diff options
-rw-r--r-- | Makefile.am | 10 | ||||
-rw-r--r-- | deps/yxml.c | 1024 | ||||
-rw-r--r-- | deps/yxml.h | 134 | ||||
-rw-r--r-- | src/fl_load.c | 412 | ||||
-rw-r--r-- | src/xmlread.c | 537 |
5 files changed, 1360 insertions, 757 deletions
diff --git a/Makefile.am b/Makefile.am index 7cf3459..c9186c0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ EXTRA_DIST=ChangeLog noinst_PROGRAMS= AM_CFLAGS=$(GLIB_CFLAGS) $(GNUTLS_CFLAGS) $(SQLITE_CFLAGS) -AM_CPPFLAGS=-I$(builddir)/src -I$(srcdir)/deps/ylib +AM_CPPFLAGS=-I$(builddir)/src -I$(srcdir)/deps -I$(srcdir)/deps/ylib @@ -37,8 +37,8 @@ makeheaders_SOURCES=deps/makeheaders.c noinst_LIBRARIES=libdeps.a -libdeps_a_SOURCES=deps/ylib/yuri.c -EXTRA_DIST+=deps/ylib/yuri.h +libdeps_a_SOURCES=deps/ylib/yuri.c deps/yxml.c +EXTRA_DIST+=deps/ylib/yuri.h deps/yxml.h bin_PROGRAMS=ncdc @@ -72,8 +72,7 @@ ncdc_SOURCES=\ src/uit_userlist.c\ src/ui_util.c\ src/util.c\ - src/vars.c\ - src/xmlread.c + src/vars.c auto_headers=$(ncdc_SOURCES:.c=.h) noinst_HEADERS=src/doc.h src/ncdc.h @@ -138,4 +137,3 @@ src/uit_userlist.$(OBJEXT): src/uit_userlist.h src/ui_util.$(OBJEXT): src/ui_util.h src/util.$(OBJEXT): src/util.h src/vars.$(OBJEXT): src/vars.h -src/xmlread.$(OBJEXT): src/xmlread.h diff --git a/deps/yxml.c b/deps/yxml.c new file mode 100644 index 0000000..17c2dd2 --- /dev/null +++ b/deps/yxml.c @@ -0,0 +1,1024 @@ +/* THIS FILE IS AUTOMATICALLY GENERATED, DO NOT EDIT! */ + +/* Copyright (c) 2013 Yoran Heling + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include <yxml.h> +#include <string.h> + +typedef enum { + YXMLS_string, + YXMLS_attr0, + YXMLS_attr1, + YXMLS_attr2, + YXMLS_attr3, + YXMLS_attr4, + YXMLS_cd0, + YXMLS_cd1, + YXMLS_cd2, + YXMLS_comment0, + YXMLS_comment1, + YXMLS_comment2, + YXMLS_comment3, + YXMLS_comment4, + YXMLS_dt0, + YXMLS_dt1, + YXMLS_dt2, + YXMLS_dt3, + YXMLS_dt4, + YXMLS_elem0, + YXMLS_elem1, + YXMLS_elem2, + YXMLS_elem3, + YXMLS_enc0, + YXMLS_enc1, + YXMLS_enc2, + YXMLS_enc3, + YXMLS_etag0, + YXMLS_etag1, + YXMLS_etag2, + YXMLS_init, + YXMLS_le0, + YXMLS_le1, + YXMLS_le2, + YXMLS_le3, + YXMLS_lee1, + YXMLS_lee2, + YXMLS_leq0, + YXMLS_misc0, + YXMLS_misc1, + YXMLS_misc2, + YXMLS_misc2a, + YXMLS_misc3, + YXMLS_pi0, + YXMLS_pi1, + YXMLS_pi2, + YXMLS_pi3, + YXMLS_pi4, + YXMLS_std0, + YXMLS_std1, + YXMLS_std2, + YXMLS_std3, + YXMLS_ver0, + YXMLS_ver1, + YXMLS_ver2, + YXMLS_ver3, + YXMLS_xmldecl0, + YXMLS_xmldecl1, + YXMLS_xmldecl2, + YXMLS_xmldecl3, + YXMLS_xmldecl4, + YXMLS_xmldecl5, + YXMLS_xmldecl6, + YXMLS_xmldecl7 +} yxml_state_t; + + +#define yxml_isChar(c) 1 +/* 0xd should be part of SP, too, but yxml_parse() already normalizes that into 0xa */ +#define yxml_isSP(c) (c == 0x20 || c == 0x09 || c == 0x0a) +#define yxml_isAlpha(c) ((c|32)-'a' < 26) +#define yxml_isNum(c) (c-'0' < 10) +#define yxml_isHex(c) (yxml_isNum(c) || (c|32)-'a' < 6) +#define yxml_isEncName(c) (yxml_isAlpha(c) || yxml_isNum(c) || c == '.' || c == '_' || c == '-') +#define yxml_isNameStart(c) (yxml_isAlpha(c) || c == ':' || c == '_' || c >= 128) +#define yxml_isName(c) (yxml_isNameStart(c) || yxml_isNum(c) || c == '-' || c == '.') +/* XXX: The valid characters are dependent on the quote char, hence the access to x->quote */ +#define yxml_isAttValue(c) (yxml_isChar(c) && c != x->quote && c != '<' && c != '&') +/* Anything between '&' and ';', the yxml_ref* functions will do further + * validation. Strictly speaking, this is "yxml_isName(c) || c == '#'", but + * this parser doesn't understand entities with '.', ':', etc, anwyay. */ +#define yxml_isRef(c) (yxml_isNum(c) || yxml_isAlpha(c) || c == '#') + + +/* Set the given char value to ch (0<=ch<=255). + * This can't be done with simple assignment because char may be signed, and + * unsigned-to-signed overflow is implementation defined in C. This function + * /looks/ inefficient, but gcc compiles it down to a single movb instruction + * on x86, even with -O0. */ +static inline void yxml_setchar(char *dest, unsigned ch) { + unsigned char _ch = ch; + memcpy(dest, &_ch, 1); +} + + +/* Similar to yxml_setchar(), but will convert ch (any valid unicode point) to + * UTF-8 and appends a '\0'. dest must have room for at least 5 bytes. */ +static void yxml_setutf8(char *dest, unsigned ch) { + if(ch <= 0x007F) + yxml_setchar(dest++, ch); + else if(ch <= 0x07FF) { + yxml_setchar(dest++, 0xC0 | (ch>>6)); + yxml_setchar(dest++, 0x80 | (ch & 0x3F)); + } else if(ch <= 0xFFFF) { + yxml_setchar(dest++, 0xE0 | (ch>>12)); + yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F)); + yxml_setchar(dest++, 0x80 | (ch & 0x3F)); + } else { + yxml_setchar(dest++, 0xF0 | (ch>>18)); + yxml_setchar(dest++, 0x80 | ((ch>>12) & 0x3F)); + yxml_setchar(dest++, 0x80 | ((ch>>6) & 0x3F)); + yxml_setchar(dest++, 0x80 | (ch & 0x3F)); + } + *dest = 0; +} + + +static inline int yxml_datacontent(yxml_t *x, unsigned ch) { + yxml_setchar(x->data, ch); + x->data[1] = 0; + return YXML_CONTENT; +} + + +static inline int yxml_datapi1(yxml_t *x, unsigned ch) { + yxml_setchar(x->data, ch); + x->data[1] = 0; + return YXML_PICONTENT; +} + + +static inline int yxml_datapi2(yxml_t *x, unsigned ch) { + x->data[0] = '?'; + yxml_setchar(x->data+1, ch); + x->data[2] = 0; + return YXML_PICONTENT; +} + + +static inline int yxml_datacd1(yxml_t *x, unsigned ch) { + x->data[0] = ']'; + yxml_setchar(x->data+1, ch); + x->data[2] = 0; + return YXML_CONTENT; +} + + +static inline int yxml_datacd2(yxml_t *x, unsigned ch) { + x->data[0] = ']'; + x->data[1] = ']'; + yxml_setchar(x->data+2, ch); + x->data[3] = 0; + return YXML_CONTENT; +} + + +static inline int yxml_dataattr(yxml_t *x, unsigned ch) { + /* Normalize attribute values according to the XML spec section 3.3.3. */ + yxml_setchar(x->data, ch == 0x9 || ch == 0xa ? 0x20 : ch); + x->data[1] = 0; + return YXML_ATTRVAL; +} + + +static int yxml_pushstack(yxml_t *x, char **res, unsigned ch) { + if(x->stacklen+2 >= x->stacksize) + return YXML_ESTACK; + x->stacklen++; + *res = (char *)x->stack+x->stacklen; + x->stack[x->stacklen] = ch; + x->stacklen++; + x->stack[x->stacklen] = 0; + return YXML_OK; +} + + +static int yxml_pushstackc(yxml_t *x, unsigned ch) { + if(x->stacklen+1 >= x->stacksize) + return YXML_ESTACK; + x->stack[x->stacklen] = ch; + x->stacklen++; + x->stack[x->stacklen] = 0; + return YXML_OK; +} + + +static void yxml_popstack(yxml_t *x) { + do + x->stacklen--; + while(x->stack[x->stacklen]); +} + + +static inline int yxml_elemstart (yxml_t *x, unsigned ch) { return yxml_pushstack(x, &x->elem, ch); } +static inline int yxml_elemname (yxml_t *x, unsigned ch) { return yxml_pushstackc(x, ch); } +static inline int yxml_elemnameend(yxml_t *x, unsigned ch) { return YXML_ELEMSTART; } + + +/* Also used in yxml_elemcloseend(), since this function just removes the last + * element from the stack and returns ELEMEND. */ +static int yxml_selfclose(yxml_t *x, unsigned ch) { + yxml_popstack(x); + if(x->stacklen) { + x->elem = (char *)x->stack+x->stacklen-1; + while(*(x->elem-1)) + x->elem--; + return YXML_ELEMEND; + } + x->elem = (char *)x->stack; + x->state = YXMLS_misc3; + return YXML_ELEMEND; +} + + +static inline int yxml_elemclose(yxml_t *x, unsigned ch) { + if(*((unsigned char *)x->elem) != ch) + return YXML_ECLOSE; + x->elem++; + return YXML_OK; +} + + +static inline int yxml_elemcloseend(yxml_t *x, unsigned ch) { + if(*x->elem) + return YXML_ECLOSE; + return yxml_selfclose(x, ch); +} + + +static inline int yxml_attrstart (yxml_t *x, unsigned ch) { return yxml_pushstack(x, &x->attr, ch); } +static inline int yxml_attrname (yxml_t *x, unsigned ch) { return yxml_pushstackc(x, ch); } +static inline int yxml_attrnameend(yxml_t *x, unsigned ch) { return YXML_ATTRSTART; } +static inline int yxml_attrvalend (yxml_t *x, unsigned ch) { yxml_popstack(x); return YXML_ATTREND; } + + +static inline int yxml_pistart (yxml_t *x, unsigned ch) { return yxml_pushstack(x, &x->pi, ch); } +static inline int yxml_piname (yxml_t *x, unsigned ch) { return yxml_pushstackc(x, ch); } +static inline int yxml_pinameend(yxml_t *x, unsigned ch) { + return (x->pi[0]|32) == 'x' && (x->pi[1]|32) == 'm' && (x->pi[2]|32) == 'l' && !x->pi[3] ? YXML_ESYN : YXML_PISTART; +} +static inline int yxml_pivalend (yxml_t *x, unsigned ch) { yxml_popstack(x); x->pi = (char *)x->stack; return YXML_PIEND; } + + +static inline int yxml_refstart(yxml_t *x, unsigned ch) { + memset(x->data, 0, sizeof(x->data)); + x->reflen = 0; + return YXML_OK; +} + + +static int yxml_ref(yxml_t *x, unsigned ch) { + if(x->reflen >= sizeof(x->data)-1) + return YXML_EREF; + yxml_setchar(x->data+x->reflen, ch); + x->reflen++; + return YXML_OK; +} + + +static int yxml_refend(yxml_t *x, int ret) { + unsigned char *r = (unsigned char *)x->data; + unsigned ch = 0; + if(*r == '#') { + if(r[1] == 'x') + for(r += 2; yxml_isHex((unsigned)*r); r++) + ch = (ch<<4) + (*r <= '9' ? *r-'0' : (*r|32)-'a' + 10); + else + for(r++; yxml_isNum((unsigned)*r); r++) + ch = (ch*10) + (*r-'0'); + if(*r) + ch = 0; + } else { + uint64_t ri; + memcpy(&ri, r, 8); + if(ri == *((uint64_t *)"lt\0\0\0\0\0")) + ch = '<'; + else if(ri == *((uint64_t *)"gt\0\0\0\0\0")) + ch = '>'; + else if(ri == *((uint64_t *)"amp\0\0\0\0")) + ch = '&'; + else if(ri == *((uint64_t *)"apos\0\0\0")) + ch = '\''; + else if(ri == *((uint64_t *)"quot\0\0\0")) + ch = '"'; + } + + /* Codepoints not allowed in the XML 1.1 definition of a Char */ + if(!ch || ch > 0x10FFFF || ch == 0xFFFE || ch == 0xFFFF || (ch-0xDFFF) < 0x7FF) + return YXML_EREF; + yxml_setutf8(x->data, ch); + return ret; +} + + +static inline int yxml_refcontent(yxml_t *x, unsigned ch) { return yxml_refend(x, YXML_CONTENT); } +static inline int yxml_refattrval(yxml_t *x, unsigned ch) { return yxml_refend(x, YXML_ATTRVAL); } + + +void yxml_init(yxml_t *x, char *stack, size_t stacksize) { + memset(x, 0, sizeof(*x)); + x->line = 1; + x->stack = (unsigned char *)stack; + x->stacksize = stacksize; + *x->stack = 0; + x->elem = x->pi = (char *)x->stack; + x->state = YXMLS_init; +} + + +yxml_ret_t yxml_parse(yxml_t *x, int _ch) { + /* Ensure that characters are in the range of 0..255 rather than -126..125. + * All character comparisons are done with positive integers. */ + unsigned ch = (unsigned)(_ch+256) & 0xff; + if(!ch) + return YXML_ESYN; + x->total++; + + /* End-of-Line normalization, "\rX", "\r\n" and "\n" are recognized and + * normalized to a single '\n' as per XML 1.0 section 2.11. XML 1.1 adds + * some non-ASCII character sequences to this list, but we can only handle + * ASCII here without making assumptions about the input encoding. */ + if(x->ignore == ch) { + x->ignore = 0; + return YXML_OK; + } + x->ignore = (ch == 0xd) * 0xa; + if(ch == 0xa || ch == 0xd) { + ch = 0xa; + x->line++; + x->byte = 0; + } + x->byte++; + + switch((yxml_state_t)x->state) { + case YXMLS_string: + if(ch == *x->string) { + x->string++; + if(!*x->string) + x->state = x->nextstate; + return YXML_OK; + } + break; + case YXMLS_attr0: + if(yxml_isName(ch)) + return yxml_attrname(x, ch); + if(yxml_isSP(ch)) { + x->state = YXMLS_attr1; + return yxml_attrnameend(x, ch); + } + if(ch == (unsigned char)'=') { + x->state = YXMLS_attr2; + return yxml_attrnameend(x, ch); + } + break; + case YXMLS_attr1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'=') { + x->state = YXMLS_attr2; + return YXML_OK; + } + break; + case YXMLS_attr2: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') { + x->state = YXMLS_attr3; + x->quote = ch; + return YXML_OK; + } + break; + case YXMLS_attr3: + if(yxml_isAttValue(ch)) + return yxml_dataattr(x, ch); + if(ch == (unsigned char)'&') { + x->state = YXMLS_attr4; + return yxml_refstart(x, ch); + } + if(x->quote == ch) { + x->state = YXMLS_elem2; + return yxml_attrvalend(x, ch); + } + break; + case YXMLS_attr4: + if(yxml_isRef(ch)) + return yxml_ref(x, ch); + if(ch == (unsigned char)'\x3b') { + x->state = YXMLS_attr3; + return yxml_refattrval(x, ch); + } + break; + case YXMLS_cd0: + if(ch == (unsigned char)']') { + x->state = YXMLS_cd1; + return YXML_OK; + } + if(yxml_isChar(ch)) + return yxml_datacontent(x, ch); + break; + case YXMLS_cd1: + if(ch == (unsigned char)']') { + x->state = YXMLS_cd2; + return YXML_OK; + } + if(yxml_isChar(ch)) { + x->state = YXMLS_cd0; + return yxml_datacd1(x, ch); + } + break; + case YXMLS_cd2: + if(ch == (unsigned char)']') + return yxml_datacontent(x, ch); + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return YXML_OK; + } + if(yxml_isChar(ch)) { + x->state = YXMLS_cd0; + return yxml_datacd2(x, ch); + } + break; + case YXMLS_comment0: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment1; + return YXML_OK; + } + break; + case YXMLS_comment1: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment2; + return YXML_OK; + } + break; + case YXMLS_comment2: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment3; + return YXML_OK; + } + if(yxml_isChar(ch)) + return YXML_OK; + break; + case YXMLS_comment3: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment4; + return YXML_OK; + } + if(yxml_isChar(ch)) { + x->state = YXMLS_comment2; + return YXML_OK; + } + break; + case YXMLS_comment4: + if(ch == (unsigned char)'>') { + x->state = x->nextstate; + return YXML_OK; + } + break; + case YXMLS_dt0: + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc1; + return YXML_OK; + } + if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') { + x->state = YXMLS_dt1; + x->quote = ch; + x->nextstate = YXMLS_dt0; + return YXML_OK; + } + if(ch == (unsigned char)'<') { + x->state = YXMLS_dt2; + return YXML_OK; + } + if(yxml_isChar(ch)) + return YXML_OK; + break; + case YXMLS_dt1: + if(x->quote == ch) { + x->state = x->nextstate; + return YXML_OK; + } + if(yxml_isChar(ch)) + return YXML_OK; + break; + case YXMLS_dt2: + if(ch == (unsigned char)'?') { + x->state = YXMLS_pi0; + x->nextstate = YXMLS_dt0; + return YXML_OK; + } + if(ch == (unsigned char)'!') { + x->state = YXMLS_dt3; + return YXML_OK; + } + break; + case YXMLS_dt3: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment1; + x->nextstate = YXMLS_dt0; + return YXML_OK; + } + if(yxml_isChar(ch)) { + x->state = YXMLS_dt4; + return YXML_OK; + } + break; + case YXMLS_dt4: + if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') { + x->state = YXMLS_dt1; + x->quote = ch; + x->nextstate = YXMLS_dt4; + return YXML_OK; + } + if(ch == (unsigned char)'>') { + x->state = YXMLS_dt0; + return YXML_OK; + } + if(yxml_isChar(ch)) + return YXML_OK; + break; + case YXMLS_elem0: + if(yxml_isName(ch)) + return yxml_elemname(x, ch); + if(yxml_isSP(ch)) { + x->state = YXMLS_elem1; + return yxml_elemnameend(x, ch); + } + if(ch == (unsigned char)'/') { + x->state = YXMLS_elem3; + return yxml_elemnameend(x, ch); + } + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return yxml_elemnameend(x, ch); + } + break; + case YXMLS_elem1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'/') { + x->state = YXMLS_elem3; + return YXML_OK; + } + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return YXML_OK; + } + if(yxml_isNameStart(ch)) { + x->state = YXMLS_attr0; + return yxml_attrstart(x, ch); + } + break; + case YXMLS_elem2: + if(yxml_isSP(ch)) { + x->state = YXMLS_elem1; + return YXML_OK; + } + if(ch == (unsigned char)'/') { + x->state = YXMLS_elem3; + return YXML_OK; + } + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return YXML_OK; + } + break; + case YXMLS_elem3: + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return yxml_selfclose(x, ch); + } + break; + case YXMLS_enc0: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'=') { + x->state = YXMLS_enc1; + return YXML_OK; + } + break; + case YXMLS_enc1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') { + x->state = YXMLS_enc2; + x->quote = ch; + return YXML_OK; + } + break; + case YXMLS_enc2: + if(yxml_isAlpha(ch)) { + x->state = YXMLS_enc3; + return YXML_OK; + } + break; + case YXMLS_enc3: + if(yxml_isEncName(ch)) + return YXML_OK; + if(x->quote == ch) { + x->state = YXMLS_xmldecl4; + return YXML_OK; + } + break; + case YXMLS_etag0: + if(yxml_isNameStart(ch)) { + x->state = YXMLS_etag1; + return yxml_elemclose(x, ch); + } + break; + case YXMLS_etag1: + if(yxml_isName(ch)) + return yxml_elemclose(x, ch); + if(yxml_isSP(ch)) { + x->state = YXMLS_etag2; + return yxml_elemcloseend(x, ch); + } + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return yxml_elemcloseend(x, ch); + } + break; + case YXMLS_etag2: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc2; + return YXML_OK; + } + break; + case YXMLS_init: + if(ch == (unsigned char)'\xef') { + x->state = YXMLS_string; + x->nextstate = YXMLS_misc0; + x->string = (unsigned char *)"\xbb\xbf"; + return YXML_OK; + } + if(yxml_isSP(ch)) { + x->state = YXMLS_misc0; + return YXML_OK; + } + if(ch == (unsigned char)'<') { + x->state = YXMLS_le0; + return YXML_OK; + } + break; + case YXMLS_le0: + if(ch == (unsigned char)'!') { + x->state = YXMLS_lee1; + return YXML_OK; + } + if(ch == (unsigned char)'?') { + x->state = YXMLS_leq0; + return YXML_OK; + } + if(yxml_isNameStart(ch)) { + x->state = YXMLS_elem0; + return yxml_elemstart(x, ch); + } + break; + case YXMLS_le1: + if(ch == (unsigned char)'!') { + x->state = YXMLS_lee1; + return YXML_OK; + } + if(ch == (unsigned char)'?') { + x->state = YXMLS_pi0; + x->nextstate = YXMLS_misc1; + return YXML_OK; + } + if(yxml_isNameStart(ch)) { + x->state = YXMLS_elem0; + return yxml_elemstart(x, ch); + } + break; + case YXMLS_le2: + if(ch == (unsigned char)'!') { + x->state = YXMLS_lee2; + return YXML_OK; + } + if(ch == (unsigned char)'?') { + x->state = YXMLS_pi0; + x->nextstate = YXMLS_misc2; + return YXML_OK; + } + if(ch == (unsigned char)'/') { + x->state = YXMLS_etag0; + return YXML_OK; + } + if(yxml_isNameStart(ch)) { + x->state = YXMLS_elem0; + return yxml_elemstart(x, ch); + } + break; + case YXMLS_le3: + if(ch == (unsigned char)'!') { + x->state = YXMLS_comment0; + x->nextstate = YXMLS_misc3; + return YXML_OK; + } + if(ch == (unsigned char)'?') { + x->state = YXMLS_pi0; + x->nextstate = YXMLS_misc3; + return YXML_OK; + } + break; + case YXMLS_lee1: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment1; + x->nextstate = YXMLS_misc1; + return YXML_OK; + } + if(ch == (unsigned char)'D') { + x->state = YXMLS_string; + x->nextstate = YXMLS_dt0; + x->string = (unsigned char *)"OCTYPE"; + return YXML_OK; + } + break; + case YXMLS_lee2: + if(ch == (unsigned char)'-') { + x->state = YXMLS_comment1; + x->nextstate = YXMLS_misc2; + return YXML_OK; + } + if(ch == (unsigned char)'[') { + x->state = YXMLS_string; + x->nextstate = YXMLS_cd0; + x->string = (unsigned char *)"CDATA["; + return YXML_OK; + } + break; + case YXMLS_leq0: + if(ch == (unsigned char)'x') { + x->state = YXMLS_string; + x->nextstate = YXMLS_xmldecl0; + x->string = (unsigned char *)"ml"; + return YXML_OK; + } + if(yxml_isNameStart(ch)) { + x->state = YXMLS_pi1; + x->nextstate = YXMLS_misc1; + return yxml_pistart(x, ch); + } + break; + case YXMLS_misc0: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'<') { + x->state = YXMLS_le0; + return YXML_OK; + } + break; + case YXMLS_misc1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'<') { + x->state = YXMLS_le1; + return YXML_OK; + } + break; + case YXMLS_misc2: + if(ch == (unsigned char)'<') { + x->state = YXMLS_le2; + return YXML_OK; + } + if(ch == (unsigned char)'&') { + x->state = YXMLS_misc2a; + return yxml_refstart(x, ch); + } + if(yxml_isChar(ch)) + return yxml_datacontent(x, ch); + break; + case YXMLS_misc2a: + if(yxml_isRef(ch)) + return yxml_ref(x, ch); + if(ch == (unsigned char)'\x3b') { + x->state = YXMLS_misc2; + return yxml_refcontent(x, ch); + } + break; + case YXMLS_misc3: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'<') { + x->state = YXMLS_le3; + return YXML_OK; + } + break; + case YXMLS_pi0: + if(yxml_isNameStart(ch)) { + x->state = YXMLS_pi1; + return yxml_pistart(x, ch); + } + break; + case YXMLS_pi1: + if(yxml_isName(ch)) + return yxml_piname(x, ch); + if(ch == (unsigned char)'?') { + x->state = YXMLS_pi4; + return yxml_pinameend(x, ch); + } + if(yxml_isSP(ch)) { + x->state = YXMLS_pi2; + return yxml_pinameend(x, ch); + } + break; + case YXMLS_pi2: + if(ch == (unsigned char)'?') { + x->state = YXMLS_pi3; + return YXML_OK; + } + if(yxml_isChar(ch)) + return yxml_datapi1(x, ch); + break; + case YXMLS_pi3: + if(ch == (unsigned char)'>') { + x->state = x->nextstate; + return yxml_pivalend(x, ch); + } + if(yxml_isChar(ch)) { + x->state = YXMLS_pi2; + return yxml_datapi2(x, ch); + } + break; + case YXMLS_pi4: + if(ch == (unsigned char)'>') { + x->state = x->nextstate; + return yxml_pivalend(x, ch); + } + break; + case YXMLS_std0: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'=') { + x->state = YXMLS_std1; + return YXML_OK; + } + break; + case YXMLS_std1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') { + x->state = YXMLS_std2; + x->quote = ch; + return YXML_OK; + } + break; + case YXMLS_std2: + if(ch == (unsigned char)'y') { + x->state = YXMLS_string; + x->nextstate = YXMLS_std3; + x->string = (unsigned char *)"es"; + return YXML_OK; + } + if(ch == (unsigned char)'n') { + x->state = YXMLS_string; + x->nextstate = YXMLS_std3; + x->string = (unsigned char *)"o"; + return YXML_OK; + } + break; + case YXMLS_std3: + if(x->quote == ch) { + x->state = YXMLS_xmldecl6; + return YXML_OK; + } + break; + case YXMLS_ver0: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'=') { + x->state = YXMLS_ver1; + return YXML_OK; + } + break; + case YXMLS_ver1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'\'' || ch == (unsigned char)'"') { + x->state = YXMLS_string; + x->quote = ch; + x->nextstate = YXMLS_ver2; + x->string = (unsigned char *)"1."; + return YXML_OK; + } + break; + case YXMLS_ver2: + if(yxml_isNum(ch)) { + x->state = YXMLS_ver3; + return YXML_OK; + } + break; + case YXMLS_ver3: + if(yxml_isNum(ch)) + return YXML_OK; + if(x->quote == ch) { + x->state = YXMLS_xmldecl2; + return YXML_OK; + } + break; + case YXMLS_xmldecl0: + if(yxml_isSP(ch)) { + x->state = YXMLS_xmldecl1; + return YXML_OK; + } + break; + case YXMLS_xmldecl1: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'v') { + x->state = YXMLS_string; + x->nextstate = YXMLS_ver0; + x->string = (unsigned char *)"ersion"; + return YXML_OK; + } + break; + case YXMLS_xmldecl2: + if(yxml_isSP(ch)) { + x->state = YXMLS_xmldecl3; + return YXML_OK; + } + if(ch == (unsigned char)'?') { + x->state = YXMLS_xmldecl7; + return YXML_OK; + } + break; + case YXMLS_xmldecl3: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'?') { + x->state = YXMLS_xmldecl7; + return YXML_OK; + } + if(ch == (unsigned char)'e') { + x->state = YXMLS_string; + x->nextstate = YXMLS_enc0; + x->string = (unsigned char *)"ncoding"; + return YXML_OK; + } + if(ch == (unsigned char)'s') { + x->state = YXMLS_string; + x->nextstate = YXMLS_std0; + x->string = (unsigned char *)"tandalone"; + return YXML_OK; + } + break; + case YXMLS_xmldecl4: + if(yxml_isSP(ch)) { + x->state = YXMLS_xmldecl5; + return YXML_OK; + } + if(ch == (unsigned char)'?') { + x->state = YXMLS_xmldecl7; + return YXML_OK; + } + break; + case YXMLS_xmldecl5: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'?') { + x->state = YXMLS_xmldecl7; + return YXML_OK; + } + if(ch == (unsigned char)'s') { + x->state = YXMLS_string; + x->nextstate = YXMLS_std0; + x->string = (unsigned char *)"tandalone"; + return YXML_OK; + } + break; + case YXMLS_xmldecl6: + if(yxml_isSP(ch)) + return YXML_OK; + if(ch == (unsigned char)'?') { + x->state = YXMLS_xmldecl7; + return YXML_OK; + } + break; + case YXMLS_xmldecl7: + if(ch == (unsigned char)'>') { + x->state = YXMLS_misc1; + return YXML_OK; + } + break; + } + return YXML_ESYN; +} + + +yxml_ret_t yxml_eof(yxml_t *x) { + if(x->state != YXMLS_misc3) + return YXML_EEOF; + return YXML_OK; +} + + +/* vim: set noet sw=4 ts=4: */ diff --git a/deps/yxml.h b/deps/yxml.h new file mode 100644 index 0000000..f1ebaeb --- /dev/null +++ b/deps/yxml.h @@ -0,0 +1,134 @@ +/* Copyright (c) 2013 Yoran Heling + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include <stdint.h> +#include <stddef.h> + + +typedef enum { + YXML_EEOF = -6, /* Unexpected EOF */ + YXML_EREF = -5, /* Invalid character or entity reference (&whatever;) */ + YXML_ECLOSE = -4, /* Close tag does not match open tag (<Tag> .. </OtherTag>) */ + YXML_ESTACK = -3, /* Stack overflow (too deeply nested tags or too long element/attribute name) */ + YXML_EATTR = -2, /* Too long attribute name */ + YXML_ESYN = -1, /* Syntax error (unexpected byte) */ + YXML_OK = 0, /* Character consumed, no new token present */ + YXML_ELEMSTART = 1, /* Start of an element: '<Tag ..' */ + YXML_CONTENT = 2, /* Element content */ + YXML_ELEMEND = 3, /* End of an element: '.. />' or '</Tag>' */ + YXML_ATTRSTART = 4, /* Attribute: 'Name=..' */ + YXML_ATTRVAL = 5, /* Attribute value */ + YXML_ATTREND = 6, /* End of attribute '.."' */ + YXML_PISTART = 7, /* Start of a processing instruction */ + YXML_PICONTENT = 8, /* Content of a PI */ + YXML_PIEND = 9 /* End of a processing instruction */ +} yxml_ret_t; + +/* When, exactly, are tokens returned? + * + * <TagName + * '>' ELEMSTART + * '/' ELEMSTART, '>' ELEMEND + * ' ' ELEMSTART + * '>' + * '/', '>' ELEMEND + * Attr + * '=' ATTRSTART + * "X ATTRVAL + * 'Y' ATTRVAL + * 'Z' ATTRVAL + * '"' ATTREND + * '>' + * '/', '>' ELEMEND + * + * </TagName + * '>' ELEMEND + */ + + +typedef struct { + /* PUBLIC (read-only) */ + + /* Name of the current element, zero-length if not in any element. Changed + * after YXML_ELEMSTART. The pointer will remain valid up to and including + * the next non-YXML_ATTR* token, the pointed-to buffer will remain valid + * up to and including the YXML_ELEMCLOSE for the corresponding element. */ + char *elem; + + /* The last read character(s) of an attribute value (YXML_ATTRVAL), element + * data (YXML_CONTENT), or processing instruction (YXML_PICONTENT). Changed + * after one of the respective YXML_ values is returned, and only valid + * until the next yxml_parse() call. Usually, this string only consists of + * a single byte, but multiple bytes are returned in the following cases: + * - "<?SomePI ?x ?>": The two characters "?x" + * - "<![CDATA[ ]x ]]>": The two characters "]x" + * - "<![CDATA[ ]]x ]]>": The three characters "]]x" + * - "&#N;" and "&#xN;", where dec(n) > 127. The referenced Unicode + * character is then encoded in multiple UTF-8 bytes. + */ + char data[8]; + + /* Name of the current attribute. Changed after YXML_ATTRSTART, valid up to + * and including the next YXML_ATTREND. */ + char *attr; + + /* Name/target of the current processing instruction, zero-length if not in + * a PI. Changed after YXML_PISTART, valid up to (but excluding) + * the next YXML_PIEND. */ + char *pi; + + /* Line number, byte offset within that line, and total bytes read. These + * values refer to the position _after_ the last byte given to + * yxml_parse(). These are useful for debugging and error reporting. */ + uint64_t byte; + uint64_t total; + uint32_t line; + + + /* PRIVATE */ + int state; + unsigned char *stack; /* Stack of element names + attribute/PI name, separated by \0. Also starts with a \0. */ + size_t stacksize, stacklen; + unsigned reflen; + unsigned quote; + int nextstate; /* Used for '@' state remembering and for the "string" consuming state */ + unsigned ignore; + unsigned char *string; +} yxml_t; + + +void yxml_init(yxml_t *x, char *stack, size_t stacksize); + + +yxml_ret_t yxml_parse(yxml_t *x, int ch); + + +/* May be called after the last character has been given to yxml_parse(). + * Returns YXML_OK if the XML document is valid, YXML_EEOF otherwise. Using + * this function isn't really necessary, but can be used to detect documents + * that don't end correctly. In particular, an error is returned when the XML + * document did not contain a (complete) root element, or when the document + * ended while in a comment or processing instruction. */ +yxml_ret_t yxml_eof(yxml_t *x); + + +/* vim: set noet sw=4 ts=4: */ diff --git a/src/fl_load.c b/src/fl_load.c index c1a4825..f86846d 100644 --- a/src/fl_load.c +++ b/src/fl_load.c @@ -26,6 +26,16 @@ #include "ncdc.h" #include "fl_load.h" +#include <yxml.h> + + +#define STACKSIZE (8*1024) +#define READBUFSIZE (32*1024) + +// Only used for attributes that we care about, and those tend to be short, +// file names being the longest possible values. I am unaware of a filesystem +// that allows filenames longer than 256 bytes, so this should be a safe value. +#define MAXATTRVAL 1024 #define S_START 0 // waiting for <FileListing> @@ -34,97 +44,42 @@ #define S_INDIR 3 // In a <Directory>..</Directory> or <FileListing>..</FileListing> #define S_FILEOPEN 4 // In a <File ..> #define S_INFILE 5 // In a <File>..</File> -#define S_UNKNOWN 6 // In some tag we didn't recognize -#define S_END 7 // Received </FileListing> -typedef struct ctx_t { - BZFILE *fh_bz; - FILE *fh_f; - gboolean eof; +typedef struct ctx_t { gboolean local; int state; - char *name; char filetth[24]; gboolean filehastth; guint64 filesize; - gboolean dirincomplete; + char *name; fl_list_t *root; fl_list_t *cur; int unknown_level; -} ctx_t; - -static int readcb(void *context, char *buf, int len, GError **err) { - ctx_t *x = context; - - if(x->fh_bz) { - if(x->eof) - return 0; - int bzerr; - int r = BZ2_bzRead(&bzerr, x->fh_bz, buf, len); - if(bzerr != BZ_OK && bzerr != BZ_STREAM_END) { - g_set_error(err, 1, 0, "bzip2 decompression error (%d): %s", bzerr, g_strerror(errno)); - return -1; - } - if(bzerr == BZ_STREAM_END) - x->eof = TRUE; - return r; + int consume; + char *attrp; + char attr[MAXATTRVAL]; - } + yxml_t x; + char stack[STACKSIZE]; + char buf[READBUFSIZE]; +} ctx_t; - int r = fread(buf, 1, len, x->fh_f); - if(r < 0 && feof(x->fh_f)) - r = 0; - if(r < 0) - g_set_error(err, 1, 0, "Read error: %s", g_strerror(errno)); - return r; -} #define isvalidfilename(x) (\ !(((x)[0] == '.' && (!(x)[1] || ((x)[1] == '.' && !(x)[2])))) && !strchr((x), '/')) -static int entitycb(void *context, int type, const char *arg1, const char *arg2, GError **err) { - ctx_t *x = context; - //printf("%d,%d: %s, %s\n", x->state, type, arg1, arg2); - switch(x->state) { - - // The first token must always be a <FileListing> - case S_START: - if(type == XMLT_OPEN && g_ascii_strcasecmp(arg1, "FileListing") == 0) { - x->state = S_FLOPEN; - return 0; - } - break; - - // Any attributes in a <FileListing> are currently ignored. - case S_FLOPEN: - if(type == XMLT_ATTR) - return 0; - if(type == XMLT_ATTDONE) { - x->state = S_INDIR; - return 0; - } - break; - - // Handling the attributes of a Directory element. - case S_DIROPEN: - if(type == XMLT_ATTR && g_ascii_strcasecmp(arg1, "Name") == 0 && !x->name) { - x->name = g_utf8_validate(arg2, -1, NULL) ? g_strdup(arg2) : str_convert("UTF-8", "UTF-8", arg2); - if(!isvalidfilename(x->name)) { - g_set_error(err, 1, 0, "Invalid directory name"); - return -1; - } - return 0; - } - if(type == XMLT_ATTDONE) { +static void fl_load_token(ctx_t *x, yxml_ret_t r, GError **err) { + // Detect the end of the attributes for an open XML element. + if(r != YXML_ATTRSTART && r != YXML_ATTRVAL && r != YXML_ATTREND) { + if(x->state == S_DIROPEN) { if(!x->name) { - g_set_error(err, 1, 0, "Missing Name attribute in Directory element"); - return -1; + g_set_error_literal(err, 1, 0, "Missing Name attribute in Directory element"); + return; } - // Create the directory entry fl_list_t *new = fl_list_create(x->name, FALSE); new->isfile = FALSE; new->sub = g_ptr_array_new_with_free_func(fl_list_free); @@ -134,77 +89,12 @@ static int entitycb(void *context, int type, const char *arg1, const char *arg2, g_free(x->name); x->name = NULL; x->state = S_INDIR; - return 0; - } - // Ignore unknown or duplicate attributes. - if(type == XMLT_ATTR) - return 0; - break; - - // In a directory listing. - case S_INDIR: - if(type == XMLT_OPEN && g_ascii_strcasecmp(arg1, "Directory") == 0) { - x->state = S_DIROPEN; - return 0; - } - if(type == XMLT_OPEN && g_ascii_strcasecmp(arg1, "File") == 0) { - x->state = S_FILEOPEN; - return 0; - } - if(type == XMLT_OPEN) { - x->state = S_UNKNOWN; - x->unknown_level = 1; - return 0; - } - if(type == XMLT_CLOSE) { - char *expect = x->root == x->cur ? "FileListing" : "Directory"; - if(arg1 && g_ascii_strcasecmp(arg1, expect) != 0) { - g_set_error(err, 1, 0, "Invalid close tag, expected </%s> but got </%s>", expect, arg1); - return -1; - } - fl_list_sort(x->cur); - if(x->cur == x->root) - x->state = S_END; - else - x->cur = x->cur->parent; - return 0; - } - break; - // Handling the attributes of a File element. (If there are multiple - // attributes with the same name, only the first is used.) - case S_FILEOPEN: - if(type == XMLT_ATTR && g_ascii_strcasecmp(arg1, "Name") == 0 && !x->name) { - x->name = g_utf8_validate(arg2, -1, NULL) ? g_strdup(arg2) : str_convert("UTF-8", "UTF-8", arg2); - if(!isvalidfilename(x->name)) { - g_set_error(err, 1, 0, "Invalid file name"); - return -1; - } - return 0; - } - if(type == XMLT_ATTR && g_ascii_strcasecmp(arg1, "TTH") == 0 && !x->filehastth) { - if(!istth(arg2)) { - g_set_error(err, 1, 0, "Invalid TTH"); - return -1; - } - base32_decode(arg2, x->filetth); - x->filehastth = TRUE; - return 0; - } - if(type == XMLT_ATTR && g_ascii_strcasecmp(arg1, "Size") == 0 && x->filesize == G_MAXUINT64) { - char *end = NULL; - x->filesize = g_ascii_strtoull(arg2, &end, 10); - if(!end || *end) { - g_set_error(err, 1, 0, "Invalid file size"); - return -1; - } - return 0; - } - if(type == XMLT_ATTDONE) { + } else if(x->state == S_FILEOPEN) { if(!x->name || !x->filehastth || x->filesize == G_MAXUINT64) { g_set_error(err, 1, 0, "Missing %s attribute in File element", !x->name ? "Name" : !x->filehastth ? "TTH" : "Size"); - return -1; + return; } // Create the file entry fl_list_t *new = fl_list_create(x->name, x->local); @@ -219,115 +109,209 @@ static int entitycb(void *context, int type, const char *arg1, const char *arg2, g_free(x->name); x->name = NULL; x->state = S_INFILE; - return 0; - } - // Ignore unknown or duplicate attributes. - if(type == XMLT_ATTR) - return 0; - break; - // In a File element. Nothing is allowed here exept a close of the File - // element. (Really?) - case S_INFILE: - if(type == XMLT_CLOSE && (!arg1 || g_ascii_strcasecmp(arg1, "File") == 0)) { + } else if(x->state == S_FLOPEN) x->state = S_INDIR; - return 0; + } + + switch(r) { + case YXML_ELEMSTART: + if(x->unknown_level) + x->unknown_level++; + else if(x->state == S_START) { + if(g_ascii_strcasecmp(x->x.elem, "FileListing") == 0) + x->state = S_FLOPEN; + else + g_set_error_literal(err, 1, 0, "XML root element is not <FileListing>"); + } else { + if(g_ascii_strcasecmp(x->x.elem, "File") == 0) + x->state = S_FILEOPEN; + else if(g_ascii_strcasecmp(x->x.elem, "Directory") == 0) + x->state = S_DIROPEN; + else + x->unknown_level++; } break; - // No idea in what kind of tag we are, just count start/end tags so we can - // continue parsing when we're out of this unknown tag. - case S_UNKNOWN: - if(type == XMLT_OPEN) - x->unknown_level++; - else if(type == XMLT_CLOSE && !--x->unknown_level) + case YXML_ELEMEND: + if(x->unknown_level) + x->unknown_level--; + else if(x->state == S_INFILE) x->state = S_INDIR; - return 0; - } - - g_set_error(err, 1, 0, "Unexpected token in state %s: %s, %s", - x->state == S_START ? "START" : - x->state == S_FLOPEN ? "FLOPEN" : - x->state == S_DIROPEN ? "DIROPEN" : - x->state == S_INDIR ? "INDIR" : - x->state == S_FILEOPEN ? "FILEOPEN" : - x->state == S_INFILE ? "INFILE" : - x->state == S_END ? "END" : "UNKNOWN", - type == XMLT_OPEN ? "OPEN" : - type == XMLT_CLOSE ? "CLOSE" : - type == XMLT_ATTR ? "ATTR" : - type == XMLT_ATTDONE ? "ATTDONE" : "???", - arg1 ? arg1 : "<NULL>"); - return -1; -} - + else { + fl_list_sort(x->cur); + x->cur = x->cur->parent; + } + break; -static int ctx_open(ctx_t *x, const char *file, GError **err) { - memset(x, 0, sizeof(ctx_t)); + case YXML_ATTRSTART: + x->consume = !x->unknown_level && ( + (x->state == S_DIROPEN && g_ascii_strcasecmp(x->x.attr, "Name") == 0) || + (x->state == S_FILEOPEN && ( + g_ascii_strcasecmp(x->x.attr, "Name") == 0 || + g_ascii_strcasecmp(x->x.attr, "Size") == 0 || + g_ascii_strcasecmp(x->x.attr, "TTH") == 0 + )) + ); + x->attrp = x->attr; + break; - // open file - x->fh_f = fopen(file, "r"); - if(!x->fh_f) { - g_set_error_literal(err, 1, 0, g_strerror(errno)); - return -1; - } + case YXML_ATTRVAL: + if(!x->consume) + break; + if(x->attrp-x->attr > sizeof(x->attr)-5) { + g_set_error_literal(err, 1, 0, "Too long XML attribute"); + return; + } + char *v = x->x.data; + while(*v) + *(x->attrp++) = *(v++); + break; - // open BZ2 decompression - if(strlen(file) > 4 && strcmp(file+(strlen(file)-4), ".bz2") == 0) { - int bzerr; - x->fh_bz = BZ2_bzReadOpen(&bzerr, x->fh_f, 0, 0, NULL, 0); - if(bzerr != BZ_OK) { - g_set_error(err, 1, 0, "Unable to open bzip2 file (%d): %s", bzerr, g_strerror(errno)); - return -1; + case YXML_ATTREND: + if(!x->consume) + break; + *x->attrp = 0; + // Name, for either file or directory + if((*x->x.attr|32) == 'n' && !x->name) { + x->name = g_utf8_validate(x->attr, -1, NULL) ? g_strdup(x->attr) : str_convert("UTF-8", "UTF-8", x->attr); + if(!isvalidfilename(x->name)) + g_set_error_literal(err, 1, 0, "Invalid file name"); } - } + // TTH, for files + if((*x->x.attr|32) == 't' && !x->filehastth) { + if(!istth(x->attr)) + g_set_error_literal(err, 1, 0, "Invalid TTH"); + else { + base32_decode(x->attr, x->filetth); + x->filehastth = TRUE; + } + } + // Size, for files + if((*x->x.attr|32) == 's' && x->filesize == G_MAXUINT64) { + char *end = NULL; + x->filesize = g_ascii_strtoull(x->attr, &end, 10); + if(!end || *end) + g_set_error_literal(err, 1, 0, "Invalid file size"); + } + break; - return 0; + default: + break; + } } -static void ctx_close(ctx_t *x) { - if(x->fh_bz) { - int bzerr; - BZ2_bzReadClose(&bzerr, x->fh_bz); +static fl_list_t *fl_load_parse(FILE *fh, BZFILE *bzfh, gboolean local, GError **err) { + ctx_t *x = g_new(ctx_t, 1); + x->state = S_START; + x->root = fl_list_create("", FALSE); + x->root->sub = g_ptr_array_new_with_free_func(fl_list_free); + x->cur = x->root; + x->filesize = G_MAXUINT64; + x->local = local; + x->unknown_level = 0; + x->filehastth = FALSE; + x->name = NULL; + + yxml_init(&x->x, x->stack, STACKSIZE); + int buflen = 0; + int bzeof = 0; + + while(1) { + // Fill buffer + if(bzfh) { + if(bzeof) + break; + int bzerr; + buflen = BZ2_bzRead(&bzerr, bzfh, x->buf, READBUFSIZE); + if(bzerr == BZ_STREAM_END) + bzeof = 1; + else if(bzerr != BZ_OK) { + g_set_error(err, 1, 0, "bzip2 decompression error (%d): %s", bzerr, g_strerror(errno)); + break; + } + } else { + buflen = fread(x->buf, 1, READBUFSIZE, fh); + if(buflen < 0 && feof(fh)) + break; + if(buflen < 0) { + g_set_error(err, 1, 0, "Read error: %s", g_strerror(errno)); + break; + } + } + + // And parse + char *pbuf = x->buf; + while(!*err && buflen > 0) { + yxml_ret_t r = yxml_parse(&x->x, *pbuf); + pbuf++; + buflen--; + if(r == YXML_OK) + continue; + if(r < 0) { + g_set_error_literal(err, 1, 0, "XML parsing error"); + break; + } + fl_load_token(x, r, err); + } + if(*err) { + g_prefix_error(err, "Line %"G_GUINT32_FORMAT":%"G_GUINT64_FORMAT": ", x->x.line, x->x.byte); + break; + } } - if(x->fh_f) - fclose(x->fh_f); + if(!*err && yxml_eof(&x->x) < 0) + g_set_error_literal(err, 1, 0, "XML document did not end correctly"); - if(x->name) - g_free(x->name); + fl_list_t *root = x->root; + g_free(x->name); + g_free(x); + return root; } fl_list_t *fl_load(const char *file, GError **err, gboolean local) { g_return_val_if_fail(err == NULL || *err == NULL, NULL); - ctx_t x; + fl_list_t *root = NULL; + FILE *fh; + BZFILE *bzfh = NULL; GError *ierr = NULL; - if(ctx_open(&x, file, &ierr)) + + // open file + fh = fopen(file, "r"); + if(!fh) { + g_set_error_literal(&ierr, 1, 0, g_strerror(errno)); goto end; + } - x.state = S_START; - x.root = fl_list_create("", FALSE); - x.root->sub = g_ptr_array_new_with_free_func(fl_list_free); - x.cur = x.root; - x.filesize = G_MAXUINT64; - x.local = local; + // open BZ2 decompression + if(strlen(file) > 4 && strcmp(file+(strlen(file)-4), ".bz2") == 0) { + int bzerr; + bzfh = BZ2_bzReadOpen(&bzerr, fh, 0, 0, NULL, 0); + if(bzerr != BZ_OK) { + g_set_error(&ierr, 1, 0, "Unable to open bzip2 file (%d): %s", bzerr, g_strerror(errno)); + goto end; + } + } - if(xml_parse(entitycb, readcb, &x, &ierr)) - goto end; + root = fl_load_parse(fh, bzfh, local, &ierr); end: - g_return_val_if_fail(ierr || x.state == S_END, NULL); - ctx_close(&x); + if(bzfh) { + int bzerr; + BZ2_bzReadClose(&bzerr, bzfh); + } + if(fh) + fclose(fh); if(ierr) { g_propagate_error(err, ierr); - if(x.root) - fl_list_free(x.root); - x.root = NULL; + if(root) + fl_list_free(root); + root = NULL; } - return x.root; + return root; } diff --git a/src/xmlread.c b/src/xmlread.c deleted file mode 100644 index 9c0c716..0000000 --- a/src/xmlread.c +++ /dev/null @@ -1,537 +0,0 @@ -/* ncdc - NCurses Direct Connect client - - Copyright (c) 2011-2013 Yoran Heling - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be included - in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*/ - -/* This is a minimal XML stream parser designed for parsing ADC files.xml - * documents. As these documents don't tend to use the full XML specification, - * this parser lacks a few features: - * - * - Character entities (&#...;) are validated to be syntactically correct, but - * are otherwise ignored. - * - Only ASCII characters are allowed in element and attribute names, Unicode - * characters in these constructs result in an error. - * - The contents of attribute values are not validated to contain only - * characters in the allowed ranges. These values are passed to the - * application even if they don't form a valid UTF-8 sequence. The only - * exception to this is the 0 byte, which will result in an error. - * - Element contents (<Tag> ..contents.. </Tag>) are validated but otherwise - * ignored. - * - An element may have multiple attributes with the same name, it is assumed - * that the application handles this situation. - * - No validation is performed that open tags are properly closed. E.g. - * "<a></b>" is valid. The application is responsible for this validation. - * - The 'encoding' information in the <?xml ..> tag is ignored. - * - The following features are not supported, and will result in a parse error - * when present in the XML document: - * - CDATA sections (<![CDATA ..) - * - Processing instructions (<? .. ?> - * - Document type declaration (<!DOCTYPE ..>) - * - Attribute-list declarations (<!ATTLIST ..>) - * - Element type declarations (<!ELEMENT ..>) - * - Entity declarations (<!ENTITY ..>) - * - Conditional sections (<![IGNORE .. or <![INCLUDE ..) - * - Notation declarations (<!NOTATION ..>) - * - * (To my knowledge, the parser in DC++ and derivatives behave similarly). - * - * TODO: Since this parser is recursive, figure out some maximum bound on the - * stack space used. (There should be a maximum, limited by MAX_DEPTH) - */ - -#include "ncdc.h" -#include "xmlread.h" - - -#if INTERFACE - -#define XMLT_OPEN 1 // arg1 = tag name -#define XMLT_CLOSE 2 // arg1 = tag name or NULL for self-closing tags -#define XMLT_ATTR 3 // arg1 = name, arg2 = value (not validated to be correct UTF-8) -#define XMLT_ATTDONE 4 // no args, indicates that there are no more attributes for the last opened tag - -// Called whenever an XMLT_ entity has been found. Should return 0 to -// continue processing, anything else to abort. -typedef int (*xml_cb_t)(void *, int, const char *, const char *, GError **); - -// Read callback. Should return -1 on error, 0 on EOF, number of bytes read -// otherwise. -typedef int (*xml_read_t)(void *, char *, int, GError **); - -#endif - - -#define MAX_NAME 128 -#define MAX_ATTRVAL (8*1024) // this is more than enough for file lists. -#define MAX_DEPTH 50 -#define READ_BUF_SIZE (32*1024) - -typedef struct ctx_t { - xml_cb_t cb; - xml_read_t read; - void *dat; - - char name[MAX_NAME]; - char val[MAX_ATTRVAL]; - char readbuf[READ_BUF_SIZE]; - char *buf; - gboolean readeof; - int len; - - int level; - int line; - int byte; - GError *err; - jmp_buf jmp; -} ctx_t; - - - -// Helper functions - - -static void err(ctx_t *x, const char *fmt, ...) { - va_list arg; - va_start(arg, fmt); - if(!x->err) { - char *msg = g_strdup_vprintf(fmt, arg); - g_set_error(&x->err, 1, 0, "Line %d:%d: %s", x->line, x->byte, msg); - g_free(msg); - } - va_end(arg); - longjmp(x->jmp, 1); -} - - -static void callcb(ctx_t *x, int type, const char *arg1, const char *arg2) { - if(x->cb(x->dat, type, arg1, arg2, &x->err)) { - g_prefix_error(&x->err, "Line %d:%d: ", x->line, x->byte); - err(x, "Processing aborted by the application"); - } -} - - -// Make sure we have more than n bytes in the buffer. Returns the buffer -// length, which may be smaller on EOF. Also validates that the XML data does -// not contain the 0 byte (this simplifies error checking a bit). -static int fill(ctx_t *x, int n) { - if(G_LIKELY(x->len >= n)) - return x->len; - if(x->readeof) - return x->len; - - if(x->len > 0) - memmove(x->readbuf, x->buf, x->len); - x->buf = x->readbuf; - - do { - int r = x->read(x->dat, x->readbuf + x->len, READ_BUF_SIZE - x->len, &x->err); - if(r < 0) - err(x, "Parse error"); - if(!r) { - x->readeof = TRUE; - break; - } - if(memchr(x->readbuf + x->len, 0, r) != NULL) - err(x, "Invalid zero byte in XML data"); - x->len += r; - } while(x->len < n); - - return x->len; -} - - -// Require n bytes to be present, set error otherwise. -static void rfill(ctx_t *x, int n) { - if(G_UNLIKELY(n >= x->len) && fill(x, n) < n) - err(x, "Unexpected EOF"); -} - - -// consume some characters (also updates ->bytes and ->lines) -static void con(ctx_t *x, int n) { - int i = 0; - while(i < n) { - if(x->buf[i++] == '\n') { - x->line++; - x->byte = 0; - } - x->byte++; - } - x->buf += n; - x->len -= n; -} - - -// Validate and consume a string literal -static void lit(ctx_t *x, const char *str) { - int len = strlen(str); - rfill(x, len); - if(strncmp(x->buf, str, len) != 0) - err(x, "Expected '%s'", str); - con(x, len); -} - - - - -// Language definition - - -#define isWhiteSpace(x) (x == 0x20 || x == 0x09 || x == 0x0d || x == 0x0a) -#define isDecimal(x) ('0' <= x && x <= '9') -#define isHex(x) (isDecimal(x) || ('a' <= x && x <= 'f') || ('A' <= x && x <= 'F')) -#define isNameStartChar(x) (x == ':' || ('A' <= x && x <= 'Z') || x == '_' || ('a' <= x && x <= 'z')) -#define isNameChar(x) (isNameStartChar(x) || x == '-' || x == '.' || isDecimal(x)) -#define isCharData(x) (x != '&' && x != '<') - - -// Consumes whitespace until an other character or EOF was found. If req, then -// there must be at least one whitespace character, otherwise it's optional. -static void S(ctx_t *x, int req) { - if(req) { - rfill(x, 1); - if(!isWhiteSpace(*x->buf)) - err(x, "White space expected, got '%c'", *x->buf); - } - while((x->len > 0 || fill(x, 1) > 0) && isWhiteSpace(*x->buf)) - con(x, 1); -} - - -static void Eq(ctx_t *x) { - S(x, 0); - lit(x, "="); - S(x, 0); -} - - -// Parses a CharRef or EntityRef and writes the result to x->val+n, returning -// the number of bytes written (either 0 or 1). -// Note: CharRef's are parsed but ignored. This is what DC++ does, and -// simplifies things a bit. Custom EntityRefs are not supported, only those -// predefined in the XML standard can be used. -static int Reference(ctx_t *x, int n) { - con(x, 1); // Assuming the caller has already verified that this is indeed a Reference. - - // We're currently parsing [^;]* here, while the standard requires a (more - // strict) 'Name' token or a CharRef. This doesn't really matter, since we - // validate the contents of name later on. - char name[16] = {}; - int i = 0; - rfill(x, 1); - while(i < 15 && *x->buf != ';') { - name[i++] = *x->buf; - con(x, 1); - rfill(x, 1); - } - if(i >= 15) - err(x, "Entity name too long"); - con(x, 1); - - // Predefined entities -#define p(s, c) if(strcmp(name, s) == 0) {x->val[n] = c; return 1;} - p("lt", '<'); - p("gt", '>'); - p("amp", '&'); - p("apos", '\''); - p("quot", '"'); -#undef p - - // CharRefs - if(name[0] == '#' && name[1] == 'x') { - i = 2; - do - if(!isHex(name[i])) - err(x, "Invalid character reference '&%s;'", name); - while(++i < strlen(name)); - return 0; - } - - // decimal CharRef - if(name[0] == '#') { - i = 1; - do - if(!isDecimal(name[i])) - err(x, "Invalid character reference '&%s;'", name); - while(++i < strlen(name)); - return 0; - } - - // Anything else is an error - err(x, "Unknown entity reference '&%s;'", name); - return 0; -} - - -// Parses an attribute value and writes its (decoded) contents to x->val. -static void AttValue(ctx_t *x) { - rfill(x, 2); - char esc = *x->buf; - if(esc != '"' && esc != '\'') - err(x, "' or \" expected, got '%c'", *x->buf); - con(x, 1); - - int n = 0; - while(*x->buf != esc) { - if(*x->buf == '<') - err(x, "Invalid '<' in attribute value"); - if(n >= MAX_ATTRVAL-4) - err(x, "Too long attribute value."); - if(*x->buf == '&') - n += Reference(x, n); - else { - x->val[n++] = *x->buf; - con(x, 1); - } - rfill(x, 1); - } - x->val[n] = 0; - - if(*x->buf != esc) - err(x, "%c expected, got %c", esc, *x->buf); - con(x, 1); -} - - -static void comment(ctx_t *x) { - lit(x, "<!--"); - while(1) { - rfill(x, 3); - if(x->buf[0] == '-' && x->buf[1] == '-') { - if(x->buf[2] != '>') - err(x, "'--' not allowed in XML comment"); - con(x, 3); - break; - } - con(x, 1); - } -} - - -// Consumes any number of whitespace and comments. (So it's actually Misc*) -static void Misc(ctx_t *x) { - while(fill(x, 4) >= 4) { - if(strncmp(x->buf, "<!--", 4) == 0) { - comment(x); - continue; - } - if(!isWhiteSpace(*x->buf)) - break; - S(x, 0); - } - S(x, 0); -} - - -// Consumes a name and stores it in x->name. -static void Name(ctx_t *x) { - rfill(x, 1); - int n = 0; - if(!isNameStartChar(*x->buf)) - err(x, "Invalid character in element or attribute name"); - x->name[n++] = *x->buf; - con(x, 1); - while(n < MAX_NAME-1 && fill(x, 1) > 0 && isNameChar(*x->buf)) { - x->name[n++] = *x->buf; - con(x, 1); - } - if(n >= MAX_NAME-1) - err(x, "Too long element or attribute name"); - x->name[n] = 0; -} - - -// Returns the number of bytes consumed. -static int CharData(ctx_t *x) { - int r = 0; - while(fill(x, 3) >= 3) { - if(!isCharData(*x->buf)) - return r; - if(strncmp(x->buf, "]]>", 3) == 0) - err(x, "']]>' not allowed in content"); - r++; - con(x, 1); - } - - while(fill(x, 1) >= 1) { - if(!isCharData(*x->buf)) - return r; - r++; - con(x, 1); - } - return r; -} - - -static void element(ctx_t *x); - -static void content(ctx_t *x) { - CharData(x); - while(1) { - // Getting an EOF 2 bytes after content is always an error regardless of - // the content (since content always follows a close tag), so this rfill - // usage is safe. - rfill(x, 2); - if(x->buf[0] == '<' && x->buf[1] == '/') - return; - else if(x->buf[0] == '<' && x->buf[1] == '!') - comment(x); - else if(x->buf[0] == '<') - element(x); - else if(x->buf[0] == '&') - Reference(x, 0); - else if(!CharData(x)) // shouldn't happen, actually. - err(x, "Invalid character in content"); - } -} - - -static void element(ctx_t *x) { - if(x->level <= 0) - err(x, "Maximum element depth exceeded"); - - lit(x, "<"); - Name(x); - callcb(x, XMLT_OPEN, x->name, NULL); - - while(1) { - // Is this tag ending yet? - rfill(x, 1); - if(*x->buf == '>' || *x->buf == '/') - break; - S(x, 1); - if(*x->buf == '>' || *x->buf == '/') - break; - - // Otherwise, we have an attribute - Name(x); - Eq(x); - AttValue(x); - callcb(x, XMLT_ATTR, x->name, x->val); - } - - callcb(x, XMLT_ATTDONE, NULL, NULL); - - // EmptyElementTag - if(*x->buf == '/') { - lit(x, "/>"); - callcb(x, XMLT_CLOSE, NULL, NULL); - return; - } - - // Otherwise, this was an STag - lit(x, ">"); - x->level--; - content(x); - x->level++; - lit(x, "</"); - Name(x); - lit(x, ">"); - callcb(x, XMLT_CLOSE, x->name, NULL); -} - - -static void XMLDecl(ctx_t *x) { - if(fill(x, 5) < 5 || strncmp(x->buf, "<?xml", 5) != 0) - return; - - con(x, 5); - S(x, 1); - - // version - lit(x, "version"); - Eq(x); - AttValue(x); - if(x->val[0] != '1' || x->val[1] != '.') - err(x, "Invalid XML version"); - int i = 2; - do - if(!isDecimal(x->val[i])) - err(x, "Invalid XML version"); - while(++i < strlen(x->val)); - - // Accepts either whitespace or a '?' to signal the end of this XML - // declaration. -#define se rfill(x, 1); if(x->buf[0] == '?') goto end; S(x, 1); rfill(x, 1); if(x->buf[0] == '?') goto end; - - // encoding - se - if(x->buf[0] == 'e') { - lit(x, "encoding"); - Eq(x); - AttValue(x); - se - } - - // standalone - lit(x, "standalone"); - Eq(x); - AttValue(x); - if(strcmp(x->val, "yes") != 0 && strcmp(x->val, "no") != 0) - err(x, "Invalid value for \"standalone\""); - S(x, 0); -#undef se - -end: - lit(x, "?>"); -} - - - - -// Parses the complete XML document, returns 0 on success or -1 on error. -int xml_parse(xml_cb_t cb, xml_read_t read, void *dat, GError **e) { - // Don't allocate this the stack, it's fairly large. - ctx_t *x = g_new(ctx_t, 1); - x->dat = dat; - x->cb = cb; - x->read = read; - - x->buf = x->readbuf; - x->readeof = FALSE; - x->len = 0; - - x->line = x->byte = 1; - x->level = MAX_DEPTH; - x->err = NULL; - - if(!setjmp(x->jmp)) { - // UTF-8 BOM - if(fill(x, 3) >= 3 && strncmp(x->buf, "\xef\xbb\xbf", 3) == 0) - con(x, 3); - XMLDecl(x); - Misc(x); - element(x); - Misc(x); - // We should have consumed everything now. - if(fill(x, 1)) - err(x, "Expected end-of-file"); - } - - if(x->err) - g_propagate_error(e, x->err); - g_free(x); - return 0; -} - |