/* To compile, ensure that khash.h from https://github.com/attractivechaos/klib * and yxml.c and yxml.h from http://dev.yorhel.nl/yxml are available in the * same directory at this file, and then run: * * gcc -Wall -Wextra -O2 -I. yxml.c dcfilestats.c -lbz2 -o dcfilestats * * And then to analyze a directory containing file lists: * * ./dcfilestats /path/to/dir * * If your directory contains more than NUMLISTS (see the #define below) files, * make sure to modify that define and recompile. * * Progress will be written to standard error, some final stats to standard * out, and more detailed stats to the following files: * * dcfiledist * dcfilesize * dclistsize * dcnumfiles * * More information and stats are available at http://dev.yorhel.nl/yxml * * Copyright (c) 2014 by Yoran Heling * License: MIT */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NUMLISTS 14200 /* Upper bound on the number of file lists we're processing */ typedef struct { char tth[24]; uint16_t num; uint8_t bin; } file_t; static inline khint_t file_hash_func_(void *tth) { return *((khint_t*)tth); } #define file_hash_func(k) file_hash_func_((k).tth) #define file_equal_func(a, b) (memcmp((a).tth, (b).tth, 24) == 0) KHASH_INIT(h, file_t, int, 0, file_hash_func, file_equal_func); static uint64_t total_comp, total_uncomp; static uint32_t total_files, cur_files, total_unique, cur_unique; static uint16_t num_lists; static bool x_infile, x_intth, x_insize; static uint64_t x_size; static char *x_tthval, x_tthbuf[40]; static khash_t(h) *cur_hash, *total_hash; static FILE *dcfiledist, *dcfilesize, *dclistsize, *dcnumfiles; /* Stolen from Globster; http://g.blicky.net/globster.git/tree/src/util/base32.c */ static void base32_decode(const char *from, char *to, int len) { int i = 0, bits = 0, idx = 0, value = 0; while(idx < len) { value = (value << 5) | (from[i] <= '9' ? (26+(from[i]-'2')) : from[i]-'A'); i++; bits += 5; while(bits >= 8) { to[idx++] = (value >> (bits-8)) & 0xFF; bits -= 8; } } } static uint8_t size2bin(uint64_t size) { /* ..1k -> 0, ..2k -> 1, ..4k -> 2, ..8k -> 3, ..16k -> 4 */ uint64_t x = 1024; uint8_t n = 0; while(size > x) { x <<= 1; n++; } return n; } static void handlefile(const char *tth, uint64_t size) { cur_files++; total_files++; file_t f; base32_decode(tth, f.tth, 24); int r; khint_t i = kh_put(h, cur_hash, f, &r); if(r == 0) return; cur_unique++; memcpy(kh_key(cur_hash, i).tth, f.tth, 24); i = kh_put(h, total_hash, f, &r); file_t *p = &kh_key(total_hash, i); if(r != 0) { p->num = 0; p->bin = size2bin(size); memcpy(p->tth, f.tth, 24); total_unique++; } p->num++; } static int parsechar(yxml_t *x, char c) { char *tmp; yxml_ret_t r = yxml_parse(x, c); switch(r) { case YXML_ELEMSTART: x_infile = x->elem[0] == 'F' && x->elem[1] == 'i' && x->elem[2] == 'l' && x->elem[3] == 'e' && !x->elem[4]; x_tthval = x_tthbuf; x_size = UINT64_MAX; break; case YXML_ELEMEND: if(!x_infile) break; if(x_tthval-x_tthbuf != 39 || x_size == UINT64_MAX) fprintf(stderr, "\n Missing or invalid TTH or Size at %"PRIu32":%"PRIu64"\n", x->line, x->byte); else handlefile(x_tthbuf, x_size); x_infile = false; break; case YXML_ATTRSTART: x_intth = x->attr[0] == 'T' && x->attr[1] == 'T' && x->attr[2] == 'H' && !x->attr[3]; x_insize = x->attr[0] == 'S' && x->attr[1] == 'i' && x->attr[2] == 'z' && x->attr[3] == 'e' && !x->attr[4]; if(x_insize) x_size = 0; break; case YXML_ATTREND: x_intth = x_insize = false; break; case YXML_ATTRVAL: if(x_intth) { tmp = x->data; while((((unsigned)*tmp)-'A' < 26 || ((unsigned)*tmp)-'0' < 8) && x_tthval-x_tthbuf < 39) *(x_tthval++) = *(tmp++); *x_tthval = 0; if(*tmp) { fprintf(stderr, "\n Invalid TTH at %"PRIu32":%"PRIu64"\n", x->line, x->byte); x_infile = x_intth = false; return 0; } } if(x_insize) { tmp = x->data; while(((unsigned)*tmp)-'0' < 10 && x_size < UINT64_MAX/4) x_size = x_size*10 + (*(tmp++)-'0'); if(*tmp) { fprintf(stderr, "\n Invalid Size at %"PRIu32":%"PRIu64"\n", x->line, x->byte); x_infile = x_insize = false; return 0; } } break; default: if(r < 0) { fprintf(stderr, "XML parse error at %"PRIu32":%"PRIu64"\n", x->line, x->byte); return 1; } } return 0; } static void parsefile(const char *path, const char *fn) { fprintf(stderr, "Parsing %s... ", fn); fflush(stderr); static char fnbuf[4096]; snprintf(fnbuf, sizeof(fnbuf), "%s/%s", path, fn); struct stat st; if(stat(fnbuf, &st) < 0) { fprintf(stderr, "stat(): %s\n", strerror(errno)); return; } FILE *f = fopen(fnbuf, "r"); if(!f) { fprintf(stderr, "fopen(): %s\n", strerror(errno)); return; } int err; BZFILE *bf = BZ2_bzReadOpen(&err, f, 0, 0, NULL, 0); if(!bf) { fprintf(stderr, "bzReadOpen(): %d (%s)\n", err, strerror(errno)); fclose(f); return; } static char xmlbuf[8196]; yxml_t x[1]; yxml_init(x, xmlbuf, sizeof(xmlbuf)); cur_files = cur_unique = 0; kh_clear(h, cur_hash); int i=0,r; uint64_t rd = 0; static char buf[4096]; while(i >= 0 && (r = BZ2_bzRead(&err, bf, buf, sizeof(buf))) > 0) { rd += r; for(i=0; i\n"); return 1; } fprintf(stderr, "Creating output files\n"); #define OF(f) f = fopen(#f, "w");\ if(!f) {\ fprintf(stderr, "Error opening "#f": %s\n", strerror(errno));\ return 1;\ } OF(dcfiledist); OF(dcfilesize); OF(dclistsize); OF(dcnumfiles); fprintf(stderr, "Reading directory...\n"); struct dirent **lst; int n = scandir(argv[1], &lst, NULL, alphasort); if(n < 0) { fprintf(stderr, "Error reading %s: %s\n", argv[1], strerror(errno)); return 1; } assert(n-2 < NUMLISTS); cur_hash = kh_init(h); total_hash = kh_init(h); int i; for(i=0; id_name == '.') continue; parsefile(argv[1], lst[i]->d_name); } aggregate(); fclose(dcfiledist); fclose(dcfilesize); fclose(dclistsize); fclose(dcnumfiles); printf("\n#lists: %"PRIu16"\n", num_lists); printf("Total list size (compressed/uncompressed): %"PRIu64"/%"PRIu64"\n", total_comp, total_uncomp); printf("Total number of files (unique/total): %"PRIu32"/%"PRIu32"\n", total_unique, total_files); return 0; }