diff options
author | Yorhel <git@yorhel.nl> | 2014-01-09 20:12:17 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2014-01-09 20:12:17 +0100 |
commit | 8cfbc07271b6f170acf6c6dbe44836644fbf5e61 (patch) | |
tree | f893ef62451e2f76d0bc3c13985611776d723046 /dcfilestats.c |
Diffstat (limited to 'dcfilestats.c')
-rw-r--r-- | dcfilestats.c | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/dcfilestats.c b/dcfilestats.c new file mode 100644 index 0000000..0127b98 --- /dev/null +++ b/dcfilestats.c @@ -0,0 +1,347 @@ +/* To compile, ensure that khash.h from https://github.com/attractivechaos/klib + * and yxml.c and yxml.h from http://dev.yorhel.nl/yxml are available in the + * same directory at this file, and then run: + * + * gcc -Wall -Wextra -O2 -I. yxml.c dcfilestats.c -lbz2 -o dcfilestats + * + * And then to analyze a directory containing file lists: + * + * ./dcfilestats /path/to/dir + * + * If your directory contains more than NUMLISTS (see the #define below) files, + * make sure to modify that define and recompile. + * + * Progress will be written to standard error, some final stats to standard + * out, and more detailed stats to the following files: + * + * dcfiledist + * dcfilesize + * dclistsize + * dcnumfiles + * + * More information and stats are available at http://dev.yorhel.nl/yxml + * + * Copyright (c) 2014 by Yoran Heling + * License: MIT + */ + +#include <stdint.h> +#include <inttypes.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> +#include <assert.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <dirent.h> + +#include <bzlib.h> +#include <yxml.h> +#include <khash.h> + +#define NUMLISTS 14200 /* Upper bound on the number of file lists we're processing */ + + +typedef struct { + char tth[24]; + uint16_t num; + uint8_t bin; +} file_t; + + +static inline khint_t file_hash_func_(void *tth) { return *((khint_t*)tth); } +#define file_hash_func(k) file_hash_func_((k).tth) +#define file_equal_func(a, b) (memcmp((a).tth, (b).tth, 24) == 0) +KHASH_INIT(h, file_t, int, 0, file_hash_func, file_equal_func); + + +static uint64_t total_comp, + total_uncomp; +static uint32_t total_files, + cur_files, + total_unique, + cur_unique; + +static uint16_t num_lists; + +static bool x_infile, x_intth, x_insize; +static uint64_t x_size; +static char *x_tthval, + x_tthbuf[40]; + +static khash_t(h) *cur_hash, + *total_hash; + +static FILE *dcfiledist, + *dcfilesize, + *dclistsize, + *dcnumfiles; + +/* Stolen from Globster; http://g.blicky.net/globster.git/tree/src/util/base32.c */ +static void base32_decode(const char *from, char *to, int len) { + int i = 0, bits = 0, idx = 0, value = 0; + while(idx < len) { + value = (value << 5) | (from[i] <= '9' ? (26+(from[i]-'2')) : from[i]-'A'); + i++; + bits += 5; + while(bits >= 8) { + to[idx++] = (value >> (bits-8)) & 0xFF; + bits -= 8; + } + } +} + + +static uint8_t size2bin(uint64_t size) { + /* ..1k -> 0, ..2k -> 1, ..4k -> 2, ..8k -> 3, ..16k -> 4 */ + uint64_t x = 1024; + uint8_t n = 0; + while(size > x) { + x <<= 1; + n++; + } + return n; +} + + +static void handlefile(const char *tth, uint64_t size) { + cur_files++; + total_files++; + + file_t f; + base32_decode(tth, f.tth, 24); + + int r; + khint_t i = kh_put(h, cur_hash, f, &r); + if(r == 0) + return; + cur_unique++; + memcpy(kh_key(cur_hash, i).tth, f.tth, 24); + + i = kh_put(h, total_hash, f, &r); + file_t *p = &kh_key(total_hash, i); + if(r != 0) { + p->num = 0; + p->bin = size2bin(size); + memcpy(p->tth, f.tth, 24); + total_unique++; + } + p->num++; +} + + +static int parsechar(yxml_t *x, char c) { + char *tmp; + yxml_ret_t r = yxml_parse(x, c); + switch(r) { + case YXML_ELEMSTART: + x_infile = x->elem[0] == 'F' && x->elem[1] == 'i' && x->elem[2] == 'l' && x->elem[3] == 'e' && !x->elem[4]; + x_tthval = x_tthbuf; + x_size = UINT64_MAX; + break; + + case YXML_ELEMEND: + if(!x_infile) + break; + if(x_tthval-x_tthbuf != 39 || x_size == UINT64_MAX) + fprintf(stderr, "\n Missing or invalid TTH or Size at %"PRIu32":%"PRIu64"\n", x->line, x->byte); + else + handlefile(x_tthbuf, x_size); + x_infile = false; + break; + + case YXML_ATTRSTART: + x_intth = x->attr[0] == 'T' && x->attr[1] == 'T' && x->attr[2] == 'H' && !x->attr[3]; + x_insize = x->attr[0] == 'S' && x->attr[1] == 'i' && x->attr[2] == 'z' && x->attr[3] == 'e' && !x->attr[4]; + if(x_insize) + x_size = 0; + break; + + case YXML_ATTREND: + x_intth = x_insize = false; + break; + + case YXML_ATTRVAL: + if(x_intth) { + tmp = x->data; + while((((unsigned)*tmp)-'A' < 26 || ((unsigned)*tmp)-'0' < 8) && x_tthval-x_tthbuf < 39) + *(x_tthval++) = *(tmp++); + *x_tthval = 0; + if(*tmp) { + fprintf(stderr, "\n Invalid TTH at %"PRIu32":%"PRIu64"\n", x->line, x->byte); + x_infile = x_intth = false; + return 0; + } + } + if(x_insize) { + tmp = x->data; + while(((unsigned)*tmp)-'0' < 10 && x_size < UINT64_MAX/4) + x_size = x_size*10 + (*(tmp++)-'0'); + if(*tmp) { + fprintf(stderr, "\n Invalid Size at %"PRIu32":%"PRIu64"\n", x->line, x->byte); + x_infile = x_insize = false; + return 0; + } + } + break; + default: + if(r < 0) { + fprintf(stderr, "XML parse error at %"PRIu32":%"PRIu64"\n", x->line, x->byte); + return 1; + } + } + return 0; +} + + +static void parsefile(const char *path, const char *fn) { + fprintf(stderr, "Parsing %s... ", fn); + fflush(stderr); + + static char fnbuf[4096]; + snprintf(fnbuf, sizeof(fnbuf), "%s/%s", path, fn); + + struct stat st; + if(stat(fnbuf, &st) < 0) { + fprintf(stderr, "stat(): %s\n", strerror(errno)); + return; + } + + FILE *f = fopen(fnbuf, "r"); + if(!f) { + fprintf(stderr, "fopen(): %s\n", strerror(errno)); + return; + } + + int err; + BZFILE *bf = BZ2_bzReadOpen(&err, f, 0, 0, NULL, 0); + if(!bf) { + fprintf(stderr, "bzReadOpen(): %d (%s)\n", err, strerror(errno)); + fclose(f); + return; + } + + static char xmlbuf[8196]; + yxml_t x[1]; + yxml_init(x, xmlbuf, sizeof(xmlbuf)); + cur_files = cur_unique = 0; + kh_clear(h, cur_hash); + + int i=0,r; + uint64_t rd = 0; + static char buf[4096]; + while(i >= 0 && (r = BZ2_bzRead(&err, bf, buf, sizeof(buf))) > 0) { + rd += r; + + for(i=0; i<r; i++) + if(parsechar(x, buf[i])) { + i = -1; + break; + } + + if(err != BZ_OK) + break; + } + + + if(i == -1) + ; /* parsechar() failed */ + else if(err != BZ_STREAM_END) + fprintf(stderr, "bzRead(): %d (%s)\n", err, strerror(errno)); + else { + fprintf(dclistsize, "%"PRIu64" %"PRIu64"\n", (uint64_t)st.st_size, rd); + fprintf(dcnumfiles, "%"PRIu32" %"PRIu32"\n", cur_unique, cur_files); + total_comp += st.st_size; + total_uncomp += rd; + num_lists++; + fprintf(stderr, "ok %9"PRIu64" bytes, %7"PRIu32"/%8"PRIu32" unique files\n", rd, cur_unique, total_unique); + } + + BZ2_bzReadClose(&err, bf); + fclose(f); +} + + +static void aggregate() { + fprintf(stderr, "Aggregating file stats..."); + fflush(stderr); + static uint32_t dist[NUMLISTS] = {}; + static uint32_t sizes[64] = {}; /* More than enough, 1024<<64 is not a valid file size */ + + khint_t i; + for(i=kh_begin(total_hash); i!=kh_end(total_hash); i++) { + if(!kh_exist(total_hash, i)) + continue; + uint16_t n = kh_key(total_hash, i).num; + assert(n < NUMLISTS); + dist[n]++; + + uint8_t bin = kh_key(total_hash, i).bin; + assert(bin < sizeof(sizes)/sizeof(*sizes)); + sizes[bin]++; + } + + uint16_t j; + for(j=1; j<NUMLISTS; j++) + if(dist[j]) + fprintf(dcfiledist, "%"PRIu16" %"PRIu32"\n", j, dist[j]); + + uint8_t k; + for(k=0; k<sizeof(sizes)/sizeof(*sizes); k++) + if(sizes[k]) + fprintf(dcfilesize, "%"PRIu8" %"PRIu32"\n", k, sizes[k]); + + fprintf(stderr, " done!\n"); +} + + +int main(int argc, char **argv) { + if(argc <= 1) { + fprintf(stderr, "Usage: ./stats <path>\n"); + return 1; + } + + fprintf(stderr, "Creating output files\n"); +#define OF(f) f = fopen(#f, "w");\ + if(!f) {\ + fprintf(stderr, "Error opening "#f": %s\n", strerror(errno));\ + return 1;\ + } + OF(dcfiledist); + OF(dcfilesize); + OF(dclistsize); + OF(dcnumfiles); + + fprintf(stderr, "Reading directory...\n"); + struct dirent **lst; + int n = scandir(argv[1], &lst, NULL, alphasort); + if(n < 0) { + fprintf(stderr, "Error reading %s: %s\n", argv[1], strerror(errno)); + return 1; + } + assert(n-2 < NUMLISTS); + + cur_hash = kh_init(h); + total_hash = kh_init(h); + + int i; + for(i=0; i<n; i++) { + if(*lst[i]->d_name == '.') + continue; + parsefile(argv[1], lst[i]->d_name); + } + + aggregate(); + fclose(dcfiledist); + fclose(dcfilesize); + fclose(dclistsize); + fclose(dcnumfiles); + + printf("\n#lists: %"PRIu16"\n", num_lists); + printf("Total list size (compressed/uncompressed): %"PRIu64"/%"PRIu64"\n", total_comp, total_uncomp); + printf("Total number of files (unique/total): %"PRIu32"/%"PRIu32"\n", total_unique, total_files); + return 0; +} |