summaryrefslogtreecommitdiff
path: root/dcfilestats.c
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2014-01-09 20:12:17 +0100
committerYorhel <git@yorhel.nl>2014-01-09 20:12:17 +0100
commit8cfbc07271b6f170acf6c6dbe44836644fbf5e61 (patch)
treef893ef62451e2f76d0bc3c13985611776d723046 /dcfilestats.c
Initial commitHEADmaster
Diffstat (limited to 'dcfilestats.c')
-rw-r--r--dcfilestats.c347
1 files changed, 347 insertions, 0 deletions
diff --git a/dcfilestats.c b/dcfilestats.c
new file mode 100644
index 0000000..0127b98
--- /dev/null
+++ b/dcfilestats.c
@@ -0,0 +1,347 @@
+/* To compile, ensure that khash.h from https://github.com/attractivechaos/klib
+ * and yxml.c and yxml.h from http://dev.yorhel.nl/yxml are available in the
+ * same directory at this file, and then run:
+ *
+ * gcc -Wall -Wextra -O2 -I. yxml.c dcfilestats.c -lbz2 -o dcfilestats
+ *
+ * And then to analyze a directory containing file lists:
+ *
+ * ./dcfilestats /path/to/dir
+ *
+ * If your directory contains more than NUMLISTS (see the #define below) files,
+ * make sure to modify that define and recompile.
+ *
+ * Progress will be written to standard error, some final stats to standard
+ * out, and more detailed stats to the following files:
+ *
+ * dcfiledist
+ * dcfilesize
+ * dclistsize
+ * dcnumfiles
+ *
+ * More information and stats are available at http://dev.yorhel.nl/yxml
+ *
+ * Copyright (c) 2014 by Yoran Heling
+ * License: MIT
+ */
+
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+
+#include <bzlib.h>
+#include <yxml.h>
+#include <khash.h>
+
+#define NUMLISTS 14200 /* Upper bound on the number of file lists we're processing */
+
+
+typedef struct {
+ char tth[24];
+ uint16_t num;
+ uint8_t bin;
+} file_t;
+
+
+static inline khint_t file_hash_func_(void *tth) { return *((khint_t*)tth); }
+#define file_hash_func(k) file_hash_func_((k).tth)
+#define file_equal_func(a, b) (memcmp((a).tth, (b).tth, 24) == 0)
+KHASH_INIT(h, file_t, int, 0, file_hash_func, file_equal_func);
+
+
+static uint64_t total_comp,
+ total_uncomp;
+static uint32_t total_files,
+ cur_files,
+ total_unique,
+ cur_unique;
+
+static uint16_t num_lists;
+
+static bool x_infile, x_intth, x_insize;
+static uint64_t x_size;
+static char *x_tthval,
+ x_tthbuf[40];
+
+static khash_t(h) *cur_hash,
+ *total_hash;
+
+static FILE *dcfiledist,
+ *dcfilesize,
+ *dclistsize,
+ *dcnumfiles;
+
+/* Stolen from Globster; http://g.blicky.net/globster.git/tree/src/util/base32.c */
+static void base32_decode(const char *from, char *to, int len) {
+ int i = 0, bits = 0, idx = 0, value = 0;
+ while(idx < len) {
+ value = (value << 5) | (from[i] <= '9' ? (26+(from[i]-'2')) : from[i]-'A');
+ i++;
+ bits += 5;
+ while(bits >= 8) {
+ to[idx++] = (value >> (bits-8)) & 0xFF;
+ bits -= 8;
+ }
+ }
+}
+
+
+static uint8_t size2bin(uint64_t size) {
+ /* ..1k -> 0, ..2k -> 1, ..4k -> 2, ..8k -> 3, ..16k -> 4 */
+ uint64_t x = 1024;
+ uint8_t n = 0;
+ while(size > x) {
+ x <<= 1;
+ n++;
+ }
+ return n;
+}
+
+
+static void handlefile(const char *tth, uint64_t size) {
+ cur_files++;
+ total_files++;
+
+ file_t f;
+ base32_decode(tth, f.tth, 24);
+
+ int r;
+ khint_t i = kh_put(h, cur_hash, f, &r);
+ if(r == 0)
+ return;
+ cur_unique++;
+ memcpy(kh_key(cur_hash, i).tth, f.tth, 24);
+
+ i = kh_put(h, total_hash, f, &r);
+ file_t *p = &kh_key(total_hash, i);
+ if(r != 0) {
+ p->num = 0;
+ p->bin = size2bin(size);
+ memcpy(p->tth, f.tth, 24);
+ total_unique++;
+ }
+ p->num++;
+}
+
+
+static int parsechar(yxml_t *x, char c) {
+ char *tmp;
+ yxml_ret_t r = yxml_parse(x, c);
+ switch(r) {
+ case YXML_ELEMSTART:
+ x_infile = x->elem[0] == 'F' && x->elem[1] == 'i' && x->elem[2] == 'l' && x->elem[3] == 'e' && !x->elem[4];
+ x_tthval = x_tthbuf;
+ x_size = UINT64_MAX;
+ break;
+
+ case YXML_ELEMEND:
+ if(!x_infile)
+ break;
+ if(x_tthval-x_tthbuf != 39 || x_size == UINT64_MAX)
+ fprintf(stderr, "\n Missing or invalid TTH or Size at %"PRIu32":%"PRIu64"\n", x->line, x->byte);
+ else
+ handlefile(x_tthbuf, x_size);
+ x_infile = false;
+ break;
+
+ case YXML_ATTRSTART:
+ x_intth = x->attr[0] == 'T' && x->attr[1] == 'T' && x->attr[2] == 'H' && !x->attr[3];
+ x_insize = x->attr[0] == 'S' && x->attr[1] == 'i' && x->attr[2] == 'z' && x->attr[3] == 'e' && !x->attr[4];
+ if(x_insize)
+ x_size = 0;
+ break;
+
+ case YXML_ATTREND:
+ x_intth = x_insize = false;
+ break;
+
+ case YXML_ATTRVAL:
+ if(x_intth) {
+ tmp = x->data;
+ while((((unsigned)*tmp)-'A' < 26 || ((unsigned)*tmp)-'0' < 8) && x_tthval-x_tthbuf < 39)
+ *(x_tthval++) = *(tmp++);
+ *x_tthval = 0;
+ if(*tmp) {
+ fprintf(stderr, "\n Invalid TTH at %"PRIu32":%"PRIu64"\n", x->line, x->byte);
+ x_infile = x_intth = false;
+ return 0;
+ }
+ }
+ if(x_insize) {
+ tmp = x->data;
+ while(((unsigned)*tmp)-'0' < 10 && x_size < UINT64_MAX/4)
+ x_size = x_size*10 + (*(tmp++)-'0');
+ if(*tmp) {
+ fprintf(stderr, "\n Invalid Size at %"PRIu32":%"PRIu64"\n", x->line, x->byte);
+ x_infile = x_insize = false;
+ return 0;
+ }
+ }
+ break;
+ default:
+ if(r < 0) {
+ fprintf(stderr, "XML parse error at %"PRIu32":%"PRIu64"\n", x->line, x->byte);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+
+static void parsefile(const char *path, const char *fn) {
+ fprintf(stderr, "Parsing %s... ", fn);
+ fflush(stderr);
+
+ static char fnbuf[4096];
+ snprintf(fnbuf, sizeof(fnbuf), "%s/%s", path, fn);
+
+ struct stat st;
+ if(stat(fnbuf, &st) < 0) {
+ fprintf(stderr, "stat(): %s\n", strerror(errno));
+ return;
+ }
+
+ FILE *f = fopen(fnbuf, "r");
+ if(!f) {
+ fprintf(stderr, "fopen(): %s\n", strerror(errno));
+ return;
+ }
+
+ int err;
+ BZFILE *bf = BZ2_bzReadOpen(&err, f, 0, 0, NULL, 0);
+ if(!bf) {
+ fprintf(stderr, "bzReadOpen(): %d (%s)\n", err, strerror(errno));
+ fclose(f);
+ return;
+ }
+
+ static char xmlbuf[8196];
+ yxml_t x[1];
+ yxml_init(x, xmlbuf, sizeof(xmlbuf));
+ cur_files = cur_unique = 0;
+ kh_clear(h, cur_hash);
+
+ int i=0,r;
+ uint64_t rd = 0;
+ static char buf[4096];
+ while(i >= 0 && (r = BZ2_bzRead(&err, bf, buf, sizeof(buf))) > 0) {
+ rd += r;
+
+ for(i=0; i<r; i++)
+ if(parsechar(x, buf[i])) {
+ i = -1;
+ break;
+ }
+
+ if(err != BZ_OK)
+ break;
+ }
+
+
+ if(i == -1)
+ ; /* parsechar() failed */
+ else if(err != BZ_STREAM_END)
+ fprintf(stderr, "bzRead(): %d (%s)\n", err, strerror(errno));
+ else {
+ fprintf(dclistsize, "%"PRIu64" %"PRIu64"\n", (uint64_t)st.st_size, rd);
+ fprintf(dcnumfiles, "%"PRIu32" %"PRIu32"\n", cur_unique, cur_files);
+ total_comp += st.st_size;
+ total_uncomp += rd;
+ num_lists++;
+ fprintf(stderr, "ok %9"PRIu64" bytes, %7"PRIu32"/%8"PRIu32" unique files\n", rd, cur_unique, total_unique);
+ }
+
+ BZ2_bzReadClose(&err, bf);
+ fclose(f);
+}
+
+
+static void aggregate() {
+ fprintf(stderr, "Aggregating file stats...");
+ fflush(stderr);
+ static uint32_t dist[NUMLISTS] = {};
+ static uint32_t sizes[64] = {}; /* More than enough, 1024<<64 is not a valid file size */
+
+ khint_t i;
+ for(i=kh_begin(total_hash); i!=kh_end(total_hash); i++) {
+ if(!kh_exist(total_hash, i))
+ continue;
+ uint16_t n = kh_key(total_hash, i).num;
+ assert(n < NUMLISTS);
+ dist[n]++;
+
+ uint8_t bin = kh_key(total_hash, i).bin;
+ assert(bin < sizeof(sizes)/sizeof(*sizes));
+ sizes[bin]++;
+ }
+
+ uint16_t j;
+ for(j=1; j<NUMLISTS; j++)
+ if(dist[j])
+ fprintf(dcfiledist, "%"PRIu16" %"PRIu32"\n", j, dist[j]);
+
+ uint8_t k;
+ for(k=0; k<sizeof(sizes)/sizeof(*sizes); k++)
+ if(sizes[k])
+ fprintf(dcfilesize, "%"PRIu8" %"PRIu32"\n", k, sizes[k]);
+
+ fprintf(stderr, " done!\n");
+}
+
+
+int main(int argc, char **argv) {
+ if(argc <= 1) {
+ fprintf(stderr, "Usage: ./stats <path>\n");
+ return 1;
+ }
+
+ fprintf(stderr, "Creating output files\n");
+#define OF(f) f = fopen(#f, "w");\
+ if(!f) {\
+ fprintf(stderr, "Error opening "#f": %s\n", strerror(errno));\
+ return 1;\
+ }
+ OF(dcfiledist);
+ OF(dcfilesize);
+ OF(dclistsize);
+ OF(dcnumfiles);
+
+ fprintf(stderr, "Reading directory...\n");
+ struct dirent **lst;
+ int n = scandir(argv[1], &lst, NULL, alphasort);
+ if(n < 0) {
+ fprintf(stderr, "Error reading %s: %s\n", argv[1], strerror(errno));
+ return 1;
+ }
+ assert(n-2 < NUMLISTS);
+
+ cur_hash = kh_init(h);
+ total_hash = kh_init(h);
+
+ int i;
+ for(i=0; i<n; i++) {
+ if(*lst[i]->d_name == '.')
+ continue;
+ parsefile(argv[1], lst[i]->d_name);
+ }
+
+ aggregate();
+ fclose(dcfiledist);
+ fclose(dcfilesize);
+ fclose(dclistsize);
+ fclose(dcnumfiles);
+
+ printf("\n#lists: %"PRIu16"\n", num_lists);
+ printf("Total list size (compressed/uncompressed): %"PRIu64"/%"PRIu64"\n", total_comp, total_uncomp);
+ printf("Total number of files (unique/total): %"PRIu32"/%"PRIu32"\n", total_unique, total_files);
+ return 0;
+}