summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2019-05-14 18:37:14 +0200
committerYorhel <git@yorhel.nl>2019-05-14 18:38:12 +0200
commit04719fb0dd62858f97996f791a2318e062b90544 (patch)
tree47f820a61ab0ee7757740e8cd6367f31e1e310ed
Initial commit
-rw-r--r--.gitignore1
-rw-r--r--LICENSE20
-rw-r--r--Makefile41
-rw-r--r--README.md9
-rwxr-xr-xbtree.pl124
-rwxr-xr-xlmdb.pl40
-rwxr-xr-xplain.pl39
7 files changed, 274 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..19e0180
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+db/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..01615da
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2019 Yoran Heling
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..7137863
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,41 @@
+DICT=crackstation-human-only.txt.gz
+
+all: bench
+
+db/plain: ${DICT}
+ mkdir -p db
+ zcat ${DICT} | LC_COLLATE=C sort >$@
+
+db/gzip: db/plain
+ cat db/plain | gzip >$@
+
+db/lmdb: db/plain
+ ./lmdb.pl create db/lmdb <db/plain
+
+db/btree-1k-plain: db/plain
+ ./btree.pl create 1024 plain <db/plain >db/btree-1k-plain
+
+db/btree-4k-plain: db/plain
+ ./btree.pl create 4096 plain <db/plain >db/btree-4k-plain
+
+db/btree-1k-gzip: db/plain
+ ./btree.pl create 1024 gzip <db/plain >db/btree-1k-gzip
+
+db/btree-4k-gzip: db/plain
+ ./btree.pl create 4096 gzip <db/plain >db/btree-4k-gzip
+
+db/btree-1k-zstd: db/plain
+ ./btree.pl create 1024 zstd <db/plain >db/btree-1k-zstd
+
+db/btree-4k-zstd: db/plain
+ ./btree.pl create 4096 zstd <db/plain >db/btree-4k-zstd
+
+bench: db/plain db/gzip db/lmdb db/btree-1k-plain db/btree-4k-plain db/btree-1k-gzip db/btree-4k-gzip db/btree-1k-zstd db/btree-4k-zstd
+ time -v ./btree.pl bench 1024 plain db/btree-1k-plain
+ time -v ./btree.pl bench 4096 plain db/btree-4k-plain
+ time -v ./btree.pl bench 1024 gzip db/btree-1k-gzip
+ time -v ./btree.pl bench 4096 gzip db/btree-4k-gzip
+ time -v ./btree.pl bench 1024 zstd db/btree-1k-zstd
+ time -v ./btree.pl bench 4096 zstd db/btree-4k-zstd
+ time -v ./plain.pl db/plain db/gzip
+ time -v ./lmdb.pl bench db/lmdb
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9b904c0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,9 @@
+These are the scripts behind my little article, [Fast Key Lookup with a Small
+Read-Only Database](https://dev.yorhel.nl/doc/pwlookup).
+
+Requirements:
+
+- Perl (64bit!)
+- Compress::Zstd
+- LMDB_File
+- make
diff --git a/btree.pl b/btree.pl
new file mode 100755
index 0000000..04bb168
--- /dev/null
+++ b/btree.pl
@@ -0,0 +1,124 @@
+#!/usr/bin/perl
+
+# Usage:
+#
+# ./btree.pl create blocksize compression <sorted-dictionary >dbfile
+# ./btree.pl lookup blocksize compression dbfile query
+# ./btree.pl bench blocksize compression dbfile
+
+use strict;
+use warnings;
+use v5.10;
+
+
+my($cmd, $blocksize, $compress, $dbfile, $query) = @ARGV;
+
+$blocksize ||= 4096; # Actual block sizes may differ a bit.
+$compress ||= 'plain';
+
+if($compress eq 'zstd') {
+ require Compress::Zstd;
+ no warnings 'once';
+ *compress = \*Compress::Zstd::compress;
+ *uncompress = \*Compress::Zstd::decompress;
+} elsif($compress eq 'gzip') {
+ require Compress::Zlib;
+ no warnings 'once';
+ *compress = \*Compress::Zlib::compress;
+ *uncompress = \*Compress::Zlib::uncompress;
+} else {
+ *compress = sub { shift };
+ *uncompress = sub { shift };
+}
+
+
+# Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16)
+sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 }
+sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] }
+
+# Write a block and return its reference.
+sub writeblock {
+ state $off = 0;
+ my $buf = compress($_[0]);
+ my $len = length $buf;
+ print $buf;
+ my $oldoff = $off;
+ $off += $len;
+ [$_[1], $len, $oldoff]
+}
+
+# Read a block given a file handle and a reference.
+sub readblock {
+ my($F, $ref) = @_;
+ die $! if !sysseek $F, $ref->[2], 0;
+ die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1];
+ uncompress($buf)
+}
+
+sub encode {
+ my $leaf = "\0";
+ my @nodes = ('');
+ my $ref;
+
+ my $flush = sub {
+ my $minsize = $_[0];
+ return if $minsize > length $leaf;
+
+ my $str = $leaf =~ /^\x00([^\x00]*)/ && $1;
+ $ref = writeblock $leaf, 1;
+ $leaf = "\0";
+ $nodes[0] .= "$str\x00".eref($ref);
+
+ for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) {
+ my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1;
+ $ref = writeblock $nodes[$i], 0;
+ $nodes[$i] = '';
+ if($minsize) {
+ $nodes[$i+1] ||= '';
+ $nodes[$i+1] .= "$str\x00".eref($ref);
+ }
+ }
+ };
+
+ while(<STDIN>) {
+ chomp;
+ $leaf .= "$_\0";
+ $flush->($blocksize);
+ }
+ $flush->(0);
+ print eref $ref;
+}
+
+
+sub lookup_rec {
+ my($q, $F, $ref) = @_;
+ my $buf = readblock $F, $ref;
+ if($ref->[0]) {
+ return $buf =~ /\x00\Q$q\E\x00/;
+ } else {
+ while($buf =~ /(.{8})([^\x00]*)\x00/sg) {
+ return lookup_rec($q, $F, dref $1) if $q lt $2;
+ }
+ return lookup_rec($q, $F, dref substr $buf, -8)
+ }
+}
+
+sub lookup {
+ my($q, $f) = @_;
+ open my $F, '<', $f or die $!;
+ sysseek $F, -8, 2 or die $!;
+ die $! if 8 != sysread $F, (my $buf), 8;
+ lookup_rec($q, $F, dref $buf)
+}
+
+sub bench {
+ my $f = shift;
+ use Benchmark 'timethis', ':hireswallclock';
+ sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 }
+ srand 0;
+ timethis 10000, sub { lookup rstr(), $f }, "btree-$compress-$blocksize";
+}
+
+encode if $cmd eq 'create';
+printf "%s\n", lookup($query, $dbfile) ? "Found" : "Not found" if $cmd eq 'lookup';
+bench $dbfile if $cmd eq 'bench';
diff --git a/lmdb.pl b/lmdb.pl
new file mode 100755
index 0000000..9daaaf1
--- /dev/null
+++ b/lmdb.pl
@@ -0,0 +1,40 @@
+#!/usr/bin/perl
+
+# Usage:
+#
+# ./lmdb.pl create dbfile <dictionary
+# ./lmdb.pl bench dbfile
+
+use strict;
+use warnings;
+use v5.10;
+use LMDB_File ':flags';
+
+my($cmd, $dbfile) = @ARGV;
+
+sub encode {
+ my $env = LMDB::Env->new($dbfile, { mapsize => 1<<36, maxdbs => 1, flags => MDB_NOSUBDIR });
+ my $txn = $env->BeginTxn();
+ my $dbi = $txn->open('db', MDB_CREATE);
+ while(<STDIN>) {
+ chomp;
+ $txn->put($dbi => $_, '') if length($_) && length($_) < 510;
+ }
+ $txn->commit;
+}
+
+
+sub bench {
+ my $env = LMDB::Env->new($dbfile, { mapsize => 1<<32, maxdbs => 1, flags => MDB_RDONLY|MDB_NOSUBDIR });
+ my $txn = $env->BeginTxn();
+ my $dbi = $txn->open('db');
+ my $lookup = sub { eval { $txn->get($dbi => $_[0], my $d); 1 } };
+
+ use Benchmark 'timethis', ':hireswallclock';
+ sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 }
+ srand 0;
+ timethis 200000, sub { $lookup->(rstr()) }, 'lmdb';
+}
+
+encode if $cmd eq 'create';
+bench if $cmd eq 'bench';
diff --git a/plain.pl b/plain.pl
new file mode 100755
index 0000000..4bc30f5
--- /dev/null
+++ b/plain.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+
+# Usage: ./plain.pl db-plain db-gzip
+
+use strict;
+use warnings;
+use PerlIO::gzip;
+use v5.10;
+
+sub lookup_str {
+ my($F, $q) = @_;
+ while(<$F>) {
+ chomp;
+ return 1 if $q eq $_;
+ return 0 if $q lt $_;
+ }
+ return 0;
+}
+
+sub lookup_plain {
+ my($q, $f) = @_;
+ open my $F, '<', $f or die $!;
+ lookup_str $F, $q
+}
+
+sub lookup_gzip {
+ my($q, $f) = @_;
+ open my $F, '<:gzip', $f or die $!;
+ lookup_str $F, $q
+}
+
+my($plain, $gzip) = @ARGV;
+
+use Benchmark 'timethis', ':hireswallclock';
+sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 }
+srand 0;
+timethis 2, sub { lookup_plain rstr(), $plain }, 'plain';
+srand 0;
+timethis 2, sub { lookup_gzip rstr(), $gzip }, 'gzip';