From 04719fb0dd62858f97996f791a2318e062b90544 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Tue, 14 May 2019 18:37:14 +0200 Subject: Initial commit --- .gitignore | 1 + LICENSE | 20 ++++++++++ Makefile | 41 ++++++++++++++++++++ README.md | 9 +++++ btree.pl | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lmdb.pl | 40 ++++++++++++++++++++ plain.pl | 39 +++++++++++++++++++ 7 files changed, 274 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.md create mode 100755 btree.pl create mode 100755 lmdb.pl create mode 100755 plain.pl diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..19e0180 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +db/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..01615da --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2019 Yoran Heling + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7137863 --- /dev/null +++ b/Makefile @@ -0,0 +1,41 @@ +DICT=crackstation-human-only.txt.gz + +all: bench + +db/plain: ${DICT} + mkdir -p db + zcat ${DICT} | LC_COLLATE=C sort >$@ + +db/gzip: db/plain + cat db/plain | gzip >$@ + +db/lmdb: db/plain + ./lmdb.pl create db/lmdb db/btree-1k-plain + +db/btree-4k-plain: db/plain + ./btree.pl create 4096 plain db/btree-4k-plain + +db/btree-1k-gzip: db/plain + ./btree.pl create 1024 gzip db/btree-1k-gzip + +db/btree-4k-gzip: db/plain + ./btree.pl create 4096 gzip db/btree-4k-gzip + +db/btree-1k-zstd: db/plain + ./btree.pl create 1024 zstd db/btree-1k-zstd + +db/btree-4k-zstd: db/plain + ./btree.pl create 4096 zstd db/btree-4k-zstd + +bench: db/plain db/gzip db/lmdb db/btree-1k-plain db/btree-4k-plain db/btree-1k-gzip db/btree-4k-gzip db/btree-1k-zstd db/btree-4k-zstd + time -v ./btree.pl bench 1024 plain db/btree-1k-plain + time -v ./btree.pl bench 4096 plain db/btree-4k-plain + time -v ./btree.pl bench 1024 gzip db/btree-1k-gzip + time -v ./btree.pl bench 4096 gzip db/btree-4k-gzip + time -v ./btree.pl bench 1024 zstd db/btree-1k-zstd + time -v ./btree.pl bench 4096 zstd db/btree-4k-zstd + time -v ./plain.pl db/plain db/gzip + time -v ./lmdb.pl bench db/lmdb diff --git a/README.md b/README.md new file mode 100644 index 0000000..9b904c0 --- /dev/null +++ b/README.md @@ -0,0 +1,9 @@ +These are the scripts behind my little article, [Fast Key Lookup with a Small +Read-Only Database](https://dev.yorhel.nl/doc/pwlookup). + +Requirements: + +- Perl (64bit!) +- Compress::Zstd +- LMDB_File +- make diff --git a/btree.pl b/btree.pl new file mode 100755 index 0000000..04bb168 --- /dev/null +++ b/btree.pl @@ -0,0 +1,124 @@ +#!/usr/bin/perl + +# Usage: +# +# ./btree.pl create blocksize compression dbfile +# ./btree.pl lookup blocksize compression dbfile query +# ./btree.pl bench blocksize compression dbfile + +use strict; +use warnings; +use v5.10; + + +my($cmd, $blocksize, $compress, $dbfile, $query) = @ARGV; + +$blocksize ||= 4096; # Actual block sizes may differ a bit. +$compress ||= 'plain'; + +if($compress eq 'zstd') { + require Compress::Zstd; + no warnings 'once'; + *compress = \*Compress::Zstd::compress; + *uncompress = \*Compress::Zstd::decompress; +} elsif($compress eq 'gzip') { + require Compress::Zlib; + no warnings 'once'; + *compress = \*Compress::Zlib::compress; + *uncompress = \*Compress::Zlib::uncompress; +} else { + *compress = sub { shift }; + *uncompress = sub { shift }; +} + + +# Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16) +sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 } +sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] } + +# Write a block and return its reference. +sub writeblock { + state $off = 0; + my $buf = compress($_[0]); + my $len = length $buf; + print $buf; + my $oldoff = $off; + $off += $len; + [$_[1], $len, $oldoff] +} + +# Read a block given a file handle and a reference. +sub readblock { + my($F, $ref) = @_; + die $! if !sysseek $F, $ref->[2], 0; + die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1]; + uncompress($buf) +} + +sub encode { + my $leaf = "\0"; + my @nodes = (''); + my $ref; + + my $flush = sub { + my $minsize = $_[0]; + return if $minsize > length $leaf; + + my $str = $leaf =~ /^\x00([^\x00]*)/ && $1; + $ref = writeblock $leaf, 1; + $leaf = "\0"; + $nodes[0] .= "$str\x00".eref($ref); + + for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) { + my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1; + $ref = writeblock $nodes[$i], 0; + $nodes[$i] = ''; + if($minsize) { + $nodes[$i+1] ||= ''; + $nodes[$i+1] .= "$str\x00".eref($ref); + } + } + }; + + while() { + chomp; + $leaf .= "$_\0"; + $flush->($blocksize); + } + $flush->(0); + print eref $ref; +} + + +sub lookup_rec { + my($q, $F, $ref) = @_; + my $buf = readblock $F, $ref; + if($ref->[0]) { + return $buf =~ /\x00\Q$q\E\x00/; + } else { + while($buf =~ /(.{8})([^\x00]*)\x00/sg) { + return lookup_rec($q, $F, dref $1) if $q lt $2; + } + return lookup_rec($q, $F, dref substr $buf, -8) + } +} + +sub lookup { + my($q, $f) = @_; + open my $F, '<', $f or die $!; + sysseek $F, -8, 2 or die $!; + die $! if 8 != sysread $F, (my $buf), 8; + lookup_rec($q, $F, dref $buf) +} + +sub bench { + my $f = shift; + use Benchmark 'timethis', ':hireswallclock'; + sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 } + srand 0; + timethis 10000, sub { lookup rstr(), $f }, "btree-$compress-$blocksize"; +} + +encode if $cmd eq 'create'; +printf "%s\n", lookup($query, $dbfile) ? "Found" : "Not found" if $cmd eq 'lookup'; +bench $dbfile if $cmd eq 'bench'; diff --git a/lmdb.pl b/lmdb.pl new file mode 100755 index 0000000..9daaaf1 --- /dev/null +++ b/lmdb.pl @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +# Usage: +# +# ./lmdb.pl create dbfile new($dbfile, { mapsize => 1<<36, maxdbs => 1, flags => MDB_NOSUBDIR }); + my $txn = $env->BeginTxn(); + my $dbi = $txn->open('db', MDB_CREATE); + while() { + chomp; + $txn->put($dbi => $_, '') if length($_) && length($_) < 510; + } + $txn->commit; +} + + +sub bench { + my $env = LMDB::Env->new($dbfile, { mapsize => 1<<32, maxdbs => 1, flags => MDB_RDONLY|MDB_NOSUBDIR }); + my $txn = $env->BeginTxn(); + my $dbi = $txn->open('db'); + my $lookup = sub { eval { $txn->get($dbi => $_[0], my $d); 1 } }; + + use Benchmark 'timethis', ':hireswallclock'; + sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 } + srand 0; + timethis 200000, sub { $lookup->(rstr()) }, 'lmdb'; +} + +encode if $cmd eq 'create'; +bench if $cmd eq 'bench'; diff --git a/plain.pl b/plain.pl new file mode 100755 index 0000000..4bc30f5 --- /dev/null +++ b/plain.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl + +# Usage: ./plain.pl db-plain db-gzip + +use strict; +use warnings; +use PerlIO::gzip; +use v5.10; + +sub lookup_str { + my($F, $q) = @_; + while(<$F>) { + chomp; + return 1 if $q eq $_; + return 0 if $q lt $_; + } + return 0; +} + +sub lookup_plain { + my($q, $f) = @_; + open my $F, '<', $f or die $!; + lookup_str $F, $q +} + +sub lookup_gzip { + my($q, $f) = @_; + open my $F, '<:gzip', $f or die $!; + lookup_str $F, $q +} + +my($plain, $gzip) = @ARGV; + +use Benchmark 'timethis', ':hireswallclock'; +sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 } +srand 0; +timethis 2, sub { lookup_plain rstr(), $plain }, 'plain'; +srand 0; +timethis 2, sub { lookup_gzip rstr(), $gzip }, 'gzip'; -- cgit v1.2.3