summaryrefslogtreecommitdiff
path: root/btree.pl
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2019-05-14 18:37:14 +0200
committerYorhel <git@yorhel.nl>2019-05-14 18:38:12 +0200
commit04719fb0dd62858f97996f791a2318e062b90544 (patch)
tree47f820a61ab0ee7757740e8cd6367f31e1e310ed /btree.pl
Initial commit
Diffstat (limited to 'btree.pl')
-rwxr-xr-xbtree.pl124
1 files changed, 124 insertions, 0 deletions
diff --git a/btree.pl b/btree.pl
new file mode 100755
index 0000000..04bb168
--- /dev/null
+++ b/btree.pl
@@ -0,0 +1,124 @@
+#!/usr/bin/perl
+
+# Usage:
+#
+# ./btree.pl create blocksize compression <sorted-dictionary >dbfile
+# ./btree.pl lookup blocksize compression dbfile query
+# ./btree.pl bench blocksize compression dbfile
+
+use strict;
+use warnings;
+use v5.10;
+
+
+my($cmd, $blocksize, $compress, $dbfile, $query) = @ARGV;
+
+$blocksize ||= 4096; # Actual block sizes may differ a bit.
+$compress ||= 'plain';
+
+if($compress eq 'zstd') {
+ require Compress::Zstd;
+ no warnings 'once';
+ *compress = \*Compress::Zstd::compress;
+ *uncompress = \*Compress::Zstd::decompress;
+} elsif($compress eq 'gzip') {
+ require Compress::Zlib;
+ no warnings 'once';
+ *compress = \*Compress::Zlib::compress;
+ *uncompress = \*Compress::Zlib::uncompress;
+} else {
+ *compress = sub { shift };
+ *uncompress = sub { shift };
+}
+
+
+# Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16)
+sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 }
+sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] }
+
+# Write a block and return its reference.
+sub writeblock {
+ state $off = 0;
+ my $buf = compress($_[0]);
+ my $len = length $buf;
+ print $buf;
+ my $oldoff = $off;
+ $off += $len;
+ [$_[1], $len, $oldoff]
+}
+
+# Read a block given a file handle and a reference.
+sub readblock {
+ my($F, $ref) = @_;
+ die $! if !sysseek $F, $ref->[2], 0;
+ die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1];
+ uncompress($buf)
+}
+
+sub encode {
+ my $leaf = "\0";
+ my @nodes = ('');
+ my $ref;
+
+ my $flush = sub {
+ my $minsize = $_[0];
+ return if $minsize > length $leaf;
+
+ my $str = $leaf =~ /^\x00([^\x00]*)/ && $1;
+ $ref = writeblock $leaf, 1;
+ $leaf = "\0";
+ $nodes[0] .= "$str\x00".eref($ref);
+
+ for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) {
+ my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1;
+ $ref = writeblock $nodes[$i], 0;
+ $nodes[$i] = '';
+ if($minsize) {
+ $nodes[$i+1] ||= '';
+ $nodes[$i+1] .= "$str\x00".eref($ref);
+ }
+ }
+ };
+
+ while(<STDIN>) {
+ chomp;
+ $leaf .= "$_\0";
+ $flush->($blocksize);
+ }
+ $flush->(0);
+ print eref $ref;
+}
+
+
+sub lookup_rec {
+ my($q, $F, $ref) = @_;
+ my $buf = readblock $F, $ref;
+ if($ref->[0]) {
+ return $buf =~ /\x00\Q$q\E\x00/;
+ } else {
+ while($buf =~ /(.{8})([^\x00]*)\x00/sg) {
+ return lookup_rec($q, $F, dref $1) if $q lt $2;
+ }
+ return lookup_rec($q, $F, dref substr $buf, -8)
+ }
+}
+
+sub lookup {
+ my($q, $f) = @_;
+ open my $F, '<', $f or die $!;
+ sysseek $F, -8, 2 or die $!;
+ die $! if 8 != sysread $F, (my $buf), 8;
+ lookup_rec($q, $F, dref $buf)
+}
+
+sub bench {
+ my $f = shift;
+ use Benchmark 'timethis', ':hireswallclock';
+ sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 }
+ srand 0;
+ timethis 10000, sub { lookup rstr(), $f }, "btree-$compress-$blocksize";
+}
+
+encode if $cmd eq 'create';
+printf "%s\n", lookup($query, $dbfile) ? "Found" : "Not found" if $cmd eq 'lookup';
+bench $dbfile if $cmd eq 'bench';