#!/usr/bin/perl # Usage: # # ./btree.pl create blocksize compression dbfile # ./btree.pl lookup blocksize compression dbfile query # ./btree.pl bench blocksize compression dbfile use strict; use warnings; use v5.10; my($cmd, $blocksize, $compress, $dbfile, $query) = @ARGV; $blocksize ||= 4096; # Actual block sizes may differ a bit. $compress ||= 'plain'; if($compress eq 'zstd') { require Compress::Zstd; no warnings 'once'; *compress = \*Compress::Zstd::compress; *uncompress = \*Compress::Zstd::decompress; } elsif($compress eq 'gzip') { require Compress::Zlib; no warnings 'once'; *compress = \*Compress::Zlib::compress; *uncompress = \*Compress::Zlib::uncompress; } else { *compress = sub { shift }; *uncompress = sub { shift }; } # Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16) sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 } sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] } # Write a block and return its reference. sub writeblock { state $off = 0; my $buf = compress($_[0]); my $len = length $buf; print $buf; my $oldoff = $off; $off += $len; [$_[1], $len, $oldoff] } # Read a block given a file handle and a reference. sub readblock { my($F, $ref) = @_; die $! if !sysseek $F, $ref->[2], 0; die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1]; uncompress($buf) } sub encode { my $leaf = "\0"; my @nodes = (''); my $ref; my $flush = sub { my $minsize = $_[0]; return if $minsize > length $leaf; my $str = $leaf =~ /^\x00([^\x00]*)/ && $1; $ref = writeblock $leaf, 1; $leaf = "\0"; $nodes[0] .= "$str\x00".eref($ref); for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) { my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1; $ref = writeblock $nodes[$i], 0; $nodes[$i] = ''; if($minsize || $nodes[$i+1]) { $nodes[$i+1] ||= ''; $nodes[$i+1] .= "$str\x00".eref($ref); } } }; while() { chomp; $leaf .= "$_\0"; $flush->($blocksize); } $flush->(0); print eref $ref; } sub lookup_rec { my($q, $F, $ref) = @_; my $buf = readblock $F, $ref; if($ref->[0]) { return $buf =~ /\x00\Q$q\E\x00/; } else { while($buf =~ /(.{8})([^\x00]*)\x00/sg) { return lookup_rec($q, $F, dref $1) if $q lt $2; } return lookup_rec($q, $F, dref substr $buf, -8) } } sub lookup { my($q, $f) = @_; open my $F, '<', $f or die $!; sysseek $F, -8, 2 or die $!; die $! if 8 != sysread $F, (my $buf), 8; lookup_rec($q, $F, dref $buf) } sub bench { my $f = shift; use Benchmark 'timethis', ':hireswallclock'; sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 } srand 0; timethis 10000, sub { lookup rstr(), $f }, "btree-$compress-$blocksize"; } encode if $cmd eq 'create'; printf "%s\n", lookup($query, $dbfile) ? "Found" : "Not found" if $cmd eq 'lookup'; bench $dbfile if $cmd eq 'bench';