1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
|
#!/usr/bin/perl
# Usage:
#
# ./btree.pl create blocksize compression <sorted-dictionary >dbfile
# ./btree.pl lookup blocksize compression dbfile query
# ./btree.pl bench blocksize compression dbfile
use strict;
use warnings;
use v5.10;
my($cmd, $blocksize, $compress, $dbfile, $query) = @ARGV;
$blocksize ||= 4096; # Actual block sizes may differ a bit.
$compress ||= 'plain';
if($compress eq 'zstd') {
require Compress::Zstd;
no warnings 'once';
*compress = \*Compress::Zstd::compress;
*uncompress = \*Compress::Zstd::decompress;
} elsif($compress eq 'gzip') {
require Compress::Zlib;
no warnings 'once';
*compress = \*Compress::Zlib::compress;
*uncompress = \*Compress::Zlib::uncompress;
} else {
*compress = sub { shift };
*uncompress = sub { shift };
}
# Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16)
sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 }
sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] }
# Write a block and return its reference.
sub writeblock {
state $off = 0;
my $buf = compress($_[0]);
my $len = length $buf;
print $buf;
my $oldoff = $off;
$off += $len;
[$_[1], $len, $oldoff]
}
# Read a block given a file handle and a reference.
sub readblock {
my($F, $ref) = @_;
die $! if !sysseek $F, $ref->[2], 0;
die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1];
uncompress($buf)
}
sub encode {
my $leaf = "\0";
my @nodes = ('');
my $ref;
my $flush = sub {
my $minsize = $_[0];
return if $minsize > length $leaf;
my $str = $leaf =~ /^\x00([^\x00]*)/ && $1;
$ref = writeblock $leaf, 1;
$leaf = "\0";
$nodes[0] .= "$str\x00".eref($ref);
for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) {
my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1;
$ref = writeblock $nodes[$i], 0;
$nodes[$i] = '';
if($minsize || $nodes[$i+1]) {
$nodes[$i+1] ||= '';
$nodes[$i+1] .= "$str\x00".eref($ref);
}
}
};
while(<STDIN>) {
chomp;
$leaf .= "$_\0";
$flush->($blocksize);
}
$flush->(0);
print eref $ref;
}
sub lookup_rec {
my($q, $F, $ref) = @_;
my $buf = readblock $F, $ref;
if($ref->[0]) {
return $buf =~ /\x00\Q$q\E\x00/;
} else {
while($buf =~ /(.{8})([^\x00]*)\x00/sg) {
return lookup_rec($q, $F, dref $1) if $q lt $2;
}
return lookup_rec($q, $F, dref substr $buf, -8)
}
}
sub lookup {
my($q, $f) = @_;
open my $F, '<', $f or die $!;
sysseek $F, -8, 2 or die $!;
die $! if 8 != sysread $F, (my $buf), 8;
lookup_rec($q, $F, dref $buf)
}
sub bench {
my $f = shift;
use Benchmark 'timethis', ':hireswallclock';
sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 }
srand 0;
timethis 10000, sub { lookup rstr(), $f }, "btree-$compress-$blocksize";
}
encode if $cmd eq 'create';
printf "%s\n", lookup($query, $dbfile) ? "Found" : "Not found" if $cmd eq 'lookup';
bench $dbfile if $cmd eq 'bench';
|