summaryrefslogtreecommitdiff
path: root/btree.pl
blob: fb792bf41c126821818b09feaac77aea76fdfad3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl

# Usage:
#
#   ./btree.pl create blocksize compression <sorted-dictionary >dbfile
#   ./btree.pl lookup blocksize compression dbfile query
#   ./btree.pl bench  blocksize compression dbfile

use strict;
use warnings;
use v5.10;


my($cmd, $blocksize, $compress, $dbfile, $query) = @ARGV;

$blocksize ||= 4096; # Actual block sizes may differ a bit.
$compress ||= 'plain';

if($compress eq 'zstd') {
    require Compress::Zstd;
    no warnings 'once';
    *compress = \*Compress::Zstd::compress;
    *uncompress = \*Compress::Zstd::decompress;
} elsif($compress eq 'gzip') {
    require Compress::Zlib;
    no warnings 'once';
    *compress = \*Compress::Zlib::compress;
    *uncompress = \*Compress::Zlib::uncompress;
} else {
    *compress = sub { shift };
    *uncompress = sub { shift };
}


# Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16)
sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 }
sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] }

# Write a block and return its reference.
sub writeblock {
    state $off = 0;
    my $buf = compress($_[0]);
    my $len = length $buf;
    print $buf;
    my $oldoff = $off;
    $off += $len;
    [$_[1], $len, $oldoff]
}

# Read a block given a file handle and a reference.
sub readblock {
    my($F, $ref) = @_;
    die $! if !sysseek $F, $ref->[2], 0;
    die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1];
    uncompress($buf)
}

sub encode {
    my $leaf = "\0";
    my @nodes = ('');
    my $ref;

    my $flush = sub {
        my $minsize = $_[0];
        return if $minsize > length $leaf;

        my $str = $leaf =~ /^\x00([^\x00]*)/ && $1;
        $ref = writeblock $leaf, 1;
        $leaf = "\0";
        $nodes[0] .= "$str\x00".eref($ref);

        for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) {
            my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1;
            $ref = writeblock $nodes[$i], 0;
            $nodes[$i] = '';
            if($minsize || $nodes[$i+1]) {
                $nodes[$i+1] ||= '';
                $nodes[$i+1] .= "$str\x00".eref($ref);
            }
        }
    };

    while(<STDIN>) {
        chomp;
        $leaf .= "$_\0";
        $flush->($blocksize);
    }
    $flush->(0);
    print eref $ref;
}


sub lookup_rec {
    my($q, $F, $ref) = @_;
    my $buf = readblock $F, $ref;
    if($ref->[0]) {
        return $buf =~ /\x00\Q$q\E\x00/;
    } else {
        while($buf =~ /(.{8})([^\x00]*)\x00/sg) {
            return lookup_rec($q, $F, dref $1) if $q lt $2;
        }
        return lookup_rec($q, $F, dref substr $buf, -8)
    }
}

sub lookup {
    my($q, $f) = @_;
    open my $F, '<', $f or die $!;
    sysseek $F, -8, 2 or die $!;
    die $! if 8 != sysread $F, (my $buf), 8;
    lookup_rec($q, $F, dref $buf)
}

sub bench {
    my $f = shift;
    use Benchmark 'timethis', ':hireswallclock';
    sub rstr { state $s=['a'..'z','A'..'Z','0'..'9','!','@','#']; join '', map $s->[rand @$s], 1..5 }
    srand 0;
    timethis 10000, sub { lookup rstr(), $f }, "btree-$compress-$blocksize";
}

encode if $cmd eq 'create';
printf "%s\n", lookup($query, $dbfile) ? "Found" : "Not found" if $cmd eq 'lookup';
bench $dbfile if $cmd eq 'bench';