summaryrefslogtreecommitdiff
path: root/lib/PWLookup.pm
blob: 6e2f03e434508ea4d0b95b208c12b7f3913e58b8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/perl

# This script is based on the btree.pl that I wrote as part of a little
# experiment: https://dev.yorhel.nl/doc/pwlookup
#
# It is hardcoded to use gzip (because that's available in a standard Perl
# distribution) compression level 9 (saves a few MiB with no noticable impact
# on lookup performance) with 4k block sizes (because that is fast enough and
# offers good compression).
#
# Creating the database:
#
#   perl PWlookup.pm create <sorted-dictionary >dbfile
#
# Extracting all passwords from the database:
#
#   perl PWLookup.pm extract dbfile >sorted-dictionary
#
# Performing lookups (from the CLI):
#
#   perl PWLookup.pm lookup dbfile query
#
# Performing lookups (from Perl):
#
#   use PWLookup;
#   my $pw_exists = PWLookup::lookup($dbfile, $query);

package PWLookup;

use strict;
use warnings;
use v5.10;
use Compress::Zlib qw/compress uncompress/;
use Encode qw/encode_utf8 decode_utf8/;

my $blocksize = 4096;

# Encode/decode a block reference, [ leaf, length, offset ]. Encoded in a single 64bit integer as (leaf | length << 1 | offset << 16)
sub eref($) { pack 'Q', ($_[0][0]?1:0) | $_[0][1]<<1 | $_[0][2]<<16 }
sub dref($) { my $v = unpack 'Q', $_[0]; [$v&1, ($v>>1)&((1<<15)-1), $v>>16] }

# Write a block and return its reference.
sub writeblock {
    state $off = 0;
    my $buf = compress($_[0], 9);
    my $len = length $buf;
    print $buf;
    my $oldoff = $off;
    $off += $len;
    [$_[1], $len, $oldoff]
}

# Read a block given a file handle and a reference.
sub readblock {
    my($F, $ref) = @_;
    die $! if !sysseek $F, $ref->[2], 0;
    die $! if $ref->[1] != sysread $F, (my $buf), $ref->[1];
    uncompress($buf)
}

sub encode {
    my $leaf = "\0";
    my @nodes = ('');
    my $ref;

    my $flush = sub {
        my $minsize = $_[0];
        return if $minsize > length $leaf;

        my $str = $leaf =~ /^\x00([^\x00]*)/ && $1;
        $ref = writeblock $leaf, 1;
        $leaf = "\0";
        $nodes[0] .= "$str\x00".eref($ref);

        for(my $i=0; $i <= $#nodes && $minsize < length $nodes[$i]; $i++) {
            my $str = $nodes[$i] =~ s/^([^\x00]*)\x00// && $1;
            $ref = writeblock $nodes[$i], 0;
            $nodes[$i] = '';
            if($minsize || $nodes[$i+1]) {
                $nodes[$i+1] ||= '';
                $nodes[$i+1] .= "$str\x00".eref($ref);
            }
        }
    };

    my $last;
    while((my $p = <STDIN>)) {
        chomp($p);
        # No need to store passwords that are rejected by form validation
        if(!length($p) || length($p) > 500 || !eval { decode_utf8((local $_=$p), Encode::FB_CROAK); 1 } || $p =~ /\x00/) {
            warn sprintf "Rejecting: %s\n", ($p =~ s/([^\x21-\x7e])/sprintf '%%%02x', ord $1/ger);
            next;
        }
        # Extra check to make sure the input is unique and sorted according to Perl's string comparison
        if(defined($last) && $last ge $p) {
            warn "Rejecting due to uniqueness or incorrect sorting: $p\n";
            next;
        }
        $leaf .= "$p\0";
        $flush->($blocksize);
    }
    $flush->(0);
    print eref $ref;
}


sub lookup_rec {
    my($F, $q, $ref) = @_;
    my $buf = readblock $F, $ref;
    if($ref->[0]) {
        return $buf =~ /\x00\Q$q\E\x00/;
    } else {
        while($buf =~ /(.{8})([^\x00]+)\x00/sg) {
            return lookup_rec($F, $q, dref $1) if $q lt $2;
        }
        return lookup_rec($F, $q, dref substr $buf, -8)
    }
}

sub lookup {
    my($f, $q) = @_;
    open my $F, '<', $f or die $!;
    sysseek $F, -8, 2 or die $!;
    die $! if 8 != sysread $F, (my $buf), 8;
    lookup_rec($F, encode_utf8($q), dref $buf)
}


sub extract_rec {
    my($F, $ref) = @_;
    my $buf = readblock $F, $ref;
    if($ref->[0]) {
        print "$1\n" while $buf =~ /\x00([^\x00]+)/g;
    } else {
        extract_rec($F, dref $1) while $buf =~ /(.{8})[^\x00]+\x00/sg;
        extract_rec($F, dref substr $buf, -8)
    }
}

sub extract {
    my($f) = @_;
    open my $F, '<', $f or die $!;
    sysseek $F, -8, 2 or die $!;
    die $! if 8 != sysread $F, (my $buf), 8;
    extract_rec($F, dref $buf)
}


if(!caller) {
    encode() if $ARGV[0] eq 'create';
    extract($ARGV[1]) if $ARGV[0] eq 'extract';
    printf "%s\n", lookup($ARGV[1], decode_utf8 $ARGV[2]) ? 'Found' : 'Not found' if $ARGV[0] eq 'lookup';
}

1;