summaryrefslogtreecommitdiff
path: root/util/dbdump.pl
blob: f749ed727f24e4ecff797814dc27a83ee1d68f3c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
#!/usr/bin/perl
my $HELP=<<_;
Usage:

util/dbdump.pl export-db output.tar.zst

  Write a full database export as a .tar.zst

  The uncompressed directory is written to "output.tar.zst_dir"

util/dbdump.pl export-img output-dir

  Create or update a directory with hardlinks to images.

util/dbdump.pl export-data data.sql

  Create an SQL script that is usable as replacement for 'sql/all.sql'.
  (Similar to the dump created by devdump.pl, except this one includes *all* data)

  This allows recreating the full database using the definitions in sql/*.
  The script does not rely on column order, so can be used to re-order table columns.

util/dbdump.pl export-votes output.gz
util/dbdump.pl export-tags output.gz
util/dbdump.pl export-traits output.gz
_

# TODO:
# - Import
# - Consolidate with devdump.pl?

use strict;
use warnings;
use autodie;
use DBI;
use DBD::Pg;
use File::Copy 'cp';
use File::Find 'find';
use Time::HiRes 'time';

use Cwd 'abs_path';
our $ROOT;
BEGIN { ($ROOT = abs_path $0) =~ s{/util/dbdump\.pl$}{}; }

use lib "$ROOT/lib";
use VNDB::Schema;


# Tables and columns to export.
#
# Tables are exported with an explicit ORDER BY to make them more deterministic
# and avoid potentially leaking information about internal state (such as when
# a user last updated their account).
#
# Hidden DB entries, private user lists and various other rows with no
# interesting references are excluded from the dumps. Keeping all references
# consistent with those omissions complicates the WHERE clauses somewhat.
my %tables = (
    anime               => { where => 'id IN(SELECT va.aid FROM vn_anime va JOIN vn v ON v.id = va.id WHERE NOT v.hidden)' },
    chars               => { where => 'NOT hidden' },
    chars_traits        => { where => 'id IN(SELECT id FROM chars WHERE NOT hidden) AND tid IN(SELECT id FROM traits WHERE NOT hidden)' },
    chars_vns           => { where => 'id IN(SELECT id FROM chars WHERE NOT hidden)'
                                .' AND vid IN(SELECT id FROM vn WHERE NOT hidden)'
                                .' AND (rid IS NULL OR rid IN(SELECT id FROM releases WHERE NOT hidden))'
                           , order => 'id, vid, rid' },
    docs                => { where => 'NOT hidden' },
    images              => { where => "c_weight > 0" }, # Only images with a positive weight are referenced.
    image_votes         => { where => "id IN(SELECT id FROM images WHERE c_weight > 0)", order => 'uid, id' },
    producers           => { where => 'NOT hidden' },
    producers_relations => { where => 'id IN(SELECT id FROM producers WHERE NOT hidden)' },
    quotes              => { where => 'vid IN(SELECT id FROM vn WHERE NOT hidden)' },
    releases            => { where => 'NOT hidden' },
    releases_lang       => { where => 'id IN(SELECT id FROM releases WHERE NOT hidden)' },
    releases_media      => { where => 'id IN(SELECT id FROM releases WHERE NOT hidden)' },
    releases_platforms  => { where => 'id IN(SELECT id FROM releases WHERE NOT hidden)' },
    releases_producers  => { where => 'id IN(SELECT id FROM releases WHERE NOT hidden) AND pid IN(SELECT id FROM producers WHERE NOT hidden)' },
    releases_vn         => { where => 'id IN(SELECT id FROM releases WHERE NOT hidden) AND vid IN(SELECT id FROM vn WHERE NOT hidden)' },
    rlists              => { where => 'EXISTS(SELECT 1 FROM releases r'
                                                    .' JOIN releases_vn rv ON rv.id = r.id'
                                                    .' JOIN vn v ON v.id = rv.vid'
                                                    .' JOIN ulist_vns_labels uvl ON uvl.vid = rv.vid'
                                                    .' JOIN ulist_labels ul ON ul.uid = uvl.uid AND ul.id = uvl.lbl'
                                                   .' WHERE r.id = rlists.rid AND uvl.uid = rlists.uid AND NOT r.hidden AND NOT v.hidden AND NOT ul.private)' },
    staff               => { where => 'NOT hidden' },
    staff_alias         => { where => 'id IN(SELECT id FROM staff WHERE NOT hidden)' },
    tags                => { where => 'NOT hidden' },
    tags_parents        => { where => 'id IN(SELECT id FROM tags WHERE NOT hidden)' },
    tags_vn             => { where => 'tag IN(SELECT id FROM tags WHERE NOT hidden) AND vid IN(SELECT id FROM vn WHERE NOT hidden)', order => 'tag, vid, uid, date' },
    traits              => { where => 'NOT hidden' },
    traits_parents      => { where => 'id IN(SELECT id FROM traits WHERE NOT hidden)' },
    ulist_labels        => { where => 'NOT private AND EXISTS(SELECT 1 FROM ulist_vns_labels uvl WHERE uvl.lbl = id AND ulist_labels.uid = uvl.uid)' },
    ulist_vns           => { where => 'vid IN(SELECT id FROM vn WHERE NOT hidden)'
                                .' AND EXISTS(SELECT 1 FROM ulist_vns_labels uvl'
                                                    .' JOIN ulist_labels ul ON ul.uid = uvl.uid AND ul.id = uvl.lbl'
                                                   .' WHERE ulist_vns.uid = uvl.uid AND ulist_vns.vid = uvl.vid AND NOT ul.private)' },
    ulist_vns_labels    => { where => 'vid IN(SELECT id FROM vn WHERE NOT hidden)'
                                .' AND EXISTS(SELECT 1 FROM ulist_labels ul WHERE ul.uid = ulist_vns_labels.uid AND id = lbl AND NOT ul.private)' },
    users               => { where => 'id IN(SELECT DISTINCT uvl.uid FROM ulist_vns_labels uvl JOIN ulist_labels ul ON ul.uid = uvl.uid AND ul.id = uvl.lbl WHERE NOT ul.private)'
                                 .' OR id IN(SELECT DISTINCT uid FROM tags_vn)'
                                 .' OR id IN(SELECT DISTINCT uid FROM image_votes)'
                                 .' OR id IN(SELECT DISTINCT uid FROM vn_length_votes WHERE NOT private)' },
    vn                  => { where => 'NOT hidden' },
    vn_anime            => { where => 'id IN(SELECT id FROM vn WHERE NOT hidden)' },
    vn_relations        => { where => 'id IN(SELECT id FROM vn WHERE NOT hidden)' },
    vn_screenshots      => { where => 'id IN(SELECT id FROM vn WHERE NOT hidden)' },
    vn_seiyuu           => { where => 'id IN(SELECT id FROM vn WHERE NOT hidden)'
                                .' AND aid IN(SELECT sa.aid FROM staff_alias sa JOIN staff s ON s.id = sa.id WHERE NOT s.hidden)'
                                .' AND cid IN(SELECT id FROM chars WHERE NOT hidden)' },
    vn_staff            => { where => 'id IN(SELECT id FROM vn WHERE NOT hidden) AND aid IN(SELECT sa.aid FROM staff_alias sa JOIN staff s ON s.id = sa.id WHERE NOT s.hidden)' },
    vn_titles           => { where => 'id IN(SELECT id FROM vn WHERE NOT hidden)' },
    vn_length_votes     => { where => 'vid IN(SELECT id FROM vn WHERE NOT hidden) AND NOT private'
                           , order => 'vid, uid' },
    wikidata            => { where => q{id IN(SELECT l_wikidata FROM producers WHERE NOT hidden
                                        UNION SELECT l_wikidata FROM staff WHERE NOT hidden
                                        UNION SELECT l_wikidata FROM vn WHERE NOT hidden)} },
);

my @tables = map +{ name => $_, %{$tables{$_}} }, sort keys %tables;
my $schema = VNDB::Schema::schema;
my $types = VNDB::Schema::types;
my $references = VNDB::Schema::references;

my $db = DBI->connect('dbi:Pg:dbname=vndb', 'vndb', undef, { RaiseError => 1, AutoCommit => 0 });
$db->do('SET TIME ZONE +0');
$db->do('SET TRANSACTION ISOLATION LEVEL SERIALIZABLE');


sub table_order {
    my $s = $schema->{$_[0]};
    my $c = $tables{$_[0]};
    my $o = $s->{primary} ? join ', ', map "\"$_\"", $s->{primary}->@* : $c ? $c->{order} : '';
    $o ? "ORDER BY $o" : '';
}


sub export_timestamp {
    my $dest = shift;
    open my $F, '>', $dest;
    printf $F "%s\n", $db->selectrow_array('SELECT date_trunc(\'second\', NOW())');
}


sub export_table {
    my($dest, $table) = @_;

    my $schema = $schema->{$table->{name}};
    my @cols = grep $_->{pub}, @{$schema->{cols}};
    die "No columns to export for table '$table->{name}'\n" if !@cols;;

    my $fn = "$dest/$table->{name}";

    # Truncate all timestamptz columns to a day, to avoid leaking privacy-sensitive info.
    my $cols = join ', ', map $_->{type} eq 'timestamptz' ? "date_trunc('day', \"$_->{name}\")" : qq{"$_->{name}"}, @cols;
    my $where = $table->{where} ? "WHERE $table->{where}" : '';
    my $order = table_order $table->{name};
    die "Table '$table->{name}' is missing an ORDER BY clause\n" if !$order;

    my $start = time;
    $db->do(qq{COPY (SELECT $cols FROM "$table->{name}" $where $order) TO STDOUT});
    open my $F, '>:utf8', $fn;
    my $v;
    print $F $v while($db->pg_getcopydata($v) >= 0);
    close $F;

    #printf "# Dumped %s in %.3fs\n", $table->{name}, time-$start;

    open $F, '>', "$fn.header";
    print $F join "\t", map $_->{name}, @cols;
    print $F "\n";
    close $F;
}


sub export_import_script {
    my $dest = shift;
    open my $F, '>', $dest;
    print $F <<'    _' =~ s/^    //mgr;
    -- This script will create the necessary tables and import all data into an
    -- existing PostgreSQL database.
    --
    -- Usage:
    --   Run a 'CREATE DATABASE $database' somewhere.
    --   psql -U $user $database -f import.sql
    --
    -- The imported database does not include any indices, other than primary keys.
    -- You may want to create some indices by hand to speed up complex queries.

    -- Uncomment to import the schema and data into a separate namespace:
    --CREATE SCHEMA vndb;
    --SET search_path TO vndb;

    -- 'vndbid' is a custom base type used in the VNDB codebase, but it's safe to treat
    -- it as just text. If you want to use the proper type, load sql/vndbid.sql from
    -- the VNDB source code into your database and comment out the following line.
    -- (or ignore the error message about 'vndbid' already existing)
    CREATE DOMAIN vndbid AS text;
    _

    print $F "\n\n";
    my %types = map +($_->{type}, 1), grep $_->{pub}, map @{$schema->{$_->{name}}{cols}}, @tables;
    print $F "$types->{$_}{decl}\n" for (sort grep $types->{$_}, keys %types);

    for my $table (@tables) {
        my $schema = $schema->{$table->{name}};
        my @primary = grep { my $n=$_; !!grep $_->{name} eq $n && $_->{pub}, $schema->{cols}->@* } ($schema->{primary}||[])->@*;
        print $F "\n";
        print $F "CREATE TABLE \"$table->{name}\" (\n";
        print $F join ",\n", map "  $_->{decl}" =~ s/" serial/" integer/ir =~ s/ +(?:check|constraint|default) +.*//ir, grep $_->{pub}, @{$schema->{cols}};
        print $F ",\n  PRIMARY KEY(".join(', ', map "\"$_\"", @primary).")" if @primary;
        print $F "\n);\n";
    }

    print $F "\n\n";
    print $F "-- You can comment out tables you don't need, to speed up the import and save some disk space.\n";
    print $F "\\copy $_->{name} from 'db/$_->{name}'\n" for @tables;

    print $F "\n\n";
    print $F "-- These are included to verify the internal consistency of the dump, you can safely comment out this part.\n";
    for my $ref (@$references) {
        next if !$tables{$ref->{from_table}} || !$tables{$ref->{to_table}};
        my %pub = map +($_->{name}, 1), grep $_->{pub}, @{$schema->{$ref->{from_table}}{cols}};
        next if grep !$pub{$_}, @{$ref->{from_cols}};
        print $F "$ref->{decl}\n";
    }
}


sub export_db {
    my $dest = shift;

    my @static = qw{
        LICENSE-CC0.txt
        LICENSE-CC-BY-NC-SA.txt
        LICENSE-DBCL.txt
        LICENSE-ODBL.txt
        README.txt
    };

    # This will die if it already exists, which is good because we want to write to a new empty dir.
    mkdir "${dest}_dir";
    mkdir "${dest}_dir/db";

    cp "$ROOT/util/dump/$_", "${dest}_dir/$_" for @static;

    export_timestamp "${dest}_dir/TIMESTAMP";
    export_table "${dest}_dir/db", $_ for @tables;
    export_import_script "${dest}_dir/import.sql";

    #print "# Compressing\n";
    `tar -cf "$dest" -I 'zstd -7' --sort=name -C "${dest}_dir" @static import.sql TIMESTAMP db`
}


# Copy file while retaining access/modification times
sub cp_p {
    my($from, $to) = @_;
    cp $from, $to;
    utime @{ [stat($from)] }[8,9], $to;
}


# XXX: This does not include images that are linked from descriptions; May want to borrow from util/unusedimages.pl to find those.
sub export_img {
    my $dest = shift;

    {
        no autodie;
        mkdir ${dest};
        mkdir sprintf '%s/%s', $dest, $_ for qw/ch cv sf st/;
        mkdir sprintf '%s/%s/%02d', $dest, $_->[0], $_->[1] for map +([ch=>$_], [cv=>$_], [sf=>$_], [st=>$_]), 0..99;
    }

    cp_p "$ROOT/util/dump/LICENSE-ODBL.txt", "$dest/LICENSE-ODBL.txt";
    cp_p "$ROOT/util/dump/README-img.txt", "$dest/README.txt";
    export_timestamp "$dest/TIMESTAMP";

    my %scr;
    my %dir = (ch => {}, cv => {}, sf => \%scr, st => \%scr);
    $dir{sf}{$_->[0]} = 1 for $db->selectall_array("SELECT vndbid_num(scr) FROM vn_screenshots WHERE $tables{vn_screenshots}{where}");
    $dir{cv}{$_->[0]} = 1 for $db->selectall_array("SELECT vndbid_num(image) FROM vn WHERE image IS NOT NULL AND $tables{vn}{where}");
    $dir{ch}{$_->[0]} = 1 for $db->selectall_array("SELECT vndbid_num(image) FROM chars WHERE image IS NOT NULL AND $tables{chars}{where}");
    $db->rollback;
    undef $db;

    find {
        no_chdir => 1,
        wanted => sub {
            unlink $File::Find::name if $File::Find::name =~ m{(cv|ch|sf|st)/[0-9][0-9]/([0-9]+)\.jpg$} && !$dir{$1}{$2};
        }
    }, $dest;

    for my $d (keys %dir) {
        for my $i (keys %{$dir{$d}}) {
            my $f = sprintf('%s/%02d/%d.jpg', $d, $i % 100, $i);
            link "$ROOT/static/$f", "$dest/$f" if !-e "$dest/$f";
        }
    }
}


sub export_data {
    my $dest = shift;
    my $F = *STDOUT;
    open $F, '>', $dest if $dest ne '-';
    binmode($F, ":utf8");
    select $F;
    print "\\set ON_ERROR_STOP 1\n";
    print "\\i sql/util.sql\n";
    print "\\i sql/schema.sql\n";
    # Would be nice if VNDB::Schema could list sequences, too.
    my @seq = sort @{ $db->selectcol_arrayref(
        "SELECT oid::regclass::text FROM pg_class WHERE relkind = 'S' AND relnamespace = 'public'::regnamespace"
    ) };
    printf "SELECT setval('%s', %d);\n", $_, $db->selectrow_array("SELECT last_value FROM \"$_\"", {}) for @seq;
    for my $t (sort { $a->{name} cmp $b->{name} } values %$schema) {
        my $cols = join ',', map "\"$_->{name}\"", grep $_->{decl} !~ /\sGENERATED\s/, $t->{cols}->@*;
        my $order = table_order $t->{name};
        print "\nCOPY \"$t->{name}\" ($cols) FROM STDIN;\n";
        $db->do("COPY (SELECT $cols FROM \"$t->{name}\" $order) TO STDOUT");
        my $v;
        print $v while($db->pg_getcopydata($v) >= 0);
        print "\\.\n";
    }
    print "\\i sql/func.sql\n";
    print "\\i sql/editfunc.sql\n";
    print "\\i sql/tableattrs.sql\n";
    print "\\i sql/triggers.sql\n";
    print "\\set ON_ERROR_STOP 0\n";
    print "\\i sql/perms.sql\n";
}


sub export_votes {
    my $dest = shift;
    require PerlIO::gzip;

    open my $F, '>:gzip:utf8', $dest;
    $db->do(q{COPY (
        SELECT vndbid_num(uv.vid)||' '||vndbid_num(uv.uid)||' '||uv.vote||' '||to_char(uv.vote_date, 'YYYY-MM-DD')
          FROM ulist_vns uv
          JOIN users u ON u.id = uv.uid
          JOIN vn v ON v.id = uv.vid
         WHERE NOT v.hidden
           AND NOT u.ign_votes
           AND uv.vote IS NOT NULL
           AND EXISTS(SELECT 1 FROM ulist_vns_labels uvl JOIN ulist_labels ul ON ul.id = uvl.lbl AND ul.uid = uvl.uid WHERE uv.uid = uvl.uid AND uv.vid = uvl.vid AND NOT ul.private)
         ORDER BY uv.vid, uv.uid
       ) TO STDOUT
    });
    my $v;
    print $F $v while($db->pg_getcopydata($v) >= 0);
}


sub export_tags {
    my $dest = shift;
    require JSON::XS;
    require PerlIO::gzip;

    my $lst = $db->selectall_arrayref(q{
        SELECT vndbid_num(id) AS id, name, description, searchable, applicable, c_items AS vns, cat, alias,
          (SELECT string_agg(vndbid_num(parent)::text, ',' ORDER BY main desc, parent) FROM tags_parents tp WHERE tp.id = t.id) AS parents
        FROM tags t WHERE NOT hidden ORDER BY id
    }, { Slice => {} });
    for(@$lst) {
      $_->{id} *= 1;
      $_->{meta} = !$_->{searchable} ? JSON::XS::true() : JSON::XS::false(); # For backwards compat
      $_->{searchable} = $_->{searchable} ? JSON::XS::true() : JSON::XS::false();
      $_->{applicable} = $_->{applicable} ? JSON::XS::true() : JSON::XS::false();
      $_->{vns} *= 1;
      $_->{aliases} = [ split /\n/, delete $_->{alias} ];
      $_->{parents} = [ map $_*1, split /,/, ($_->{parents}||'') ];
    }

    open my $F, '>:gzip:utf8', $dest;
    print $F JSON::XS->new->canonical->encode($lst);
}


sub export_traits {
    my $dest = shift;
    require JSON::XS;
    require PerlIO::gzip;

    my $lst = $db->selectall_arrayref(q{
        SELECT vndbid_num(id) AS id, name, alias AS aliases, description, searchable, applicable, c_items AS chars,
               (SELECT string_agg(vndbid_num(parent)::text, ',' ORDER BY main desc, parent) FROM traits_parents tp WHERE tp.id = t.id) AS parents
        FROM traits t WHERE NOT hidden ORDER BY id
    }, { Slice => {} });
    for(@$lst) {
      $_->{id} *= 1;
      $_->{meta} = $_->{searchable} ? JSON::XS::true() : JSON::XS::false(); # For backwards compat
      $_->{searchable} = $_->{searchable} ? JSON::XS::true() : JSON::XS::false();
      $_->{applicable} = $_->{applicable} ? JSON::XS::true() : JSON::XS::false();
      $_->{chars} *= 1;
      $_->{aliases} = [ split /\r?\n/, ($_->{aliases}||'') ];
      $_->{parents} = [ map $_*1, split /,/, ($_->{parents}||'') ];
    }

    open my $F, '>:gzip:utf8', $dest;
    print $F JSON::XS->new->canonical->encode($lst);
}


if($ARGV[0] && $ARGV[0] eq 'export-db' && $ARGV[1]) {
    export_db $ARGV[1];
} elsif($ARGV[0] && $ARGV[0] eq 'export-img' && $ARGV[1]) {
    export_img $ARGV[1];
} elsif($ARGV[0] && $ARGV[0] eq 'export-data' && $ARGV[1]) {
    export_data $ARGV[1];
} elsif($ARGV[0] && $ARGV[0] eq 'export-votes' && $ARGV[1]) {
    export_votes $ARGV[1];
} elsif($ARGV[0] && $ARGV[0] eq 'export-tags' && $ARGV[1]) {
    export_tags $ARGV[1];
} elsif($ARGV[0] && $ARGV[0] eq 'export-traits' && $ARGV[1]) {
    export_traits $ARGV[1];
} else {
    print $HELP;
}