1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#
# Multi::Wikidata - Fetches information from wikidata
#
package Multi::Wikidata;
use strict;
use warnings;
use Multi::Core;
use JSON::XS 'decode_json';
use AnyEvent::HTTP;
my %C = (
check_timeout => 30, # Check & fetch for entries to update every 30 seconds
fetch_number => 50, # Number of entries to fetch in a single API call
fetch_interval => 24*3600, # Minimum delay between updates of a single entry
api_endpoint => 'https://www.wikidata.org/w/api.php',
);
sub run {
shift;
$C{ua} = "VNDB.org Crawler (Multi v$VNDB::S{version}; contact\@vndb.org)";
%C = (%C, @_);
push_watcher schedule 0, $C{check_timeout}, \&fetch;
}
sub fetch {
pg_cmd q{
SELECT id
FROM wikidata
WHERE id IN(
SELECT l_wikidata FROM producers WHERE l_wp IS NOT NULL AND NOT hidden
UNION SELECT l_wikidata FROM staff WHERE l_wp IS NOT NULL AND NOT hidden
UNION SELECT l_wikidata FROM vn WHERE l_wp IS NOT NULL AND NOT hidden)
AND (lastfetch IS NULL OR lastfetch < now()-($1 * '1 second'::interval))
ORDER BY lastfetch NULLS FIRST
LIMIT $2
}, [ $C{fetch_interval}, $C{fetch_number} ], sub {
my($res) = @_;
return if pg_expect $res, 1 or !$res->nRows;
my @ids = map $res->value($_,0), 0..($res->nRows-1);
my $ids_q = join '|', map "Q$_", @ids;
my $ts = AE::now;
http_get "$C{api_endpoint}?action=wbgetentities&format=json&props=sitelinks|claims&sitefilter=enwiki|jawiki&ids=$ids_q",
'User-Agent' => $C{ua},
timeout => 60,
sub { process(\@ids, $ids_q, $ts, @_) }
}
}
my %props = qw/
P856 website
P3180 vndb
P1933 mobygames
P4773 mobygames_company
P4769 gamefaqs_game
P6182 gamefaqs_company
P5646 anidb_anime
P5649 anidb_person
P1985 ann_anime
P1984 ann_manga
P434 musicbrainz_artist
P2002 twitter
P5659 vgmdb_product
P3435 vgmdb_artist
P1953 discogs_artist
P7013 acdb_char
P7017 acdb_source
P6717 indiedb_game
P2816 howlongtobeat
/;
sub process {
my($ids, $ids_q, $ts, $body, $hdr) = @_;
# Just update lastfetch even if we have some kind of error further on. This
# makes sure we at least don't get into an error loop on the same entry.
my $n = 1;
my $ids_where = join ',', map sprintf('$%d', $n++), @$ids;
pg_cmd "UPDATE wikidata SET lastfetch = NOW() WHERE id IN($ids_where)", $ids;
return AE::log warn => "$ids_q Http error: $hdr->{Status} $hdr->{Reason}"
if $hdr->{Status} !~ /^2/;
my $data = eval { decode_json $body };
return AE::log warn => "$ids_q Error decoding JSON: $@" if !$data;
save($_, $ts, $data->{entities}{"Q$_"}) for @$ids;
}
sub save {
my($id, $ts, $data) = @_;
my @set = ( 'enwiki = $2', 'jawiki = $3');
my @val = ($id, $data->{sitelinks}{enwiki}{title}, $data->{sitelinks}{jawiki}{title});
for my $p (sort keys %props) {
my $v = $data->{claims}{$p};
AE::log warn => "Q$id has multiple properties for '$p', storing the first" if $v && @$v > 1;
$v = $v->[0]{mainsnak}{datavalue}{value};
if(ref $v) {
AE::log warn => "Q$id has a non-scalar value for '$p'";
$v = undef;
}
push @val, $v;
push @set, sprintf '%s = $%d', $props{$p}, scalar @val;
}
my $set = join ', ', @set;
pg_cmd "UPDATE wikidata SET $set WHERE id = \$1", \@val;
AE::log info => sprintf "Q%d in %.1fs with %d vals", $id, AE::now()-$ts, -1+scalar grep defined($_), @val;
}
1;
|