summaryrefslogtreecommitdiff
path: root/lib/Multi
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2019-08-09 16:53:54 +0200
committerYorhel <git@yorhel.nl>2019-08-09 16:53:56 +0200
commit414daab355c8392dc21d5623cd458426d221fedf (patch)
tree14853fa252b5130e24dcad779505f8b86b9c909b /lib/Multi
parent76da9bdcf9a6492888f02457a9c7a23454da1092 (diff)
Replace Wikipedia links with Wikidata IDs
+ Automatically fetch other links from Wikidata
Diffstat (limited to 'lib/Multi')
-rw-r--r--lib/Multi/Wikidata.pm126
1 files changed, 126 insertions, 0 deletions
diff --git a/lib/Multi/Wikidata.pm b/lib/Multi/Wikidata.pm
new file mode 100644
index 00000000..fea9dbb1
--- /dev/null
+++ b/lib/Multi/Wikidata.pm
@@ -0,0 +1,126 @@
+
+#
+# Multi::Wikidata - Fetches information from wikidata
+#
+
+package Multi::Wikidata;
+
+use strict;
+use warnings;
+use Multi::Core;
+use JSON::XS 'decode_json';
+use AnyEvent::HTTP;
+
+
+my %C = (
+ check_timeout => 30, # Check & fetch for entries to update every 30 seconds
+ fetch_number => 50, # Number of entries to fetch in a single API call
+ fetch_interval => 24*3600, # Minimum delay between updates of a single entry
+ api_endpoint => 'https://www.wikidata.org/w/api.php',
+);
+
+
+sub run {
+ shift;
+ $C{ua} = "VNDB.org Crawler (Multi v$VNDB::S{version}; contact\@vndb.org)";
+ %C = (%C, @_);
+
+ push_watcher schedule 0, $C{check_timeout}, \&fetch;
+}
+
+
+sub fetch {
+ pg_cmd q{
+ SELECT id
+ FROM wikidata
+ WHERE id IN(
+ SELECT l_wikidata FROM producers WHERE l_wp IS NOT NULL AND NOT hidden
+ UNION SELECT l_wikidata FROM staff WHERE l_wp IS NOT NULL AND NOT hidden
+ UNION SELECT l_wikidata FROM vn WHERE l_wp IS NOT NULL AND NOT hidden)
+ AND (lastfetch IS NULL OR lastfetch < now()-($1 * '1 second'::interval))
+ ORDER BY lastfetch NULLS FIRST
+ LIMIT $2
+ }, [ $C{fetch_interval}, $C{fetch_number} ], sub {
+ my($res) = @_;
+ return if pg_expect $res, 1 or !$res->nRows;
+ my @ids = map $res->value($_,0), 0..($res->nRows-1);
+
+ my $ids_q = join '|', map "Q$_", @ids;
+ my $ts = AE::now;
+ http_get "$C{api_endpoint}?action=wbgetentities&format=json&props=sitelinks|claims&sitefilter=enwiki|jawiki&ids=$ids_q",
+ 'User-Agent' => $C{ua},
+ timeout => 60,
+ sub { process(\@ids, $ids_q, $ts, @_) }
+ }
+}
+
+
+my %props = qw/
+ P856 website
+ P3180 vndb
+ P1933 mobygames
+ P4773 mobygames_company
+ P4769 gamefaqs_game
+ P6182 gamefaqs_company
+ P5646 anidb_anime
+ P5649 anidb_person
+ P1985 ann_anime
+ P1984 ann_manga
+ P434 musicbrainz_artist
+ P2002 twitter
+ P5659 vgmdb_product
+ P3435 vgmdb_artist
+ P1953 discogs_artist
+ P7013 acdb_char
+ P7017 acdb_source
+ P6717 indiedb_game
+ P2816 howlongtobeat
+/;
+
+
+sub process {
+ my($ids, $ids_q, $ts, $body, $hdr) = @_;
+
+ # Just update lastfetch even if we have some kind of error further on. This
+ # makes sure we at least don't get into an error loop on the same entry.
+ my $n = 1;
+ my $ids_where = join ',', map sprintf('$%d', $n++), @$ids;
+ pg_cmd "UPDATE wikidata SET lastfetch = NOW() WHERE id IN($ids_where)", $ids;
+
+ return AE::log warn => "$ids_q Http error: $hdr->{Status} $hdr->{Reason}"
+ if $hdr->{Status} !~ /^2/;
+
+ my $data = eval { decode_json $body };
+ return AE::log warn => "$ids_q Error decoding JSON: $@" if !$data;
+
+ save($_, $ts, $data->{entities}{"Q$_"}) for @$ids;
+}
+
+
+sub save {
+ my($id, $ts, $data) = @_;
+
+ my @set = ( 'enwiki = $2', 'jawiki = $3');
+ my @val = ($id, $data->{sitelinks}{enwiki}{title}, $data->{sitelinks}{jawiki}{title});
+
+ for my $p (sort keys %props) {
+ my $v = $data->{claims}{$p};
+ AE::log warn => "Q$id has multiple properties for '$p', storing the first" if $v && @$v > 1;
+
+ $v = $v->[0]{mainsnak}{datavalue}{value};
+ if(ref $v) {
+ AE::log warn => "Q$id has a non-scalar value for '$p'";
+ $v = undef;
+ }
+
+ push @val, $v;
+ push @set, sprintf '%s = $%d', $props{$p}, scalar @val;
+ }
+
+ my $set = join ', ', @set;
+
+ pg_cmd "UPDATE wikidata SET $set WHERE id = \$1", \@val;
+ AE::log info => sprintf "Q%d in %.1fs with %d vals", $id, AE::now()-$ts, -1+scalar grep defined($_), @val;
+}
+
+1;