diff options
author | Yorhel <git@yorhel.nl> | 2019-08-09 16:53:54 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2019-08-09 16:53:56 +0200 |
commit | 414daab355c8392dc21d5623cd458426d221fedf (patch) | |
tree | 14853fa252b5130e24dcad779505f8b86b9c909b /lib/Multi | |
parent | 76da9bdcf9a6492888f02457a9c7a23454da1092 (diff) |
Replace Wikipedia links with Wikidata IDs
+ Automatically fetch other links from Wikidata
Diffstat (limited to 'lib/Multi')
-rw-r--r-- | lib/Multi/Wikidata.pm | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/lib/Multi/Wikidata.pm b/lib/Multi/Wikidata.pm new file mode 100644 index 00000000..fea9dbb1 --- /dev/null +++ b/lib/Multi/Wikidata.pm @@ -0,0 +1,126 @@ + +# +# Multi::Wikidata - Fetches information from wikidata +# + +package Multi::Wikidata; + +use strict; +use warnings; +use Multi::Core; +use JSON::XS 'decode_json'; +use AnyEvent::HTTP; + + +my %C = ( + check_timeout => 30, # Check & fetch for entries to update every 30 seconds + fetch_number => 50, # Number of entries to fetch in a single API call + fetch_interval => 24*3600, # Minimum delay between updates of a single entry + api_endpoint => 'https://www.wikidata.org/w/api.php', +); + + +sub run { + shift; + $C{ua} = "VNDB.org Crawler (Multi v$VNDB::S{version}; contact\@vndb.org)"; + %C = (%C, @_); + + push_watcher schedule 0, $C{check_timeout}, \&fetch; +} + + +sub fetch { + pg_cmd q{ + SELECT id + FROM wikidata + WHERE id IN( + SELECT l_wikidata FROM producers WHERE l_wp IS NOT NULL AND NOT hidden + UNION SELECT l_wikidata FROM staff WHERE l_wp IS NOT NULL AND NOT hidden + UNION SELECT l_wikidata FROM vn WHERE l_wp IS NOT NULL AND NOT hidden) + AND (lastfetch IS NULL OR lastfetch < now()-($1 * '1 second'::interval)) + ORDER BY lastfetch NULLS FIRST + LIMIT $2 + }, [ $C{fetch_interval}, $C{fetch_number} ], sub { + my($res) = @_; + return if pg_expect $res, 1 or !$res->nRows; + my @ids = map $res->value($_,0), 0..($res->nRows-1); + + my $ids_q = join '|', map "Q$_", @ids; + my $ts = AE::now; + http_get "$C{api_endpoint}?action=wbgetentities&format=json&props=sitelinks|claims&sitefilter=enwiki|jawiki&ids=$ids_q", + 'User-Agent' => $C{ua}, + timeout => 60, + sub { process(\@ids, $ids_q, $ts, @_) } + } +} + + +my %props = qw/ + P856 website + P3180 vndb + P1933 mobygames + P4773 mobygames_company + P4769 gamefaqs_game + P6182 gamefaqs_company + P5646 anidb_anime + P5649 anidb_person + P1985 ann_anime + P1984 ann_manga + P434 musicbrainz_artist + P2002 twitter + P5659 vgmdb_product + P3435 vgmdb_artist + P1953 discogs_artist + P7013 acdb_char + P7017 acdb_source + P6717 indiedb_game + P2816 howlongtobeat +/; + + +sub process { + my($ids, $ids_q, $ts, $body, $hdr) = @_; + + # Just update lastfetch even if we have some kind of error further on. This + # makes sure we at least don't get into an error loop on the same entry. + my $n = 1; + my $ids_where = join ',', map sprintf('$%d', $n++), @$ids; + pg_cmd "UPDATE wikidata SET lastfetch = NOW() WHERE id IN($ids_where)", $ids; + + return AE::log warn => "$ids_q Http error: $hdr->{Status} $hdr->{Reason}" + if $hdr->{Status} !~ /^2/; + + my $data = eval { decode_json $body }; + return AE::log warn => "$ids_q Error decoding JSON: $@" if !$data; + + save($_, $ts, $data->{entities}{"Q$_"}) for @$ids; +} + + +sub save { + my($id, $ts, $data) = @_; + + my @set = ( 'enwiki = $2', 'jawiki = $3'); + my @val = ($id, $data->{sitelinks}{enwiki}{title}, $data->{sitelinks}{jawiki}{title}); + + for my $p (sort keys %props) { + my $v = $data->{claims}{$p}; + AE::log warn => "Q$id has multiple properties for '$p', storing the first" if $v && @$v > 1; + + $v = $v->[0]{mainsnak}{datavalue}{value}; + if(ref $v) { + AE::log warn => "Q$id has a non-scalar value for '$p'"; + $v = undef; + } + + push @val, $v; + push @set, sprintf '%s = $%d', $props{$p}, scalar @val; + } + + my $set = join ', ', @set; + + pg_cmd "UPDATE wikidata SET $set WHERE id = \$1", \@val; + AE::log info => sprintf "Q%d in %.1fs with %d vals", $id, AE::now()-$ts, -1+scalar grep defined($_), @val; +} + +1; |