summaryrefslogtreecommitdiff
path: root/lib/Multi/Wikidata.pm
blob: fea9dbb14632f5608821587f46ea63cd36458555 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

#
#  Multi::Wikidata  -  Fetches information from wikidata
#

package Multi::Wikidata;

use strict;
use warnings;
use Multi::Core;
use JSON::XS 'decode_json';
use AnyEvent::HTTP;


my %C = (
  check_timeout  => 30, # Check & fetch for entries to update every 30 seconds
  fetch_number   => 50, # Number of entries to fetch in a single API call
  fetch_interval => 24*3600, # Minimum delay between updates of a single entry
  api_endpoint => 'https://www.wikidata.org/w/api.php',
);


sub run {
  shift;
  $C{ua} = "VNDB.org Crawler (Multi v$VNDB::S{version}; contact\@vndb.org)";
  %C = (%C, @_);

  push_watcher schedule 0, $C{check_timeout}, \&fetch;
}


sub fetch {
  pg_cmd q{
    SELECT id
      FROM wikidata
     WHERE id IN(
              SELECT l_wikidata FROM producers WHERE l_wp IS NOT NULL AND NOT hidden
        UNION SELECT l_wikidata FROM staff     WHERE l_wp IS NOT NULL AND NOT hidden
        UNION SELECT l_wikidata FROM vn        WHERE l_wp IS NOT NULL AND NOT hidden)
       AND (lastfetch IS NULL OR lastfetch < now()-($1 * '1 second'::interval))
     ORDER BY lastfetch NULLS FIRST
     LIMIT $2
  }, [ $C{fetch_interval}, $C{fetch_number} ], sub {
    my($res) = @_;
    return if pg_expect $res, 1 or !$res->nRows;
    my @ids = map $res->value($_,0), 0..($res->nRows-1);

    my $ids_q = join '|', map "Q$_", @ids;
    my $ts = AE::now;
    http_get "$C{api_endpoint}?action=wbgetentities&format=json&props=sitelinks|claims&sitefilter=enwiki|jawiki&ids=$ids_q",
      'User-Agent' => $C{ua},
      timeout => 60,
      sub { process(\@ids, $ids_q, $ts, @_) }
  }
}


my %props = qw/
  P856  website
  P3180 vndb
  P1933 mobygames
  P4773 mobygames_company
  P4769 gamefaqs_game
  P6182 gamefaqs_company
  P5646 anidb_anime
  P5649 anidb_person
  P1985 ann_anime
  P1984 ann_manga
  P434  musicbrainz_artist
  P2002 twitter
  P5659 vgmdb_product
  P3435 vgmdb_artist
  P1953 discogs_artist
  P7013 acdb_char
  P7017 acdb_source
  P6717 indiedb_game
  P2816 howlongtobeat
/;


sub process {
  my($ids, $ids_q, $ts, $body, $hdr) = @_;

  # Just update lastfetch even if we have some kind of error further on. This
  # makes sure we at least don't get into an error loop on the same entry.
  my $n = 1;
  my $ids_where = join ',', map sprintf('$%d', $n++), @$ids;
  pg_cmd "UPDATE wikidata SET lastfetch = NOW() WHERE id IN($ids_where)", $ids;

  return AE::log warn => "$ids_q Http error: $hdr->{Status} $hdr->{Reason}"
    if $hdr->{Status} !~ /^2/;

  my $data = eval { decode_json $body };
  return AE::log warn => "$ids_q Error decoding JSON: $@" if !$data;

  save($_, $ts, $data->{entities}{"Q$_"}) for @$ids;
}


sub save {
  my($id, $ts, $data) = @_;

  my @set = (     'enwiki = $2',                     'jawiki = $3');
  my @val = ($id, $data->{sitelinks}{enwiki}{title}, $data->{sitelinks}{jawiki}{title});

  for my $p (sort keys %props) {
    my $v = $data->{claims}{$p};
    AE::log warn => "Q$id has multiple properties for '$p', storing the first" if $v && @$v > 1;

    $v = $v->[0]{mainsnak}{datavalue}{value};
    if(ref $v) {
      AE::log warn => "Q$id has a non-scalar value for '$p'";
      $v = undef;
    }

    push @val, $v;
    push @set, sprintf '%s = $%d', $props{$p}, scalar @val;
  }

  my $set = join ', ', @set;

  pg_cmd "UPDATE wikidata SET $set WHERE id = \$1", \@val;
  AE::log info => sprintf "Q%d in %.1fs with %d vals", $id, AE::now()-$ts, -1+scalar grep defined($_), @val;
}

1;