diff options
author | Yorhel <git@yorhel.nl> | 2021-11-19 16:25:16 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2021-11-19 16:25:16 +0100 |
commit | 0ea2fb67930af01fe5f4484615e129a13bd8649b (patch) | |
tree | 67a209fb8841e343a4c7dc47424325c0f7e406ba | |
parent | 0a98d9df12cfc6ea73729b0b3feb3fee52c9f336 (diff) |
Use new search for VNs; remove search cache updating from Multi
-rw-r--r-- | lib/Multi/API.pm | 5 | ||||
-rw-r--r-- | lib/Multi/IRC.pm | 10 | ||||
-rw-r--r-- | lib/Multi/Maintenance.pm | 40 | ||||
-rw-r--r-- | lib/VNDB/Func.pm | 52 | ||||
-rw-r--r-- | lib/VNWeb/ULists/List.pm | 2 | ||||
-rw-r--r-- | lib/VNWeb/VN/Elm.pm | 13 | ||||
-rw-r--r-- | lib/VNWeb/VN/List.pm | 2 | ||||
-rw-r--r-- | sql/func.sql | 39 | ||||
-rw-r--r-- | sql/triggers.sql | 9 | ||||
-rwxr-xr-x | util/devdump.pl | 3 | ||||
-rw-r--r-- | util/updates/2021-11-19-vn-search.sql | 7 |
11 files changed, 52 insertions, 130 deletions
diff --git a/lib/Multi/API.pm b/lib/Multi/API.pm index 24a8a0ee..8ae8ceee 100644 --- a/lib/Multi/API.pm +++ b/lib/Multi/API.pm @@ -15,7 +15,7 @@ use POE::Filter::VNDBAPI 'encode_filters'; use Encode 'encode_utf8', 'decode_utf8'; use Crypt::URandom 'urandom'; use Crypt::ScryptKDF 'scrypt_raw';; -use VNDB::Func 'imgurl', 'normalize_query', 'norm_ip', 'resolution'; +use VNDB::Func 'imgurl', 'norm_ip', 'resolution'; use VNDB::Types; use VNDB::Config; use JSON::XS; @@ -583,8 +583,7 @@ my %GET_VN = ( [ stra => 'v.olang :op:(:value:)', {'=' => 'IN', '!=' => 'NOT IN'}, join => ',', process => \'lang' ], ], search => [ - [ str => '(:value:)', {'~',1}, split => \&normalize_query, - join => ' AND ', serialize => 'v.c_search LIKE :value:', process => \'like' ], + [ str => 'v.c_search LIKE ALL (search_query(:value:))', {'~',1} ], ], tags => [ [ int => 'v.id :op:(SELECT vid FROM tags_vn_inherit WHERE tag = :value:)', {'=' => 'IN', '!=' => 'NOT IN'}, process => \'g' ], diff --git a/lib/Multi/IRC.pm b/lib/Multi/IRC.pm index 4ef26ec1..9fbc298f 100644 --- a/lib/Multi/IRC.pm +++ b/lib/Multi/IRC.pm @@ -10,7 +10,6 @@ use warnings; use Multi::Core; use AnyEvent::IRC::Client; use AnyEvent::IRC::Util 'prefix_nick'; -use VNDB::Func 'normalize_query'; use VNDB::Config; use TUWF::Misc 'uri_escape'; use POSIX 'strftime'; @@ -366,18 +365,13 @@ vn => [ 0, 0, sub { my($nick, $chan, $q) = @_; return $irc->send_msg(PRIVMSG => $chan, 'You forgot the search query, dummy~~!') if !$q; - my @q = normalize_query($q); - return $irc->send_msg(PRIVMSG => $chan, - "Couldn't do anything with that search query, you might want to add quotes or use longer words.") if !@q; - - my $w = join ' AND ', map "c_search LIKE \$$_", 1..@q; pg_cmd qq{ SELECT id, title FROM vn - WHERE NOT hidden AND $w + WHERE NOT hidden AND c_search LIKE ALL (search_query($1)) ORDER BY title LIMIT 6 - }, [ map "%$_%", @q ], sub { + }, [ $q ], sub { my $res = shift; return if pg_expect $res, 1; return $irc->send_msg(PRIVMSG => $chan, 'No visual novels found.') if !$res->nRows; diff --git a/lib/Multi/Maintenance.pm b/lib/Multi/Maintenance.pm index 7c0bb1ac..6f8e1274 100644 --- a/lib/Multi/Maintenance.pm +++ b/lib/Multi/Maintenance.pm @@ -9,7 +9,6 @@ use strict; use warnings; use Multi::Core; use PerlIO::gzip; -use VNDB::Func 'normalize_titles'; use VNDB::Config; @@ -18,8 +17,6 @@ my $monthly; sub run { push_watcher schedule 7*3600+1800, 24*3600, \&daily; # 7:30 UTC, 30 minutes before the daily DB dumps are created - push_watcher schedule 0, 3600, \&vnsearch_check; - push_watcher pg->listen(vnsearch => on_notify => \&vnsearch_check); set_monthly(); } @@ -176,41 +173,4 @@ sub monthly { } - -# -# V N S E A R C H C A C H E -# - - -sub vnsearch_check { - pg_cmd 'SELECT id FROM vn WHERE c_search IS NULL LIMIT 1', undef, sub { - my $res = shift; - return if pg_expect $res, 1 or !$res->rows; - - my $id = $res->value(0,0); - pg_cmd q|SELECT title, original, alias FROM vn WHERE id = $1 - UNION SELECT r.title, r.original, NULL FROM releases r JOIN releases_vn rv ON rv.id = r.id WHERE rv.vid = $1 AND NOT r.hidden|, - [ $id ], sub { vnsearch_update($id, @_) }; - }; -} - - -sub vnsearch_update { # id, res, time - my($id, $res, $time) = @_; - return if pg_expect $res, 1; - - my $t = normalize_titles(grep length, map - +($_->{title}, $_->{original}, split /[\n,]/, $_->{alias}||''), - $res->rowsAsHashes - ); - - pg_cmd 'UPDATE vn SET c_search = $1 WHERE id = $2', [ $t, $id ], sub { - my($res, $t2) = @_; - return if pg_expect $res, 0; - AE::log info => sprintf 'Updated search cache for %s (%3dms SQL)', $id, ($time+$t2)*1000; - vnsearch_check; - }; -} - - 1; diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm index 800c37a5..d5692a38 100644 --- a/lib/VNDB/Func.pm +++ b/lib/VNDB/Func.pm @@ -5,8 +5,6 @@ use warnings; use TUWF::Misc 'uri_escape'; use Exporter 'import'; use POSIX 'strftime'; -use Encode 'encode_utf8'; -use Unicode::Normalize 'NFKD', 'compose'; use Socket 'inet_pton', 'inet_ntop', 'AF_INET', 'AF_INET6'; use VNDB::Config; use VNDB::Types; @@ -17,7 +15,6 @@ our @EXPORT = ('bb_format', qw| shorten resolution gtintype - normalize_titles normalize_query imgsize norm_ip minage @@ -93,55 +90,6 @@ sub gtintype { } -# a rather aggressive normalization -sub normalize { - local $_ = lc shift; - use utf8; - # Remove combining markings, except for kana. - # This effectively removes all accents from the characters (e.g. é -> e) - $_ = compose(NFKD($_) =~ s/(?<=[^ア-ンあ-ん])\pM//rg); - # remove some characters that have no significance when searching - tr/\r\n\t,_\-.~~〜∼ー῀:[]()%+!?#$"'`♥★☆♪†「」『』【】・‟“”‛’‘‚„«‹»›//d; - tr/@/a/; - tr/ı/i/; # Turkish lowercase i - tr/×/x/; - s/&/and/; - # Remove spaces. We're doing substring search, so let it cross word boundary to find more stuff - tr/ //d; - # remove commonly used release titles ("x Edition" and "x Version") - # this saves some space and speeds up the search - s/(?: - first|firstpress|firstpresslimited|limited|regular|standard - |package|boxed|download|complete|popular - |lowprice|best|cheap|budget - |special|trial|allages|fullvoice - |cd|cdr|cdrom|dvdrom|dvd|dvdpack|dvdpg|windows - |初回限定|初回|限定|通常|廉価|パッケージ|ダウンロード - )(?:edition|version|版|生産)//xg; - # other common things - s/fandisk/fandisc/g; - s/sempai/senpai/g; - no utf8; - return $_; -} - - -# normalizes each title and returns a concatenated string of unique titles -sub normalize_titles { - my %t = map +(normalize($_), 1), @_; - return join ' ', grep length $_, sort keys %t; -} - - -sub normalize_query { - my $q = shift; - # remove spaces within quotes, so that it's considered as one search word - $q =~ s/"([^"]+)"/(my $s=$1)=~y{ }{}d;$s/ge; - # split into search words and normalize - return map quotemeta($_), grep length $_, map normalize($_), split / /, $q; -} - - # arguments: <image size>, <max dimensions> # returns the size of the thumbnail with the same aspect ratio as the full-size # image, but fits within the specified maximum dimensions diff --git a/lib/VNWeb/ULists/List.pm b/lib/VNWeb/ULists/List.pm index 3fe2ac6c..833b3556 100644 --- a/lib/VNWeb/ULists/List.pm +++ b/lib/VNWeb/ULists/List.pm @@ -269,7 +269,7 @@ sub listing_ { sql('uv.uid =', \$uid), !$own ? sql('uv.vid IN(SELECT vid FROM ulist_vns_labels WHERE uid =', \$uid, 'AND lbl IN(SELECT id FROM ulist_labels WHERE uid =', \$uid, 'AND NOT private))') : (), @where_vns ? sql_or(@where_vns) : (), - $opt->{q} ? map sql('v.c_search like', \"%$_%"), normalize_query $opt->{q} : (), + $opt->{q} ? sql 'v.c_search LIKE ALL (search_query(', \$opt->{q}, '))' : (), defined($opt->{ch}) && $opt->{ch} ? sql('LOWER(SUBSTR(v.title, 1, 1)) =', \$opt->{ch}) : (), defined($opt->{ch}) && !$opt->{ch} ? sql('(ASCII(v.title) <', \97, 'OR ASCII(v.title) >', \122, ') AND (ASCII(v.title) <', \65, 'OR ASCII(v.title) >', \90, ')') : (); diff --git a/lib/VNWeb/VN/Elm.pm b/lib/VNWeb/VN/Elm.pm index 3bf02d59..0f6ca5d4 100644 --- a/lib/VNWeb/VN/Elm.pm +++ b/lib/VNWeb/VN/Elm.pm @@ -13,15 +13,10 @@ elm_api VN => undef, { elm_VNResult tuwf->dbPagei({ results => $data->{hidden}?50:15, page => 1 }, 'SELECT v.id, v.title, v.original, v.hidden FROM (', - sql_join('UNION ALL', map { - my $qs = sql_like $_; - my @qs = normalize_query $_; - ( - /^$RE{vid}$/ ? sql('SELECT 1, id FROM vn WHERE id =', \"$+{id}") : (), - sql('SELECT 1+substr_score(lower(title),', \$qs, '), id FROM vn WHERE title ILIKE', \"$qs%"), - @qs ? (sql 'SELECT 10, id FROM vn WHERE', sql_and map sql('c_search ILIKE', \"%$_%"), @qs) : () - ) - } @q), + sql_join('UNION ALL', map +( + /^$RE{vid}$/ ? sql('SELECT 1, id FROM vn WHERE id =', \"$+{id}") : (), + sql('SELECT 1+substr_score(lower(title),', \sql_like($_), '), id FROM vn WHERE c_search LIKE ALL (search_query(', \"$_", '))'), + ), @q), ') x(prio, id) JOIN vn v ON v.id = x.id WHERE', sql_and($data->{hidden} ? () : 'NOT v.hidden'), ' diff --git a/lib/VNWeb/VN/List.pm b/lib/VNWeb/VN/List.pm index 4dcb1785..5c08727c 100644 --- a/lib/VNWeb/VN/List.pm +++ b/lib/VNWeb/VN/List.pm @@ -263,7 +263,7 @@ TUWF::get qr{/v(?:/(?<char>all|[a-z0]))?}, sub { my $where = sql_and 'NOT v.hidden', $opt->{f}->sql_where(), - $opt->{q} ? map sql('v.c_search LIKE', \"%$_%"), normalize_query $opt->{q} : (), + $opt->{q} ? sql 'v.c_search LIKE ALL (search_query(', \$opt->{q}, '))' : (), defined($opt->{ch}) && $opt->{ch} ? sql('LOWER(SUBSTR(v.title, 1, 1)) =', \$opt->{ch}) : (), defined($opt->{ch}) && !$opt->{ch} ? sql('(ASCII(v.title) <', \97, 'OR ASCII(v.title) >', \122, ') AND (ASCII(v.title) <', \65, 'OR ASCII(v.title) >', \90, ')') : (); diff --git a/sql/func.sql b/sql/func.sql index 3df59b35..179df943 100644 --- a/sql/func.sql +++ b/sql/func.sql @@ -69,6 +69,35 @@ CREATE OR REPLACE FUNCTION search_gen(terms text[]) RETURNS text AS $$ $$ LANGUAGE SQL IMMUTABLE; +CREATE OR REPLACE FUNCTION search_gen_vn(vnid vndbid) RETURNS text AS $$ + SELECT coalesce(string_agg(t, ' '), '') FROM ( + SELECT t FROM ( + SELECT search_norm_term(title) FROM vn WHERE id = vnid + UNION ALL SELECT search_norm_term(original) FROM vn WHERE vnid = id + UNION ALL SELECT search_norm_term(a) FROM vn, regexp_split_to_table(alias, E'\n') a(a) WHERE vnid = id + -- Remove the various editions/version strings from release titles, + -- this reduces the index size and makes VN search more relevant. + -- People looking for editions should be using the release search. + -- (This regex is rather incomplete, sadly) + UNION ALL SELECT regexp_replace(search_norm_term(t), '(?:体験|ダウンロド|dvdpg|(?: + first|firstpress|firstpresslimited|limited|regular|standard + |package|boxed|download|complete|popular|premium|deluxe|collectors? + |lowprice|best|thebest|cheap|budget|reprint|bundle|renewal + |special|trial|allages|fullvoice|web|demo|fulldemo + |cd|cdr|cdrom|dvdrom|dvd|dvdpack|windows|windows7|windows7support|windows10 + |初回限定|初回|限定|通常|廉価|豪華|パッケージ|ダウンロード|ベスト|復刻|新装|7対応 + )?(?:edition|version|thebest|pack|package|版|生産))+$', '', 'xg') + FROM ( + SELECT title FROM releases r JOIN releases_vn rv ON rv.id = r.id WHERE NOT r.hidden AND rv.vid = vnid + UNION ALL + SELECT original FROM releases r JOIN releases_vn rv ON rv.id = r.id WHERE NOT r.hidden AND rv.vid = vnid + ) r(t) + ) x(t) WHERE t IS NOT NULL AND t <> '' GROUP BY t ORDER BY t + ) x(t); +$$ LANGUAGE SQL; + + + -- Split a search query into LIKE patterns. -- Supports double quoting for adjacent terms. -- e.g. 'SEARCH que.ry "word here"' -> '{%search%,%query%,%wordhere%}' @@ -507,7 +536,7 @@ DECLARE BEGIN SELECT id INTO xoldchid FROM changes WHERE itemid = nitemid AND rev = nrev-1; - -- Set c_search to NULL and notify when + -- Update c_search when -- 1. A new VN entry is created -- 2. The vn title/original/alias has changed IF vndbid_type(nitemid) = 'v' THEN @@ -516,12 +545,11 @@ BEGIN -- 2. EXISTS(SELECT 1 FROM vn_hist v1, vn_hist v2 WHERE (v2.title <> v1.title OR v2.original <> v1.original OR v2.alias <> v1.alias) AND v1.chid = xoldchid AND v2.chid = nchid) THEN - UPDATE vn SET c_search = NULL WHERE id = nitemid; - NOTIFY vnsearch; + UPDATE vn SET c_search = search_gen_vn(id) WHERE id = nitemid; END IF; END IF; - -- Set related vn.c_search columns to NULL and notify when + -- Update vn.c_search when -- 1. A new release is created -- 2. A release has been hidden or unhidden -- 3. The release title/original has changed @@ -537,8 +565,7 @@ BEGIN EXISTS(SELECT vid FROM releases_vn_hist WHERE chid = xoldchid EXCEPT SELECT vid FROM releases_vn_hist WHERE chid = nchid) OR EXISTS(SELECT vid FROM releases_vn_hist WHERE chid = nchid EXCEPT SELECT vid FROM releases_vn_hist WHERE chid = xoldchid) THEN - UPDATE vn SET c_search = NULL WHERE id IN(SELECT vid FROM releases_vn_hist WHERE chid IN(nchid, xoldchid)); - NOTIFY vnsearch; + UPDATE vn SET c_search = search_gen_vn(id) WHERE id IN(SELECT vid FROM releases_vn_hist WHERE chid IN(nchid, xoldchid)); END IF; END IF; diff --git a/sql/triggers.sql b/sql/triggers.sql index 8c2a8643..4a201c8a 100644 --- a/sql/triggers.sql +++ b/sql/triggers.sql @@ -203,15 +203,6 @@ CREATE TRIGGER insert_notify AFTER INSERT ON reviews FOR EACH STATEMENT EX --- Send a vnsearch notification when the c_search column is set to NULL. - -CREATE OR REPLACE FUNCTION vn_vnsearch_notify() RETURNS trigger AS 'BEGIN NOTIFY vnsearch; RETURN NULL; END;' LANGUAGE plpgsql; - -CREATE TRIGGER vn_vnsearch_notify AFTER UPDATE ON vn FOR EACH ROW WHEN (OLD.c_search IS NOT NULL AND NEW.c_search IS NULL) EXECUTE PROCEDURE vn_vnsearch_notify(); - - - - -- Create notifications for new posts. CREATE OR REPLACE FUNCTION notify_post() RETURNS trigger AS $$ diff --git a/util/devdump.pl b/util/devdump.pl index 387e0bb9..2a20544c 100755 --- a/util/devdump.pl +++ b/util/devdump.pl @@ -84,7 +84,7 @@ sub copy_entry { for(@$tables) { my $add = ''; $add = " AND vid IN($vids)" if /^releases_vn/ || /^vn_relations/ || /^chars_vns/; - copy $_ => "SELECT * FROM $_ WHERE id IN($ids) $add", $_ eq 'releases' ? { c_search => 'del' } : (); + copy $_ => "SELECT * FROM $_ WHERE id IN($ids) $add", { c_search => 'del' }; copy "${_}_hist" => "SELECT x.* FROM ${_}_hist x JOIN changes c ON c.id = x.chid WHERE c.itemid IN($ids) $add"; } } @@ -182,6 +182,7 @@ sub copy_entry { print "SELECT update_vnvotestats();\n"; print "SELECT update_users_ulist_stats(NULL);\n"; print "SELECT update_images_cache(NULL);\n"; + print "UPDATE vn SET c_search = search_gen_vn(id);\n"; print "UPDATE users u SET c_tags = (SELECT COUNT(*) FROM tags_vn v WHERE v.uid = u.id);\n"; print "UPDATE users u SET c_changes = (SELECT COUNT(*) FROM changes c WHERE c.requester = u.id);\n"; diff --git a/util/updates/2021-11-19-vn-search.sql b/util/updates/2021-11-19-vn-search.sql new file mode 100644 index 00000000..56ce6661 --- /dev/null +++ b/util/updates/2021-11-19-vn-search.sql @@ -0,0 +1,7 @@ +DROP TRIGGER vn_vnsearch_notify ON vn; +DROP FUNCTION vn_vnsearch_notify(); +\i sql/func.sql + +-- Warning: slow +\timing +UPDATE vn SET c_search = search_gen_vn(id); |