diff options
author | Yorhel <git@yorhel.nl> | 2021-07-24 10:23:18 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2021-07-24 10:23:21 +0200 |
commit | 302f3ac336647da12192ed2f9c435f00b589a02f (patch) | |
tree | c6b61c9ce83f61eddb4b8f70c2a6b665739cf09a /lib | |
parent | 408550a8a9d6f6b9d2ba09d0a2efca855ec65826 (diff) |
Minor improvements to VN search normalization
× -> x from https://vndb.org/t8242.636
Remove minimum character length for ascii (has the potential to slow
things down, but should give more relevant results)
Remove old o->wo normalization, as the database is more consistent now.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/VNDB/Func.pm | 13 |
1 files changed, 4 insertions, 9 deletions
diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm index 7dfd011c..68b76f36 100644 --- a/lib/VNDB/Func.pm +++ b/lib/VNDB/Func.pm @@ -104,9 +104,8 @@ sub normalize { tr/\r\n\t,_\-.~~〜∼ー῀:[]()%+!?#$"'`♥★☆♪†「」『』【】・‟“”‛’‘‚„«‹»›//d; tr/@/a/; tr/ı/i/; # Turkish lowercase i + tr/×/x/; s/&/and/; - # Consider wo and o the same thing (when used as separate word) - s/(?:^| )o(?:$| )/wo/g; # Remove spaces. We're doing substring search, so let it cross word boundary to find more stuff tr/ //d; # remove commonly used release titles ("x Edition" and "x Version") @@ -130,20 +129,16 @@ sub normalize { # normalizes each title and returns a concatenated string of unique titles sub normalize_titles { my %t = map +(normalize($_), 1), @_; - return join ' ', grep $_, keys %t; + return join ' ', grep length $_, sort keys %t; } sub normalize_query { my $q = shift; - # Consider wo and o the same thing (when used as separate word). Has to be - # done here (in addition to normalize()) to make it work in combination with - # double quote search. - $q =~ s/(^| )o($| )/$1wo$2/ig; # remove spaces within quotes, so that it's considered as one search word $q =~ s/"([^"]+)"/(my $s=$1)=~y{ }{}d;$s/ge; - # split into search words, normalize, and remove too short words - return map length($_)>=(/^[\x01-\x7F]+$/?2:1) ? quotemeta($_) : (), map normalize($_), split / /, $q; + # split into search words and normalize + return map quotemeta($_), grep length $_, map normalize($_), split / /, $q; } |