Minor improvements to VN search normalization

× -> x from https://vndb.org/t8242.636 Remove minimum character length for ascii (has the potential to slow things down, but should give more relevant results) Remove old o->wo normalization, as the database is more consistent now.
author: Yorhel <git@yorhel.nl> 2021-07-24 10:23:18 +0200
committer: Yorhel <git@yorhel.nl> 2021-07-24 10:23:21 +0200
commit: 302f3ac336647da12192ed2f9c435f00b589a02f (patch)
tree: c6b61c9ce83f61eddb4b8f70c2a6b665739cf09a /lib
parent: 408550a8a9d6f6b9d2ba09d0a2efca855ec65826 (diff)
1 files changed, 4 insertions, 9 deletions
diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm
index 7dfd011c..68b76f36 100644
--- a/lib/VNDB/Func.pm
+++ b/lib/VNDB/Func.pm
@@ -104,9 +104,8 @@ sub normalize {
   tr/\r\n\t,_\-.~～〜∼ー῀:[]()%+!?#$"'`♥★☆♪†「」『』【】・‟“”‛’‘‚„«‹»›//d;
   tr/@/a/;
   tr/ı/i/; # Turkish lowercase i
+  tr/×/x/;
   s/&/and/;
-  # Consider wo and o the same thing (when used as separate word)
-  s/(?:^| )o(?:$| )/wo/g;
   # Remove spaces. We're doing substring search, so let it cross word boundary to find more stuff
   tr/ //d;
   # remove commonly used release titles ("x Edition" and "x Version")
@@ -130,20 +129,16 @@ sub normalize {
 # normalizes each title and returns a concatenated string of unique titles
 sub normalize_titles {
   my %t = map +(normalize($_), 1), @_;
-  return join ' ', grep $_, keys %t;
+  return join ' ', grep length $_, sort keys %t;
 }
 
 
 sub normalize_query {
   my $q = shift;
-  # Consider wo and o the same thing (when used as separate word). Has to be
-  # done here (in addition to normalize()) to make it work in combination with
-  # double quote search.
-  $q =~ s/(^| )o($| )/$1wo$2/ig;
   # remove spaces within quotes, so that it's considered as one search word
   $q =~ s/"([^"]+)"/(my $s=$1)=~y{ }{}d;$s/ge;
-  # split into search words, normalize, and remove too short words
-  return map length($_)>=(/^[\x01-\x7F]+$/?2:1) ? quotemeta($_) : (), map normalize($_), split / /, $q;
+  # split into search words and normalize
+  return map quotemeta($_), grep length $_, map normalize($_), split / /, $q;
 }
author	Yorhel <git@yorhel.nl>	2021-07-24 10:23:18 +0200
committer	Yorhel <git@yorhel.nl>	2021-07-24 10:23:21 +0200
commit	302f3ac336647da12192ed2f9c435f00b589a02f (patch)
tree	c6b61c9ce83f61eddb4b8f70c2a6b665739cf09a /lib
parent	408550a8a9d6f6b9d2ba09d0a2efca855ec65826 (diff)