summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2021-07-24 10:23:18 +0200
committerYorhel <git@yorhel.nl>2021-07-24 10:23:21 +0200
commit302f3ac336647da12192ed2f9c435f00b589a02f (patch)
treec6b61c9ce83f61eddb4b8f70c2a6b665739cf09a /lib
parent408550a8a9d6f6b9d2ba09d0a2efca855ec65826 (diff)
Minor improvements to VN search normalization
× -> x from https://vndb.org/t8242.636 Remove minimum character length for ascii (has the potential to slow things down, but should give more relevant results) Remove old o->wo normalization, as the database is more consistent now.
Diffstat (limited to 'lib')
-rw-r--r--lib/VNDB/Func.pm13
1 files changed, 4 insertions, 9 deletions
diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm
index 7dfd011c..68b76f36 100644
--- a/lib/VNDB/Func.pm
+++ b/lib/VNDB/Func.pm
@@ -104,9 +104,8 @@ sub normalize {
tr/\r\n\t,_\-.~~〜∼ー῀:[]()%+!?#$"'`♥★☆♪†「」『』【】・‟“”‛’‘‚„«‹»›//d;
tr/@/a/;
tr/ı/i/; # Turkish lowercase i
+ tr/×/x/;
s/&/and/;
- # Consider wo and o the same thing (when used as separate word)
- s/(?:^| )o(?:$| )/wo/g;
# Remove spaces. We're doing substring search, so let it cross word boundary to find more stuff
tr/ //d;
# remove commonly used release titles ("x Edition" and "x Version")
@@ -130,20 +129,16 @@ sub normalize {
# normalizes each title and returns a concatenated string of unique titles
sub normalize_titles {
my %t = map +(normalize($_), 1), @_;
- return join ' ', grep $_, keys %t;
+ return join ' ', grep length $_, sort keys %t;
}
sub normalize_query {
my $q = shift;
- # Consider wo and o the same thing (when used as separate word). Has to be
- # done here (in addition to normalize()) to make it work in combination with
- # double quote search.
- $q =~ s/(^| )o($| )/$1wo$2/ig;
# remove spaces within quotes, so that it's considered as one search word
$q =~ s/"([^"]+)"/(my $s=$1)=~y{ }{}d;$s/ge;
- # split into search words, normalize, and remove too short words
- return map length($_)>=(/^[\x01-\x7F]+$/?2:1) ? quotemeta($_) : (), map normalize($_), split / /, $q;
+ # split into search words and normalize
+ return map quotemeta($_), grep length $_, map normalize($_), split / /, $q;
}