From e56db50017f1dd3d93f50448bb0f0abe3055d13a Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 14 Jul 2018 15:40:12 +0200 Subject: Some VN search normalization improvements https://vndb.org/t2520.265 - 270. --- lib/VNDBUtil.pm | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm index a534856f..1c0629e4 100644 --- a/lib/VNDBUtil.pm +++ b/lib/VNDBUtil.pm @@ -6,10 +6,10 @@ use strict; use warnings; use Exporter 'import'; use Encode 'encode_utf8'; -use Unicode::Normalize 'NFKD'; +use Unicode::Normalize 'NFKD', 'compose'; use Socket 'inet_pton', 'inet_ntop', 'AF_INET', 'AF_INET6'; -our @EXPORT = qw|shorten gtintype normalize normalize_titles normalize_query imgsize norm_ip|; +our @EXPORT = qw|shorten gtintype normalize_titles normalize_query imgsize norm_ip|; sub shorten { @@ -50,12 +50,12 @@ sub gtintype { # a rather aggressive normalization sub normalize { local $_ = lc shift; - # remove combining markings. assuming the string is in NFD or NFKD, - # this effectively removes all accents from the characters (e.g. é -> e) - s/\pM//g; - # remove some characters that have no significance when searching use utf8; - tr/\r\n\t,_\-.~~〜∼῀:[]()%+!?#$"'`♥★☆♪†「」『』【】・‟”‛’‘‚„«‹»›//d; + # Remove combining markings, except for kana. + # This effectively removes all accents from the characters (e.g. é -> e) + $_ = compose(NFKD($_) =~ s/(?<=[^ア-ンあ-ん])\pM//rg); + # remove some characters that have no significance when searching + tr/\r\n\t,_\-.~~〜∼ー῀:[]()%+!?#$"'`♥★☆♪†「」『』【】・‟”‛’‘‚„«‹»›//d; tr/@/a/; s/&/and/; # Consider wo and o the same thing (when used as separate word) @@ -82,13 +82,13 @@ sub normalize { # normalizes each title and returns a concatenated string of unique titles sub normalize_titles { - my %t = map +(normalize(NFKD($_)), 1), @_; + my %t = map +(normalize($_), 1), @_; return join ' ', grep $_, keys %t; } sub normalize_query { - my $q = NFKD shift; + my $q = shift; # Consider wo and o the same thing (when used as separate word). Has to be # done here (in addition to normalize()) to make it work in combination with # double quote search. -- cgit v1.2.3