From b8fbb458cdbf6635c45675a3797292110a739f31 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 13 Mar 2010 15:34:24 +0100 Subject: Improved VN search This adds a new column to the vn table: c_search, which holds the normalized titles for speedy search results using LIKE. Also split some functions from VNDB::Func that didn't require YAWF into a VNDBUtil module, so Multi can also make use of them. The normalization functions are the same for Multi and VNDB, after all. The API and Multi::IRC still use the old search, these should be updated as well. --- lib/VNDBUtil.pm | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 lib/VNDBUtil.pm (limited to 'lib/VNDBUtil.pm') diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm new file mode 100644 index 00000000..0b70db4d --- /dev/null +++ b/lib/VNDBUtil.pm @@ -0,0 +1,202 @@ +# Misc. utility functions, do not rely on YAWF or POE and can be used from any script + +package VNDBUtil; + +use strict; +use warnings; +use Exporter 'import'; +use Unicode::Normalize 'NFKD'; + +our @EXPORT = qw|shorten bb2html gtintype normalize normalize_titles normalize_query|; + + +sub shorten { + my($str, $len) = @_; + return length($str) > $len ? substr($str, 0, $len-3).'...' : $str; +} + + +# Arguments: input, and optionally the maximum length +# Parses: +# [url=..] [/url] +# [raw] .. [/raw] +# [spoiler] .. [/spoiler] +# [quote] .. [/quote] +# [code] .. [/code] +# v+, v+.+ +# http://../ +sub bb2html { + my $raw = shift; + my $maxlength = shift; + $raw =~ s/\r//g; + $raw =~ s/\n{5,}/\n\n/g; + return '' if !$raw && $raw ne "0"; + + my($result, $length, $rmnewline, @open) = ('', 0, 0, 'first'); + + my $e = sub { + local $_ = shift; + s/&/&/g; + s/>/>/g; + s//g if !$maxlength; + s/\n/ /g if $maxlength; + return $_; + }; + + for (split /(\s|\n|\[[^\]]+\])/, $raw) { + next if !defined $_; + next if $_ eq ''; + + # (note to self: stop using unreadable hacks like these!) + $rmnewline-- && $_ eq "\n" && next if $rmnewline; + + my $lit = $_; + if($open[$#open] ne 'raw' && $open[$#open] ne 'code') { + if (lc$_ eq '[raw]') { push @open, 'raw'; next } + elsif (lc$_ eq '[spoiler]') { push @open, 'spoiler'; $result .= ''; next } + elsif (lc$_ eq '[quote]') { + push @open, 'quote'; + $result .= '
' if !$maxlength; + $rmnewline = 1; + next + } elsif (lc$_ eq '[code]') { + push @open, 'code'; + $result .= '
' if !$maxlength;
+        $rmnewline = 1;
+        next
+      } elsif (lc$_ eq '[/spoiler]') {
+        if($open[$#open] eq 'spoiler') {
+          $result .= '';
+          pop @open;
+        }
+        next;
+      } elsif (lc$_ eq '[/quote]') {
+        if($open[$#open] eq 'quote') {
+          $result .= '
' if !$maxlength; + $rmnewline = 1; + pop @open; + } + next; + } elsif(lc$_ eq '[/url]') { + if($open[$#open] eq 'url') { + $result .= ''; + pop @open; + } + next; + } elsif(s{\[url=((https?://|/)[^\]>]+)\]}{}i) { + $result .= $_; + push @open, 'url'; + next; + } elsif(!grep(/url/, @open) && + s{(.*)(http|https)://(.+[\d\w=/-])(.*)} + {$e->($1).qq|'.$e->('link').''.$e->($4)}e) { + $length += 4; + last if $maxlength && $length > $maxlength; + $result .= $_; + next; + } elsif(!grep(/url/, @open) && ( + s{^(.*[^\w]|)([tdvpr][1-9][0-9]*)\.([1-9][0-9]*)([^\w].*|)$}{$e->($1).qq|$2.$3|.$e->($4)}e || + s{^(.*[^\w]|)([tdvprug][1-9][0-9]*)([^\w].*|)$}{$e->($1).qq|$2|.$e->($3)}e)) { + $length += length $lit; + last if $maxlength && $length > $maxlength; + $result .= $_; + next; + } + } elsif($open[$#open] eq 'raw' && lc$_ eq '[/raw]') { + pop @open; + next; + } elsif($open[$#open] eq 'code' && lc$_ eq '[/code]') { + $result .= '' if !$maxlength; + pop @open; + next; + } + + # normal text processing + $length += length $_; + last if $maxlength && $length > $maxlength; + $result .= $e->($_); + } + + # close open tags + while((local $_ = pop @open) ne 'first') { + $result .= $_ eq 'url' ? '' : $_ eq 'spoiler' ? '' : ''; + $result .= $_ eq 'quote' ? '' : $_ eq 'code' ? '' : '' if !$maxlength; + } + $result .= '...' if $maxlength && $length > $maxlength; + + return $result; +} + + +# GTIN code as argument, +# Returns 'JAN', 'EAN', 'UPC' or undef, +# Also 'normalizes' the first argument in place +sub gtintype { + $_[0] =~ s/[^\d]+//g; + $_[0] =~ s/^0+//; + my $c = shift; + return undef if $c !~ /^[0-9]{12,13}$/; # only gtin-12 and 13 + $c = ('0'x(13-length $c)) . $c; # pad with zeros + + # calculate check digit according to + # http://www.gs1.org/productssolutions/barcodes/support/check_digit_calculator.html#how + my @n = reverse split //, $c; + my $n = shift @n; + $n += $n[$_] * ($_ % 2 != 0 ? 1 : 3) for (0..$#n); + return undef if $n % 10 != 0; + + # Do some rough guesses based on: + # http://www.gs1.org/productssolutions/barcodes/support/prefix_list.html + # and http://en.wikipedia.org/wiki/List_of_GS1_country_codes + local $_ = $c; + return 'JAN' if /^4[59]/; # prefix code 450-459 & 490-499 + return 'UPC' if /^(?:0[01]|0[6-9]|13|75[45])/; # prefix code 000-019 & 060-139 & 754-755 + return undef if /(?:0[2-5]|2|97[789]|9[6-9])/; # some codes we don't want: 020–059 & 200-299 & 977-999 + return 'EAN'; # let's just call everything else EAN :) +} + + +# a rather aggressive normalization +sub normalize { + local $_ = lc shift; + # remove combining markings. assuming the string is in NFD or NFKD, + # this effectively removes all accents from the characters (e.g. é -> e) + s/\pM//g; + # remove some characters that have no significance when searching + use utf8; + tr/\r\n\t ,_\-.~:[]()%+!?&#$"'`♥★☆♪†「」『』【】・”//d; + tr/@/a/; + # remove commonly used release titles ("x Edition" and "x Version") + # this saves some space and speeds up the search + s/(?: + first|firstpress|firstpresslimited|limited|regular|standard + |package|boxed|download|complete|popular + |lowprice|best|cheap|budget + |special|trial|allages|fullvoice + |cd|cdr|cdrom|dvdrom|dvd|dvdpack|dvdpg|windows + |初回限定|初回|限定|通常|廉価|パッケージ|ダウンロード + )(?:edition|version|版|生産)//xg; + no utf8; + return $_; +} + + +# normalizes each title and returns a concatenated string of unique titles +sub normalize_titles { + my %t = map +(normalize(NFKD($_)), 1), @_; + return join ' ', grep $_, keys %t; +} + + +sub normalize_query { + my $q = NFKD shift; + # remove spaces within quotes, so that it's considered as one search word + $q =~ s/"([^"]+)"/(my $s=$1)=~y{ }{}d;$s/ge; + # split into search words, normalize, and remove too short words + return map length($_)>(/^[\x01-\x7F]+$/?2:0) ? quotemeta($_) : (), map normalize($_), split / /, $q; +} + + +1; + -- cgit v1.2.3