From b8fbb458cdbf6635c45675a3797292110a739f31 Mon Sep 17 00:00:00 2001
From: Yorhel <git@yorhel.nl>
Date: Sat, 13 Mar 2010 15:34:24 +0100
Subject: Improved VN search

This adds a new column to the vn table: c_search, which holds the
normalized titles for speedy search results using LIKE.

Also split some functions from VNDB::Func that didn't require YAWF into
a VNDBUtil module, so Multi can also make use of them. The normalization
functions are the same for Multi and VNDB, after all.

The API and Multi::IRC still use the old search, these should be updated
as well.
---
 lib/VNDBUtil.pm | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 lib/VNDBUtil.pm

(limited to 'lib/VNDBUtil.pm')
diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm
new file mode 100644
index 00000000..0b70db4d
--- /dev/null
+++ b/lib/VNDBUtil.pm
@@ -0,0 +1,202 @@
+# Misc. utility functions, do not rely on YAWF or POE and can be used from any script
+
+package VNDBUtil;
+
+use strict;
+use warnings;
+use Exporter 'import';
+use Unicode::Normalize 'NFKD';
+
+our @EXPORT = qw|shorten bb2html gtintype normalize normalize_titles normalize_query|;
+
+
+sub shorten {
+  my($str, $len) = @_;
+  return length($str) > $len ? substr($str, 0, $len-3).'...' : $str;
+}
+
+
+# Arguments: input, and optionally the maximum length
+# Parses:
+#  [url=..] [/url]
+#  [raw] .. [/raw]
+#  [spoiler] .. [/spoiler]
+#  [quote] .. [/quote]
+#  [code] .. [/code]
+#  v+,  v+.+
+#  http://../
+sub bb2html {
+  my $raw = shift;
+  my $maxlength = shift;
+  $raw =~ s/\r//g;
+  $raw =~ s/\n{5,}/\n\n/g;
+  return '' if !$raw && $raw ne "0";
+
+  my($result, $length, $rmnewline, @open) = ('', 0, 0, 'first');
+
+  my $e = sub {
+    local $_ = shift;
+    s/&/&amp;/g;
+    s/>/&gt;/g;
+    s/</&lt;/g;
+    s/\n/<br \/>/g if !$maxlength;
+    s/\n/ /g if $maxlength;
+    return $_;
+  };
+
+  for (split /(\s|\n|\[[^\]]+\])/, $raw) {
+    next if !defined $_;
+    next if $_ eq '';
+
+    # (note to self: stop using unreadable hacks like these!)
+    $rmnewline-- && $_ eq "\n" && next if $rmnewline;
+
+    my $lit = $_;
+    if($open[$#open] ne 'raw' && $open[$#open] ne 'code') {
+      if    (lc$_ eq '[raw]')      { push @open, 'raw'; next }
+      elsif (lc$_ eq '[spoiler]')  { push @open, 'spoiler'; $result .= '<b class="spoiler">'; next }
+      elsif (lc$_ eq '[quote]')    {
+        push @open, 'quote';
+        $result .= '<div class="quote">' if !$maxlength;
+        $rmnewline = 1;
+        next
+      } elsif (lc$_ eq '[code]') {
+        push @open, 'code';
+        $result .= '<pre>' if !$maxlength;
+        $rmnewline = 1;
+        next
+      } elsif (lc$_ eq '[/spoiler]') {
+        if($open[$#open] eq 'spoiler') {
+          $result .= '</b>';
+          pop @open;
+        }
+        next;
+      } elsif (lc$_ eq '[/quote]') {
+        if($open[$#open] eq 'quote') {
+          $result .= '</div>' if !$maxlength;
+          $rmnewline = 1;
+          pop @open;
+        }
+        next;
+      } elsif(lc$_ eq '[/url]') {
+        if($open[$#open] eq 'url') {
+          $result .= '</a>';
+          pop @open;
+        }
+        next;
+      } elsif(s{\[url=((https?://|/)[^\]>]+)\]}{<a href="$1" rel="nofollow">}i) {
+        $result .= $_;
+        push @open, 'url';
+        next;
+      } elsif(!grep(/url/, @open) &&
+           s{(.*)(http|https)://(.+[\d\w=/-])(.*)}
+            {$e->($1).qq|<a href="$2://|.$e->($3, 1).'" rel="nofollow">'.$e->('link').'</a>'.$e->($4)}e) {
+        $length += 4;
+        last if $maxlength && $length > $maxlength;
+        $result .= $_;
+        next;
+      } elsif(!grep(/url/, @open) && (
+          s{^(.*[^\w]|)([tdvpr][1-9][0-9]*)\.([1-9][0-9]*)([^\w].*|)$}{$e->($1).qq|<a href="/$2.$3">$2.$3</a>|.$e->($4)}e ||
+          s{^(.*[^\w]|)([tdvprug][1-9][0-9]*)([^\w].*|)$}{$e->($1).qq|<a href="/$2">$2</a>|.$e->($3)}e)) {
+        $length += length $lit;
+        last if $maxlength && $length > $maxlength;
+        $result .= $_;
+        next;
+      }
+    } elsif($open[$#open] eq 'raw' && lc$_ eq '[/raw]') {
+      pop @open;
+      next;
+    } elsif($open[$#open] eq 'code' && lc$_ eq '[/code]') {
+      $result .= '</pre>' if !$maxlength;
+      pop @open;
+      next;
+    }
+
+    # normal text processing
+    $length += length $_;
+    last if $maxlength && $length > $maxlength;
+    $result .= $e->($_);
+  }
+
+  # close open tags
+  while((local $_ = pop @open) ne 'first') {
+    $result .= $_ eq 'url' ? '</a>' : $_ eq 'spoiler' ? '</b>' : '';
+    $result .= $_ eq 'quote' ? '</div>' : $_ eq 'code' ? '</pre>' : '' if !$maxlength;
+  }
+  $result .= '...' if $maxlength && $length > $maxlength;
+
+  return $result;
+}
+
+
+# GTIN code as argument,
+# Returns 'JAN', 'EAN', 'UPC' or undef,
+# Also 'normalizes' the first argument in place
+sub gtintype {
+  $_[0] =~ s/[^\d]+//g;
+  $_[0] =~ s/^0+//;
+  my $c = shift;
+  return undef if $c !~ /^[0-9]{12,13}$/; # only gtin-12 and 13
+  $c = ('0'x(13-length $c)) . $c; # pad with zeros
+
+  # calculate check digit according to
+  #  http://www.gs1.org/productssolutions/barcodes/support/check_digit_calculator.html#how
+  my @n = reverse split //, $c;
+  my $n = shift @n;
+  $n += $n[$_] * ($_ % 2 != 0 ? 1 : 3) for (0..$#n);
+  return undef if $n % 10 != 0;
+
+  # Do some rough guesses based on:
+  #  http://www.gs1.org/productssolutions/barcodes/support/prefix_list.html
+  #  and http://en.wikipedia.org/wiki/List_of_GS1_country_codes
+  local $_ = $c;
+  return 'JAN' if /^4[59]/; # prefix code 450-459 & 490-499
+  return 'UPC' if /^(?:0[01]|0[6-9]|13|75[45])/; # prefix code 000-019 & 060-139 & 754-755
+  return  undef if /(?:0[2-5]|2|97[789]|9[6-9])/; # some codes we don't want: 020–059 & 200-299 & 977-999
+  return 'EAN'; # let's just call everything else EAN :)
+}
+
+
+# a rather aggressive normalization
+sub normalize {
+  local $_ = lc shift;
+  # remove combining markings. assuming the string is in NFD or NFKD,
+  #  this effectively removes all accents from the characters (e.g. é -> e)
+  s/\pM//g;
+  # remove some characters that have no significance when searching
+  use utf8;
+  tr/\r\n\t ,_\-.~:[]()%+!?&#$"'`♥★☆♪†「」『』【】・”//d;
+  tr/@/a/;
+  # remove commonly used release titles ("x Edition" and "x Version")
+  # this saves some space and speeds up the search
+  s/(?:
+    first|firstpress|firstpresslimited|limited|regular|standard
+   |package|boxed|download|complete|popular
+   |lowprice|best|cheap|budget
+   |special|trial|allages|fullvoice
+   |cd|cdr|cdrom|dvdrom|dvd|dvdpack|dvdpg|windows
+   |初回限定|初回|限定|通常|廉価|パッケージ|ダウンロード
+   )(?:edition|version|版|生産)//xg;
+  no utf8;
+  return $_;
+}
+
+
+# normalizes each title and returns a concatenated string of unique titles
+sub normalize_titles {
+  my %t = map +(normalize(NFKD($_)), 1), @_;
+  return join ' ', grep $_, keys %t;
+}
+
+
+sub normalize_query {
+  my $q = NFKD shift;
+  # remove spaces within quotes, so that it's considered as one search word
+  $q =~ s/"([^"]+)"/(my $s=$1)=~y{ }{}d;$s/ge;
+  # split into search words, normalize, and remove too short words
+  return map length($_)>(/^[\x01-\x7F]+$/?2:0) ? quotemeta($_) : (), map normalize($_), split / /, $q;
+}
+
+
+1;
+
-- 
cgit v1.2.3