From 5a74f0fe0564807657ced8e9773075c2391c5cce Mon Sep 17 00:00:00 2001 From: Yorhel Date: Fri, 26 Nov 2010 22:00:41 +0100 Subject: VNDBUtil: Partly rewrote bb2html() to be faster and better It does not use split() anymore, the input string is parsed in a single pass using a global regex. It's now a lot faster on larger input. The page generation time of /t937 went back from ~350ms to ~55ms (on the beta, the production server is slower). Also made several tiny improvements while I was at it: - multiple successive newlines aren't removed within [code] - truncating a message with $maxlength also removes trailing spaces and interpunction - multiple successive spaces are removed outside of [code] (and thus don't count towards the length of the message) The function should be mostly equivalent in regards to all other things, ignoring a few minor border cases that weren't documented in the first place. The URL regex (and the idea of a global regex) came from bpaste: http://g.blicky.net/bpaste.git/commit/?id=ac7b16d0ec0f195d00a0a79698f67c3010e8cf7d --- lib/VNDBUtil.pm | 123 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 52 deletions(-) (limited to 'lib/VNDBUtil.pm') diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm index f23811a9..82363bcf 100644 --- a/lib/VNDBUtil.pm +++ b/lib/VNDBUtil.pm @@ -29,95 +29,114 @@ sub bb2html { my $raw = shift; my $maxlength = shift; $raw =~ s/\r//g; - $raw =~ s/\n{5,}/\n\n/g; return '' if !$raw && $raw ne "0"; - my($result, $length, $rmnewline, @open) = ('', 0, 0, 'first'); + my($result, $last, $length, $rmnewline, @open) = ('', 0, 0, 0, 'first'); + # escapes, returns string, and takes care of $length and $maxlength; also + # takes care to remove newlines and double spaces when necessary my $e = sub { local $_ = shift; + s/^\n// if $rmnewline && $rmnewline--; + s/\n{5,}/\n\n/g if $open[$#open] ne 'code'; + s/ +/ /g if $open[$#open] ne 'code'; + $length += length $_; + if($maxlength && $length > $maxlength) { + $_ = substr($_, 0, $maxlength-$length); + s/[ \.,:;]+[^ \.,:;]*$//; # cleanly cut off on word boundary + } s/&/&/g; s/>/>/g; s//g if !$maxlength; - s/\n/ /g if $maxlength; + s/\n/ /g if $maxlength; return $_; }; - for (split /(\s|\n|\[[^\]]+\])/, $raw) { - next if !defined $_; - next if $_ eq ''; + while($raw =~ m{( + ([tdvpr][1-9][0-9]*\.[1-9][0-9]*) | # 2. exid + ([tdvprug][1-9][0-9]*) | # 3. id + (\[[^\s\]]+\]) | # 4. tag + ((?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-]) # 5. url + )}xg) { + my($match, $exid, $id, $tag, $url) = ($1, $2, $3, $4, $5); - # (note to self: stop using unreadable hacks like these!) - $rmnewline-- && $_ eq "\n" && next if $rmnewline; + # add string before the match + $result .= $e->(substr $raw, $last, (pos($raw)-length($match))-$last); + last if $maxlength && $length > $maxlength; + $last = pos $raw; - my $lit = $_; if($open[$#open] ne 'raw' && $open[$#open] ne 'code') { - if (lc$_ eq '[raw]') { push @open, 'raw'; next } - elsif (lc$_ eq '[spoiler]') { push @open, 'spoiler'; $result .= ''; next } - elsif (lc$_ eq '[quote]') { - push @open, 'quote'; - $result .= '
' if !$maxlength; - $rmnewline = 1; - next - } elsif (lc$_ eq '[code]') { - push @open, 'code'; - $result .= '
' if !$maxlength;
-        $rmnewline = 1;
-        next
-      } elsif (lc$_ eq '[/spoiler]') {
-        if($open[$#open] eq 'spoiler') {
+      # handle tags
+      if($tag) {
+        $tag = lc $tag;
+        if($tag eq '[raw]') {
+          push @open, 'raw'
+        } elsif($tag eq '[spoiler]') {
+          push @open, 'spoiler';
+          $result .= ''
+        } elsif($tag eq '[quote]') {
+          push @open, 'quote';
+          $result .= '
' if !$maxlength; + $rmnewline = 1; + } elsif($tag eq '[code]') { + push @open, 'code'; + $result .= '
' if !$maxlength;
+          $rmnewline = 1;
+        } elsif($tag eq '[/spoiler]' && $open[$#open] eq 'spoiler') {
           $result .= '';
           pop @open;
-        }
-        next;
-      } elsif (lc$_ eq '[/quote]') {
-        if($open[$#open] eq 'quote') {
+        } elsif($tag eq '[/quote]' && $open[$#open] eq 'quote') {
           $result .= '
' if !$maxlength; $rmnewline = 1; - pop @open; - } - next; - } elsif(lc$_ eq '[/url]') { - if($open[$#open] eq 'url') { + } elsif($tag eq '[/url]' && $open[$#open] eq 'url') { $result .= ''; pop @open; + } elsif($tag =~ s{\[url=((https?://|/)[^\]>]+)\]}{}i) { + $result .= $tag; + push @open, 'url'; } next; - } elsif(s{\[url=((https?://|/)[^\]>]+)\]}{}i) { - $result .= $_; - push @open, 'url'; - next; - } elsif(!grep(/url/, @open) && - s{(.*)(http|https)://(.+[\d\w=/-])(.*)} - {$e->($1).qq|'.$e->('link').''.$e->($4)}e) { + } + # handle URLs + if($url && !grep(/url/, @open)) { $length += 4; last if $maxlength && $length > $maxlength; - $result .= $_; - next; - } elsif(!grep(/url/, @open) && ( - s{^(.*[^\w]|)([tdvpr][1-9][0-9]*)\.([1-9][0-9]*)([^\w].*|)$}{$e->($1).qq|$2.$3|.$e->($4)}e || - s{^(.*[^\w]|)([tdvprug][1-9][0-9]*)([^\w].*|)$}{$e->($1).qq|$2|.$e->($3)}e)) { - $length += length $lit; - last if $maxlength && $length > $maxlength; - $result .= $_; + $result .= sprintf 'link', $url; next; } - } elsif($open[$#open] eq 'raw' && lc$_ eq '[/raw]') { + # id + if($id || $exid) { + my $r = $id || $exid; + if(substr($raw, $last-1-length($r), 1) !~ /[\w]/ && substr($raw, $last, 1) !~ /[\w]/) { + $length += length $r; + last if $maxlength && $length > $maxlength; + $result .= sprintf '%1$s', $r; + next + } + } + } + + if($tag && $open[$#open] eq 'raw' && lc$tag eq '[/raw]') { pop @open; next; - } elsif($open[$#open] eq 'code' && lc$_ eq '[/code]') { + } + + if($tag && $open[$#open] eq 'code' && lc$tag eq '[/code]') { $result .= '
' if !$maxlength; pop @open; next; } - # normal text processing - $length += length $_; + # We'll only get here when the bbcode input isn't correct or something else + # didn't work out. In that case, just output whatever we've matched. + $result .= $e->($match); last if $maxlength && $length > $maxlength; - $result .= $e->($_); } + # the last unmatched part, just escape and output + $result .= $e->(substr $raw, $last); + # close open tags while((local $_ = pop @open) ne 'first') { $result .= $_ eq 'url' ? '' : $_ eq 'spoiler' ? '
' : ''; -- cgit v1.2.3