From 000cb5720d2db0a744797bc0b9c30df70efd3fb6 Mon Sep 17 00:00:00 2001 From: Yorhel Date: Sat, 6 Jan 2018 12:29:45 +0100 Subject: Rewrite bb2html() to be more flexible This is based on the API that I described in https://vndb.org/t5564.12 It's mostly bug-compatible with the old bb2html(), main differences: -
->
for no reason - Doesn't sporadically add a wrong - $rmwhitespace now also after [/code] Most of the test cases were contributed by flan --- lib/Multi/Feed.pm | 2 +- lib/VNDB/BBCode.pm | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++++ lib/VNDB/Func.pm | 3 +- lib/VNDBUtil.pm | 143 +------------------------------- util/bbcode-test.pl | 185 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 418 insertions(+), 144 deletions(-) create mode 100644 lib/VNDB/BBCode.pm create mode 100755 util/bbcode-test.pl diff --git a/lib/Multi/Feed.pm b/lib/Multi/Feed.pm index 2c4144db..86f0ffa1 100644 --- a/lib/Multi/Feed.pm +++ b/lib/Multi/Feed.pm @@ -10,7 +10,7 @@ use warnings; use TUWF::XML; use Multi::Core; use POSIX 'strftime'; -use VNDBUtil 'bb2html'; +use VNDB::BBCode; my %stats; # key = feed, value = [ count, total, max ] diff --git a/lib/VNDB/BBCode.pm b/lib/VNDB/BBCode.pm new file mode 100644 index 00000000..ffa14d98 --- /dev/null +++ b/lib/VNDB/BBCode.pm @@ -0,0 +1,229 @@ +package VNDB::BBCode; + +use strict; +use warnings; +use Exporter 'import'; +use TUWF::XML 'xml_escape'; + +our @EXPORT = qw/bb2html/; + +# Supported BBCode: +# [spoiler] .. [/spoiler] +# [quote] .. [/quote] +# [code] .. [/code] +# [url=..] [/url] +# [raw] .. [/raw] +# link: http://../ +# dblink: v#, v#.#, d#.#.# +# +# Permitted nesting of formatting codes: +# spoiler -> url, raw, link, dblink +# quote -> anything +# code -> nothing +# url -> raw +# raw -> nothing + + +# State action function usage: +# _state_action \@stack, $match, $char_pre, $char_post +# Returns: ($token, @arg) on successful parse, () otherwise. + +# Trivial open and close actions +sub _spoiler_start { if(lc$_[1] eq '[spoiler]') { push @{$_[0]}, 'spoiler'; ('spoiler_start') } else { () } } +sub _quote_start { if(lc$_[1] eq '[quote]') { push @{$_[0]}, 'quote'; ('quote_start') } else { () } } +sub _code_start { if(lc$_[1] eq '[code]') { push @{$_[0]}, 'code'; ('code_start') } else { () } } +sub _raw_start { if(lc$_[1] eq '[raw]') { push @{$_[0]}, 'raw'; ('raw_start') } else { () } } +sub _spoiler_end { if(lc$_[1] eq '[/spoiler]') { pop @{$_[0]}; ('spoiler_end') } else { () } } +sub _quote_end { if(lc$_[1] eq '[/quote]' ) { pop @{$_[0]}; ('quote_end' ) } else { () } } +sub _code_end { if(lc$_[1] eq '[/code]' ) { pop @{$_[0]}; ('code_end' ) } else { () } } +sub _raw_end { if(lc$_[1] eq '[/raw]' ) { pop @{$_[0]}; ('raw_end' ) } else { () } } +sub _url_end { if(lc$_[1] eq '[/url]' ) { pop @{$_[0]}; ('url_end' ) } else { () } } + +sub _url_start { + if($_[1] =~ m{^\[url=((https?://|/)[^\]>]+)\]$}i) { + push @{$_[0]}, 'url'; + (url_start => $1) + } else { () } +} + +sub _link { + my(undef, $match, $char_pre, $char_post) = @_; + + # Tags arent links + return () if $match =~ /^\[/; + + # URLs (already "validated" in the parsing regex) + return ('link') if $match =~ /^[hf]t/; + + # Now we're left with various forms of IDs, just need to make sure it's not surrounded by word characters + return ('dblink') if $char_pre !~ /\w/ && $char_post !~ /\w/; + + (); +} + + +# Permitted actions to take in each state. The actions are run in order, if +# none succeed then the token is passed through as text. +# The "current state" is the most recent tag in the stack, or '' if no tags are open. +my %STATE = ( + '' => [ \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start], + spoiler => [\&_spoiler_end, \&_link, \&_url_start, \&_raw_start], + quote => [\&_quote_end, \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start], + code => [\&_code_end ], + url => [\&_url_end, \&_raw_start], + raw => [\&_raw_end ], +); + + +# Usage: +# +# parse $input, sub { +# my($raw, $token, @arg) = @_; +# return 1; # to continue processing, 0 to stop. (Note that _close tokens may still follow after stopping) +# }; +# +# $raw = the raw part that has been parsed +# $token = name of the parsed bbcode token, with some special cases (see below) +# @arg = $token-specific arguments. +# +# Tags: +# text -> literal text, $raw is the text to display +# spoiler_start -> start a spoiler +# spoiler_end -> end +# quote_start -> start a quote +# quote_end -> end +# code_start -> code block +# code_end -> end +# url_start -> [url=..], $arg[0] contains the url +# url_end -> [/url] +# raw_start -> [raw] +# raw_end -> [/raw] +# link -> http://.../, $raw is the link +# dblink -> v123, t13.1, etc. $raw is the dblink +# +# This function will ensure correct nesting of _start and _end tokens. +sub parse { + my($raw, $sub) = @_; + $raw =~ s/\r//g; + return if !$raw && $raw ne '0'; + + my $last = 0; + my @stack; + + while($raw =~ m{(?: + \[[^\s\]]+\] | # tag + d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]* | # d#.#.# + [tdvprcs][1-9][0-9]*\.[1-9][0-9]* | # v#.# + [tdvprcsugi][1-9][0-9]* | # v# + (?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-] # link + )}xg) { + my $token = $&; + my $pre = substr $raw, $last, (pos($raw)-length($&))-$last; + my $char_pre = $last ? substr $raw, pos($raw)-length($&)-1, 1 : ''; + $last = pos $raw; + my $char_post = substr $raw, $last, 1; + + # Pass through the unformatted text before the match + $sub->($pre, 'text') || goto FINAL if length $pre; + + # Call the state functions. Arguments to these functions are implicitely + # passed through @_, which avoids allocating a new stack for each function + # call. + my $state = $STATE{ $stack[$#stack]||'' }; + my @ret; + @_ = (\@stack, $token, $char_pre, $char_post); + for(@$state) { + @ret = &$_; + last if @ret; + } + $sub->($token, @ret ? @ret : ('text')) || goto FINAL; + } + + $sub->(substr($raw, $last), 'text') if $last < length $raw; + +FINAL: + # Close all tags. This code is a bit of a hack, as it bypasses the state actions. + $sub->('', "${_}_end") for reverse @stack; +} + + +sub bb2html { + my($input, $maxlength, $charspoil) = @_; + + my $incode = 0; + my $rmnewline = 0; + my $length = 0; + my $ret = ''; + + # escapes, returns string, and takes care of $length and $maxlength; also + # takes care to remove newlines and double spaces when necessary + my $e = sub { + local $_ = shift; + + s/^\n// if $rmnewline && $rmnewline--; + s/\n{5,}/\n\n/g if !$incode; + s/ +/ /g if !$incode; + $length += length $_; + if($maxlength && $length > $maxlength) { + $_ = substr($_, 0, $maxlength-$length); + s/\W+\w*$//; # cleanly cut off on word boundary + } + s/&/&/g; + s/>/>/g; + s//g if !$maxlength; + s/\n/ /g if $maxlength; + $_; + }; + + parse $input, sub { + my($raw, $tag, @arg) = @_; + + #$ret .= "$tag {$raw}\n"; + #return 1; + + if($tag eq 'text') { + $ret .= $e->($raw); + + } elsif($tag eq 'spoiler_start') { + $ret .= !$charspoil + ? '' + : '<hidden by spoiler settings>' : ''; + + } elsif($tag eq 'quote_start') { + $ret .= '
' if !$maxlength; + $rmnewline = 1; + } elsif($tag eq 'quote_end') { + $ret .= '
' if !$maxlength; + $rmnewline = 1; + + } elsif($tag eq 'code_start') { + $ret .= '
' if !$maxlength;
+      $rmnewline = 1;
+      $incode = 1;
+    } elsif($tag eq 'code_end') {
+      $ret .= '
' if !$maxlength; + $rmnewline = 1; + $incode = 0; + + } elsif($tag eq 'url_start') { + $ret .= sprintf '', xml_escape($arg[0]); + } elsif($tag eq 'url_end') { + $ret .= ''; + + } elsif($tag eq 'link') { + $ret .= sprintf '%s', xml_escape($raw), $e->('link'); + + } elsif($tag eq 'dblink') { + (my $link = $raw) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/; + $ret .= sprintf '%s', $link, $e->($raw); + } + + !$maxlength || $length < $maxlength; + }; + $ret; +} + +1; diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm index 6af8f5bf..e46df5e2 100644 --- a/lib/VNDB/Func.pm +++ b/lib/VNDB/Func.pm @@ -8,7 +8,8 @@ use Exporter 'import'; use POSIX 'strftime', 'ceil', 'floor'; use JSON::XS; use VNDBUtil; -our @EXPORT = (@VNDBUtil::EXPORT, qw| +use VNDB::BBCode; +our @EXPORT = (@VNDBUtil::EXPORT, 'bb2html', qw| clearfloat cssicon tagscore mt minage fil_parse fil_serialize parenttags childtags charspoil imgpath imgurl fmtvote fmtmedia fmtvnlen fmtage fmtdatestr fmtdate fmtuser fmtrating fmtspoil diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm index cd294d1b..c8cd1bc1 100644 --- a/lib/VNDBUtil.pm +++ b/lib/VNDBUtil.pm @@ -9,7 +9,7 @@ use Encode 'encode_utf8'; use Unicode::Normalize 'NFKD'; use Socket 'inet_pton', 'inet_ntop', 'AF_INET', 'AF_INET6'; -our @EXPORT = qw|shorten bb2html gtintype normalize normalize_titles normalize_query imgsize norm_ip|; +our @EXPORT = qw|shorten gtintype normalize normalize_titles normalize_query imgsize norm_ip|; sub shorten { @@ -18,147 +18,6 @@ sub shorten { } -# Arguments: input, and optionally the maximum length -# Parses: -# [url=..] [/url] -# [raw] .. [/raw] -# [spoiler] .. [/spoiler] -# [quote] .. [/quote] -# [code] .. [/code] -# v+, v+.+ -# http://../ -# XXX: Make sure to sync any changes in the formating with -# VNDB::Util::Misc::bbSubstLinks() if necessary. Or, alternatively, abstract -# parsing into a separate function as per http://beta.vndb.org/t5564.12 -sub bb2html { - my($raw, $maxlength, $charspoil) = @_; - $raw =~ s/\r//g; - return '' if !$raw && $raw ne "0"; - - my($result, $last, $length, $rmnewline, @open) = ('', 0, 0, 0, 'first'); - - # escapes, returns string, and takes care of $length and $maxlength; also - # takes care to remove newlines and double spaces when necessary - my $e = sub { - local $_ = shift; - s/^\n// if $rmnewline && $rmnewline--; - s/\n{5,}/\n\n/g if $open[$#open] ne 'code'; - s/ +/ /g if $open[$#open] ne 'code'; - $length += length $_; - if($maxlength && $length > $maxlength) { - $_ = substr($_, 0, $maxlength-$length); - s/[ \.,:;]+[^ \.,:;]*$//; # cleanly cut off on word boundary - } - s/&/&/g; - s/>/>/g; - s//g if !$maxlength; - s/\n/ /g if $maxlength; - return $_; - }; - - while($raw =~ m{( - (d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]*) | # 2. longid - ([tdvprcs][1-9][0-9]*\.[1-9][0-9]*) | # 3. exid - ([tdvprcsugi][1-9][0-9]*) | # 4. id - (\[[^\s\]]+\]) | # 5. tag - ((?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-]) # 6. url - )}xg) { - my($match, $longid, $exid, $id, $tag, $url) = ($1, $2, $3, $4, $5, $6); - - # add string before the match - $result .= $e->(substr $raw, $last, (pos($raw)-length($match))-$last); - last if $maxlength && $length > $maxlength; - $last = pos $raw; - - if($open[$#open] ne 'raw' && $open[$#open] ne 'code') { - # handle tags - if($tag) { - $tag = lc $tag; - if($tag eq '[raw]') { - push @open, 'raw'; - next; - } elsif($tag eq '[spoiler]') { - push @open, 'spoiler'; - $result .= !$charspoil ? '' - : '<hidden by spoiler settings>
' if !$maxlength; - $rmnewline = 1; - next; - } elsif($tag eq '[code]') { - push @open, 'code'; - $result .= '
' if !$maxlength;
-          $rmnewline = 1;
-          next;
-        } elsif($tag eq '[/spoiler]' && $open[$#open] eq 'spoiler') {
-          $result .= !$charspoil ? '' : '';
-          pop @open;
-          next;
-        } elsif($tag eq '[/quote]' && $open[$#open] eq 'quote') {
-          $result .= '
' if !$maxlength; - $rmnewline = 1; - next; - } elsif($tag eq '[/url]' && $open[$#open] eq 'url') { - $result .= ''; - pop @open; - next; - } elsif($match =~ s{\[url=((https?://|/)[^\]>]+)\]}{}i) { - $result .= $match; - push @open, 'url'; - next; - } - } - # handle URLs - if($url && !grep(/url/, @open)) { - $length += 4; - last if $maxlength && $length > $maxlength; - $result .= sprintf 'link', $url; - next; - } - # id - if(($id || $exid || $longid) && !grep(/url/, @open) && (!$result || substr($raw, $last-1-length($match), 1) !~ /[\w]/) && substr($raw, $last, 1) !~ /[\w]/) { - (my $lnk = $match) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/; - $length += length $lnk; - last if $maxlength && $length > $maxlength; - $result .= sprintf '%s', $lnk, $match; - next - } - } - - if($tag && $open[$#open] eq 'raw' && lc$tag eq '[/raw]') { - pop @open; - next; - } - - if($tag && $open[$#open] eq 'code' && lc$tag eq '[/code]') { - $result .= '' if !$maxlength; - pop @open; - next; - } - - # We'll only get here when the bbcode input isn't correct or something else - # didn't work out. In that case, just output whatever we've matched. - $result .= $e->($match); - last if $maxlength && $length > $maxlength; - } - - # the last unmatched part, just escape and output - $result .= $e->(substr $raw, $last); - - # close open tags - while((local $_ = pop @open) ne 'first') { - $result .= $_ eq 'url' ? '' : $_ eq 'spoiler' ? '' : ''; - $result .= $_ eq 'quote' ? '' : $_ eq 'code' ? '' : '' if !$maxlength; - } - $result .= '...' if $maxlength && $length > $maxlength; - - return $result; -} - - # GTIN code as argument, # Returns 'JAN', 'EAN', 'UPC' or undef, # Also 'normalizes' the first argument in place diff --git a/util/bbcode-test.pl b/util/bbcode-test.pl new file mode 100755 index 00000000..058fa937 --- /dev/null +++ b/util/bbcode-test.pl @@ -0,0 +1,185 @@ +#!/usr/bin/perl + +# This is a test & benchmark script for VNDB::BBCode. +# Call without arguments to run the test, with any argument to run the benchmark. + +use strict; +use warnings; +use Cwd 'abs_path'; +use Test::More; +use Benchmark 'timethese'; + +our($ROOT, %S); +BEGIN { ($ROOT = abs_path $0) =~ s{/util/bbcode-test\.pl$}{}; } +use lib "$ROOT/lib"; +use VNDB::BBCode; + + +my @tests = ( + '', + '', + + '[From [url=http://www.dlSITE.com/eng/]DLsite English[/url]]', + '[From DLsite English]', + + '[url=http://example.com/]some url[/url]', + 'some url', + + '[quote]some quote[/quote]', + '
some quote
', + + "[code]some code\n\nalso newlines;[/code]", + '
some code

also newlines;
', + + '[spoiler]some spoiler[/spoiler]', + 'some spoiler', + + "[raw][quote]not parsed\n[url=https://vndb.org/]valid url[/url]\n[url=asdf]invalid url[/url][/quote][/raw]", + "[quote]not parsed
[url=https://vndb.org/]valid url[/url]
[url=asdf]invalid url[/url][/quote]", + + '[quote]basic [spoiler]single[/spoiler]-line [spoiler][url=/g]tag[/url] nesting [raw](without [url=/v3333]special[/url] cases)[/raw][/spoiler][/quote]', + '
basic single-line tag nesting (without [url=/v3333]special[/url] cases)
', + + "[quote]rmnewline after closing tag[/quote]\n", + '
rmnewline after closing tag
', + + '[url=/v19]some vndb url[/url]', + 'some vndb url', + + "quite\n\n\n\n\n\n\na\n\n\n\n\n lot of\n\n\n\nunneeded whitespace", + 'quite

a

lot of



unneeded whitespace', + + "[quote]\nsimple\nrmnewline\ntest\n[/quote]", + '
simple
rmnewline
test
', + + # the new implementation doesn't special-case [code], as the first newline shouldn't matter either way + "[quote]\n\nhello, rmnewline test[code]\n#!/bin/sh\n\nfunction random_username() {\n /dev/null\n}\n[/code]\nsome text after the code tag\n[/quote]\n\n[spoiler]\nsome newlined spoiler\n[/spoiler]", + '

hello, rmnewline test
#!/bin/sh

function random_username() {
</dev/urandom tr -cd \'a-zA-Z0-9\' | dd bs=1 count=16 2>/dev/null
}
some text after the code tag


some newlined spoiler
', + + "[quote]\n[raw]\nrmnewline test with made-up elements\n[/raw]\nwelp\n[dumbtag]\nnone\n[/dumbtag]\n[/quote]", + '

rmnewline test with made-up elements

welp
[dumbtag]
none
[/dumbtag]
', + + '[url=http://example.com/]markup in [raw][url][/raw][/url]', + 'markup in [url]', + + '[url=http://192.168.1.1/some/path]ipv4 address in [url][/url]', + 'ipv4 address in [url]', + + 'http://192.168.1.1/some/path (literal ipv4 address)', + 'link (literal ipv4 address)', + + '[url=http://192.168.1.1:8080/some/path]ipv4 address (port included) in [url][/url]', + 'ipv4 address (port included) in [url]', + + 'http://192.168.1.1:8080/some/path (literal ipv4 address, port included)', + 'link (literal ipv4 address, port included)', + + '[Quote]non-lowercase tags [SpOILER]here[/sPOilER][/qUOTe]', + '
non-lowercase tags here
', + + 'some text [spoiler]with (v17) tags[/spoiler] and internal ids such as s1', + 'some text with (v17) tags and internal ids such as s1', + + 'r12.1 v6.3 s1.2', + 'r12.1 v6.3 s1.2', + + 'v17 text dds16v21 more text1 v9', + 'v17 text dds16v21 more text1 v9', + + # Not sure what to do here + #'http://some[raw].pointlessly[/raw].unusual.domain/', + #'link', + + #'[url=http://some[raw].pointlessly[/raw].unusual.domain/]hi[/url]', + #'hi', + + 'html escapes (&)', + '<tag>html escapes (&)</tag>', + + '[spoiler]stray open tag', + 'stray open tag', + + # TODO: This isn't ideal + '[quote][spoiler]stray open tag (nested)[/quote]', + '
stray open tag (nested)[/quote]
', + + '[quote][spoiler]two stray open tags', + '
two stray open tags
', + + "[url=https://cat.xyz/]that's [spoiler]some [quote]uncommon[/quote][/spoiler] combination[/url]", + 'that\'s [spoiler]some [quote]uncommon[/quote][/spoiler] combination', + + # > I don't see anyone using IPv6 URLs anytime soon, so I'm not worried too either way. + #'[url=http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]/some/path]ipv6 address in [url][/url]', + #'ipv6 address in [url]', + + #'http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]/some/path (literal ipv6 address)', + #'link (literal ipv6 address)', + + # test shortening + [ "[url=https://cat.xyz/]that's [spoiler]some [quote]uncommon[/quote][/spoiler] combination[/url]", 10 ], + 'that\'s ', + + [ "A https://blicky.net/ only takes 4 characters", 8 ], + 'A link', +); + + +# output should be the same as the input +my @invalid_syntax = ( + '[url="http://example.com/"]invalid argument to the "url" tag[/url]', + '[url=nicetext]simpler invalid param[/url]', + '[url]empty "url" tag[/url]', + '[tag]custom tag[/tag]', +); + + +# Chaining all the parse() raw arguments should generate the same string as the input +sub identity { + my $ret = ''; + VNDB::BBCode::parse $_[0], sub { + $ret .= $_[0]; + }; + $ret; +} + + +sub test { + push @tests, map +($_,$_), @invalid_syntax; + plan tests => scalar @tests; + + my @t = @tests; + while(@t) { + my $input = shift @t; + my $html = shift @t; + my @arg = ref $input ? @$input : ($input); + (my $msg = $arg[0]) =~ s/\n/\\n/g; + is identity($arg[0]), $arg[0], $msg; + is bb2html(@arg), $html, $msg; + } +} + + +# Performance comparison with old implementation +sub bench { + my $plain = "This isn't a terribly interesting [string]. "x1000; + my $short = "Nobody ev3r v10 uses v5 so s1 many [url=https://blicky.net/]x[raw]y[/raw][/url] tags. "; + my $heavy = $short x100; + timethese(0, { + short => sub { bb2html($short) }, + plain => sub { bb2html($plain) }, + heavy => sub { bb2html($heavy) }, + }); + # old: + # heavy: 3 wallclock secs ( 3.15 usr + 0.00 sys = 3.15 CPU) @ 357.46/s (n=1126) + # plain: 3 wallclock secs ( 3.20 usr + 0.00 sys = 3.20 CPU) @ 130.00/s (n=416) + # short: 3 wallclock secs ( 3.17 usr + 0.00 sys = 3.17 CPU) @ 31420.82/s (n=99604) + # new: + # heavy: 3 wallclock secs ( 3.23 usr + 0.00 sys = 3.23 CPU) @ 242.11/s (n=782) + # plain: 3 wallclock secs ( 3.12 usr + 0.00 sys = 3.12 CPU) @ 124.04/s (n=387) + # short: 3 wallclock secs ( 3.18 usr + 0.00 sys = 3.18 CPU) @ 21018.55/s (n=66839) + # That's a bit of a performance hit, but should still be fast enough. +} + +test if !@ARGV; +bench if @ARGV; -- cgit v1.2.3