From 000cb5720d2db0a744797bc0b9c30df70efd3fb6 Mon Sep 17 00:00:00 2001
From: Yorhel <git@yorhel.nl>
Date: Sat, 6 Jan 2018 12:29:45 +0100
Subject: Rewrite bb2html() to be more flexible

This is based on the API that I described in https://vndb.org/t5564.12

It's mostly bug-compatible with the old bb2html(), main differences:
- <br /> -> <br> for no reason
- Doesn't sporadically add a wrong </div>
- $rmwhitespace now also after [/code]

Most of the test cases were contributed by flan <flan@flande.re>
---
 lib/Multi/Feed.pm   |   2 +-
 lib/VNDB/BBCode.pm  | 229 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/VNDB/Func.pm    |   3 +-
 lib/VNDBUtil.pm     | 143 +-------------------------------
 util/bbcode-test.pl | 185 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 418 insertions(+), 144 deletions(-)
 create mode 100644 lib/VNDB/BBCode.pm
 create mode 100755 util/bbcode-test.pl
diff --git a/lib/Multi/Feed.pm b/lib/Multi/Feed.pm
index 2c4144db..86f0ffa1 100644
--- a/lib/Multi/Feed.pm
+++ b/lib/Multi/Feed.pm
@@ -10,7 +10,7 @@ use warnings;
 use TUWF::XML;
 use Multi::Core;
 use POSIX 'strftime';
-use VNDBUtil 'bb2html';
+use VNDB::BBCode;
 
 my %stats; # key = feed, value = [ count, total, max ]
 
diff --git a/lib/VNDB/BBCode.pm b/lib/VNDB/BBCode.pm
new file mode 100644
index 00000000..ffa14d98
--- /dev/null
+++ b/lib/VNDB/BBCode.pm
@@ -0,0 +1,229 @@
+package VNDB::BBCode;
+
+use strict;
+use warnings;
+use Exporter 'import';
+use TUWF::XML 'xml_escape';
+
+our @EXPORT = qw/bb2html/;
+
+# Supported BBCode:
+#  [spoiler] .. [/spoiler]
+#  [quote] .. [/quote]
+#  [code] .. [/code]
+#  [url=..] [/url]
+#  [raw] .. [/raw]
+#  link: http://../
+#  dblink: v#, v#.#, d#.#.#
+#
+# Permitted nesting of formatting codes:
+#  spoiler -> url, raw, link, dblink
+#  quote   -> anything
+#  code    -> nothing
+#  url     -> raw
+#  raw     -> nothing
+
+
+# State action function usage:
+#   _state_action \@stack, $match, $char_pre, $char_post
+# Returns: ($token, @arg) on successful parse, () otherwise.
+
+# Trivial open and close actions
+sub _spoiler_start { if(lc$_[1] eq '[spoiler]')  { push @{$_[0]}, 'spoiler'; ('spoiler_start') } else { () } }
+sub _quote_start   { if(lc$_[1] eq '[quote]')    { push @{$_[0]}, 'quote';   ('quote_start')   } else { () } }
+sub _code_start    { if(lc$_[1] eq '[code]')     { push @{$_[0]}, 'code';    ('code_start')    } else { () } }
+sub _raw_start     { if(lc$_[1] eq '[raw]')      { push @{$_[0]}, 'raw';     ('raw_start')     } else { () } }
+sub _spoiler_end   { if(lc$_[1] eq '[/spoiler]') { pop  @{$_[0]}; ('spoiler_end') } else { () } }
+sub _quote_end     { if(lc$_[1] eq '[/quote]'  ) { pop  @{$_[0]}; ('quote_end'  ) } else { () } }
+sub _code_end      { if(lc$_[1] eq '[/code]'   ) { pop  @{$_[0]}; ('code_end'   ) } else { () } }
+sub _raw_end       { if(lc$_[1] eq '[/raw]'    ) { pop  @{$_[0]}; ('raw_end'    ) } else { () } }
+sub _url_end       { if(lc$_[1] eq '[/url]'    ) { pop  @{$_[0]}; ('url_end'    ) } else { () } }
+
+sub _url_start {
+  if($_[1] =~ m{^\[url=((https?://|/)[^\]>]+)\]$}i) {
+    push @{$_[0]}, 'url';
+    (url_start => $1)
+  } else { () }
+}
+
+sub _link {
+  my(undef, $match, $char_pre, $char_post) = @_;
+
+  # Tags arent links
+  return () if $match =~ /^\[/;
+
+  # URLs (already "validated" in the parsing regex)
+  return ('link') if $match =~ /^[hf]t/;
+
+  # Now we're left with various forms of IDs, just need to make sure it's not surrounded by word characters
+  return ('dblink') if $char_pre !~ /\w/ && $char_post !~ /\w/;
+
+  ();
+}
+
+
+# Permitted actions to take in each state. The actions are run in order, if
+# none succeed then the token is passed through as text.
+# The "current state" is the most recent tag in the stack, or '' if no tags are open.
+my %STATE = (
+  ''      => [                \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start],
+  spoiler => [\&_spoiler_end, \&_link, \&_url_start, \&_raw_start],
+  quote   => [\&_quote_end,   \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start],
+  code    => [\&_code_end     ],
+  url     => [\&_url_end,     \&_raw_start],
+  raw     => [\&_raw_end      ],
+);
+
+
+# Usage:
+#
+#   parse $input, sub {
+#     my($raw, $token, @arg) = @_;
+#     return 1; # to continue processing, 0 to stop. (Note that _close tokens may still follow after stopping)
+#   };
+#
+#   $raw   = the raw part that has been parsed
+#   $token = name of the parsed bbcode token, with some special cases (see below)
+#   @arg   = $token-specific arguments.
+#
+# Tags:
+#   text           -> literal text, $raw is the text to display
+#   spoiler_start  -> start a spoiler
+#   spoiler_end    -> end
+#   quote_start    -> start a quote
+#   quote_end      -> end
+#   code_start     -> code block
+#   code_end       -> end
+#   url_start      -> [url=..], $arg[0] contains the url
+#   url_end        -> [/url]
+#   raw_start      -> [raw]
+#   raw_end        -> [/raw]
+#   link           -> http://.../, $raw is the link
+#   dblink         -> v123, t13.1, etc. $raw is the dblink
+#
+# This function will ensure correct nesting of _start and _end tokens.
+sub parse {
+  my($raw, $sub) = @_;
+  $raw =~ s/\r//g;
+  return if !$raw && $raw ne '0';
+
+  my $last = 0;
+  my @stack;
+
+  while($raw =~ m{(?:
+    \[[^\s\]]+\]                            |  # tag
+    d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]*  |  # d#.#.#
+    [tdvprcs][1-9][0-9]*\.[1-9][0-9]*       |  # v#.#
+    [tdvprcsugi][1-9][0-9]*                 |  # v#
+    (?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-]  # link
+  )}xg) {
+    my $token = $&;
+    my $pre = substr $raw, $last, (pos($raw)-length($&))-$last;
+    my $char_pre = $last ? substr $raw, pos($raw)-length($&)-1, 1 : '';
+    $last = pos $raw;
+    my $char_post = substr $raw, $last, 1;
+
+    # Pass through the unformatted text before the match
+    $sub->($pre, 'text') || goto FINAL if length $pre;
+
+    # Call the state functions. Arguments to these functions are implicitely
+    # passed through @_, which avoids allocating a new stack for each function
+    # call.
+    my $state = $STATE{ $stack[$#stack]||'' };
+    my @ret;
+    @_ = (\@stack, $token, $char_pre, $char_post);
+    for(@$state) {
+      @ret = &$_;
+      last if @ret;
+    }
+    $sub->($token, @ret ? @ret : ('text')) || goto FINAL;
+  }
+
+  $sub->(substr($raw, $last), 'text') if $last < length $raw;
+
+FINAL:
+  # Close all tags. This code is a bit of a hack, as it bypasses the state actions.
+  $sub->('', "${_}_end") for reverse @stack;
+}
+
+
+sub bb2html {
+  my($input, $maxlength, $charspoil) = @_;
+
+  my $incode = 0;
+  my $rmnewline = 0;
+  my $length = 0;
+  my $ret = '';
+
+  # escapes, returns string, and takes care of $length and $maxlength; also
+  # takes care to remove newlines and double spaces when necessary
+  my $e = sub {
+    local $_ = shift;
+
+    s/^\n//         if $rmnewline && $rmnewline--;
+    s/\n{5,}/\n\n/g if !$incode;
+    s/  +/ /g       if !$incode;
+    $length += length $_;
+    if($maxlength && $length > $maxlength) {
+      $_ = substr($_, 0, $maxlength-$length);
+      s/\W+\w*$//; # cleanly cut off on word boundary
+    }
+    s/&/&amp;/g;
+    s/>/&gt;/g;
+    s/</&lt;/g;
+    s/\n/<br>/g if !$maxlength;
+    s/\n/ /g       if $maxlength;
+    $_;
+  };
+
+  parse $input, sub {
+    my($raw, $tag, @arg) = @_;
+
+    #$ret .= "$tag {$raw}\n";
+    #return 1;
+
+    if($tag eq 'text') {
+      $ret .= $e->($raw);
+
+    } elsif($tag eq 'spoiler_start') {
+      $ret .= !$charspoil
+        ? '<b class="spoiler">'
+        : '<b class="grayedout charspoil charspoil_-1">&lt;hidden by spoiler settings&gt;</b><span class="charspoil charspoil_2 hidden">';
+    } elsif($tag eq 'spoiler_end') {
+      $ret .= !$charspoil ? '</b>' : '</span>';
+
+    } elsif($tag eq 'quote_start') {
+      $ret .= '<div class="quote">' if !$maxlength;
+      $rmnewline = 1;
+    } elsif($tag eq 'quote_end') {
+      $ret .= '</div>' if !$maxlength;
+      $rmnewline = 1;
+
+    } elsif($tag eq 'code_start') {
+      $ret .= '<pre>' if !$maxlength;
+      $rmnewline = 1;
+      $incode = 1;
+    } elsif($tag eq 'code_end') {
+      $ret .= '</pre>' if !$maxlength;
+      $rmnewline = 1;
+      $incode = 0;
+
+    } elsif($tag eq 'url_start') {
+      $ret .= sprintf '<a href="%s" rel="nofollow">', xml_escape($arg[0]);
+    } elsif($tag eq 'url_end') {
+      $ret .= '</a>';
+
+    } elsif($tag eq 'link') {
+      $ret .= sprintf '<a href="%s" rel="nofollow">%s</a>', xml_escape($raw), $e->('link');
+
+    } elsif($tag eq 'dblink') {
+      (my $link = $raw) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/;
+      $ret .= sprintf '<a href="/%s">%s</a>', $link, $e->($raw);
+    }
+
+    !$maxlength || $length < $maxlength;
+  };
+  $ret;
+}
+
+1;
diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm
index 6af8f5bf..e46df5e2 100644
--- a/lib/VNDB/Func.pm
+++ b/lib/VNDB/Func.pm
@@ -8,7 +8,8 @@ use Exporter 'import';
 use POSIX 'strftime', 'ceil', 'floor';
 use JSON::XS;
 use VNDBUtil;
-our @EXPORT = (@VNDBUtil::EXPORT, qw|
+use VNDB::BBCode;
+our @EXPORT = (@VNDBUtil::EXPORT, 'bb2html', qw|
   clearfloat cssicon tagscore mt minage fil_parse fil_serialize parenttags
   childtags charspoil imgpath imgurl
   fmtvote fmtmedia fmtvnlen fmtage fmtdatestr fmtdate fmtuser fmtrating fmtspoil
diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm
index cd294d1b..c8cd1bc1 100644
--- a/lib/VNDBUtil.pm
+++ b/lib/VNDBUtil.pm
@@ -9,7 +9,7 @@ use Encode 'encode_utf8';
 use Unicode::Normalize 'NFKD';
 use Socket 'inet_pton', 'inet_ntop', 'AF_INET', 'AF_INET6';
 
-our @EXPORT = qw|shorten bb2html gtintype normalize normalize_titles normalize_query imgsize norm_ip|;
+our @EXPORT = qw|shorten gtintype normalize normalize_titles normalize_query imgsize norm_ip|;
 
 
 sub shorten {
@@ -18,147 +18,6 @@ sub shorten {
 }
 
 
-# Arguments: input, and optionally the maximum length
-# Parses:
-#  [url=..] [/url]
-#  [raw] .. [/raw]
-#  [spoiler] .. [/spoiler]
-#  [quote] .. [/quote]
-#  [code] .. [/code]
-#  v+,  v+.+
-#  http://../
-# XXX: Make sure to sync any changes in the formating with
-#   VNDB::Util::Misc::bbSubstLinks() if necessary. Or, alternatively, abstract
-#   parsing into a separate function as per http://beta.vndb.org/t5564.12
-sub bb2html {
-  my($raw, $maxlength, $charspoil) = @_;
-  $raw =~ s/\r//g;
-  return '' if !$raw && $raw ne "0";
-
-  my($result, $last, $length, $rmnewline, @open) = ('', 0, 0, 0, 'first');
-
-  # escapes, returns string, and takes care of $length and $maxlength; also
-  # takes care to remove newlines and double spaces when necessary
-  my $e = sub {
-    local $_ = shift;
-    s/^\n//         if $rmnewline && $rmnewline--;
-    s/\n{5,}/\n\n/g if $open[$#open] ne 'code';
-    s/  +/ /g       if $open[$#open] ne 'code';
-    $length += length $_;
-    if($maxlength && $length > $maxlength) {
-      $_ = substr($_, 0, $maxlength-$length);
-      s/[ \.,:;]+[^ \.,:;]*$//; # cleanly cut off on word boundary
-    }
-    s/&/&amp;/g;
-    s/>/&gt;/g;
-    s/</&lt;/g;
-    s/\n/<br \/>/g if !$maxlength;
-    s/\n/ /g       if $maxlength;
-    return $_;
-  };
-
-  while($raw =~ m{(
-    (d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]*)  | # 2. longid
-    ([tdvprcs][1-9][0-9]*\.[1-9][0-9]*)       | # 3. exid
-    ([tdvprcsugi][1-9][0-9]*)                 | # 4. id
-    (\[[^\s\]]+\])                            | # 5. tag
-    ((?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-]) # 6. url
-  )}xg) {
-    my($match, $longid, $exid, $id, $tag, $url) = ($1, $2, $3, $4, $5, $6);
-
-    # add string before the match
-    $result .= $e->(substr $raw, $last, (pos($raw)-length($match))-$last);
-    last if $maxlength && $length > $maxlength;
-    $last = pos $raw;
-
-    if($open[$#open] ne 'raw' && $open[$#open] ne 'code') {
-      # handle tags
-      if($tag) {
-        $tag = lc $tag;
-        if($tag eq '[raw]') {
-          push @open, 'raw';
-          next;
-        } elsif($tag eq '[spoiler]') {
-          push @open, 'spoiler';
-          $result .= !$charspoil ? '<b class="spoiler">'
-            : '<b class="grayedout charspoil charspoil_-1">&lt;hidden by spoiler settings&gt;</b><span class="charspoil charspoil_2 hidden">';
-          next;
-        } elsif($tag eq '[quote]') {
-          push @open, 'quote';
-          $result .= '<div class="quote">' if !$maxlength;
-          $rmnewline = 1;
-          next;
-        } elsif($tag eq '[code]') {
-          push @open, 'code';
-          $result .= '<pre>' if !$maxlength;
-          $rmnewline = 1;
-          next;
-        } elsif($tag eq '[/spoiler]' && $open[$#open] eq 'spoiler') {
-          $result .= !$charspoil ? '</b>' : '</span>';
-          pop @open;
-          next;
-        } elsif($tag eq '[/quote]' && $open[$#open] eq 'quote') {
-          $result .= '</div>' if !$maxlength;
-          $rmnewline = 1;
-          next;
-        } elsif($tag eq '[/url]' && $open[$#open] eq 'url') {
-          $result .= '</a>';
-          pop @open;
-          next;
-        } elsif($match =~ s{\[url=((https?://|/)[^\]>]+)\]}{<a href="$1" rel="nofollow">}i) {
-          $result .= $match;
-          push @open, 'url';
-          next;
-        }
-      }
-      # handle URLs
-      if($url && !grep(/url/, @open)) {
-        $length += 4;
-        last if $maxlength && $length > $maxlength;
-        $result .= sprintf '<a href="%s" rel="nofollow">link</a>', $url;
-        next;
-      }
-      # id
-      if(($id || $exid || $longid) && !grep(/url/, @open) && (!$result || substr($raw, $last-1-length($match), 1) !~ /[\w]/) && substr($raw, $last, 1) !~ /[\w]/) {
-        (my $lnk = $match) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/;
-        $length += length $lnk;
-        last if $maxlength && $length > $maxlength;
-        $result .= sprintf '<a href="/%s">%s</a>', $lnk, $match;
-        next
-      }
-    }
-
-    if($tag && $open[$#open] eq 'raw' && lc$tag eq '[/raw]') {
-      pop @open;
-      next;
-    }
-
-    if($tag && $open[$#open] eq 'code' && lc$tag eq '[/code]') {
-      $result .= '</pre>' if !$maxlength;
-      pop @open;
-      next;
-    }
-
-    # We'll only get here when the bbcode input isn't correct or something else
-    # didn't work out. In that case, just output whatever we've matched.
-    $result .= $e->($match);
-    last if $maxlength && $length > $maxlength;
-  }
-
-  # the last unmatched part, just escape and output
-  $result .= $e->(substr $raw, $last);
-
-  # close open tags
-  while((local $_ = pop @open) ne 'first') {
-    $result .= $_ eq 'url' ? '</a>' : $_ eq 'spoiler' ? '</b>' : '';
-    $result .= $_ eq 'quote' ? '</div>' : $_ eq 'code' ? '</pre>' : '' if !$maxlength;
-  }
-  $result .= '...' if $maxlength && $length > $maxlength;
-
-  return $result;
-}
-
-
 # GTIN code as argument,
 # Returns 'JAN', 'EAN', 'UPC' or undef,
 # Also 'normalizes' the first argument in place
diff --git a/util/bbcode-test.pl b/util/bbcode-test.pl
new file mode 100755
index 00000000..058fa937
--- /dev/null
+++ b/util/bbcode-test.pl
@@ -0,0 +1,185 @@
+#!/usr/bin/perl
+
+# This is a test & benchmark script for VNDB::BBCode.
+# Call without arguments to run the test, with any argument to run the benchmark.
+
+use strict;
+use warnings;
+use Cwd 'abs_path';
+use Test::More;
+use Benchmark 'timethese';
+
+our($ROOT, %S);
+BEGIN { ($ROOT = abs_path $0) =~ s{/util/bbcode-test\.pl$}{}; }
+use lib "$ROOT/lib";
+use VNDB::BBCode;
+
+
+my @tests = (
+  '',
+  '',
+
+  '[From [url=http://www.dlSITE.com/eng/]DLsite English[/url]]',
+  '[From <a href="http://www.dlSITE.com/eng/" rel="nofollow">DLsite English</a>]',
+
+  '[url=http://example.com/]some url[/url]',
+  '<a href="http://example.com/" rel="nofollow">some url</a>',
+
+  '[quote]some quote[/quote]',
+  '<div class="quote">some quote</div>',
+
+  "[code]some code\n\nalso newlines;[/code]",
+  '<pre>some code<br><br>also newlines;</pre>',
+
+  '[spoiler]some spoiler[/spoiler]',
+  '<b class="spoiler">some spoiler</b>',
+
+  "[raw][quote]not parsed\n[url=https://vndb.org/]valid url[/url]\n[url=asdf]invalid url[/url][/quote][/raw]",
+  "[quote]not parsed<br>[url=https://vndb.org/]valid url[/url]<br>[url=asdf]invalid url[/url][/quote]",
+
+  '[quote]basic [spoiler]single[/spoiler]-line [spoiler][url=/g]tag[/url] nesting [raw](without [url=/v3333]special[/url] cases)[/raw][/spoiler][/quote]',
+  '<div class="quote">basic <b class="spoiler">single</b>-line <b class="spoiler"><a href="/g" rel="nofollow">tag</a> nesting (without [url=/v3333]special[/url] cases)</b></div>',
+
+  "[quote]rmnewline after closing tag[/quote]\n",
+  '<div class="quote">rmnewline after closing tag</div>',
+
+  '[url=/v19]some vndb url[/url]',
+  '<a href="/v19" rel="nofollow">some vndb url</a>',
+
+  "quite\n\n\n\n\n\n\na\n\n\n\n\n            lot             of\n\n\n\nunneeded             whitespace",
+  'quite<br><br>a<br><br> lot of<br><br><br><br>unneeded whitespace',
+
+  "[quote]\nsimple\nrmnewline\ntest\n[/quote]",
+  '<div class="quote">simple<br>rmnewline<br>test<br></div>',
+
+  # the new implementation doesn't special-case [code], as the first newline shouldn't matter either way
+  "[quote]\n\nhello, rmnewline test[code]\n#!/bin/sh\n\nfunction random_username() {\n    </dev/urandom tr -cd 'a-zA-Z0-9' | dd bs=1 count=16 2>/dev/null\n}\n[/code]\nsome text after the code tag\n[/quote]\n\n[spoiler]\nsome newlined spoiler\n[/spoiler]",
+  '<div class="quote"><br>hello, rmnewline test<pre>#!/bin/sh<br><br>function random_username() {<br>    &lt;/dev/urandom tr -cd \'a-zA-Z0-9\' | dd bs=1 count=16 2&gt;/dev/null<br>}<br></pre>some text after the code tag<br></div><br><b class="spoiler"><br>some newlined spoiler<br></b>',
+
+  "[quote]\n[raw]\nrmnewline test with made-up elements\n[/raw]\nwelp\n[dumbtag]\nnone\n[/dumbtag]\n[/quote]",
+  '<div class="quote"><br>rmnewline test with made-up elements<br><br>welp<br>[dumbtag]<br>none<br>[/dumbtag]<br></div>',
+
+  '[url=http://example.com/]markup in [raw][url][/raw][/url]',
+  '<a href="http://example.com/" rel="nofollow">markup in [url]</a>',
+
+  '[url=http://192.168.1.1/some/path]ipv4 address in [url][/url]',
+  '<a href="http://192.168.1.1/some/path" rel="nofollow">ipv4 address in [url]</a>',
+
+  'http://192.168.1.1/some/path (literal ipv4 address)',
+  '<a href="http://192.168.1.1/some/path" rel="nofollow">link</a> (literal ipv4 address)',
+
+  '[url=http://192.168.1.1:8080/some/path]ipv4 address (port included) in [url][/url]',
+  '<a href="http://192.168.1.1:8080/some/path" rel="nofollow">ipv4 address (port included) in [url]</a>',
+
+  'http://192.168.1.1:8080/some/path (literal ipv4 address, port included)',
+  '<a href="http://192.168.1.1:8080/some/path" rel="nofollow">link</a> (literal ipv4 address, port included)',
+
+  '[Quote]non-lowercase tags [SpOILER]here[/sPOilER][/qUOTe]',
+  '<div class="quote">non-lowercase tags <b class="spoiler">here</b></div>',
+
+  'some text [spoiler]with (v17) tags[/spoiler] and internal ids such as s1',
+  'some text <b class="spoiler">with (<a href="/v17">v17</a>) tags</b> and internal ids such as <a href="/s1">s1</a>',
+
+  'r12.1 v6.3 s1.2',
+  '<a href="/r12.1">r12.1</a> <a href="/v6.3">v6.3</a> <a href="/s1.2">s1.2</a>',
+
+  'v17 text dds16v21 more text1 v9',
+  '<a href="/v17">v17</a> text dds16v21 more text1 <a href="/v9">v9</a>',
+
+  # Not sure what to do here
+  #'http://some[raw].pointlessly[/raw].unusual.domain/',
+  #'<a href="http://some.pointlessly.unusual.domain/" rel="nofollow">link</a>',
+
+  #'[url=http://some[raw].pointlessly[/raw].unusual.domain/]hi[/url]',
+  #'<a href="http://some[raw].pointlessly[/raw].unusual.domain/" rel="nofollow">hi</a>',
+
+  '<tag>html escapes (&)</tag>',
+  '&lt;tag&gt;html escapes (&amp;)&lt;/tag&gt;',
+
+  '[spoiler]stray open tag',
+  '<b class="spoiler">stray open tag</b>',
+
+  # TODO: This isn't ideal
+  '[quote][spoiler]stray open tag (nested)[/quote]',
+  '<div class="quote"><b class="spoiler">stray open tag (nested)[/quote]</b></div>',
+
+  '[quote][spoiler]two stray open tags',
+  '<div class="quote"><b class="spoiler">two stray open tags</b></div>',
+
+  "[url=https://cat.xyz/]that's [spoiler]some [quote]uncommon[/quote][/spoiler] combination[/url]",
+  '<a href="https://cat.xyz/" rel="nofollow">that\'s [spoiler]some [quote]uncommon[/quote][/spoiler] combination</a>',
+
+  # > I don't see anyone using IPv6 URLs anytime soon, so I'm not worried too either way.
+  #'[url=http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]/some/path]ipv6 address in [url][/url]',
+  #'<a href="http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]/some/path" rel="nofollow">ipv6 address in [url]</a>',
+
+  #'http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]/some/path (literal ipv6 address)',
+  #'<a href="http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]/some/path" rel="nofollow">link</a> (literal ipv6 address)',
+
+  # test shortening
+  [ "[url=https://cat.xyz/]that's [spoiler]some [quote]uncommon[/quote][/spoiler] combination[/url]", 10 ],
+  '<a href="https://cat.xyz/" rel="nofollow">that\'s </a>',
+
+  [ "A https://blicky.net/ only takes 4 characters", 8 ],
+  'A <a href="https://blicky.net/" rel="nofollow">link</a>',
+);
+
+
+# output should be the same as the input
+my @invalid_syntax = (
+  '[url="http://example.com/"]invalid argument to the "url" tag[/url]',
+  '[url=nicetext]simpler invalid param[/url]',
+  '[url]empty "url" tag[/url]',
+  '[tag]custom tag[/tag]',
+);
+
+
+# Chaining all the parse() raw arguments should generate the same string as the input
+sub identity {
+  my $ret = '';
+  VNDB::BBCode::parse $_[0], sub {
+    $ret .= $_[0];
+  };
+  $ret;
+}
+
+
+sub test {
+  push @tests, map +($_,$_), @invalid_syntax;
+  plan tests => scalar @tests;
+
+  my @t = @tests;
+  while(@t) {
+    my $input  = shift @t;
+    my $html = shift @t;
+    my @arg = ref $input ? @$input : ($input);
+    (my $msg = $arg[0]) =~ s/\n/\\n/g;
+    is identity($arg[0]), $arg[0], $msg;
+    is bb2html(@arg), $html, $msg;
+  }
+}
+
+
+# Performance comparison with old implementation
+sub bench {
+    my $plain = "This isn't a terribly interesting [string]. "x1000;
+    my $short = "Nobody ev3r v10 uses v5 so s1 many [url=https://blicky.net/]x[raw]y[/raw][/url] tags. ";
+    my $heavy = $short x100;
+    timethese(0, {
+      short => sub { bb2html($short) },
+      plain => sub { bb2html($plain) },
+      heavy => sub { bb2html($heavy) },
+    });
+    # old:
+    #   heavy:  3 wallclock secs ( 3.15 usr +  0.00 sys =  3.15 CPU) @ 357.46/s (n=1126)
+    #   plain:  3 wallclock secs ( 3.20 usr +  0.00 sys =  3.20 CPU) @ 130.00/s (n=416)
+    #   short:  3 wallclock secs ( 3.17 usr +  0.00 sys =  3.17 CPU) @ 31420.82/s (n=99604)
+    # new:
+    #   heavy:  3 wallclock secs ( 3.23 usr +  0.00 sys =  3.23 CPU) @ 242.11/s (n=782)
+    #   plain:  3 wallclock secs ( 3.12 usr +  0.00 sys =  3.12 CPU) @ 124.04/s (n=387)
+    #   short:  3 wallclock secs ( 3.18 usr +  0.00 sys =  3.18 CPU) @ 21018.55/s (n=66839)
+    # That's a bit of a performance hit, but should still be fast enough.
+}
+
+test if !@ARGV;
+bench if @ARGV;
-- 
cgit v1.2.3