package VNDB::BBCode;
use strict;
use warnings;
use Exporter 'import';
use TUWF::XML 'xml_escape';
our @EXPORT = qw/bb2html bb2text/;
# Supported BBCode:
# [spoiler] .. [/spoiler]
# [quote] .. [/quote]
# [code] .. [/code]
# [url=..] [/url]
# [raw] .. [/raw]
# link: http://../
# dblink: v+, v+.+, d+#+, d+#+.+
#
# Permitted nesting of formatting codes:
# spoiler -> url, raw, link, dblink
# quote -> anything
# code -> nothing
# url -> raw
# raw -> nothing
# State action function usage:
# _state_action \@stack, $match, $char_pre, $char_post
# Returns: ($token, @arg) on successful parse, () otherwise.
# Trivial open and close actions
sub _spoiler_start { if(lc$_[1] eq '[spoiler]') { push @{$_[0]}, 'spoiler'; ('spoiler_start') } else { () } }
sub _quote_start { if(lc$_[1] eq '[quote]') { push @{$_[0]}, 'quote'; ('quote_start') } else { () } }
sub _code_start { if(lc$_[1] eq '[code]') { push @{$_[0]}, 'code'; ('code_start') } else { () } }
sub _raw_start { if(lc$_[1] eq '[raw]') { push @{$_[0]}, 'raw'; ('raw_start') } else { () } }
sub _spoiler_end { if(lc$_[1] eq '[/spoiler]') { pop @{$_[0]}; ('spoiler_end') } else { () } }
sub _quote_end { if(lc$_[1] eq '[/quote]' ) { pop @{$_[0]}; ('quote_end' ) } else { () } }
sub _code_end { if(lc$_[1] eq '[/code]' ) { pop @{$_[0]}; ('code_end' ) } else { () } }
sub _raw_end { if(lc$_[1] eq '[/raw]' ) { pop @{$_[0]}; ('raw_end' ) } else { () } }
sub _url_end { if(lc$_[1] eq '[/url]' ) { pop @{$_[0]}; ('url_end' ) } else { () } }
sub _url_start {
if($_[1] =~ m{^\[url=((https?://|/)[^\]>]+)\]$}i) {
push @{$_[0]}, 'url';
(url_start => $1)
} else { () }
}
sub _link {
my(undef, $match, $char_pre, $char_post) = @_;
# Tags arent links
return () if $match =~ /^\[/;
# URLs (already "validated" in the parsing regex)
return ('link') if $match =~ /^[hf]t/;
# Now we're left with various forms of IDs, just need to make sure it's not surrounded by word characters
return ('dblink') if $char_pre !~ /\w/ && $char_post !~ /\w/;
();
}
# Permitted actions to take in each state. The actions are run in order, if
# none succeed then the token is passed through as text.
# The "current state" is the most recent tag in the stack, or '' if no tags are open.
my %STATE = (
'' => [ \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start],
spoiler => [\&_spoiler_end, \&_link, \&_url_start, \&_raw_start],
quote => [\&_quote_end, \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start],
code => [\&_code_end ],
url => [\&_url_end, \&_raw_start],
raw => [\&_raw_end ],
);
# Usage:
#
# parse $input, sub {
# my($raw, $token, @arg) = @_;
# return 1; # to continue processing, 0 to stop. (Note that _close tokens may still follow after stopping)
# };
#
# $raw = the raw part that has been parsed
# $token = name of the parsed bbcode token, with some special cases (see below)
# @arg = $token-specific arguments.
#
# Tags:
# text -> literal text, $raw is the text to display
# spoiler_start -> start a spoiler
# spoiler_end -> end
# quote_start -> start a quote
# quote_end -> end
# code_start -> code block
# code_end -> end
# url_start -> [url=..], $arg[0] contains the url
# url_end -> [/url]
# raw_start -> [raw]
# raw_end -> [/raw]
# link -> http://.../, $raw is the link
# dblink -> v123, t13.1, etc. $raw is the dblink
#
# This function will ensure correct nesting of _start and _end tokens.
sub parse {
my($raw, $sub) = @_;
$raw =~ s/\r//g;
return if !$raw && $raw ne '0';
my $last = 0;
my @stack;
while($raw =~ m{(?:
\[ \/? (?i: spoiler|quote|code|url|raw ) [^\s\]]* \] | # tag
d[1-9][0-9]* \# [1-9][0-9]* (?: \.[1-9][0-9]* )? | # d+#+[.+]
[tdvprcs][1-9][0-9]*\.[1-9][0-9]* | # v+.+
[tdvprcsugi][1-9][0-9]* | # v+
(?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-] # link
)}xg) {
my $token = $&;
my $pre = substr $raw, $last, $-[0]-$last;
my $char_pre = $-[0] ? substr $raw, $-[0]-1, 1 : '';
$last = pos $raw;
my $char_post = substr $raw, $last, 1;
# Pass through the unformatted text before the match
$sub->($pre, 'text') || goto FINAL if length $pre;
# Call the state functions. Arguments to these functions are implicitely
# passed through @_, which avoids allocating a new stack for each function
# call.
my $state = $STATE{ $stack[$#stack]||'' };
my @ret;
@_ = (\@stack, $token, $char_pre, $char_post);
for(@$state) {
@ret = &$_;
last if @ret;
}
$sub->($token, @ret ? @ret : ('text')) || goto FINAL;
}
$sub->(substr($raw, $last), 'text') if $last < length $raw;
FINAL:
# Close all tags. This code is a bit of a hack, as it bypasses the state actions.
$sub->('', "${_}_end") for reverse @stack;
}
sub bb2html {
my($input, $maxlength, $charspoil) = @_;
my $incode = 0;
my $rmnewline = 0;
my $length = 0;
my $ret = '';
# escapes, returns string, and takes care of $length and $maxlength; also
# takes care to remove newlines and double spaces when necessary
my $e = sub {
local $_ = shift;
s/^\n// if $rmnewline && $rmnewline--;
s/\n{5,}/\n\n/g if !$incode;
s/ +/ /g if !$incode;
$length += length $_;
if($maxlength && $length > $maxlength) {
$_ = substr($_, 0, $maxlength-$length);
s/\W+\w*$//; # cleanly cut off on word boundary
}
s/&/&/g;
s/>/>/g;
s/</g;
s/\n/
/g if !$maxlength;
s/\n/ /g if $maxlength;
$_;
};
parse $input, sub {
my($raw, $tag, @arg) = @_;
#$ret .= "$tag {$raw}\n";
#return 1;
if($tag eq 'text') {
$ret .= $e->($raw);
} elsif($tag eq 'spoiler_start') {
$ret .= !$charspoil
? ''
: '<hidden by spoiler settings> ' : '';
} elsif($tag eq 'quote_start') {
$ret .= '
' if !$maxlength; $rmnewline = 1; $incode = 1; } elsif($tag eq 'code_end') { $ret .= '' if !$maxlength; $rmnewline = 1; $incode = 0; } elsif($tag eq 'url_start') { $ret .= sprintf '', xml_escape($arg[0]); } elsif($tag eq 'url_end') { $ret .= ''; } elsif($tag eq 'link') { $ret .= sprintf '%s', xml_escape($raw), $e->('link'); } elsif($tag eq 'dblink') { (my $link = $raw) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/; $ret .= sprintf '%s', $link, $e->($raw); } !$maxlength || $length < $maxlength; }; $ret; } # Convert bbcode into plain text, stripping all tags and spoilers. [url] tags # only display the title. sub bb2text { my $input = shift; my $inspoil = 0; my $ret = ''; parse $input, sub { my($raw, $tag, @arg) = @_; if($tag eq 'spoiler_start') { $inspoil = 1; } elsif($tag eq 'spoiler_end') { $inspoil = 0; } else { $ret .= $raw if !$inspoil && $tag !~ /_(start|end)$/; } 1; }; $ret; } 1;