diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/Multi/Feed.pm | 2 | ||||
-rw-r--r-- | lib/VNDB/BBCode.pm | 229 | ||||
-rw-r--r-- | lib/VNDB/Func.pm | 3 | ||||
-rw-r--r-- | lib/VNDBUtil.pm | 143 |
4 files changed, 233 insertions, 144 deletions
diff --git a/lib/Multi/Feed.pm b/lib/Multi/Feed.pm index 2c4144db..86f0ffa1 100644 --- a/lib/Multi/Feed.pm +++ b/lib/Multi/Feed.pm @@ -10,7 +10,7 @@ use warnings; use TUWF::XML; use Multi::Core; use POSIX 'strftime'; -use VNDBUtil 'bb2html'; +use VNDB::BBCode; my %stats; # key = feed, value = [ count, total, max ] diff --git a/lib/VNDB/BBCode.pm b/lib/VNDB/BBCode.pm new file mode 100644 index 00000000..ffa14d98 --- /dev/null +++ b/lib/VNDB/BBCode.pm @@ -0,0 +1,229 @@ +package VNDB::BBCode; + +use strict; +use warnings; +use Exporter 'import'; +use TUWF::XML 'xml_escape'; + +our @EXPORT = qw/bb2html/; + +# Supported BBCode: +# [spoiler] .. [/spoiler] +# [quote] .. [/quote] +# [code] .. [/code] +# [url=..] [/url] +# [raw] .. [/raw] +# link: http://../ +# dblink: v#, v#.#, d#.#.# +# +# Permitted nesting of formatting codes: +# spoiler -> url, raw, link, dblink +# quote -> anything +# code -> nothing +# url -> raw +# raw -> nothing + + +# State action function usage: +# _state_action \@stack, $match, $char_pre, $char_post +# Returns: ($token, @arg) on successful parse, () otherwise. + +# Trivial open and close actions +sub _spoiler_start { if(lc$_[1] eq '[spoiler]') { push @{$_[0]}, 'spoiler'; ('spoiler_start') } else { () } } +sub _quote_start { if(lc$_[1] eq '[quote]') { push @{$_[0]}, 'quote'; ('quote_start') } else { () } } +sub _code_start { if(lc$_[1] eq '[code]') { push @{$_[0]}, 'code'; ('code_start') } else { () } } +sub _raw_start { if(lc$_[1] eq '[raw]') { push @{$_[0]}, 'raw'; ('raw_start') } else { () } } +sub _spoiler_end { if(lc$_[1] eq '[/spoiler]') { pop @{$_[0]}; ('spoiler_end') } else { () } } +sub _quote_end { if(lc$_[1] eq '[/quote]' ) { pop @{$_[0]}; ('quote_end' ) } else { () } } +sub _code_end { if(lc$_[1] eq '[/code]' ) { pop @{$_[0]}; ('code_end' ) } else { () } } +sub _raw_end { if(lc$_[1] eq '[/raw]' ) { pop @{$_[0]}; ('raw_end' ) } else { () } } +sub _url_end { if(lc$_[1] eq '[/url]' ) { pop @{$_[0]}; ('url_end' ) } else { () } } + +sub _url_start { + if($_[1] =~ m{^\[url=((https?://|/)[^\]>]+)\]$}i) { + push @{$_[0]}, 'url'; + (url_start => $1) + } else { () } +} + +sub _link { + my(undef, $match, $char_pre, $char_post) = @_; + + # Tags arent links + return () if $match =~ /^\[/; + + # URLs (already "validated" in the parsing regex) + return ('link') if $match =~ /^[hf]t/; + + # Now we're left with various forms of IDs, just need to make sure it's not surrounded by word characters + return ('dblink') if $char_pre !~ /\w/ && $char_post !~ /\w/; + + (); +} + + +# Permitted actions to take in each state. The actions are run in order, if +# none succeed then the token is passed through as text. +# The "current state" is the most recent tag in the stack, or '' if no tags are open. +my %STATE = ( + '' => [ \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start], + spoiler => [\&_spoiler_end, \&_link, \&_url_start, \&_raw_start], + quote => [\&_quote_end, \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start], + code => [\&_code_end ], + url => [\&_url_end, \&_raw_start], + raw => [\&_raw_end ], +); + + +# Usage: +# +# parse $input, sub { +# my($raw, $token, @arg) = @_; +# return 1; # to continue processing, 0 to stop. (Note that _close tokens may still follow after stopping) +# }; +# +# $raw = the raw part that has been parsed +# $token = name of the parsed bbcode token, with some special cases (see below) +# @arg = $token-specific arguments. +# +# Tags: +# text -> literal text, $raw is the text to display +# spoiler_start -> start a spoiler +# spoiler_end -> end +# quote_start -> start a quote +# quote_end -> end +# code_start -> code block +# code_end -> end +# url_start -> [url=..], $arg[0] contains the url +# url_end -> [/url] +# raw_start -> [raw] +# raw_end -> [/raw] +# link -> http://.../, $raw is the link +# dblink -> v123, t13.1, etc. $raw is the dblink +# +# This function will ensure correct nesting of _start and _end tokens. +sub parse { + my($raw, $sub) = @_; + $raw =~ s/\r//g; + return if !$raw && $raw ne '0'; + + my $last = 0; + my @stack; + + while($raw =~ m{(?: + \[[^\s\]]+\] | # tag + d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]* | # d#.#.# + [tdvprcs][1-9][0-9]*\.[1-9][0-9]* | # v#.# + [tdvprcsugi][1-9][0-9]* | # v# + (?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-] # link + )}xg) { + my $token = $&; + my $pre = substr $raw, $last, (pos($raw)-length($&))-$last; + my $char_pre = $last ? substr $raw, pos($raw)-length($&)-1, 1 : ''; + $last = pos $raw; + my $char_post = substr $raw, $last, 1; + + # Pass through the unformatted text before the match + $sub->($pre, 'text') || goto FINAL if length $pre; + + # Call the state functions. Arguments to these functions are implicitely + # passed through @_, which avoids allocating a new stack for each function + # call. + my $state = $STATE{ $stack[$#stack]||'' }; + my @ret; + @_ = (\@stack, $token, $char_pre, $char_post); + for(@$state) { + @ret = &$_; + last if @ret; + } + $sub->($token, @ret ? @ret : ('text')) || goto FINAL; + } + + $sub->(substr($raw, $last), 'text') if $last < length $raw; + +FINAL: + # Close all tags. This code is a bit of a hack, as it bypasses the state actions. + $sub->('', "${_}_end") for reverse @stack; +} + + +sub bb2html { + my($input, $maxlength, $charspoil) = @_; + + my $incode = 0; + my $rmnewline = 0; + my $length = 0; + my $ret = ''; + + # escapes, returns string, and takes care of $length and $maxlength; also + # takes care to remove newlines and double spaces when necessary + my $e = sub { + local $_ = shift; + + s/^\n// if $rmnewline && $rmnewline--; + s/\n{5,}/\n\n/g if !$incode; + s/ +/ /g if !$incode; + $length += length $_; + if($maxlength && $length > $maxlength) { + $_ = substr($_, 0, $maxlength-$length); + s/\W+\w*$//; # cleanly cut off on word boundary + } + s/&/&/g; + s/>/>/g; + s/</</g; + s/\n/<br>/g if !$maxlength; + s/\n/ /g if $maxlength; + $_; + }; + + parse $input, sub { + my($raw, $tag, @arg) = @_; + + #$ret .= "$tag {$raw}\n"; + #return 1; + + if($tag eq 'text') { + $ret .= $e->($raw); + + } elsif($tag eq 'spoiler_start') { + $ret .= !$charspoil + ? '<b class="spoiler">' + : '<b class="grayedout charspoil charspoil_-1"><hidden by spoiler settings></b><span class="charspoil charspoil_2 hidden">'; + } elsif($tag eq 'spoiler_end') { + $ret .= !$charspoil ? '</b>' : '</span>'; + + } elsif($tag eq 'quote_start') { + $ret .= '<div class="quote">' if !$maxlength; + $rmnewline = 1; + } elsif($tag eq 'quote_end') { + $ret .= '</div>' if !$maxlength; + $rmnewline = 1; + + } elsif($tag eq 'code_start') { + $ret .= '<pre>' if !$maxlength; + $rmnewline = 1; + $incode = 1; + } elsif($tag eq 'code_end') { + $ret .= '</pre>' if !$maxlength; + $rmnewline = 1; + $incode = 0; + + } elsif($tag eq 'url_start') { + $ret .= sprintf '<a href="%s" rel="nofollow">', xml_escape($arg[0]); + } elsif($tag eq 'url_end') { + $ret .= '</a>'; + + } elsif($tag eq 'link') { + $ret .= sprintf '<a href="%s" rel="nofollow">%s</a>', xml_escape($raw), $e->('link'); + + } elsif($tag eq 'dblink') { + (my $link = $raw) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/; + $ret .= sprintf '<a href="/%s">%s</a>', $link, $e->($raw); + } + + !$maxlength || $length < $maxlength; + }; + $ret; +} + +1; diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm index 6af8f5bf..e46df5e2 100644 --- a/lib/VNDB/Func.pm +++ b/lib/VNDB/Func.pm @@ -8,7 +8,8 @@ use Exporter 'import'; use POSIX 'strftime', 'ceil', 'floor'; use JSON::XS; use VNDBUtil; -our @EXPORT = (@VNDBUtil::EXPORT, qw| +use VNDB::BBCode; +our @EXPORT = (@VNDBUtil::EXPORT, 'bb2html', qw| clearfloat cssicon tagscore mt minage fil_parse fil_serialize parenttags childtags charspoil imgpath imgurl fmtvote fmtmedia fmtvnlen fmtage fmtdatestr fmtdate fmtuser fmtrating fmtspoil diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm index cd294d1b..c8cd1bc1 100644 --- a/lib/VNDBUtil.pm +++ b/lib/VNDBUtil.pm @@ -9,7 +9,7 @@ use Encode 'encode_utf8'; use Unicode::Normalize 'NFKD'; use Socket 'inet_pton', 'inet_ntop', 'AF_INET', 'AF_INET6'; -our @EXPORT = qw|shorten bb2html gtintype normalize normalize_titles normalize_query imgsize norm_ip|; +our @EXPORT = qw|shorten gtintype normalize normalize_titles normalize_query imgsize norm_ip|; sub shorten { @@ -18,147 +18,6 @@ sub shorten { } -# Arguments: input, and optionally the maximum length -# Parses: -# [url=..] [/url] -# [raw] .. [/raw] -# [spoiler] .. [/spoiler] -# [quote] .. [/quote] -# [code] .. [/code] -# v+, v+.+ -# http://../ -# XXX: Make sure to sync any changes in the formating with -# VNDB::Util::Misc::bbSubstLinks() if necessary. Or, alternatively, abstract -# parsing into a separate function as per http://beta.vndb.org/t5564.12 -sub bb2html { - my($raw, $maxlength, $charspoil) = @_; - $raw =~ s/\r//g; - return '' if !$raw && $raw ne "0"; - - my($result, $last, $length, $rmnewline, @open) = ('', 0, 0, 0, 'first'); - - # escapes, returns string, and takes care of $length and $maxlength; also - # takes care to remove newlines and double spaces when necessary - my $e = sub { - local $_ = shift; - s/^\n// if $rmnewline && $rmnewline--; - s/\n{5,}/\n\n/g if $open[$#open] ne 'code'; - s/ +/ /g if $open[$#open] ne 'code'; - $length += length $_; - if($maxlength && $length > $maxlength) { - $_ = substr($_, 0, $maxlength-$length); - s/[ \.,:;]+[^ \.,:;]*$//; # cleanly cut off on word boundary - } - s/&/&/g; - s/>/>/g; - s/</</g; - s/\n/<br \/>/g if !$maxlength; - s/\n/ /g if $maxlength; - return $_; - }; - - while($raw =~ m{( - (d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]*) | # 2. longid - ([tdvprcs][1-9][0-9]*\.[1-9][0-9]*) | # 3. exid - ([tdvprcsugi][1-9][0-9]*) | # 4. id - (\[[^\s\]]+\]) | # 5. tag - ((?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-]) # 6. url - )}xg) { - my($match, $longid, $exid, $id, $tag, $url) = ($1, $2, $3, $4, $5, $6); - - # add string before the match - $result .= $e->(substr $raw, $last, (pos($raw)-length($match))-$last); - last if $maxlength && $length > $maxlength; - $last = pos $raw; - - if($open[$#open] ne 'raw' && $open[$#open] ne 'code') { - # handle tags - if($tag) { - $tag = lc $tag; - if($tag eq '[raw]') { - push @open, 'raw'; - next; - } elsif($tag eq '[spoiler]') { - push @open, 'spoiler'; - $result .= !$charspoil ? '<b class="spoiler">' - : '<b class="grayedout charspoil charspoil_-1"><hidden by spoiler settings></b><span class="charspoil charspoil_2 hidden">'; - next; - } elsif($tag eq '[quote]') { - push @open, 'quote'; - $result .= '<div class="quote">' if !$maxlength; - $rmnewline = 1; - next; - } elsif($tag eq '[code]') { - push @open, 'code'; - $result .= '<pre>' if !$maxlength; - $rmnewline = 1; - next; - } elsif($tag eq '[/spoiler]' && $open[$#open] eq 'spoiler') { - $result .= !$charspoil ? '</b>' : '</span>'; - pop @open; - next; - } elsif($tag eq '[/quote]' && $open[$#open] eq 'quote') { - $result .= '</div>' if !$maxlength; - $rmnewline = 1; - next; - } elsif($tag eq '[/url]' && $open[$#open] eq 'url') { - $result .= '</a>'; - pop @open; - next; - } elsif($match =~ s{\[url=((https?://|/)[^\]>]+)\]}{<a href="$1" rel="nofollow">}i) { - $result .= $match; - push @open, 'url'; - next; - } - } - # handle URLs - if($url && !grep(/url/, @open)) { - $length += 4; - last if $maxlength && $length > $maxlength; - $result .= sprintf '<a href="%s" rel="nofollow">link</a>', $url; - next; - } - # id - if(($id || $exid || $longid) && !grep(/url/, @open) && (!$result || substr($raw, $last-1-length($match), 1) !~ /[\w]/) && substr($raw, $last, 1) !~ /[\w]/) { - (my $lnk = $match) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/; - $length += length $lnk; - last if $maxlength && $length > $maxlength; - $result .= sprintf '<a href="/%s">%s</a>', $lnk, $match; - next - } - } - - if($tag && $open[$#open] eq 'raw' && lc$tag eq '[/raw]') { - pop @open; - next; - } - - if($tag && $open[$#open] eq 'code' && lc$tag eq '[/code]') { - $result .= '</pre>' if !$maxlength; - pop @open; - next; - } - - # We'll only get here when the bbcode input isn't correct or something else - # didn't work out. In that case, just output whatever we've matched. - $result .= $e->($match); - last if $maxlength && $length > $maxlength; - } - - # the last unmatched part, just escape and output - $result .= $e->(substr $raw, $last); - - # close open tags - while((local $_ = pop @open) ne 'first') { - $result .= $_ eq 'url' ? '</a>' : $_ eq 'spoiler' ? '</b>' : ''; - $result .= $_ eq 'quote' ? '</div>' : $_ eq 'code' ? '</pre>' : '' if !$maxlength; - } - $result .= '...' if $maxlength && $length > $maxlength; - - return $result; -} - - # GTIN code as argument, # Returns 'JAN', 'EAN', 'UPC' or undef, # Also 'normalizes' the first argument in place |