summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/Multi/Feed.pm2
-rw-r--r--lib/VNDB/BBCode.pm229
-rw-r--r--lib/VNDB/Func.pm3
-rw-r--r--lib/VNDBUtil.pm143
4 files changed, 233 insertions, 144 deletions
diff --git a/lib/Multi/Feed.pm b/lib/Multi/Feed.pm
index 2c4144db..86f0ffa1 100644
--- a/lib/Multi/Feed.pm
+++ b/lib/Multi/Feed.pm
@@ -10,7 +10,7 @@ use warnings;
use TUWF::XML;
use Multi::Core;
use POSIX 'strftime';
-use VNDBUtil 'bb2html';
+use VNDB::BBCode;
my %stats; # key = feed, value = [ count, total, max ]
diff --git a/lib/VNDB/BBCode.pm b/lib/VNDB/BBCode.pm
new file mode 100644
index 00000000..ffa14d98
--- /dev/null
+++ b/lib/VNDB/BBCode.pm
@@ -0,0 +1,229 @@
+package VNDB::BBCode;
+
+use strict;
+use warnings;
+use Exporter 'import';
+use TUWF::XML 'xml_escape';
+
+our @EXPORT = qw/bb2html/;
+
+# Supported BBCode:
+# [spoiler] .. [/spoiler]
+# [quote] .. [/quote]
+# [code] .. [/code]
+# [url=..] [/url]
+# [raw] .. [/raw]
+# link: http://../
+# dblink: v#, v#.#, d#.#.#
+#
+# Permitted nesting of formatting codes:
+# spoiler -> url, raw, link, dblink
+# quote -> anything
+# code -> nothing
+# url -> raw
+# raw -> nothing
+
+
+# State action function usage:
+# _state_action \@stack, $match, $char_pre, $char_post
+# Returns: ($token, @arg) on successful parse, () otherwise.
+
+# Trivial open and close actions
+sub _spoiler_start { if(lc$_[1] eq '[spoiler]') { push @{$_[0]}, 'spoiler'; ('spoiler_start') } else { () } }
+sub _quote_start { if(lc$_[1] eq '[quote]') { push @{$_[0]}, 'quote'; ('quote_start') } else { () } }
+sub _code_start { if(lc$_[1] eq '[code]') { push @{$_[0]}, 'code'; ('code_start') } else { () } }
+sub _raw_start { if(lc$_[1] eq '[raw]') { push @{$_[0]}, 'raw'; ('raw_start') } else { () } }
+sub _spoiler_end { if(lc$_[1] eq '[/spoiler]') { pop @{$_[0]}; ('spoiler_end') } else { () } }
+sub _quote_end { if(lc$_[1] eq '[/quote]' ) { pop @{$_[0]}; ('quote_end' ) } else { () } }
+sub _code_end { if(lc$_[1] eq '[/code]' ) { pop @{$_[0]}; ('code_end' ) } else { () } }
+sub _raw_end { if(lc$_[1] eq '[/raw]' ) { pop @{$_[0]}; ('raw_end' ) } else { () } }
+sub _url_end { if(lc$_[1] eq '[/url]' ) { pop @{$_[0]}; ('url_end' ) } else { () } }
+
+sub _url_start {
+ if($_[1] =~ m{^\[url=((https?://|/)[^\]>]+)\]$}i) {
+ push @{$_[0]}, 'url';
+ (url_start => $1)
+ } else { () }
+}
+
+sub _link {
+ my(undef, $match, $char_pre, $char_post) = @_;
+
+ # Tags arent links
+ return () if $match =~ /^\[/;
+
+ # URLs (already "validated" in the parsing regex)
+ return ('link') if $match =~ /^[hf]t/;
+
+ # Now we're left with various forms of IDs, just need to make sure it's not surrounded by word characters
+ return ('dblink') if $char_pre !~ /\w/ && $char_post !~ /\w/;
+
+ ();
+}
+
+
+# Permitted actions to take in each state. The actions are run in order, if
+# none succeed then the token is passed through as text.
+# The "current state" is the most recent tag in the stack, or '' if no tags are open.
+my %STATE = (
+ '' => [ \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start],
+ spoiler => [\&_spoiler_end, \&_link, \&_url_start, \&_raw_start],
+ quote => [\&_quote_end, \&_link, \&_url_start, \&_raw_start, \&_spoiler_start, \&_quote_start, \&_code_start],
+ code => [\&_code_end ],
+ url => [\&_url_end, \&_raw_start],
+ raw => [\&_raw_end ],
+);
+
+
+# Usage:
+#
+# parse $input, sub {
+# my($raw, $token, @arg) = @_;
+# return 1; # to continue processing, 0 to stop. (Note that _close tokens may still follow after stopping)
+# };
+#
+# $raw = the raw part that has been parsed
+# $token = name of the parsed bbcode token, with some special cases (see below)
+# @arg = $token-specific arguments.
+#
+# Tags:
+# text -> literal text, $raw is the text to display
+# spoiler_start -> start a spoiler
+# spoiler_end -> end
+# quote_start -> start a quote
+# quote_end -> end
+# code_start -> code block
+# code_end -> end
+# url_start -> [url=..], $arg[0] contains the url
+# url_end -> [/url]
+# raw_start -> [raw]
+# raw_end -> [/raw]
+# link -> http://.../, $raw is the link
+# dblink -> v123, t13.1, etc. $raw is the dblink
+#
+# This function will ensure correct nesting of _start and _end tokens.
+sub parse {
+ my($raw, $sub) = @_;
+ $raw =~ s/\r//g;
+ return if !$raw && $raw ne '0';
+
+ my $last = 0;
+ my @stack;
+
+ while($raw =~ m{(?:
+ \[[^\s\]]+\] | # tag
+ d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]* | # d#.#.#
+ [tdvprcs][1-9][0-9]*\.[1-9][0-9]* | # v#.#
+ [tdvprcsugi][1-9][0-9]* | # v#
+ (?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-] # link
+ )}xg) {
+ my $token = $&;
+ my $pre = substr $raw, $last, (pos($raw)-length($&))-$last;
+ my $char_pre = $last ? substr $raw, pos($raw)-length($&)-1, 1 : '';
+ $last = pos $raw;
+ my $char_post = substr $raw, $last, 1;
+
+ # Pass through the unformatted text before the match
+ $sub->($pre, 'text') || goto FINAL if length $pre;
+
+ # Call the state functions. Arguments to these functions are implicitely
+ # passed through @_, which avoids allocating a new stack for each function
+ # call.
+ my $state = $STATE{ $stack[$#stack]||'' };
+ my @ret;
+ @_ = (\@stack, $token, $char_pre, $char_post);
+ for(@$state) {
+ @ret = &$_;
+ last if @ret;
+ }
+ $sub->($token, @ret ? @ret : ('text')) || goto FINAL;
+ }
+
+ $sub->(substr($raw, $last), 'text') if $last < length $raw;
+
+FINAL:
+ # Close all tags. This code is a bit of a hack, as it bypasses the state actions.
+ $sub->('', "${_}_end") for reverse @stack;
+}
+
+
+sub bb2html {
+ my($input, $maxlength, $charspoil) = @_;
+
+ my $incode = 0;
+ my $rmnewline = 0;
+ my $length = 0;
+ my $ret = '';
+
+ # escapes, returns string, and takes care of $length and $maxlength; also
+ # takes care to remove newlines and double spaces when necessary
+ my $e = sub {
+ local $_ = shift;
+
+ s/^\n// if $rmnewline && $rmnewline--;
+ s/\n{5,}/\n\n/g if !$incode;
+ s/ +/ /g if !$incode;
+ $length += length $_;
+ if($maxlength && $length > $maxlength) {
+ $_ = substr($_, 0, $maxlength-$length);
+ s/\W+\w*$//; # cleanly cut off on word boundary
+ }
+ s/&/&amp;/g;
+ s/>/&gt;/g;
+ s/</&lt;/g;
+ s/\n/<br>/g if !$maxlength;
+ s/\n/ /g if $maxlength;
+ $_;
+ };
+
+ parse $input, sub {
+ my($raw, $tag, @arg) = @_;
+
+ #$ret .= "$tag {$raw}\n";
+ #return 1;
+
+ if($tag eq 'text') {
+ $ret .= $e->($raw);
+
+ } elsif($tag eq 'spoiler_start') {
+ $ret .= !$charspoil
+ ? '<b class="spoiler">'
+ : '<b class="grayedout charspoil charspoil_-1">&lt;hidden by spoiler settings&gt;</b><span class="charspoil charspoil_2 hidden">';
+ } elsif($tag eq 'spoiler_end') {
+ $ret .= !$charspoil ? '</b>' : '</span>';
+
+ } elsif($tag eq 'quote_start') {
+ $ret .= '<div class="quote">' if !$maxlength;
+ $rmnewline = 1;
+ } elsif($tag eq 'quote_end') {
+ $ret .= '</div>' if !$maxlength;
+ $rmnewline = 1;
+
+ } elsif($tag eq 'code_start') {
+ $ret .= '<pre>' if !$maxlength;
+ $rmnewline = 1;
+ $incode = 1;
+ } elsif($tag eq 'code_end') {
+ $ret .= '</pre>' if !$maxlength;
+ $rmnewline = 1;
+ $incode = 0;
+
+ } elsif($tag eq 'url_start') {
+ $ret .= sprintf '<a href="%s" rel="nofollow">', xml_escape($arg[0]);
+ } elsif($tag eq 'url_end') {
+ $ret .= '</a>';
+
+ } elsif($tag eq 'link') {
+ $ret .= sprintf '<a href="%s" rel="nofollow">%s</a>', xml_escape($raw), $e->('link');
+
+ } elsif($tag eq 'dblink') {
+ (my $link = $raw) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/;
+ $ret .= sprintf '<a href="/%s">%s</a>', $link, $e->($raw);
+ }
+
+ !$maxlength || $length < $maxlength;
+ };
+ $ret;
+}
+
+1;
diff --git a/lib/VNDB/Func.pm b/lib/VNDB/Func.pm
index 6af8f5bf..e46df5e2 100644
--- a/lib/VNDB/Func.pm
+++ b/lib/VNDB/Func.pm
@@ -8,7 +8,8 @@ use Exporter 'import';
use POSIX 'strftime', 'ceil', 'floor';
use JSON::XS;
use VNDBUtil;
-our @EXPORT = (@VNDBUtil::EXPORT, qw|
+use VNDB::BBCode;
+our @EXPORT = (@VNDBUtil::EXPORT, 'bb2html', qw|
clearfloat cssicon tagscore mt minage fil_parse fil_serialize parenttags
childtags charspoil imgpath imgurl
fmtvote fmtmedia fmtvnlen fmtage fmtdatestr fmtdate fmtuser fmtrating fmtspoil
diff --git a/lib/VNDBUtil.pm b/lib/VNDBUtil.pm
index cd294d1b..c8cd1bc1 100644
--- a/lib/VNDBUtil.pm
+++ b/lib/VNDBUtil.pm
@@ -9,7 +9,7 @@ use Encode 'encode_utf8';
use Unicode::Normalize 'NFKD';
use Socket 'inet_pton', 'inet_ntop', 'AF_INET', 'AF_INET6';
-our @EXPORT = qw|shorten bb2html gtintype normalize normalize_titles normalize_query imgsize norm_ip|;
+our @EXPORT = qw|shorten gtintype normalize normalize_titles normalize_query imgsize norm_ip|;
sub shorten {
@@ -18,147 +18,6 @@ sub shorten {
}
-# Arguments: input, and optionally the maximum length
-# Parses:
-# [url=..] [/url]
-# [raw] .. [/raw]
-# [spoiler] .. [/spoiler]
-# [quote] .. [/quote]
-# [code] .. [/code]
-# v+, v+.+
-# http://../
-# XXX: Make sure to sync any changes in the formating with
-# VNDB::Util::Misc::bbSubstLinks() if necessary. Or, alternatively, abstract
-# parsing into a separate function as per http://beta.vndb.org/t5564.12
-sub bb2html {
- my($raw, $maxlength, $charspoil) = @_;
- $raw =~ s/\r//g;
- return '' if !$raw && $raw ne "0";
-
- my($result, $last, $length, $rmnewline, @open) = ('', 0, 0, 0, 'first');
-
- # escapes, returns string, and takes care of $length and $maxlength; also
- # takes care to remove newlines and double spaces when necessary
- my $e = sub {
- local $_ = shift;
- s/^\n// if $rmnewline && $rmnewline--;
- s/\n{5,}/\n\n/g if $open[$#open] ne 'code';
- s/ +/ /g if $open[$#open] ne 'code';
- $length += length $_;
- if($maxlength && $length > $maxlength) {
- $_ = substr($_, 0, $maxlength-$length);
- s/[ \.,:;]+[^ \.,:;]*$//; # cleanly cut off on word boundary
- }
- s/&/&amp;/g;
- s/>/&gt;/g;
- s/</&lt;/g;
- s/\n/<br \/>/g if !$maxlength;
- s/\n/ /g if $maxlength;
- return $_;
- };
-
- while($raw =~ m{(
- (d[1-9][0-9]*\.[1-9][0-9]*\.[1-9][0-9]*) | # 2. longid
- ([tdvprcs][1-9][0-9]*\.[1-9][0-9]*) | # 3. exid
- ([tdvprcsugi][1-9][0-9]*) | # 4. id
- (\[[^\s\]]+\]) | # 5. tag
- ((?:https?|ftp)://[^><"\n\s\]\[]+[\d\w=/-]) # 6. url
- )}xg) {
- my($match, $longid, $exid, $id, $tag, $url) = ($1, $2, $3, $4, $5, $6);
-
- # add string before the match
- $result .= $e->(substr $raw, $last, (pos($raw)-length($match))-$last);
- last if $maxlength && $length > $maxlength;
- $last = pos $raw;
-
- if($open[$#open] ne 'raw' && $open[$#open] ne 'code') {
- # handle tags
- if($tag) {
- $tag = lc $tag;
- if($tag eq '[raw]') {
- push @open, 'raw';
- next;
- } elsif($tag eq '[spoiler]') {
- push @open, 'spoiler';
- $result .= !$charspoil ? '<b class="spoiler">'
- : '<b class="grayedout charspoil charspoil_-1">&lt;hidden by spoiler settings&gt;</b><span class="charspoil charspoil_2 hidden">';
- next;
- } elsif($tag eq '[quote]') {
- push @open, 'quote';
- $result .= '<div class="quote">' if !$maxlength;
- $rmnewline = 1;
- next;
- } elsif($tag eq '[code]') {
- push @open, 'code';
- $result .= '<pre>' if !$maxlength;
- $rmnewline = 1;
- next;
- } elsif($tag eq '[/spoiler]' && $open[$#open] eq 'spoiler') {
- $result .= !$charspoil ? '</b>' : '</span>';
- pop @open;
- next;
- } elsif($tag eq '[/quote]' && $open[$#open] eq 'quote') {
- $result .= '</div>' if !$maxlength;
- $rmnewline = 1;
- next;
- } elsif($tag eq '[/url]' && $open[$#open] eq 'url') {
- $result .= '</a>';
- pop @open;
- next;
- } elsif($match =~ s{\[url=((https?://|/)[^\]>]+)\]}{<a href="$1" rel="nofollow">}i) {
- $result .= $match;
- push @open, 'url';
- next;
- }
- }
- # handle URLs
- if($url && !grep(/url/, @open)) {
- $length += 4;
- last if $maxlength && $length > $maxlength;
- $result .= sprintf '<a href="%s" rel="nofollow">link</a>', $url;
- next;
- }
- # id
- if(($id || $exid || $longid) && !grep(/url/, @open) && (!$result || substr($raw, $last-1-length($match), 1) !~ /[\w]/) && substr($raw, $last, 1) !~ /[\w]/) {
- (my $lnk = $match) =~ s/^d(\d+)\.(\d+)\.(\d+)$/d$1#$2.$3/;
- $length += length $lnk;
- last if $maxlength && $length > $maxlength;
- $result .= sprintf '<a href="/%s">%s</a>', $lnk, $match;
- next
- }
- }
-
- if($tag && $open[$#open] eq 'raw' && lc$tag eq '[/raw]') {
- pop @open;
- next;
- }
-
- if($tag && $open[$#open] eq 'code' && lc$tag eq '[/code]') {
- $result .= '</pre>' if !$maxlength;
- pop @open;
- next;
- }
-
- # We'll only get here when the bbcode input isn't correct or something else
- # didn't work out. In that case, just output whatever we've matched.
- $result .= $e->($match);
- last if $maxlength && $length > $maxlength;
- }
-
- # the last unmatched part, just escape and output
- $result .= $e->(substr $raw, $last);
-
- # close open tags
- while((local $_ = pop @open) ne 'first') {
- $result .= $_ eq 'url' ? '</a>' : $_ eq 'spoiler' ? '</b>' : '';
- $result .= $_ eq 'quote' ? '</div>' : $_ eq 'code' ? '</pre>' : '' if !$maxlength;
- }
- $result .= '...' if $maxlength && $length > $maxlength;
-
- return $result;
-}
-
-
# GTIN code as argument,
# Returns 'JAN', 'EAN', 'UPC' or undef,
# Also 'normalizes' the first argument in place