diff options
author | Yorhel <git@yorhel.nl> | 2017-01-15 17:07:03 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2017-01-15 17:07:03 +0100 |
commit | 1ccc86ce86760f8e1c93ea9a690d28865a8daddd (patch) | |
tree | f591d3cad9fe5abe0b430c1cfeabac02a84ab1ac /web | |
parent | 6114b17389990845abbeac38da7a40657425b4d8 (diff) |
Whole bunch of HTML conversion improvements
- Grotty escape sequences are now better interpreted. I feel rather
stupid for not realizing the idea behind how those codes are supposed
to work earlier. It finally hit me when I read the BSD ul(1) source
code.
- URL end detection is slightly better (much better than the old C code)
- Man page references with : are recognized now (common in Perl modules).
- More efficient HTML escaping, no need to escape > and ".
There's still a bunch of improvements to make, but I have much more
confidence in the current implementation already.
Diffstat (limited to 'web')
-rw-r--r-- | web/src/lib.rs | 89 |
1 files changed, 46 insertions, 43 deletions
diff --git a/web/src/lib.rs b/web/src/lib.rs index fc0d13b..367b69e 100644 --- a/web/src/lib.rs +++ b/web/src/lib.rs @@ -15,51 +15,51 @@ enum FmtChar { } -/* Simple state machine to parse the following grammar: +/* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is + * described as "old behaviour" in grotty(1). Roughly: * - * fmtchar = escape | double-escape | char - * escape = tag ESC char - * double-escape = ESC tag ESC char - * tag = "_" # italic - * | char # bold + * '_' BACKSPACE 'x' -> 'x' is italic + * 'x' BACKSPACE 'x' -> 'x' is bold + * '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic * - * This format is described as "old behaviour" in grotty(1). The double-escape - * seems to be a weird glitch, and can be interpreted as - * "(tag ESC char) ESC (tag ESC char)". This parser simply skips over any such - * sequence starting with ESC. */ + * And other combinations are possible. The BACKSPACE character basically says "combine the + * following character with previous token". Where "combining" means: + * + * a == b -> bold + * a == _ -> b is italic + * b == _ -> a is italic + * + * See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to + * handle the (limited) output that grotty generates, we don't have to be fully compatible with + * ul(1). + */ enum CharParse { Start, - One(char), // Seen a single character (either 'char' or 'escape') - Escape(char), // Seen a single character + escape - DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip + Token(char, FmtChar), + Escape(char, FmtChar), } impl CharParse { fn update(&mut self, chr: char) -> Option<(char, FmtChar)> { match *self { - CharParse::Start => { - *self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) }; + *self = CharParse::Token(chr, FmtChar::Regular); None }, - CharParse::One(c) => + CharParse::Token(c, f) => if chr == 8 as char { - *self = CharParse::Escape(c); + *self = CharParse::Escape(c, f); None } else { - *self = CharParse::One(chr); - Some((c, FmtChar::Regular)) + *self = CharParse::Token(chr, FmtChar::Regular); + Some((c, f)) }, - CharParse::Escape(c) => { - *self = CharParse::Start; - Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold })) - }, - - CharParse::DoubleEsc(n) => { - *self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) }; + CharParse::Escape(c, _) => { + // TODO: Handle combination of bold & italic + *self = CharParse::Token(chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }); None }, } @@ -107,11 +107,12 @@ impl FmtBuf { self.fmt.push((self.buf.len(), self.lastfmt)); self.lastfmt = fmt; } + // WARNING: The '"' character is not escaped, so care must be taken when copying a slice + // into an attribute value! (In the current implementation, " is simply never part of an + // attribute value) match chr { - '>' => self.buf.push_str(">"), '<' => self.buf.push_str("<"), '&' => self.buf.push_str("&"), - // '"' => self.buf.push_str("""), // TEMPORARILY disabled for comparison with old code _ => self.buf.push(chr), // <- 30% of the entire processing time is spent here. } } @@ -138,19 +139,19 @@ impl FmtBuf { // Consume the input buffer until 'end' without generating output fn flush_skip(&self, st: &mut Flush, end: usize) { st.idx = end; - while st.fmt.peek().unwrap().0 <= st.idx { + while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx { st.fmt.next(); } } fn flush_include(&self, st: &mut Flush, start: usize, end: usize) { lazy_static!( - static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap(); + static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap(); ); let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return }; self.flush_to(st, start); - st.out.push_str("\n>> Included man page: <a href=\"/"); + st.out.push_str(">> Included manual page: <a href=\"/"); // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash // when passed through groff, which we need to revert in order to get the link working. // (Apparently it recognizes man page references and URLs, as it doesn't do this @@ -167,12 +168,10 @@ impl FmtBuf { fn flush_url(&self, st: &mut Flush, start: usize) { lazy_static!( // Some characters considered to never be part of a URL. - // (Note that we can't match literal ><" because of the HTML escaping done previously) - static ref URLEND: Regex = Regex::new("(?:\"|"|>|<|\\s)").unwrap(); + // (Note that we can't match literal '<' because of the HTML escaping done previously) + static ref URLEND: Regex = Regex::new("(?:\"|<|>|\\s)").unwrap(); ); let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return }; - - self.flush_to(st, start); let url = &self.buf[start..(start + urlend.start())]; // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.: @@ -180,10 +179,14 @@ impl FmtBuf { // - https://manned.org/pass/78413b49 // - https://manned.org/empathy-accounts/8c05b2c1 // - https://manned.org/urn/8cb83e85 - // TODO: Check the character before the start of the URL, and only remove ) if there is a - // starting ( before it. - let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩'); + // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching ( + // inside the URL. + let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\''); + if url.len() < 10 { + return; + } + self.flush_to(st, start); write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap(); self.flush_skip(st, start + url.len()); } @@ -192,13 +195,13 @@ impl FmtBuf { // We know where the closing bracket is in the string, so this regex is used to search // backwards from there and find the start of the reference. lazy_static!( - static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap(); + static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap(); ); // Disallow some characters following a reference if self.buf.len() > end { let ch = self.buf[end..].chars().next().unwrap(); - if ch == '-' || ch == '_' || ch.is_alphanumeric() { + if ch == '_' || ch.is_alphanumeric() { return; } } @@ -220,7 +223,7 @@ impl FmtBuf { // This regex is used to quickly *find* interesting patterns, any further validation // and processing is done afterwards by the (slower) specialized flush_ methods. lazy_static!( - static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap(); + static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap(); ); let mut st = Flush{ @@ -269,8 +272,8 @@ pub fn grotty2html(input: &str) -> String { //} } } - if let CharParse::One(chr) = state { - buf.push(chr, FmtChar::Regular); + if let CharParse::Token(chr, fmt) = state { + buf.push(chr, fmt); } let mut out = String::with_capacity(input.len()); |