summaryrefslogtreecommitdiff
path: root/web
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2017-01-15 17:07:03 +0100
committerYorhel <git@yorhel.nl>2017-01-15 17:07:03 +0100
commit1ccc86ce86760f8e1c93ea9a690d28865a8daddd (patch)
treef591d3cad9fe5abe0b430c1cfeabac02a84ab1ac /web
parent6114b17389990845abbeac38da7a40657425b4d8 (diff)
Whole bunch of HTML conversion improvements
- Grotty escape sequences are now better interpreted. I feel rather stupid for not realizing the idea behind how those codes are supposed to work earlier. It finally hit me when I read the BSD ul(1) source code. - URL end detection is slightly better (much better than the old C code) - Man page references with : are recognized now (common in Perl modules). - More efficient HTML escaping, no need to escape > and ". There's still a bunch of improvements to make, but I have much more confidence in the current implementation already.
Diffstat (limited to 'web')
-rw-r--r--web/src/lib.rs89
1 files changed, 46 insertions, 43 deletions
diff --git a/web/src/lib.rs b/web/src/lib.rs
index fc0d13b..367b69e 100644
--- a/web/src/lib.rs
+++ b/web/src/lib.rs
@@ -15,51 +15,51 @@ enum FmtChar {
}
-/* Simple state machine to parse the following grammar:
+/* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is
+ * described as "old behaviour" in grotty(1). Roughly:
*
- * fmtchar = escape | double-escape | char
- * escape = tag ESC char
- * double-escape = ESC tag ESC char
- * tag = "_" # italic
- * | char # bold
+ * '_' BACKSPACE 'x' -> 'x' is italic
+ * 'x' BACKSPACE 'x' -> 'x' is bold
+ * '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic
*
- * This format is described as "old behaviour" in grotty(1). The double-escape
- * seems to be a weird glitch, and can be interpreted as
- * "(tag ESC char) ESC (tag ESC char)". This parser simply skips over any such
- * sequence starting with ESC. */
+ * And other combinations are possible. The BACKSPACE character basically says "combine the
+ * following character with previous token". Where "combining" means:
+ *
+ * a == b -> bold
+ * a == _ -> b is italic
+ * b == _ -> a is italic
+ *
+ * See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to
+ * handle the (limited) output that grotty generates, we don't have to be fully compatible with
+ * ul(1).
+ */
enum CharParse {
Start,
- One(char), // Seen a single character (either 'char' or 'escape')
- Escape(char), // Seen a single character + escape
- DoubleEsc(u32), // Inside a double-escape, indicates number of characters left to skip
+ Token(char, FmtChar),
+ Escape(char, FmtChar),
}
impl CharParse {
fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
match *self {
-
CharParse::Start => {
- *self = if chr == 8 as char { CharParse::DoubleEsc(2) } else { CharParse::One(chr) };
+ *self = CharParse::Token(chr, FmtChar::Regular);
None
},
- CharParse::One(c) =>
+ CharParse::Token(c, f) =>
if chr == 8 as char {
- *self = CharParse::Escape(c);
+ *self = CharParse::Escape(c, f);
None
} else {
- *self = CharParse::One(chr);
- Some((c, FmtChar::Regular))
+ *self = CharParse::Token(chr, FmtChar::Regular);
+ Some((c, f))
},
- CharParse::Escape(c) => {
- *self = CharParse::Start;
- Some((chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold }))
- },
-
- CharParse::DoubleEsc(n) => {
- *self = if n == 0 { CharParse::Start } else { CharParse::DoubleEsc(n-1) };
+ CharParse::Escape(c, _) => {
+ // TODO: Handle combination of bold & italic
+ *self = CharParse::Token(chr, if c == '_' { FmtChar::Italic } else { FmtChar::Bold });
None
},
}
@@ -107,11 +107,12 @@ impl FmtBuf {
self.fmt.push((self.buf.len(), self.lastfmt));
self.lastfmt = fmt;
}
+ // WARNING: The '"' character is not escaped, so care must be taken when copying a slice
+ // into an attribute value! (In the current implementation, " is simply never part of an
+ // attribute value)
match chr {
- '>' => self.buf.push_str("&gt;"),
'<' => self.buf.push_str("&lt;"),
'&' => self.buf.push_str("&amp;"),
- // '"' => self.buf.push_str("&quot;"), // TEMPORARILY disabled for comparison with old code
_ => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
}
}
@@ -138,19 +139,19 @@ impl FmtBuf {
// Consume the input buffer until 'end' without generating output
fn flush_skip(&self, st: &mut Flush, end: usize) {
st.idx = end;
- while st.fmt.peek().unwrap().0 <= st.idx {
+ while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx {
st.fmt.next();
}
}
fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
lazy_static!(
- static ref REF: Regex = Regex::new(r"^((?:[^\s\]]*/)?([^\s/\]]+))\]\]\]").unwrap();
+ static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap();
);
let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };
self.flush_to(st, start);
- st.out.push_str("\n&gt;&gt; Included man page: <a href=\"/");
+ st.out.push_str(">> Included manual page: <a href=\"/");
// Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
// when passed through groff, which we need to revert in order to get the link working.
// (Apparently it recognizes man page references and URLs, as it doesn't do this
@@ -167,12 +168,10 @@ impl FmtBuf {
fn flush_url(&self, st: &mut Flush, start: usize) {
lazy_static!(
// Some characters considered to never be part of a URL.
- // (Note that we can't match literal ><" because of the HTML escaping done previously)
- static ref URLEND: Regex = Regex::new("(?:\"|&quot;|&gt;|&lt;|\\s)").unwrap();
+ // (Note that we can't match literal '<' because of the HTML escaping done previously)
+ static ref URLEND: Regex = Regex::new("(?:\"|&lt;|>|\\s)").unwrap();
);
let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
-
- self.flush_to(st, start);
let url = &self.buf[start..(start + urlend.start())];
// Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
@@ -180,10 +179,14 @@ impl FmtBuf {
// - https://manned.org/pass/78413b49
// - https://manned.org/empathy-accounts/8c05b2c1
// - https://manned.org/urn/8cb83e85
- // TODO: Check the character before the start of the URL, and only remove ) if there is a
- // starting ( before it.
- let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩');
+ // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
+ // inside the URL.
+ let url = url.trim_right_matches('.').trim_right_matches(',').trim_right_matches(';').trim_right_matches(')').trim_right_matches('⟩').trim_right_matches('\'');
+ if url.len() < 10 {
+ return;
+ }
+ self.flush_to(st, start);
write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
self.flush_skip(st, start + url.len());
}
@@ -192,13 +195,13 @@ impl FmtBuf {
// We know where the closing bracket is in the string, so this regex is used to search
// backwards from there and find the start of the reference.
lazy_static!(
- static ref REF: Regex = Regex::new(r"([A-Za-z0-9\._-]+)\(([1-8nl])\)$").unwrap();
+ static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl])\)$").unwrap();
);
// Disallow some characters following a reference
if self.buf.len() > end {
let ch = self.buf[end..].chars().next().unwrap();
- if ch == '-' || ch == '_' || ch.is_alphanumeric() {
+ if ch == '_' || ch.is_alphanumeric() {
return;
}
}
@@ -220,7 +223,7 @@ impl FmtBuf {
// This regex is used to quickly *find* interesting patterns, any further validation
// and processing is done afterwards by the (slower) specialized flush_ methods.
lazy_static!(
- static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-8nl]\))").unwrap();
+ static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\([1-9nl]\))").unwrap();
);
let mut st = Flush{
@@ -269,8 +272,8 @@ pub fn grotty2html(input: &str) -> String {
//}
}
}
- if let CharParse::One(chr) = state {
- buf.push(chr, FmtChar::Regular);
+ if let CharParse::Token(chr, fmt) = state {
+ buf.push(chr, fmt);
}
let mut out = String::with_capacity(input.len());