summaryrefslogtreecommitdiff
path: root/web/src/lib.rs
blob: f14d38737764083828569d01a7bb0590c4a3dfb3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
//It's really unfortunate that benchmarking is still unstable.
//#![feature(test)]
//extern crate test;
extern crate regex;
#[macro_use] extern crate lazy_static;

use std::fmt::Write;
use regex::Regex;


#[derive(Clone,Copy,PartialEq,Eq)]
enum FmtChar {
    Regular,
    Italic,
    Bold,
    Both,
}

impl FmtChar {
    fn add(self, b: Self) -> Self {
        match (self, b) {
            (FmtChar::Regular, x) |
            (x, FmtChar::Regular) => x,
            (FmtChar::Italic, FmtChar::Bold) |
            (FmtChar::Bold, FmtChar::Italic) => FmtChar::Both,
            _ => self
        }
    }

    fn open(self) -> &'static str {
        match self {
            FmtChar::Regular => "",
            FmtChar::Italic  => "<i>",
            FmtChar::Bold    => "<b>",
            FmtChar::Both    => "<em>",
        }
    }

    fn close(self) -> &'static str {
        match self {
            FmtChar::Regular => "",
            FmtChar::Italic  => "</i>",
            FmtChar::Bold    => "</b>",
            FmtChar::Both    => "</em>",
        }
    }
}


/* Simple state machine to interpret the BACKSPACE codes generated by grotty. The format is
 * described as "old behaviour" in grotty(1). Roughly:
 *
 *   '_' BACKSPACE 'x'               -> 'x' is italic
 *   'x' BACKSPACE 'x'               -> 'x' is bold
 *   '_' BACKSPACE 'x' BACKSPACE 'x' -> 'x' is bold and italic
 *
 * And other combinations are possible. The BACKSPACE character basically says "combine the
 * following character with previous token". Where "combining" means:
 *
 *   a == b   -> bold
 *   a == _   -> b is italic
 *   b == _   -> a is italic
 *
 * See the BSD ul(1) utility for a full interpreter of the format. Fortunately we only have to
 * handle the (limited) output that grotty generates, we don't have to be fully compatible with
 * ul(1).
 */
enum CharParse {
    Start,
    Token(char, FmtChar),
    Escape(char, FmtChar),
}


impl CharParse {
    fn update(&mut self, chr: char) -> Option<(char, FmtChar)> {
        match *self {
            CharParse::Start => {
                *self = CharParse::Token(chr, FmtChar::Regular);
                None
            },

            CharParse::Token(c, f) =>
                if chr == 8 as char {
                    *self = CharParse::Escape(c, f);
                    None
                } else {
                    *self = CharParse::Token(chr, FmtChar::Regular);
                    Some((c, f))
                },

            CharParse::Escape(c, f) => {
                *self = if c == '_' {
                    CharParse::Token(chr, f.add(FmtChar::Italic))
                } else if chr == '_' {
                    CharParse::Token(c, f.add(FmtChar::Italic))
                } else {
                    CharParse::Token(chr, f.add(FmtChar::Bold))
                };
                None
            },
        }
    }
}


fn pushfmt(out: &mut String, old: FmtChar, new: FmtChar) {
    if new != old {
        out.push_str(old.close());
        out.push_str(new.open());
    }
}


// Intermediate text buffer. This buffer contains the entire HTML-escaped man page and a list of
// indices where text formatting changes are performed.
struct FmtBuf {
    buf: String,
    // List of formatting chunks. The number indicates the character index where the formatting
    // ends. E.g. [(5,Regular),(10,Bold),(15,Italic)] means:
    //   [0..5] is Regular
    //   [5..10] is Bold
    //   [10..15] is Italic
    fmt: Vec<(usize,FmtChar)>,
    lastfmt: FmtChar,
}

// Output state
struct Flush<'a, 'b> {
    out: &'a mut String,
    idx: usize, // Last byte in the buffer that has been processed
    fmt: std::iter::Peekable<std::slice::Iter<'b, (usize,FmtChar)>>, // Iterator over FmtBuf.fmt
}


impl FmtBuf {
    fn push(&mut self, chr: char, fmt: FmtChar) {
        // Consider whitespace and underscore to have the same
        // formatting as the previous character; This generates smaller
        // HTML, and you can't see the difference anyway.
        if self.lastfmt != fmt && !(chr == ' ' || chr == '_') {
            self.fmt.push((self.buf.len(), self.lastfmt));
            self.lastfmt = fmt;
        }
        // WARNING: The '"' character is not escaped, so care must be taken when copying a slice
        // into an attribute value! (In the current implementation, " is simply never part of an
        // attribute value)
        match chr {
            '<' => self.buf.push_str("&lt;"),
            '&' => self.buf.push_str("&amp;"),
            _   => self.buf.push(chr), // <- 30% of the entire processing time is spent here.
        }
    }

    // Flush all unprocessed bytes until 'end' to the output
    fn flush_to(&self, st: &mut Flush, end: usize) {
        let mut lastfmt = FmtChar::Regular;
        while st.idx < end {
            let &&(chunk, fmt) = st.fmt.peek().unwrap();
            let chunk = if chunk > end {
                end
            } else {
                st.fmt.next();
                chunk
            };
            pushfmt(st.out, lastfmt, fmt);
            st.out.push_str(&self.buf[st.idx..chunk]);
            st.idx = chunk;
            lastfmt = fmt;
        }
        st.out.push_str(lastfmt.close());
    }

    // Consume the input buffer until 'end' without generating output
    fn flush_skip(&self, st: &mut Flush, end: usize) {
        st.idx = end;
        while st.idx < self.buf.len() && st.fmt.peek().unwrap().0 <= st.idx {
            st.fmt.next();
        }
    }

    fn flush_include(&self, st: &mut Flush, start: usize, end: usize) {
        lazy_static!(
            static ref REF: Regex = Regex::new(r#"^((?:[^"\s\]]*/)?([^"\s/\]]+))\]\]\]"#).unwrap();
        );
        let m = match REF.captures(&self.buf[end..]) { Some(x) => x, None => return };

        self.flush_to(st, start);
        st.out.push_str(">> Included manual page: <a href=\"/");
        // Replace ‐ (U+2010) with - (U+2d). ASCII dashes are replaced with an Unicode dash
        // when passed through groff, which we need to revert in order to get the link working.
        // (Apparently it recognizes man page references and URLs, as it doesn't do this
        // replacement in those situations.)
        for c in m[2].chars() {
            st.out.push(if c == '‐' { '-' } else { c });
        }
        st.out.push_str("\">");
        st.out.push_str(&m[1]);
        st.out.push_str("</a>");
        self.flush_skip(st, end + m[0].len());
    }

    fn flush_url(&self, st: &mut Flush, start: usize) {
        lazy_static!(
            // Some characters considered to never be part of a URL.
            // (Note that we can't match literal '<' because of the HTML escaping done previously)
            static ref URLEND: Regex = Regex::new("(?:\"|&lt;|>|\\s)").unwrap();
        );
        let urlend = match URLEND.find(&self.buf[start..]) { Some(x) => x, None => return };
        let url = &self.buf[start..(start + urlend.start())];

        // Also catch a Unicode '⟩', which is how groff sometimes ends a .UR, e.g.:
        // - https://manned.org/troff/c4467840
        // - https://manned.org/pass/78413b49
        // - https://manned.org/empathy-accounts/8c05b2c1
        // - https://manned.org/urn/8cb83e85
        // TODO: Add heuristic to only remove ) at the end of the URL if there is no matching (
        // inside the URL.
        let url = url.trim_right_matches(|c|
            match c { '.' | ',' | ';' | ')' | '⟩' | '\'' | ':' | ']' | '}' => true, _ => false }
        );
        if url.len() < 10 {
            return;
        }

        self.flush_to(st, start);
        write!(st.out, "<a href=\"{0}\" rel=\"nofollow\">{0}</a>", url).unwrap();
        self.flush_skip(st, start + url.len());
    }

    fn flush_ref(&self, st: &mut Flush, end: usize) {
        // We know where the closing bracket is in the string, so this regex is used to search
        // backwards from there and find the start of the reference.
        // There are a lot of 'special' multi-character section names, so it might not make sense
        // to parse all of them. Here's an estimate of a few 'special' section references, in
        // number of man pages using the reference (using ~ '%(3pm)%' on the 2017-01-14 database):
        // - 3pm    17810
        // - 3w      8729 (just a few packages)
        // - 3tcl    2000
        // - 3tk      758
        // - 3p       309
        // - 3perl    268
        // - 3ssl     198
        lazy_static!(
            // XXX: Make sure to keep this regex in sync with the one in flush()
            static ref REF: Regex = Regex::new(r"([A-Za-z0-9:\._-]+)\(([1-9nl]|3tcl|3pm|3tk)\)$").unwrap();
        );

        // Disallow some characters following a reference
        if self.buf.len() > end {
            let ch = self.buf[end..].chars().next().unwrap();
            if ch == '_' || ch.is_alphanumeric() {
                return;
            }
        }

        let m = REF.captures(&self.buf[..end]).unwrap();
        self.flush_to(st, end - m[0].len());
        self.flush_skip(st, end);
        write!(st.out, "<a href=\"/{}.{}\">{}</a>", &m[1], &m[2], &m[0]).unwrap();
    }

    fn flush(&mut self, out: &mut String) {
        self.fmt.push((self.buf.len(), FmtChar::Regular));

        // Find the indices where the first line ends, and the last line starts. These are used to
        // efficiently disable reference formatting on the first and last line.
        let firstlineend = self.buf.find('\n').unwrap_or(self.buf.len());
        let lastlinestart = self.buf.trim_right_matches('\n').rfind('\n').unwrap_or(0);

        // This regex is used to quickly *find* interesting patterns, any further validation
        // and processing is done afterwards by the (slower) specialized flush_ methods.
        lazy_static!(
            static ref SEARCH: Regex = Regex::new(r"(?m)(^\[\[\[MANNEDINCLUDE|https?://|[A-Za-z0-9]+\(([1-9nl]|3tcl|3pm|3tk)\))").unwrap();
        );

        let mut st = Flush{
            out: out,
            idx: 0,
            fmt: self.fmt.iter().peekable(),
        };

        for i in SEARCH.find_iter(&self.buf) {
            // This can happen with overlapping detections, e.g. when something inside a URL looks
            // like a man page reference.
            if st.idx > i.start() {
                continue;
            }
            let allowref = i.start() > firstlineend && i.start() < lastlinestart;
            match self.buf.as_bytes()[i.end()-1] {
                0x45 /* E */ => self.flush_include(&mut st, i.start(), i.end()),
                0x2F /* / */ if allowref => self.flush_url(&mut st, i.start()),
                _            if allowref => self.flush_ref(&mut st, i.end()),
                _ => {}
            }
        }
        self.flush_to(&mut st, self.buf.len());
    }
}


pub fn grotty2html(input: &str) -> String {
    let mut state = CharParse::Start;

    let mut buf = FmtBuf{
        buf: String::with_capacity(128),
        fmt: Vec::with_capacity(128),
        lastfmt: FmtChar::Regular,
    };

    for chr in input.chars() {
        if let Some((chr, fmt)) = state.update(chr) {
            buf.push(chr, fmt);
        }
    }
    if let CharParse::Token(chr, fmt) = state {
        buf.push(chr, fmt);
    }

    let mut out = String::with_capacity(input.len());
    buf.flush(&mut out);
    out
}



use std::os::raw::c_ulonglong;

#[repr(C)]
pub struct StringWrap {
    buf: *mut u8,
    len: c_ulonglong,
    cap: c_ulonglong,
}

#[no_mangle]
pub extern fn grotty2html_wrap(in_buf: *const u8, in_len: c_ulonglong) -> StringWrap {
    let input = unsafe { std::str::from_utf8_unchecked( std::slice::from_raw_parts(in_buf, in_len as usize) ) };
    let mut out = grotty2html(input).into_bytes();
    let r = StringWrap {
        buf: out.as_mut_ptr(),
        len: out.len() as c_ulonglong,
        cap: out.capacity() as c_ulonglong,
    };
    std::mem::forget(out);
    r
}

#[no_mangle]
pub extern fn grotty2html_free(buf: StringWrap) {
    unsafe { Vec::from_raw_parts(buf.buf, buf.len as usize, buf.cap as usize) };
}


/*
#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Read;
    use test::Bencher;

    fn bench_file(b: &mut Bencher, f: &str) {
        let mut f = std::fs::File::open(f).unwrap();
        let mut buf = String::new();
        f.read_to_string(&mut buf).unwrap();

        b.iter(|| {
            test::black_box(grotty2html(&buf));
        });
    }

    #[bench]
    fn bench_rsync(b: &mut test::Bencher) {
        bench_file(b, "t/rsync.1.output");
    }

    #[bench]
    fn bench_ncdu(b: &mut test::Bencher) {
        bench_file(b, "t/ncdu.1.output");
    }

    #[bench]
    fn bench_javadoc(b: &mut test::Bencher) {
        bench_file(b, "t/javadoc.1.output");
    }

    /*
    #[bench]
    fn bench_wfilter(b: &mut test::Bencher) {
        bench_file(b, "t/wfilter.4.output");
    }
    */
}*/