diff options
author | Yorhel <git@yorhel.nl> | 2016-12-29 09:27:19 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2016-12-29 09:27:19 +0100 |
commit | 8d6e7bc2d8f66ef002fe4ef90c6e216a43cce538 (patch) | |
tree | 07c61058337fee8b74bb9114594bde2c6cac9a0d /indexer | |
parent | eac4b6ac774b7a69ae7f52ba1854575c2eecb1cb (diff) |
indexer: Prioritize 7bit encodings when decoding man pages
Fixes parsing of https://manned.org/xshisen/ae5d469f
Diffstat (limited to 'indexer')
-rw-r--r-- | indexer/src/man.rs | 32 |
1 files changed, 15 insertions, 17 deletions
diff --git a/indexer/src/man.rs b/indexer/src/man.rs index 90dfa03..9df594d 100644 --- a/indexer/src/man.rs +++ b/indexer/src/man.rs @@ -108,7 +108,8 @@ fn codec_from_tag(data: &Vec<u8>) -> Option<EncodingRef> { let tag = str::from_utf8(cap.at(1).unwrap()).unwrap().to_lowercase(); match &tag[..] { - // Deny some common UTF-8-compatible encodings. These tags are obviously incorrect. + // Deny some common UTF-8-compatible encodings. These tags are irrelevant because we're + // testing for UTF-8 anyway.. "us-ascii" | "ascii" | "utf8" | "utf-8" | "utf-8-unix" => None, // latin-1 isn't in the whatwg spec under that name @@ -205,27 +206,24 @@ pub fn decode(paths: &[&str], ent: &mut Read) -> io::Result<(digest::Digest,&'st let dig = digest::digest(&digest::SHA1, &data); - // TODO: Handle BOM? UTF-16? - // TODO: This fails badly for ISO-2022-JP. How the hell do we cleanly fix that? - // If it passes as UTF-8, then just consider it UTF-8. - if let Ok(_) = str::from_utf8(&data) { - return Ok((dig, "utf8", unsafe { String::from_utf8_unchecked(data) } )); + // Create a list of encodings to try, starting with UTF-8 + let mut encs : Vec<EncodingRef> = vec![all::UTF_8]; + encs.extend(codec_from_tag(&data)); + encs.extend(paths.iter().filter_map(|&e| codec_from_path(e))); + + // ISO-2022-JP is a 7bit encoding, and must be tested before UTF-8 + if encs.iter().any(|&e| e.name() == (all::ISO_2022_JP as EncodingRef).name()) { + encs.insert(0, all::ISO_2022_JP); } - // Otherwise, look for a coding tag in the contents - if let Some(e) = codec_from_tag(&data) { + + // Try the encodings in order, use the first one that succeeds + for e in encs { if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) { return Ok((dig, e.name(), s)); } } - // If that fails as well, look for clues in the file path. - for path in paths { - if let Some(e) = codec_from_path(path) { - if let Ok(s) = e.decode(&data, encoding::DecoderTrap::Strict) { - return Ok((dig, e.name(), s)); - } - } - } - // If all else fails, use a lossy iso-8859-1 + + // Fall back to lossy ISO-8859-1 if all else failed Ok((dig, "iso-8859-1", (all::ISO_8859_1 as EncodingRef).decode(&data, encoding::DecoderTrap::Ignore).unwrap() )) } |