diff options
author | Yorhel <git@yorhel.nl> | 2016-10-26 18:26:06 +0200 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2016-10-29 09:33:39 +0200 |
commit | c8bb4da246e949b5ec1b5ebc3b260105fce88132 (patch) | |
tree | a276918274e2bf1ba60933a93e413b76a8a88b90 /indexer | |
parent | 022e9acc4f7e42b807bb63e021655cc79ce9f398 (diff) |
Use libarchive3-sys crate directly + improve archread API
This all should offer a more convenient and robust interface to handle
all sorts of archives.
Diffstat (limited to 'indexer')
-rw-r--r-- | indexer/Cargo.lock | 30 | ||||
-rw-r--r-- | indexer/Cargo.toml | 4 | ||||
-rw-r--r-- | indexer/src/archive.rs | 535 | ||||
-rw-r--r-- | indexer/src/archread.rs | 363 | ||||
-rw-r--r-- | indexer/src/main.rs | 92 | ||||
-rw-r--r-- | indexer/src/man.rs | 85 | ||||
-rwxr-xr-x | indexer/tests/mktar.sh | 13 | ||||
-rw-r--r-- | indexer/tests/simpletest.tar.gz | bin | 0 -> 247 bytes | |||
-rw-r--r-- | indexer/tests/testarchive.tar.xz | bin | 616 -> 616 bytes |
9 files changed, 703 insertions, 419 deletions
diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index 53c701d..dbd3057 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -3,9 +3,11 @@ name = "indexer" version = "0.1.0" dependencies = [ "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)", + "lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -22,7 +24,7 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -35,13 +37,9 @@ dependencies = [ ] [[package]] -name = "libarchive" -version = "0.1.1" -source = "git+https://github.com/17dec/libarchive-rust#3f723cf0064561f21f0cebbd534a75076e6dbcaa" -dependencies = [ - "libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)", -] +name = "lazy_static" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "libarchive3-sys" @@ -77,19 +75,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "regex" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "regex-syntax" -version = "0.3.7" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -128,14 +126,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -"checksum libarchive 0.1.1 (git+https://github.com/17dec/libarchive-rust)" = "<none>" +"checksum lazy_static 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "49247ec2a285bb3dcb23cbd9c35193c025e7251bfce77c1d5da97e6362dffe7f" "checksum libarchive3-sys 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3cd3beae8f59a4c7a806523269b5392037577c150446e88d684dfa6de6031ca7" "checksum libc 0.2.17 (registry+https://github.com/rust-lang/crates.io-index)" = "044d1360593a78f5c8e5e710beccdc24ab71d1f01bc19a29bcacdba22e8475d8" "checksum log 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ab83497bf8bf4ed2a74259c1c802351fcd67a65baa86394b6ba73c36f4838054" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum pkg-config 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8cee804ecc7eaf201a4a207241472cc870e825206f6c031e3ee2a72fa425f2fa" -"checksum regex 0.1.77 (registry+https://github.com/rust-lang/crates.io-index)" = "64b03446c466d35b42f2a8b203c8e03ed8b91c0f17b56e1f84f7210a257aa665" -"checksum regex-syntax 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "48f0573bcee95a48da786f8823465b5f2a1fae288a55407aca991e5b3e0eae11" +"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" +"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" "checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" "checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" "checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index 7b1b918..4fd665f 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -7,4 +7,6 @@ authors = ["Yorhel <git@yorhel.nl>"] regex = "0.1.77" log = "0.3.6" env_logger = "0.3.5" -libarchive = { git = "https://github.com/17dec/libarchive-rust" } +lazy_static = "0.2.1" +libc = "0.2.17" +libarchive3-sys = "0.1.2" diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs index f1b9403..17e85c4 100644 --- a/indexer/src/archive.rs +++ b/indexer/src/archive.rs @@ -1,368 +1,273 @@ -use std::path::Path; -use std::collections::HashMap; -use libarchive::reader::Reader as ArchiveReader; -use libarchive::reader::{FileReader,Builder}; -use libarchive::archive::{Entry,FileType,ReadFormat,ReadFilter}; -use libarchive::error::ArchiveResult; - - -pub fn open_file<T: AsRef<Path>>(path: T) -> ArchiveResult<FileReader> { - let mut builder = Builder::new(); - try!(builder.support_format(ReadFormat::All)); - try!(builder.support_filter(ReadFilter::All)); - builder.open_file(path) +use std::str; +use std::ptr; +use std::error::Error as ErrorTrait; +use std::io::{Result,Error,Read}; +use std::ffi::{CStr,CString}; + +use libc::{c_void,ssize_t}; +use libarchive3_sys::ffi; + + +/* This is a safe, limited and opinionated wrapper around the libarchive C bindings. + * I initially used the libarchive crate, but it has several issues. Some of which are not fixable + * without a complete rewrite. + * - Panics on non-UTF8 path names + * - Panics on hard links (PR #6) + * - API is far too flexible, easy to misuse and get panics/segfaults + * - Impossible to correctly read files from an archive (issue #7) + * - Does not provide a convenient Read interface for files + * + * Barring any unexpected behaviour or bugs in libarchive, the API below should not panic or + * segfault for any archive or usage pattern. + */ + +pub struct Archive<'a> { + a: *mut ffi::Struct_archive, + rd: &'a mut Read, + buf: Vec<u8>, + err: Option<Error>, } -#[derive(Clone,Debug,PartialEq,Eq)] -pub enum EntryType { - // Regular file that has been handled/indexed - Handled, - // Regular file that hasn't been handled because the caller wasn't interested in it. Could - // still be an interesting file if it is referenced from an interesting path. - Regular, - // Link to another file (interesting or not is irrelevant) - Link(String), - // Directory; need this information when resolving links - Directory, - // Something that couldn't be a an interesting file (chardev/socket/etc); If any link resolves - // to this we know we're done. - Other, +pub struct ArchiveEntry<'a> { + a: Box<Archive<'a>>, + e: *mut ffi::Struct_archive_entry, } -/* - * I had hoped that reading man pages from an archive would just be a simple: - * - * 1. Walk through all files in the archive in a streaming fashion - * 2. Parse/index man pages - * - * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... - * - * 1. Walk through all entries in the archive in a streaming fashion - * 2. Parse/index regular file man pages - * 3. Keep track of all paths in the archive - * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file - * 5. Read the entire damn archive again if one of the links resolved to a file that was not - * recognized as a man page in step (2). Luckily, this isn't very common. - * - * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite - * annoying to handle. - * - * What annoys me the most about all of this is that it's not possible to stream an archive from - * the network and read/index the entire thing in a single step. Now we have to buffer packages to - * disk in order to be able to read the archive a second time. - * - * (Note that it is possible to resolve links while walking through the entries, which will allow - * us to match files found later in the archive against links found earlier, thus potentially - * saving the need to read the archive a second time. This is merely a performance improvement for - * an uncommon case, and it certainly won't simplify the code) - * - * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the - * need for a second archive read, but that's going to significantly slow down the common case in - * order to handle a rare case. It's possible to further optimize this using some heuristics to - * determine whether a file is potentially a man page, but that's both complex and may not even - * save much) - * - * (* So apparently some man pages are close to 10MB...) - */ -pub struct Reader { - // List of seen files. This is used to resolve links - seen: HashMap<String, EntryType>, - // List of interesting links - links: Vec<String>, - // List of files we have to read in a second walk through the archive - missedfiles: HashMap<String, Vec<String>>, +#[derive(Debug,PartialEq,Eq)] +pub enum FileType { + File, + Directory, + Link(String), + Other, // Also includes Link(<non-utf8-path>) } -// Generalized API: -// 1. Read once -// reader.read(file, interest_cb, file_cb) -> Error -// file: A libarchive::Reader -// interest_cb(path) -> bool -// Called on every file/link name, should return whether it's a file the caller is interested -// in. (e.g. parse_path(), but also +DESC and other metadata). -// file_cb(path, reader, entry) -> Error -// Called on every interesting (actual) file, given the (normalized?) path, the -// libarchive::Reader and a ReaderEntry -// -// 2. Read links -// reader.links(link_cb) -> Error -// link_cb(path, dest) -> Error -// Called on every link which has as 'dest' a file path that has already been given to -// file_cb() before. -// -// 3. (Optionally) read a second time -// if reader.need_reread() { -// reader.reread(file, file_cb) -// } -impl Reader { - pub fn new() -> Reader { - Reader { - seen: HashMap::new(), - links: Vec::new(), - missedfiles: HashMap::new(), +unsafe extern "C" fn archive_read_cb(_: *mut ffi::Struct_archive, data: *mut c_void, buf: *mut *const c_void) -> ssize_t { + let arch: &mut Archive = &mut *(data as *mut Archive); + *buf = arch.buf.as_mut_ptr() as *mut c_void; + match arch.rd.read(&mut arch.buf[..]) { + Ok(s) => s as ssize_t, + Err(e) => { + let desc = CString::new(e.description()).unwrap(); + let fmt = CString::new("%s").unwrap(); + ffi::archive_set_error(arch.a, e.raw_os_error().unwrap_or(0), fmt.as_ptr(), desc.as_ptr()); + arch.err = Some(e); + -1 } } +} - // Convenience function to read the path/type/link from the next header. - fn read_header(rd: &mut ArchiveReader) -> Option<(String, EntryType)> { - let ent = match rd.next_header() { - Some(x) => x, - None => return None, - }; - let path = ent.pathname().trim_left_matches('/').trim_left_matches("./").trim_right_matches('/').to_string(); - - // Hard links are apparently relative to the root of the archive. - let link = ent.hardlink().map(|x| format!("/{}", x)) - .or(ent.symlink().map(str::to_string)); - - let(fts, ret) = match ent.filetype() { - FileType::BlockDevice => ("blk", EntryType::Other), - FileType::SymbolicLink => ("sym", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), - FileType::Socket => ("sck", EntryType::Other), - FileType::CharacterDevice => ("chr", EntryType::Other), - FileType::Directory => ("dir", EntryType::Directory), - FileType::NamedPipe => ("fif", EntryType::Other), - FileType::Mount => ("mnt", EntryType::Other), - FileType::RegularFile => ("reg", EntryType::Regular), - FileType::Unknown => ("unk", match link { Some(l) => EntryType::Link(l), _ => EntryType::Other }), - }; - trace!("Archive entry: {}{:10} bytes, path={:?} type={:?}", fts, ent.size(), path, ret); - Some((path, ret)) +impl<'a> Archive<'a> { + fn new(rd: &mut Read, a: *mut ffi::Struct_archive) -> Result<Box<Archive>> { + let bufsize = 64*1024; + let mut buf = Vec::with_capacity(bufsize); + unsafe { buf.set_len(bufsize) }; + let mut ret = Box::new(Archive { a: a, rd: rd, buf: buf, err: None }); + + let aptr: *mut c_void = &mut *ret as *mut Archive as *mut c_void; + let r = unsafe { ffi::archive_read_open(a, aptr, None, Some(archive_read_cb), None) }; + if r == ffi::ARCHIVE_FATAL { + return Err(ret.error()); + } + Ok(ret) } - pub fn read<F,G>(&mut self, rd: &mut ArchiveReader, interest_cb: F, mut file_cb: G) -> ArchiveResult<()> - where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> - { - while let Some((path, t)) = Self::read_header(rd) { - // We ought to throw away the result of the previous entry with the same name and use - // this new entry instead, but fuck it. This case is too rare, so let's just warn! it. - if let Some(_) = self.seen.get(&path) { - warn!("Duplicate file entry: {}", path); - continue; - } + fn error(&mut self) -> Error { + // TODO: Do something with the description + self.err.take().unwrap_or_else(|| + Error::from_raw_os_error(unsafe { ffi::archive_errno(self.a) }) + ) + } - let mut newt = t; - match newt { - EntryType::Regular if interest_cb(&path) => { - let pathv = [&path as &str]; - try!(file_cb(&pathv[..], rd)); - newt = EntryType::Handled - }, - EntryType::Link(_) if interest_cb(&path) => { - self.links.push(path.clone()); - }, - _ => () - }; - self.seen.insert(path, newt); + fn entry(self: Box<Self>) -> Result<Option<ArchiveEntry<'a>>> { + let mut ent = ArchiveEntry { + a: self, + e: ptr::null_mut() + }; + let res = unsafe { ffi::archive_read_next_header(ent.a.a, &mut ent.e) }; + match res { + ffi::ARCHIVE_EOF => Ok(None), + ffi::ARCHIVE_FATAL => Err(ent.a.error()), + _ => Ok(Some(ent)) } - Ok(()) } - // This is basically realpath(), using the virtual filesystem in self.seen. - // This method is not particularly efficient, it allocates like crazy. - fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec<String>)> { - if depth < 1 { - warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); - return None + fn read(&mut self, buf: &mut [u8]) -> Result<usize> { + let cbuf = buf.as_mut_ptr() as *mut c_void; + let n = unsafe { ffi::archive_read_data(self.a, cbuf, buf.len()) }; + if n >= 0 { + Ok(n as usize) + } else { + Err(self.error()) } + } - // Remove filename from the base - let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; - - let comp : Vec<&str> = - if path.starts_with('/') { path.split('/').collect() } - else { basedir.split('/').chain(path.split('/')).collect() }; + pub fn open_archive(rd: &mut Read) -> Result<Option<ArchiveEntry>> { + let a = unsafe { + let a = ffi::archive_read_new(); + ffi::archive_read_support_filter_all(a); + ffi::archive_read_support_format_all(a); + a + }; + try!(Self::new(rd, a)).entry() + } +} - let mut dest = Vec::new(); - for (i, &c) in comp.iter().enumerate() { - if c == "" || c == "." { - continue; - } - if c == ".." { - if dest.len() > 1 { - dest.pop(); - } - continue; - } - dest.push(c.to_string()); - let curpath = dest.join("/"); - match self.seen.get(&curpath) { - - // If it's a directory, we're good - Some(&EntryType::Directory) => (), - - // If it's a file or man page, it must be the last item. - Some(& ref x@ EntryType::Regular) | - Some(& ref x@ EntryType::Handled) => return - if i == comp.len()-1 { - Some((x.clone(), dest)) - } else { - warn!("Unresolved link: {} -> {}; Non-directory component", base, path); - None - }, - - // Links... Ugh - Some(&EntryType::Link(ref d)) => { - match self.resolve_link(&curpath, &d, depth-1) { - // Same as above, with dirs we can continue, files have to be last - Some((EntryType::Directory, d)) => dest = d, - x@Some((EntryType::Regular, _)) | - x@Some((EntryType::Handled, _)) => return - if i == comp.len()-1 { x } - else { - warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); - None - }, - _ => return None, - } - }, - - // Don't care about anything else, just stop. - _ => { - warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); - return None - } - } +impl<'a> Drop for Archive<'a> { + fn drop(&mut self) { + unsafe { + ffi::archive_read_free(self.a); } - Some((EntryType::Directory, dest)) } +} + - pub fn links<F>(&mut self, mut cb: F) where F: FnMut(&str, &str) { - for p in self.links.iter() { - let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; - - match self.resolve_link(&p, dest, 32) { - Some((EntryType::Handled, d)) => { - let dstr = d.join("/"); - cb(&p, &dstr) - }, - Some((EntryType::Regular, d)) => { - let dstr = d.join("/"); - self.missedfiles.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); - } - _ => {}, +impl<'a> ArchiveEntry<'a> { + pub fn next(self) -> Result<Option<ArchiveEntry<'a>>> { + self.a.entry() + } + + // Returns None in NULL (when does that even happen?) or on invalid UTF-8. + pub fn path(&self) -> Option<&str> { + let c_str: &CStr = unsafe { + let ptr = ffi::archive_entry_pathname(self.e); + if ptr.is_null() { + return None; } - } - // We can reclaim this memory early. - self.links = Vec::new(); - self.seen = HashMap::new(); + CStr::from_ptr(ptr) + }; + str::from_utf8(c_str.to_bytes()).ok() + // Perform some simple opinionated normalization. Full normalization might be better, + // but also slower and more complex. This solution covers the most important cases. + .map(|s| s.trim_left_matches('/').trim_left_matches("./").trim_right_matches('/')) } - pub fn need_reread(&self) -> bool { - self.missedfiles.len() > 0 + pub fn size(&self) -> usize { + unsafe { ffi::archive_entry_size(self.e) as usize } } - pub fn reread<G>(&mut self, rd: &mut ArchiveReader, mut file_cb: G) -> ArchiveResult<()> - where G: FnMut(&[&str], &mut ArchiveReader) -> ArchiveResult<()> - { - while let Some((path, _)) = Self::read_header(rd) { - if let Some(f) = self.missedfiles.remove(&path) { - let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); - try!(file_cb(&v, rd)) + fn symlink(&self) -> Option<String> { + let c_str: &CStr = unsafe { + let ptr = ffi::archive_entry_symlink(self.e); + if ptr.is_null() { + return None; } - if self.missedfiles.len() < 1 { - break; + CStr::from_ptr(ptr) + }; + str::from_utf8(c_str.to_bytes()).map(str::to_string).ok() + } + + fn hardlink(&self) -> Option<String> { + let c_str: &CStr = unsafe { + let ptr = ffi::archive_entry_hardlink(self.e); + if ptr.is_null() { + return None; } + CStr::from_ptr(ptr) + }; + // Hard links have the same name as an earlier pathname(), and those typically don't have a + // preceding slash. Add this slash here so that the same resolution logic can be used for + // both hardlinks and symlinks. I really don't care about the difference between these two. + str::from_utf8(c_str.to_bytes()).map(|p| format!("/{}", p)).ok() + } + + pub fn filetype(&self) -> FileType { + // If it has a symlink/hardlink path, then just consider it a link regardless of what + // _filetype() says. + if let Some(l) = self.symlink().or(self.hardlink()) { + return FileType::Link(l); + } + match unsafe { ffi::archive_entry_filetype(self.e) } { + ffi::AE_IFDIR => FileType::Directory, + ffi::AE_IFREG => FileType::File, + _ => FileType::Other, } - Ok(()) } } +impl<'a> Read for ArchiveEntry<'a> { + fn read(&mut self, buf: &mut [u8]) -> Result<usize> { + self.a.read(buf) + } +} + + +// We can't provide an Iterator object for ArchiveEntries because Rust doesn't support streaming +// iterators. Let's instead provide a walk function for convenience. +// cb should return Ok(true) to continue, Ok(false) to break +pub fn walk<F>(ent: Option<ArchiveEntry>, mut cb: F) -> Result<()> + where F: FnMut(&mut ArchiveEntry) -> Result<bool> +{ + let mut ent = ent; + while let Some(mut e) = ent { + if !try!(cb(&mut e)) { + break; + } + ent = try!(e.next()); + } + Ok(()) +} + #[cfg(test)] mod tests { use super::*; - use env_logger; - - fn test_read(r: &mut Reader) { - let mut f = open_file("tests/testarchive.tar.xz").unwrap(); - let mut files = Vec::new(); - r.read(&mut f, - |p| p.starts_with("man/man"), - |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } - ).unwrap(); - assert_eq!(files, vec!["man/man3/helloworld.3".to_string()]); - } + use std; + use std::io::Read; + use std::fs::File; - fn test_resolve_links(r: &mut Reader) { - let res = |p| { - if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { - r.resolve_link(p, &l, 5) - } else { - panic!("Not found or not a link: {}", p); - } - }; - let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); - - assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); - assert_eq!(res("man/man6/hardlink.6"), helloworld); - assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); - assert_eq!(res("man/man6/symlinkafter.6"), helloworld); - - assert_eq!(res("man/man1/badsymlink1.1"), None); - assert_eq!(res("man/man1/badsymlink2.1"), None); - assert_eq!(res("man/man1/badsymlink3.1"), None); - assert_eq!(res("man/man1/badsymlink4.1"), None); - assert_eq!(res("man/man1/badsymlink5.1"), None); - - assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); - assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); - assert_eq!(res("man/man1/triplesymlink.1"), helloworld); - assert_eq!(res("man/man1/infinitesymlink.1"), None); + #[test] + fn invalid_archive() { + let mut r = std::io::repeat(0x0a).take(64*1024); + let ent = Archive::open_archive(&mut r); + assert!(ent.is_err()); } - fn test_links(r: &mut Reader) { - let mut links = Vec::new(); - r.links(|p,d| links.push((p.to_string(), d.to_string()))); - links.sort(); - - { - let mut res = |p:&str| { - let r = links.remove(0); - assert_eq!(r.0, p.to_string()); - assert_eq!(r.1, "man/man3/helloworld.3".to_string()); - }; - res("man/man1/doublesymlink1.1"); - res("man/man1/doublesymlink2.1"); - res("man/man1/symlinkbefore.1"); - res("man/man1/triplesymlink.1"); - res("man/man6/hardlink.6"); - res("man/man6/symlinkafter.6"); - } - assert_eq!(links.len(), 0); + #[test] + fn zerolength_archive() { + let mut r = std::io::empty(); + let ent = Archive::open_archive(&mut r); + // I expected an error here rather than None, whatever. + assert!(ent.unwrap().is_none()); } - fn test_reread(r: &mut Reader) { - assert!(r.need_reread()); + #[test] + fn read() { + let mut f = File::open("tests/simpletest.tar.gz").unwrap(); + let mut ent = Archive::open_archive(&mut f).unwrap().unwrap(); + + let t = |e:&mut ArchiveEntry, path, size, ft, cont| { + assert_eq!(e.path(), path); + assert_eq!(e.size(), size); + assert_eq!(e.filetype(), ft); + let mut contents = String::new(); + assert_eq!(e.read_to_string(&mut contents).unwrap(), size); + assert_eq!(&contents, cont); + }; - let mut f = open_file("tests/testarchive.tar.xz").unwrap(); - let mut files = Vec::new(); - r.reread(&mut f, - |p,_| { files.extend(p.iter().map(|x| x.to_string())); Ok(()) } - ).unwrap(); + t(&mut ent, Some("simple"), 0, FileType::Directory, ""); - files.sort(); - assert_eq!(files, vec![ - "man/man3/needreread.3".to_string(), - "man/man6/needreread.6".to_string() - ]); - } + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/file"), 3, FileType::File, "Hi\n"); - #[test] - fn test_reader() { - env_logger::init().unwrap(); - - let mut r = Reader::new(); - test_read(&mut r); - test_resolve_links(&mut r); - test_links(&mut r); - test_reread(&mut r); + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/link"), 0, FileType::Link("file".to_string()), ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/hardlink"), 0, FileType::Link("/simple/file".to_string()), ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, Some("simple/fifo"), 0, FileType::Other, ""); + + ent = ent.next().unwrap().unwrap(); + t(&mut ent, None, 0, FileType::File, ""); + + assert!(ent.next().unwrap().is_none()); } } diff --git a/indexer/src/archread.rs b/indexer/src/archread.rs new file mode 100644 index 0000000..22086f8 --- /dev/null +++ b/indexer/src/archread.rs @@ -0,0 +1,363 @@ +use std::io::Result; +use std::collections::HashMap; + +use archive::{walk,ArchiveEntry,FileType}; + +/* I had hoped that reading man pages from an archive would just be a simple: + * + * 1. Walk through all files in the archive in a streaming fashion + * 2. Parse/index man pages + * + * But alas, it was not to be. Symlinks and hardlinks have ruined it. Now we have to... + * + * 1. Walk through all entries in the archive in a streaming fashion + * 2. Parse/index regular file man pages + * 3. Keep track of all paths in the archive + * 4. Use the result of step (3) to resolve symlinks/hardlinks to their actual file + * 5. Read the entire damn archive again if one of the links resolved to a file that was not + * recognized as a man page in step (2). Luckily, this isn't very common. + * + * And this doesn't even cover the problem of duplicate entries in a tar, which is also quite + * annoying to handle. + * + * What annoys me the most about all of this is that it's not possible to stream an archive from + * the network and read/index the entire thing in a single step. Now we either have to buffer + * packages to disk or redownload the archive in order to be able to follow all links to man pages. + * + * (Note that it is possible to resolve links while walking through the entries, which will allow + * us to match files found later in the archive against links found earlier, thus potentially + * saving the need to read the archive a second time. This is merely a performance improvement for + * an uncommon case, and it certainly won't simplify the code) + * + * (Note that it's also possible to just flush all files <10MB* to disk to completely avoid the + * need for a second archive read, but that's going to significantly slow down the common case in + * order to handle a rare case. It's possible to further optimize this using some heuristics to + * determine whether a file is potentially a man page, but that's both complex and may not even + * save much) + * + * (* So apparently some man pages are close to 10MB...) + */ + + +#[derive(Clone,Debug,PartialEq,Eq)] +pub enum EntryType { + // Regular file that has been handled/indexed + Handled, + // Regular file that hasn't been handled because the caller wasn't interested in it. Could + // still be an interesting file if it is referenced from an interesting path. + Regular, + // Link to another file (interesting or not is irrelevant) + Link(String), + // Directory; need this information when resolving links + Directory, + // Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to + // this we know we're done. + Other, +} + +pub struct FileList { + // List of seen files. This is used to resolve links + seen: HashMap<String, EntryType>, + // List of interesting links + links: Vec<String>, +} + +pub struct MissedFiles(HashMap<String, Vec<String>>); + + +impl FileList { + + /* Read an archive until the end. Accepts two callbacks: + * + * interest_cb: Called on every path in the archive, should return whether the file is + * interesting (i.e. whether we want to know its contents). + * file_cb: Called on every regular file for which interest_cb() showed an interest. + * The callback accepts multiple path names, but this function will only provide one. + * + * Returns a FileList struct that can be used to retreive all interesting non-regular files. + */ + pub fn read<F,G>(ent: Option<ArchiveEntry>, interest_cb: F, mut file_cb: G) -> Result<FileList> + where F: Fn(&str) -> bool, G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> + { + let mut fl = FileList { + seen: HashMap::new(), + links: Vec::new(), + }; + + try!(walk(ent, |mut e| { + let path = match e.path() { + Some(x) => x.to_string(), + None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) } + }; + let ft = e.filetype(); + trace!("Archive entry: {:10} {} {:?}", e.size(), path, ft); + + // We ought to throw away the result of the previous entry with the same name and use + // this new entry instead, but fuck it. This case is too rare, so let's just warn. + if let Some(_) = fl.seen.get(&path) { + warn!("Duplicate file entry: {}", path); + return Ok(true); + } + + let et = match ft { + FileType::File => { + if interest_cb(&path) { + let pathv = [&path as &str]; + try!(file_cb(&pathv[..], &mut e)); + EntryType::Handled + } else { + EntryType::Regular + } + }, + FileType::Link(l) => { + if interest_cb(&path) { + fl.links.push(path.clone()); + } + EntryType::Link(l) + }, + FileType::Directory => EntryType::Directory, + FileType::Other => EntryType::Other, + }; + + fl.seen.insert(path, et); + Ok(true) + })); + Ok(fl) + } + + + // This is basically realpath(), using the virtual filesystem in self.seen. + // This method is not particularly efficient, it allocates like crazy. + fn resolve_link(&self, base: &str, path: &str, depth: usize) -> Option<(EntryType, Vec<String>)> { + if depth < 1 { + warn!("Unresolved link: {} -> {}; Recursion depth exceeded", base, path); + return None + } + + // Remove filename from the base + let basedir = if let Some(i) = base.rfind('/') { base.split_at(i).0 } else { return None }; + + let comp : Vec<&str> = + if path.starts_with('/') { path.split('/').collect() } + else { basedir.split('/').chain(path.split('/')).collect() }; + + let mut dest = Vec::new(); + + for (i, &c) in comp.iter().enumerate() { + if c == "" || c == "." { + continue; + } + if c == ".." { + if dest.len() > 1 { + dest.pop(); + } + continue; + } + dest.push(c.to_string()); + let curpath = dest.join("/"); + match self.seen.get(&curpath) { + + // If it's a directory, we're good + Some(&EntryType::Directory) => (), + + // If it's a file or man page, it must be the last item. + Some(& ref x@ EntryType::Regular) | + Some(& ref x@ EntryType::Handled) => return + if i == comp.len()-1 { + Some((x.clone(), dest)) + } else { + warn!("Unresolved link: {} -> {}; Non-directory component", base, path); + None + }, + + // Links... Ugh + Some(&EntryType::Link(ref d)) => { + match self.resolve_link(&curpath, &d, depth-1) { + // Same as above, with dirs we can continue, files have to be last + Some((EntryType::Directory, d)) => dest = d, + x@Some((EntryType::Regular, _)) | + x@Some((EntryType::Handled, _)) => return + if i == comp.len()-1 { x } + else { + warn!("Unresolved link: {} -> {}; Non-directory link component", base, path); + None + }, + _ => return None, + } + }, + + // Don't care about anything else, just stop. + _ => { + warn!("Unresolved link: {} -> {}; Special or non-existing file", base, path); + return None + } + } + } + Some((EntryType::Directory, dest)) + } + + /* Calls cb() on every 'interesting' link to a file that has already been passed to a file_cb() + * in FileList::read(). + * If there are any interesting links that have not yet been passed to file_cb(), a MissedFiles + * struct is returned that can be used to retrieve those files by re-reading the archive. + */ + pub fn links<F>(self, mut cb: F) -> Option<MissedFiles> where F: FnMut(&str, &str) { + let mut missed = HashMap::new(); + + for p in self.links.iter() { + let dest = match self.seen.get(p) { Some(&EntryType::Link(ref x)) => x, _ => unreachable!() }; + + match self.resolve_link(&p, dest, 32) { + Some((EntryType::Handled, d)) => { + let dstr = d.join("/"); + cb(&p, &dstr); + }, + Some((EntryType::Regular, d)) => { + let dstr = d.join("/"); + missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); + } + _ => (), + } + } + + if missed.len() > 0 { + Some(MissedFiles(missed)) + } else { + None + } + } +} + + +impl MissedFiles { + /* Reads the archive again and calls file_cb() on every interesting file that was missed during + * the first read of the archive (using FileList::{read,links}). file_cb is exactly the same as + * in FileList::read, but this time it can actually get multiple paths as first argument; which + * happens when multiple interesting links point to the same file. */ + pub fn read<G>(mut self, ent: Option<ArchiveEntry>, mut file_cb: G) -> Result<()> + where G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> + { + walk(ent, |mut e| { + if let Some(f) = e.path().and_then(|p| self.0.remove(p)) { + let v: Vec<&str> = f.iter().map(|x| x as &str).collect(); + try!(file_cb(&v, &mut e)) + } + Ok(self.0.len() > 0) + }) + } +} + + +#[cfg(test)] +mod tests { + use super::*; + use archive::Archive; + use std::io::Read; + use std::fs::File; + + fn test_read() -> FileList { + let mut f = File::open("tests/testarchive.tar.xz").unwrap(); + let arch = Archive::open_archive(&mut f).unwrap(); + let mut cnt = 0; + FileList::read(arch, + |p| p.starts_with("man/man"), + |p,e| { + assert_eq!(cnt, 0); + cnt += 1; + assert_eq!(p, &["man/man3/helloworld.3"][..]); + assert_eq!(e.size(), 12); + + let mut cont = String::new(); + e.read_to_string(&mut cont).unwrap(); + assert_eq!(&cont, "Hello World\n"); + Ok(()) + } + ).unwrap() + } + + fn test_resolve_links(r: &FileList) { + let res = |p| { + if let Some(&EntryType::Link(ref l)) = r.seen.get(p) { + r.resolve_link(p, &l, 5) + } else { + panic!("Not found or not a link: {}", p); + } + }; + let helloworld = Some((EntryType::Handled, vec!["man".to_string(), "man3".to_string(), "helloworld.3".to_string()])); + + assert_eq!(res("man/mans"), Some((EntryType::Directory, vec!["man".to_string(), "man3".to_string()]))); + assert_eq!(res("man/man6/hardlink.6"), helloworld); + assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); + assert_eq!(res("man/man6/symlinkafter.6"), helloworld); + + assert_eq!(res("man/man1/badsymlink1.1"), None); + assert_eq!(res("man/man1/badsymlink2.1"), None); + assert_eq!(res("man/man1/badsymlink3.1"), None); + assert_eq!(res("man/man1/badsymlink4.1"), None); + assert_eq!(res("man/man1/badsymlink5.1"), None); + + assert_eq!(res("man/man1/doublesymlink1.1"), helloworld); + assert_eq!(res("man/man1/doublesymlink2.1"), helloworld); + assert_eq!(res("man/man1/triplesymlink.1"), helloworld); + assert_eq!(res("man/man1/infinitesymlink.1"), None); + } + + fn test_links(r: FileList) -> Option<MissedFiles> { + let mut links = Vec::new(); + let missed = r.links(|p,d| links.push((p.to_string(), d.to_string()))); + links.sort(); + + { + let mut res = |p:&str| { + let r = links.remove(0); + assert_eq!(r.0, p.to_string()); + assert_eq!(r.1, "man/man3/helloworld.3".to_string()); + }; + res("man/man1/doublesymlink1.1"); + res("man/man1/doublesymlink2.1"); + res("man/man1/symlinkbefore.1"); + res("man/man1/triplesymlink.1"); + res("man/man6/hardlink.6"); + res("man/man6/symlinkafter.6"); + } + assert_eq!(links.len(), 0); + missed + } + + fn test_reread(r: MissedFiles) { + let mut f = File::open("tests/testarchive.tar.xz").unwrap(); + let ent = Archive::open_archive(&mut f).unwrap(); + let mut files = Vec::new(); + r.read(ent, + |p,e| { + let mut cont = String::new(); + e.read_to_string(&mut cont).unwrap(); + files.extend(p.iter().map(|x| (x.to_string(), cont.clone()) )); + Ok(()) + } + ).unwrap(); + files.sort(); + + { + let mut res = |a:&str, b:&str| { + let r = files.remove(0); + assert_eq!(&r.0, a); + assert_eq!(&r.1, b); + }; + res("man/man3/needreread.3", "Potentially interesting file\n"); + res("man/man6/needreread.6", "Potentially interesting file\n"); + } + assert_eq!(files.len(), 0); + } + + #[test] + fn test_reader() { + //use env_logger; + //env_logger::init().unwrap(); + + let r = test_read(); + test_resolve_links(&r); + let l = test_links(r).unwrap(); + test_reread(l); + } +} diff --git a/indexer/src/main.rs b/indexer/src/main.rs index d0eb4e2..5661568 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -1,97 +1,15 @@ #[macro_use] extern crate log; +#[macro_use] extern crate lazy_static; extern crate env_logger; -extern crate libarchive; extern crate regex; - -use regex::Regex; +extern crate libarchive3_sys; +extern crate libc; mod archive; - - -// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page -// location, otherwise Some((manPageName, Section, Locale)). -fn parse_path(path: &str) -> Option<(&str, &str, &str)> { - // Roughly: man[/locale]/man1/manpage.section[.compression]+ - // TODO: lazy_static - let re = Regex::new(r"(?x) - man - (?: / ([^/]+) )? # Optional locale - /man[a-z0-9]/ # Subdir - ([^/]+?) # Man page name (non-greedy) - \. ([^/\.]+) # Section - (?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions - ").unwrap(); - - let cap = match re.captures(path) { Some(x) => x, None => return None }; - let locale = cap.at(1).unwrap_or(""); - let name = cap.at(2).unwrap(); - let section = cap.at(3).unwrap(); - - // Not everything matching the regex is necessarily a man page, exclude some special cases. - match (name, section, locale) { - // Files that totally aren't man pages - ("Makefile", "in", _) | - ("Makefile", "am", _) | - (".cvsignore", _, _) | - (_, "gz", _) | - (_, "lzma", _) | - (_, "bz2", _) | - (_, "xz", _) | - (_, "html", _) => None, - // Some weird directories that happen to match the locale - (n, s, "5man") | - (n, s, "c") | - (n, s, "man1") | - (n, s, "man2") | - (n, s, "man3") | - (n, s, "man4") | - (n, s, "man5") | - (n, s, "man6") | - (n, s, "man7") | - (n, s, "man8") | - (n, s, "Man-Part1") | - (n, s, "Man-Part2") => Some((n, s, "")), - // Nothing special! - x => Some(x) - } -} - +mod archread; +mod man; fn main() { env_logger::init().unwrap(); info!("Hello, world!"); } - - -#[test] -fn test_parse_path() { - // Generic tests - assert_eq!(parse_path("/"), None); - assert_eq!(parse_path("/man1/ncdu.1"), None); - assert_eq!(parse_path("/man/man?/ncdu.1"), None); - assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", ""))); - assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens - assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8"))); - - // Special cases - assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None); - assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None); - assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None); - assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None); - assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None); - assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None); - - // Some actual locations - assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", ""))); - assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", ""))); - assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", ""))); - assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", ""))); - assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", ""))); - assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR"))); - - assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", ""))); - assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", ""))); - - assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); - assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); -} diff --git a/indexer/src/man.rs b/indexer/src/man.rs new file mode 100644 index 0000000..b268fe8 --- /dev/null +++ b/indexer/src/man.rs @@ -0,0 +1,85 @@ +use regex::Regex; + + +// Checks a path for a man page candidate. Returns None if it doesn't seem like a man page +// location, otherwise Some((manPageName, Section, Locale)). +fn parse_path(path: &str) -> Option<(&str, &str, &str)> { + // Roughly: man[/locale]/man1/manpage.section[.compression]+ + lazy_static! { + static ref RE: Regex = Regex::new(r"(?x) + man + (?: / ([^/]+) )? # Optional locale + /man[a-z0-9]/ # Subdir + ([^/]+?) # Man page name (non-greedy) + \. ([^/\.]+) # Section + (?: \. (?: gz|lzma|bz2|xz ))* $ # Any number of compression extensions + ").unwrap(); + } + + let cap = match RE.captures(path) { Some(x) => x, None => return None }; + let locale = cap.at(1).unwrap_or(""); + let name = cap.at(2).unwrap(); + let section = cap.at(3).unwrap(); + + // Not everything matching the regex is necessarily a man page, exclude some special cases. + match (name, section, locale) { + // Files that totally aren't man pages + ("Makefile", "in", _) | + ("Makefile", "am", _) | + (".cvsignore", _, _) | + (_, "gz", _) | + (_, "lzma", _) | + (_, "bz2", _) | + (_, "xz", _) | + (_, "html", _) => None, + // Some weird directories that happen to match the locale + (n, s, "5man") | + (n, s, "c") | + (n, s, "man1") | + (n, s, "man2") | + (n, s, "man3") | + (n, s, "man4") | + (n, s, "man5") | + (n, s, "man6") | + (n, s, "man7") | + (n, s, "man8") | + (n, s, "Man-Part1") | + (n, s, "Man-Part2") => Some((n, s, "")), + // Nothing special! + x => Some(x) + } +} + + +#[test] +fn test_parse_path() { + // Generic tests + assert_eq!(parse_path("/"), None); + assert_eq!(parse_path("/man1/ncdu.1"), None); + assert_eq!(parse_path("/man/man?/ncdu.1"), None); + assert_eq!(parse_path("/man/man1/ncdu.1"), Some(("ncdu", "1", ""))); + assert_eq!(parse_path("/man/man1/ncdu.1.gz.lzma.xz.bz2.gz"), Some(("ncdu", "1", ""))); // This stuff happens + assert_eq!(parse_path("/man/en_US.UTF-8/man1/ncdu.1"), Some(("ncdu", "1", "en_US.UTF-8"))); + + // Special cases + assert_eq!(parse_path("/usr/share/man/man1/INDEX"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.am"), None); + assert_eq!(parse_path("/usr/share/man/man1/Makefile.in"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore"), None); + assert_eq!(parse_path("/usr/share/man/man1/.cvsignore.gz"), None); + + // Some actual locations + assert_eq!(parse_path("/usr/local/man/man1/list_audio_tracks.1.gz"), Some(("list_audio_tracks", "1", ""))); + assert_eq!(parse_path("/usr/local/lib/perl5/site_perl/man/man3/DBIx::Class::Helper::ResultSet::DateMethods1::Announcement.3.gz"), Some(("DBIx::Class::Helper::ResultSet::DateMethods1::Announcement", "3", ""))); + assert_eq!(parse_path("/usr/man/man3/exit.3tk"), Some(("exit", "3tk", ""))); + assert_eq!(parse_path("/usr/local/brlcad/share/man/mann/exit.nged.gz"), Some(("exit", "nged", ""))); + assert_eq!(parse_path("/usr/X11R6/man/man3/intro.3xglut.gz"), Some(("intro", "3xglut", ""))); + assert_eq!(parse_path("/usr/local/share/man/ko_KR.eucKR/man3/intro.3.gz"), Some(("intro", "3", "ko_KR.eucKR"))); + + assert_eq!(parse_path("/usr/lib/scilab/man/Man-Part1/man1/ans.1"), Some(("ans", "1", ""))); + assert_eq!(parse_path("/heirloom/usr/share/man/5man/man1/chgrp.1.gz"), Some(("chgrp", "1", ""))); + + assert_eq!(parse_path("/usr/local/plan9/man/man8/index.html"), None); + assert_eq!(parse_path("/usr/local/share/doc/gmt/html/man/grdpaste.html"), None); +} diff --git a/indexer/tests/mktar.sh b/indexer/tests/mktar.sh index 3f5ba4e..9f8844d 100755 --- a/indexer/tests/mktar.sh +++ b/indexer/tests/mktar.sh @@ -4,6 +4,19 @@ # way. The tests will fail quite badly if hardlink.6 is considered the # "original" version. + +mkdir simple +echo Hi >simple/file +ln -s file simple/link +ln simple/file simple/hardlink +mkfifo simple/fifo +badfn=`echo 'Héllö.txt' | iconv -t ISO-8859-1` +touch $badfn +tar -czf simpletest.tar.gz simple $badfn +rm -rf $badfn simple + + + mkdir man cd man diff --git a/indexer/tests/simpletest.tar.gz b/indexer/tests/simpletest.tar.gz Binary files differnew file mode 100644 index 0000000..06a535c --- /dev/null +++ b/indexer/tests/simpletest.tar.gz diff --git a/indexer/tests/testarchive.tar.xz b/indexer/tests/testarchive.tar.xz Binary files differindex 0fe9760..55b69f7 100644 --- a/indexer/tests/testarchive.tar.xz +++ b/indexer/tests/testarchive.tar.xz |