diff options
author | Yorhel <git@yorhel.nl> | 2017-01-18 13:07:26 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2017-01-18 13:07:42 +0100 |
commit | 8235fb28b89750c0b0492e9a6973c8bfe237affb (patch) | |
tree | 074b0c115e8ba146e8396e1fa458fb3eb0b9eaf2 | |
parent | 608f79eb93749aa0b335f64163cf62f1af20e654 (diff) |
indexer: Fix link resolution and hardlink handling for rpm
Unlike tar, cpio does not have a separate entry for each directory, so
the link resolution can't assume that directory entries exist for each
path component.
I also mistakenly assumed that cpio handled hardlinks similarly to tar,
but that's clearly not the case. libarchive does help a bit, but these
differences still suck.
-rw-r--r-- | indexer/src/archive.rs | 4 | ||||
-rw-r--r-- | indexer/src/archread.rs | 79 | ||||
-rw-r--r-- | indexer/src/pkg.rs | 5 | ||||
-rwxr-xr-x | indexer/tests/mkarchives.sh | 3 |
4 files changed, 79 insertions, 12 deletions
diff --git a/indexer/src/archive.rs b/indexer/src/archive.rs index d651566..f4e91c0 100644 --- a/indexer/src/archive.rs +++ b/indexer/src/archive.rs @@ -253,6 +253,10 @@ impl<'a> ArchiveEntry<'a> { _ => FileType::Other, } } + + pub fn nlink(&self) -> u32 { + unsafe { ffi::archive_entry_nlink(self.e) } + } } diff --git a/indexer/src/archread.rs b/indexer/src/archread.rs index b9543e4..d9c71f3 100644 --- a/indexer/src/archread.rs +++ b/indexer/src/archread.rs @@ -38,6 +38,40 @@ use archive::{walk,ArchiveEntry,FileType}; * (* So apparently some man pages are close to 10MB...) */ +/* How links are handled by libarchive and how we deal with that: + * + * symlinks: + * Simply seen as a file entry containing a path that can be resolved to another file entry. Path + * can be absolute or relative. Symlink can come before or after the referenced file. + * (Handling of this is described above) + * + * tar hardlinks: + * The first occurrence of the file is seen as a regular file. For any future occurrences, + * libarchive will give us the path of the first occurrence in archive_entry_hardlink(), which + * archive.rs will then merge into a Link() entry. + * Basically, hard links can be handled in the exact same way as symlinks. + * + * cpio hardlinks (old): Hard links are duplicated. No special handling needed. + * + * cpio hardlinks (new): + * A. The first occurrence of the file is seen as a regular file with size=0, nlinks>1 + * B. Any later occurrences of the file are seen as a hardlink to the first occurrence. + * C. The last occurrence of the file is also seen as a hardlink to the first occurrence, but + * actually has the correct file size and body. + * How to handle: + * - Entry (A) is inserted in the FileList as an 'Hardlink' + * - Entry (B) is inserted as a normal 'Link' to (A) + * - Entry (C), detected by having size>0, and handled as follows: + * - The entry itself is inserted as a normal 'Link' to (A) + * - If (A) is interesting, the contents of the entry are processed with (A)'s path, and (A)'s + * entry is changed to 'Handled' + * - Then you end up with one 'Handled' entry and several 'Link' entries pointing to it. The link + * resolution mechanism will handle the rest. + * Note that this will not work if (A)'s path itself is not interesting. In that case the archive + * will be read a second time, but all entries pointing to (A), either through hardlinks or + * symlinks pointing to hardlinks, will resolve to the empty file entry. I strongly doubt there + * is a package archive like that. + */ #[derive(Clone,Debug,PartialEq,Eq)] pub enum EntryType { @@ -48,6 +82,8 @@ pub enum EntryType { Regular, // Link to another file (interesting or not is irrelevant) Link(String), + // A hardlink for which we don't have the file contents (yet) + Hardlink, // Directory; need this information when resolving links Directory, // Something that couldn't be an interesting file (chardev/socket/etc); If any link resolves to @@ -71,13 +107,14 @@ impl FileList { * * interest_cb: Called on every path in the archive, should return whether the file is * interesting (i.e. whether we want to know its contents). + * entry_cb: Called once on every entry in the archive. * file_cb: Called on every regular file for which interest_cb() showed an interest. * The callback accepts multiple path names, but this function will only provide one. * * Returns a FileList struct that can be used to retreive all interesting non-regular files. */ - pub fn read<F,G>(ent: Option<ArchiveEntry>, mut interest_cb: F, mut file_cb: G) -> Result<FileList> - where F: FnMut(&ArchiveEntry) -> bool, G: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> + pub fn read<F,G,H>(ent: Option<ArchiveEntry>, interest_cb: F, mut entry_cb: G, mut file_cb: H) -> Result<FileList> + where F: Fn(&str) -> bool, G: FnMut(&ArchiveEntry), H: FnMut(&[&str], &mut ArchiveEntry) -> Result<()> { let mut fl = FileList { seen: HashMap::new(), @@ -90,7 +127,7 @@ impl FileList { None => { warn!("Invalid UTF-8 filename in archive"); return Ok(true) } }; let ft = e.filetype(); - trace!("Archive entry: {:10} {} {:?}", e.size(), path, ft); + trace!("Archive entry: {:10} #{} {} {:?}", e.size(), e.nlink(), path, ft); // We ought to throw away the result of the previous entry with the same name and use // this new entry instead, but fuck it. This case is too rare, so let's just warn. @@ -99,9 +136,13 @@ impl FileList { return Ok(true); } + entry_cb(&e); + let et = match ft { FileType::File => { - if interest_cb(&e) { + if e.size() == 0 && e.nlink() > 1 { + EntryType::Hardlink + } else if interest_cb(&path) { let pathv = [&path as &str]; try!(file_cb(&pathv[..], &mut e)); EntryType::Handled @@ -110,7 +151,18 @@ impl FileList { } }, FileType::Link(l) => { - if interest_cb(&e) { + // cpio hard link for which we have the file contents + if e.size() > 0 && e.nlink() > 1 { + if let Some((EntryType::Hardlink, ocomp)) = fl.resolve_link(&path, &l, 32) { + let opath = ocomp.join("/"); + if interest_cb(&opath) { + let pathv = [&opath as &str]; + try!(file_cb(&pathv[..], &mut e)); + *fl.seen.get_mut(&opath).unwrap() = EntryType::Handled; + } + } + } + if interest_cb(&path) { fl.links.push(path.clone()); } EntryType::Link(l) @@ -160,8 +212,14 @@ impl FileList { // If it's a directory, we're good Some(&EntryType::Directory) => (), + // If we didn't find anything but this isn't the last item, then it's possible that + // this is a directory for which no explicit entry was created. This happens with + // RPM files. Just continue and see if we find anything if we add more components. + None if i != comp.len()-1 => (), + // If it's a file or man page, it must be the last item. Some(& ref x@ EntryType::Regular) | + Some(& ref x@ EntryType::Hardlink) | Some(& ref x@ EntryType::Handled) => return if i == comp.len()-1 { Some((x.clone(), dest)) @@ -176,6 +234,7 @@ impl FileList { // Same as above, with dirs we can continue, files have to be last Some((EntryType::Directory, d)) => dest = d, x@Some((EntryType::Regular, _)) | + x@Some((EntryType::Hardlink, _)) | x@Some((EntryType::Handled, _)) => return if i == comp.len()-1 { x } else { @@ -215,7 +274,9 @@ impl FileList { Some((EntryType::Regular, d)) => { let dstr = d.join("/"); missed.entry(dstr).or_insert_with(Vec::new).push(p.to_string()); - } + }, + Some((EntryType::Hardlink, _)) => + error!("Interesting link resolved to uninteresting hardlink: {}-> {}", p, dest), _ => (), } } @@ -260,7 +321,8 @@ mod tests { let arch = Archive::open_archive(&mut f).unwrap(); let mut cnt = 0; FileList::read(arch, - |p| p.path().unwrap().starts_with("man/man"), + |p| p.starts_with("man/man"), + |_| (), |p,e| { assert_eq!(cnt, 0); cnt += 1; @@ -290,7 +352,7 @@ mod tests { assert_eq!(res("man/man1/symlinkbefore.1"), helloworld); assert_eq!(res("man/man6/symlinkafter.6"), helloworld); - assert_eq!(res("man/man1/badsymlink1.1"), None); + //assert_eq!(res("man/man1/badsymlink1.1"), None); assert_eq!(res("man/man1/badsymlink2.1"), None); assert_eq!(res("man/man1/badsymlink3.1"), None); assert_eq!(res("man/man1/badsymlink4.1"), None); @@ -313,6 +375,7 @@ mod tests { assert_eq!(r.0, p.to_string()); assert_eq!(r.1, "man/man3/helloworld.3".to_string()); }; + res("man/man1/badsymlink1.1"); res("man/man1/doublesymlink1.1"); res("man/man1/doublesymlink2.1"); res("man/man1/symlinkbefore.1"); diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index 907b670..2d725f9 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -184,10 +184,7 @@ fn index_pkg(tr: &postgres::GenericConnection, mut opt: PkgOpt, verid: i32) -> s }; let missed = with_pkg(&mut opt, |e, opt| { - archread::FileList::read(e, |ent: &ArchiveEntry| { - opt.date.update(ent); - man::ismanpath(ent.path().unwrap()) - }, &indexfunc) + archread::FileList::read(e, man::ismanpath, |ent| opt.date.update(ent), &indexfunc) })?.links(|src, dest| { insert_link(tr, verid, src, dest) }); if let Some(missed) = missed { diff --git a/indexer/tests/mkarchives.sh b/indexer/tests/mkarchives.sh index 3d6299a..62d995f 100755 --- a/indexer/tests/mkarchives.sh +++ b/indexer/tests/mkarchives.sh @@ -44,6 +44,9 @@ ln man3/helloworld.3 man6/hardlink.6 ln -s ../man3/helloworld.3 man1/symlinkbefore.1 ln -s ../man3/helloworld.3 man6/symlinkafter.6 +# Technically badsymlink1.1 is bad, but we can't detect this because package +# archives don't necessarily list all directories. So it will have to be +# considered valid. ln -s notadir/../../man3/helloworld.3 man1/badsymlink1.1 ln -s man3/helloworld.3 man1/badsymlink2.1 ln -s ../man3/helloworld.3/. man1/badsymlink3.1 |