diff options
author | Yorhel <git@yorhel.nl> | 2016-11-26 16:55:47 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2016-11-26 16:57:05 +0100 |
commit | eb15b6e2c7749e90be443816d2deb0f2a645e691 (patch) | |
tree | 2ea390d96ebae7deb7f4c94d0e1dca1f760444fc | |
parent | de28175cd31f6af26de968d608ec9b42a42cdb16 (diff) |
indexer: Improve Debian Contents file parsing performance by 5.2x
Further improvements can be gained by caching the results of
get_contents(), since the same Contents file is often parsed multiple
times in a single cron run. But this is already a significant
achievement.
-rw-r--r-- | indexer/src/sys_deb.rs | 34 |
1 files changed, 19 insertions, 15 deletions
diff --git a/indexer/src/sys_deb.rs b/indexer/src/sys_deb.rs index 45a40f8..4937b5c 100644 --- a/indexer/src/sys_deb.rs +++ b/indexer/src/sys_deb.rs @@ -18,24 +18,28 @@ fn get_contents(f: Option<open::Path>) -> Result<HashSet<String>> { let rd = archive::Archive::open_raw(&mut fd)?; let brd = BufReader::new(rd); let mut pkgs = HashSet::new(); - let mut filecnt = 0; + let mut filecnt = -1; let mut mancnt = 0; - // Run the regex on bytes instead of strings, as paths aren't always UTF-8. This regex will - // not match non-UTF-8 paths. - let re = Regex::new(r"^(?u:([^\s].*?))\s+(?u:([^\s]+))\s*$").unwrap(); - for line in brd.split(b'\n') { - re.captures(&line?).map(|cap| { - filecnt += 1; - let path = str::from_utf8(cap.at(1).unwrap()).unwrap(); - if man::ismanpath(path) { - mancnt += 1; - pkgs.extend( str::from_utf8(cap.at(2).unwrap()).unwrap().split(',').map(|e| { - e.split('/').last().unwrap().to_string() - }) ); - } - }); + let line = line?; + let line = match str::from_utf8(&line) { Ok(x) => x, _ => continue }; + if line.starts_with("FILE ") { + filecnt = 0; + continue; + } else if filecnt < 0 { + continue; + } + filecnt += 1; + let mut it = line.split(' '); + let pkg = it.next_back().unwrap(); + let path = it.fold(String::new(), |acc, x| acc + " " + x); + if man::ismanpath(&path.trim()) { + mancnt += 1; + pkgs.extend( pkg.split(',').map(|e| { + e.split('/').last().unwrap().to_string() + }) ); + } } debug!("Found {}/{} man files in {} relevant packages from {}", mancnt, filecnt, pkgs.len(), f.path); |