summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2016-11-26 16:55:47 +0100
committerYorhel <git@yorhel.nl>2016-11-26 16:57:05 +0100
commiteb15b6e2c7749e90be443816d2deb0f2a645e691 (patch)
tree2ea390d96ebae7deb7f4c94d0e1dca1f760444fc
parentde28175cd31f6af26de968d608ec9b42a42cdb16 (diff)
indexer: Improve Debian Contents file parsing performance by 5.2x
Further improvements can be gained by caching the results of get_contents(), since the same Contents file is often parsed multiple times in a single cron run. But this is already a significant achievement.
-rw-r--r--indexer/src/sys_deb.rs34
1 files changed, 19 insertions, 15 deletions
diff --git a/indexer/src/sys_deb.rs b/indexer/src/sys_deb.rs
index 45a40f8..4937b5c 100644
--- a/indexer/src/sys_deb.rs
+++ b/indexer/src/sys_deb.rs
@@ -18,24 +18,28 @@ fn get_contents(f: Option<open::Path>) -> Result<HashSet<String>> {
let rd = archive::Archive::open_raw(&mut fd)?;
let brd = BufReader::new(rd);
let mut pkgs = HashSet::new();
- let mut filecnt = 0;
+ let mut filecnt = -1;
let mut mancnt = 0;
- // Run the regex on bytes instead of strings, as paths aren't always UTF-8. This regex will
- // not match non-UTF-8 paths.
- let re = Regex::new(r"^(?u:([^\s].*?))\s+(?u:([^\s]+))\s*$").unwrap();
-
for line in brd.split(b'\n') {
- re.captures(&line?).map(|cap| {
- filecnt += 1;
- let path = str::from_utf8(cap.at(1).unwrap()).unwrap();
- if man::ismanpath(path) {
- mancnt += 1;
- pkgs.extend( str::from_utf8(cap.at(2).unwrap()).unwrap().split(',').map(|e| {
- e.split('/').last().unwrap().to_string()
- }) );
- }
- });
+ let line = line?;
+ let line = match str::from_utf8(&line) { Ok(x) => x, _ => continue };
+ if line.starts_with("FILE ") {
+ filecnt = 0;
+ continue;
+ } else if filecnt < 0 {
+ continue;
+ }
+ filecnt += 1;
+ let mut it = line.split(' ');
+ let pkg = it.next_back().unwrap();
+ let path = it.fold(String::new(), |acc, x| acc + " " + x);
+ if man::ismanpath(&path.trim()) {
+ mancnt += 1;
+ pkgs.extend( pkg.split(',').map(|e| {
+ e.split('/').last().unwrap().to_string()
+ }) );
+ }
}
debug!("Found {}/{} man files in {} relevant packages from {}", mancnt, filecnt, pkgs.len(), f.path);