diff options
author | Yorhel <git@yorhel.nl> | 2016-11-06 13:34:22 +0100 |
---|---|---|
committer | Yorhel <git@yorhel.nl> | 2016-11-06 13:34:22 +0100 |
commit | 1ca43665a19453b128ab7a29009032f93b4d268a (patch) | |
tree | 026c3493633695743bcba70d99d175352f17a36e /indexer | |
parent | 35fab522d6b36c4a151d51a0e79e2650b20d29d6 (diff) |
indexer: Add file caching + Arch Linux indexing
Diffstat (limited to 'indexer')
-rw-r--r-- | indexer/Cargo.lock | 48 | ||||
-rw-r--r-- | indexer/Cargo.toml | 2 | ||||
-rw-r--r-- | indexer/src/main.rs | 20 | ||||
-rw-r--r-- | indexer/src/open.rs | 82 | ||||
-rw-r--r-- | indexer/src/pkg.rs | 54 | ||||
-rw-r--r-- | indexer/src/sys_arch.rs | 128 |
6 files changed, 295 insertions, 39 deletions
diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock index 501d8d3..5b22fa7 100644 --- a/indexer/Cargo.lock +++ b/indexer/Cargo.lock @@ -2,6 +2,7 @@ name = "indexer" version = "0.1.0" dependencies = [ + "chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)", "encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -13,6 +14,7 @@ dependencies = [ "postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", "ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -44,6 +46,15 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] +name = "chrono" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", + "time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "clap" version = "2.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -283,6 +294,38 @@ dependencies = [ ] [[package]] +name = "num" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)", + "num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-integer" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-iter" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)", + "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "num-traits" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] name = "num_cpus" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -584,6 +627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d" "checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074" "checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" +"checksum chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)" = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00" "checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430" "checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626" "checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>" @@ -614,6 +658,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd" "checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" "checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66" +"checksum num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "bde7c03b09e7c6a301ee81f6ddf66d7a28ec305699e3d3b056d2fc56470e3120" +"checksum num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "fb24d9bfb3f222010df27995441ded1e954f8f69cd35021f6bef02ca9552fb92" +"checksum num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "287a1c9969a847055e1122ec0ea7a5c5d6f72aad97934e131c83d5c08ab4e45c" +"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c" "checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad" "checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733" "checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f" diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml index d8ce4a0..a15db65 100644 --- a/indexer/Cargo.toml +++ b/indexer/Cargo.toml @@ -15,3 +15,5 @@ ring = "0.5.3" postgres = "0.12.0" clap = "2.16.3" hyper = "0.9.11" +url = "1.2.3" +chrono = "0.2.25" diff --git a/indexer/src/main.rs b/indexer/src/main.rs index 49c7079..1083559 100644 --- a/indexer/src/main.rs +++ b/indexer/src/main.rs @@ -9,11 +9,15 @@ extern crate ring; extern crate encoding; extern crate postgres; extern crate hyper; +extern crate url; +extern crate chrono; mod archive; mod archread; mod man; +mod open; mod pkg; +mod sys_arch; // Convenience function to get a system id by short-name. Panics if the system doesn't exist. @@ -40,6 +44,12 @@ fn main() { (@arg date: --date +required +takes_value "Package release date") (@arg FILE: +required "Package file") ) + (@subcommand arch => + (about: "Index an Arch Linux repository") + (@arg sys: --sys +required +takes_value "System short-name") + (@arg mirror: --mirror +required +takes_value "Mirror URL") + (@arg repo: --repo +required +takes_value "Repository name") + ) ).get_matches(); let verbose = arg.occurrences_of("v"); @@ -71,7 +81,15 @@ fn main() { pkg: matches.value_of("pkg").unwrap(), ver: matches.value_of("ver").unwrap(), date: matches.value_of("date").unwrap(), - file: matches.value_of("FILE").unwrap() + file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true}, }); } + + if let Some(matches) = arg.subcommand_matches("arch") { + sys_arch::sync(&db, + sysbyshort(&db, matches.value_of("sys").unwrap()), + matches.value_of("mirror").unwrap(), + matches.value_of("repo").unwrap() + ); + } } diff --git a/indexer/src/open.rs b/indexer/src/open.rs new file mode 100644 index 0000000..6919fc4 --- /dev/null +++ b/indexer/src/open.rs @@ -0,0 +1,82 @@ +use std::io::{Read,Result,Error,ErrorKind,copy}; +use std::fs::{File,create_dir_all,metadata}; +use std::hash::{Hash,Hasher,SipHasher}; +use std::time::{Duration,SystemTime}; +use url::Url; +use hyper; + + +const CACHE_PATH: &'static str = "/var/tmp/manned-indexer"; +const CACHE_TIME: u64 = 24*3600; + + +pub struct Path<'a> { + pub path: &'a str, + pub cache: bool, + pub canbelocal: bool, +} + + +fn cache_fn(url: &Url) -> String { + let name = url.path_segments().unwrap().last().unwrap(); + let name = if name == "" { "index" } else { name }; + + let mut hash = SipHasher::new(); + url.hash(&mut hash); + format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish()) +} + + +fn fetch(url: &str) -> Result<Box<Read>> { + let res = try!(hyper::Client::new() + .get(url) + .header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned())) + .send() + .map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e))) + ); + if !res.status.is_success() { + return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) )); + } + Ok(Box::new(res) as Box<Read>) +} + + +fn file(path: &str) -> Result<Box<Read>> { + Ok(Box::new(try!(File::open(path))) as Box<Read>) +} + + +impl<'a> Path<'a> { + pub fn open(&self) -> Result<Box<Read>> { + if let Ok(url) = Url::parse(self.path) { + if url.scheme() != "http" && url.scheme() != "https" { + return Err(Error::new(ErrorKind::Other, "Invalid scheme")); + } + + if self.cache { + let cfn = cache_fn(&url); + if let Ok(m) = metadata(&cfn) { + if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) { + return file(&cfn); + } + } + try!(create_dir_all(CACHE_PATH)); + { + let mut rd = try!(fetch(url.as_str())); + let mut wr = try!(File::create(&cfn)); + try!(copy(&mut rd, &mut wr)); + } + file(&cfn) + + } else { + fetch(url.as_str()) + } + + } else if self.canbelocal { + file(self.path) + + } else { + Err(Error::new(ErrorKind::Other, "Invalid URL")) + } + } +} diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs index 4120988..4d3379d 100644 --- a/indexer/src/pkg.rs +++ b/indexer/src/pkg.rs @@ -1,12 +1,11 @@ use std; use std::io::Read; use postgres; -use hyper; -use archive; +use open; use archread; use man; -use archive::Archive; +use archive::{Archive,ArchiveEntry}; pub struct PkgOpt<'a> { pub force: bool, @@ -15,7 +14,7 @@ pub struct PkgOpt<'a> { pub pkg: &'a str, pub ver: &'a str, pub date: &'a str, // TODO: Option to extract date from package metadata itself - pub file: &'a str + pub file: open::Path<'a> } @@ -32,19 +31,19 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i Ok(r) => r.get(0).get(0), }; - let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date"; - let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap(); + let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2"; + let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap(); let verid : i32; if res.is_empty() { let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id"; verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0); - trace!("New package pkgid {} verid {}", pkgid, verid); + info!("New package pkgid {} verid {}", pkgid, verid); Some(verid) } else if opt.force { verid = res.get(0).get(0); - trace!("Overwriting package pkgid {} verid {}", pkgid, verid); + info!("Overwriting package pkgid {} verid {}", pkgid, verid); tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap(); Some(verid) @@ -103,50 +102,29 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s } -fn with_pkg<T,F>(file: &str, cb: F) -> std::io::Result<T> - where F: FnOnce(Option<archive::ArchiveEntry>) -> std::io::Result<T> -{ - // TODO: .deb support - - if file.starts_with("http://") || file.starts_with("https://") { - let mut res = try!( - hyper::Client::new().get(file).send() - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e))) - ); - if !res.status.is_success() { - return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) )); - } - let ent = try!(Archive::open_archive(&mut res)); - cb(ent) - - } else { - let mut res = try!(std::fs::File::open(file)); - let ent = try!(Archive::open_archive(&mut res)); - cb(ent) - } -} - - fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> { - let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| { + let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| { insert_man(tr, verid, paths, ent); Ok(()) /* Don't propagate errors, continue handling other man pages */ }; - let missed = try!( - with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) }) - ).links(|src, dest| { insert_link(tr, verid, src, dest) }); + let mut rd = try!(opt.file.open()); + let missed = try!(archread::FileList::read( + try!(Archive::open_archive(&mut rd)), + man::ismanpath, &indexfunc)) + .links(|src, dest| { insert_link(tr, verid, src, dest) }); if let Some(missed) = missed { warn!("Some links were missed, reading package again"); - try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) })) + let mut rd = try!(opt.file.open()); + try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc)); } Ok(()) } pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) { - info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file); + info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path); let tr = conn.transaction().unwrap(); tr.set_rollback(); diff --git a/indexer/src/sys_arch.rs b/indexer/src/sys_arch.rs new file mode 100644 index 0000000..7a0bf1f --- /dev/null +++ b/indexer/src/sys_arch.rs @@ -0,0 +1,128 @@ +use std::str::FromStr; +use std::io::{Read,BufRead,BufReader,Result}; +use regex::Regex; +use chrono::NaiveDateTime; +use postgres; + +use archive; +use open; +use man; +use pkg; + + +struct Meta { + filename: String, + name: String, + version: String, + date: String, +} + + +fn read_files<T: Read>(lst: T) -> Result<bool> { + let rd = BufReader::new(lst); + for line in rd.lines() { + let line = try!(line); + if man::ismanpath(&line) { + return Ok(true); + } + } + Ok(false) +} + + +fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> { + let mut data = String::new(); + try!(rd.take(64*1024).read_to_string(&mut data)); + + let path = rd.path().unwrap(); + lazy_static! { + static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap(); + } + + let mut filename = None; + let mut name = None; + let mut version = None; + let mut builddate = None; + + for kv in RE.captures_iter(&data) { + let key = kv.at(1).unwrap(); + let val = kv.at(2).unwrap(); + trace!("{}: {} = {}", path, key, val); + match key { + "FILENAME" => filename = Some(val), + "NAME" => name = Some(val), + "VERSION" => version = Some(val), + "BUILDDATE" => builddate = i64::from_str(val).ok(), + _ => {}, + } + } + + if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() { + Ok(Some(Meta { + filename: filename.unwrap().to_string(), + name: name.unwrap().to_string(), + version: version.unwrap().to_string(), + date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(), + })) + } else { + warn!("Metadata missing from package description: {}", path); + Ok(None) + } +} + + +// TODO: Switch to x86_64 instead of i686 +pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) { + info!("Reading packages from {} {}", mirror, repo); + + let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo); + let path = open::Path{ path: &path, cache: true, canbelocal: false }; + let mut index = match path.open() { + Err(e) => { error!("Can't read package index: {}", e); return }, + Ok(x) => x, + }; + + let ent = match archive::Archive::open_archive(&mut index) { + Err(e) => { error!("Can't read package index: {}", e); return }, + Ok(x) => x, + }; + + let mut hasman = false; + let mut meta = None; + let r = archive::walk(ent, |x| { + if x.filetype() == archive::FileType::Directory { + hasman = false; + meta = None; + } else if x.path().unwrap().ends_with("/files") { + hasman = try!(read_files(x)); + } else if x.path().unwrap().ends_with("/desc") { + meta = try!(read_desc(x)); + } + + if hasman && meta.is_some() { + hasman = false; + let m = meta.take().unwrap(); + + let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename); + pkg::pkg(pg, pkg::PkgOpt{ + force: false, + sys: sys, + cat: repo, + pkg: &m.name, + ver: &m.version, + date: &m.date, + file: open::Path{ + path: &p, + cache: false, + canbelocal: false, + }, + }); + } + + Ok(true) + }); + + if let Err(e) = r { + error!("Error reading package index: {}", e); + } +} |