summaryrefslogtreecommitdiff
path: root/indexer
diff options
context:
space:
mode:
authorYorhel <git@yorhel.nl>2016-11-06 13:34:22 +0100
committerYorhel <git@yorhel.nl>2016-11-06 13:34:22 +0100
commit1ca43665a19453b128ab7a29009032f93b4d268a (patch)
tree026c3493633695743bcba70d99d175352f17a36e /indexer
parent35fab522d6b36c4a151d51a0e79e2650b20d29d6 (diff)
indexer: Add file caching + Arch Linux indexing
Diffstat (limited to 'indexer')
-rw-r--r--indexer/Cargo.lock48
-rw-r--r--indexer/Cargo.toml2
-rw-r--r--indexer/src/main.rs20
-rw-r--r--indexer/src/open.rs82
-rw-r--r--indexer/src/pkg.rs54
-rw-r--r--indexer/src/sys_arch.rs128
6 files changed, 295 insertions, 39 deletions
diff --git a/indexer/Cargo.lock b/indexer/Cargo.lock
index 501d8d3..5b22fa7 100644
--- a/indexer/Cargo.lock
+++ b/indexer/Cargo.lock
@@ -2,6 +2,7 @@
name = "indexer"
version = "0.1.0"
dependencies = [
+ "chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -13,6 +14,7 @@ dependencies = [
"postgres 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)",
"ring 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
+ "url 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -44,6 +46,15 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "chrono"
+version = "0.2.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
+ "time 0.1.35 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "clap"
version = "2.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -283,6 +294,38 @@ dependencies = [
]
[[package]]
+name = "num"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
+ "num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
+ "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
+ "num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.1.36"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
name = "num_cpus"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -584,6 +627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
"checksum bufstream 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7b48dbe2ff0e98fa2f03377d204a9637d3c9816cd431bfe05a8abbd0ea11d074"
"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855"
+"checksum chrono 0.2.25 (registry+https://github.com/rust-lang/crates.io-index)" = "9213f7cd7c27e95c2b57c49f0e69b1ea65b27138da84a170133fd21b07659c00"
"checksum clap 2.17.1 (registry+https://github.com/rust-lang/crates.io-index)" = "27dac76762fb56019b04aed3ccb43a770a18f80f9c2eb62ee1a18d9fb4ea2430"
"checksum cookie 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "0e3d6405328b6edb412158b3b7710e2634e23f3614b9bb1c412df7952489a626"
"checksum encoding 0.3.0-dev (git+https://github.com/lifthrasiir/rust-encoding)" = "<none>"
@@ -614,6 +658,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum md5 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7df230903ccdffd6b3b4ec21624498ea64c912ce50297846907f0b8e1bb249dd"
"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20"
"checksum mime 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b5c93a4bd787ddc6e7833c519b73a50883deb5863d76d9b71eb8216fb7f94e66"
+"checksum num 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "bde7c03b09e7c6a301ee81f6ddf66d7a28ec305699e3d3b056d2fc56470e3120"
+"checksum num-integer 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "fb24d9bfb3f222010df27995441ded1e954f8f69cd35021f6bef02ca9552fb92"
+"checksum num-iter 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "287a1c9969a847055e1122ec0ea7a5c5d6f72aad97934e131c83d5c08ab4e45c"
+"checksum num-traits 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "a16a42856a256b39c6d3484f097f6713e14feacd9bfb02290917904fae46c81c"
"checksum num_cpus 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8890e6084723d57d0df8d2720b0d60c6ee67d6c93e7169630e4371e88765dcad"
"checksum openssl 0.7.14 (registry+https://github.com/rust-lang/crates.io-index)" = "c4117b6244aac42ed0150a6019b4d953d28247c5dd6ae6f46ae469b5f2318733"
"checksum openssl-sys 0.7.17 (registry+https://github.com/rust-lang/crates.io-index)" = "89c47ee94c352eea9ddaf8e364be7f978a3bb6d66d73176572484238dd5a5c3f"
diff --git a/indexer/Cargo.toml b/indexer/Cargo.toml
index d8ce4a0..a15db65 100644
--- a/indexer/Cargo.toml
+++ b/indexer/Cargo.toml
@@ -15,3 +15,5 @@ ring = "0.5.3"
postgres = "0.12.0"
clap = "2.16.3"
hyper = "0.9.11"
+url = "1.2.3"
+chrono = "0.2.25"
diff --git a/indexer/src/main.rs b/indexer/src/main.rs
index 49c7079..1083559 100644
--- a/indexer/src/main.rs
+++ b/indexer/src/main.rs
@@ -9,11 +9,15 @@ extern crate ring;
extern crate encoding;
extern crate postgres;
extern crate hyper;
+extern crate url;
+extern crate chrono;
mod archive;
mod archread;
mod man;
+mod open;
mod pkg;
+mod sys_arch;
// Convenience function to get a system id by short-name. Panics if the system doesn't exist.
@@ -40,6 +44,12 @@ fn main() {
(@arg date: --date +required +takes_value "Package release date")
(@arg FILE: +required "Package file")
)
+ (@subcommand arch =>
+ (about: "Index an Arch Linux repository")
+ (@arg sys: --sys +required +takes_value "System short-name")
+ (@arg mirror: --mirror +required +takes_value "Mirror URL")
+ (@arg repo: --repo +required +takes_value "Repository name")
+ )
).get_matches();
let verbose = arg.occurrences_of("v");
@@ -71,7 +81,15 @@ fn main() {
pkg: matches.value_of("pkg").unwrap(),
ver: matches.value_of("ver").unwrap(),
date: matches.value_of("date").unwrap(),
- file: matches.value_of("FILE").unwrap()
+ file: open::Path{ path: matches.value_of("FILE").unwrap(), cache: false, canbelocal: true},
});
}
+
+ if let Some(matches) = arg.subcommand_matches("arch") {
+ sys_arch::sync(&db,
+ sysbyshort(&db, matches.value_of("sys").unwrap()),
+ matches.value_of("mirror").unwrap(),
+ matches.value_of("repo").unwrap()
+ );
+ }
}
diff --git a/indexer/src/open.rs b/indexer/src/open.rs
new file mode 100644
index 0000000..6919fc4
--- /dev/null
+++ b/indexer/src/open.rs
@@ -0,0 +1,82 @@
+use std::io::{Read,Result,Error,ErrorKind,copy};
+use std::fs::{File,create_dir_all,metadata};
+use std::hash::{Hash,Hasher,SipHasher};
+use std::time::{Duration,SystemTime};
+use url::Url;
+use hyper;
+
+
+const CACHE_PATH: &'static str = "/var/tmp/manned-indexer";
+const CACHE_TIME: u64 = 24*3600;
+
+
+pub struct Path<'a> {
+ pub path: &'a str,
+ pub cache: bool,
+ pub canbelocal: bool,
+}
+
+
+fn cache_fn(url: &Url) -> String {
+ let name = url.path_segments().unwrap().last().unwrap();
+ let name = if name == "" { "index" } else { name };
+
+ let mut hash = SipHasher::new();
+ url.hash(&mut hash);
+ format!("{}/{}-{}-{:x}", CACHE_PATH, url.host_str().unwrap(), name, hash.finish())
+}
+
+
+fn fetch(url: &str) -> Result<Box<Read>> {
+ let res = try!(hyper::Client::new()
+ .get(url)
+ .header(hyper::header::UserAgent("Man page crawler (info@manned.org; https://manned.org/)".to_owned()))
+ .send()
+ .map_err(|e| Error::new(ErrorKind::Other, format!("Hyper: {}", e)))
+ );
+ if !res.status.is_success() {
+ return Err(Error::new(ErrorKind::Other, format!("HTTP: {}", res.status) ));
+ }
+ Ok(Box::new(res) as Box<Read>)
+}
+
+
+fn file(path: &str) -> Result<Box<Read>> {
+ Ok(Box::new(try!(File::open(path))) as Box<Read>)
+}
+
+
+impl<'a> Path<'a> {
+ pub fn open(&self) -> Result<Box<Read>> {
+ if let Ok(url) = Url::parse(self.path) {
+ if url.scheme() != "http" && url.scheme() != "https" {
+ return Err(Error::new(ErrorKind::Other, "Invalid scheme"));
+ }
+
+ if self.cache {
+ let cfn = cache_fn(&url);
+ if let Ok(m) = metadata(&cfn) {
+ if m.modified().unwrap() > SystemTime::now() - Duration::from_secs(CACHE_TIME) {
+ return file(&cfn);
+ }
+ }
+ try!(create_dir_all(CACHE_PATH));
+ {
+ let mut rd = try!(fetch(url.as_str()));
+ let mut wr = try!(File::create(&cfn));
+ try!(copy(&mut rd, &mut wr));
+ }
+ file(&cfn)
+
+ } else {
+ fetch(url.as_str())
+ }
+
+ } else if self.canbelocal {
+ file(self.path)
+
+ } else {
+ Err(Error::new(ErrorKind::Other, "Invalid URL"))
+ }
+ }
+}
diff --git a/indexer/src/pkg.rs b/indexer/src/pkg.rs
index 4120988..4d3379d 100644
--- a/indexer/src/pkg.rs
+++ b/indexer/src/pkg.rs
@@ -1,12 +1,11 @@
use std;
use std::io::Read;
use postgres;
-use hyper;
-use archive;
+use open;
use archread;
use man;
-use archive::Archive;
+use archive::{Archive,ArchiveEntry};
pub struct PkgOpt<'a> {
pub force: bool,
@@ -15,7 +14,7 @@ pub struct PkgOpt<'a> {
pub pkg: &'a str,
pub ver: &'a str,
pub date: &'a str, // TODO: Option to extract date from package metadata itself
- pub file: &'a str
+ pub file: open::Path<'a>
}
@@ -32,19 +31,19 @@ fn insert_pkg(tr: &postgres::transaction::Transaction, opt: &PkgOpt) -> Option<i
Ok(r) => r.get(0).get(0),
};
- let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2 AND released = $3::text::date";
- let res = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap();
+ let q = "SELECT id FROM package_versions WHERE package = $1 AND version = $2";
+ let res = tr.query(q, &[&pkgid, &opt.ver]).unwrap();
let verid : i32;
if res.is_empty() {
let q = "INSERT INTO package_versions (package, version, released) VALUES($1, $2, $3::text::date) RETURNING id";
verid = tr.query(q, &[&pkgid, &opt.ver, &opt.date]).unwrap().get(0).get(0);
- trace!("New package pkgid {} verid {}", pkgid, verid);
+ info!("New package pkgid {} verid {}", pkgid, verid);
Some(verid)
} else if opt.force {
verid = res.get(0).get(0);
- trace!("Overwriting package pkgid {} verid {}", pkgid, verid);
+ info!("Overwriting package pkgid {} verid {}", pkgid, verid);
tr.query("DELETE FROM man WHERE package = $1", &[&verid]).unwrap();
Some(verid)
@@ -103,50 +102,29 @@ fn insert_link(tr: &postgres::GenericConnection, verid: i32, src: &str, dest: &s
}
-fn with_pkg<T,F>(file: &str, cb: F) -> std::io::Result<T>
- where F: FnOnce(Option<archive::ArchiveEntry>) -> std::io::Result<T>
-{
- // TODO: .deb support
-
- if file.starts_with("http://") || file.starts_with("https://") {
- let mut res = try!(
- hyper::Client::new().get(file).send()
- .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("Hyper: {}", e)))
- );
- if !res.status.is_success() {
- return Err(std::io::Error::new(std::io::ErrorKind::Other, format!("HTTP: {}", res.status) ));
- }
- let ent = try!(Archive::open_archive(&mut res));
- cb(ent)
-
- } else {
- let mut res = try!(std::fs::File::open(file));
- let ent = try!(Archive::open_archive(&mut res));
- cb(ent)
- }
-}
-
-
fn index_pkg(tr: &postgres::GenericConnection, opt: &PkgOpt, verid: i32) -> std::io::Result<()> {
- let indexfunc = |paths: &[&str], ent: &mut archive::ArchiveEntry| {
+ let indexfunc = |paths: &[&str], ent: &mut ArchiveEntry| {
insert_man(tr, verid, paths, ent);
Ok(()) /* Don't propagate errors, continue handling other man pages */
};
- let missed = try!(
- with_pkg(opt.file, |ent| { archread::FileList::read(ent, man::ismanpath, &indexfunc) })
- ).links(|src, dest| { insert_link(tr, verid, src, dest) });
+ let mut rd = try!(opt.file.open());
+ let missed = try!(archread::FileList::read(
+ try!(Archive::open_archive(&mut rd)),
+ man::ismanpath, &indexfunc))
+ .links(|src, dest| { insert_link(tr, verid, src, dest) });
if let Some(missed) = missed {
warn!("Some links were missed, reading package again");
- try!(with_pkg(opt.file, |ent| { missed.read(ent, indexfunc) }))
+ let mut rd = try!(opt.file.open());
+ try!(missed.read(try!(Archive::open_archive(&mut rd)), indexfunc));
}
Ok(())
}
pub fn pkg(conn: &postgres::GenericConnection, opt: PkgOpt) {
- info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file);
+ info!("Handling pkg: {} / {} / {} - {} @ {} @ {}", opt.sys, opt.cat, opt.pkg, opt.ver, opt.date, opt.file.path);
let tr = conn.transaction().unwrap();
tr.set_rollback();
diff --git a/indexer/src/sys_arch.rs b/indexer/src/sys_arch.rs
new file mode 100644
index 0000000..7a0bf1f
--- /dev/null
+++ b/indexer/src/sys_arch.rs
@@ -0,0 +1,128 @@
+use std::str::FromStr;
+use std::io::{Read,BufRead,BufReader,Result};
+use regex::Regex;
+use chrono::NaiveDateTime;
+use postgres;
+
+use archive;
+use open;
+use man;
+use pkg;
+
+
+struct Meta {
+ filename: String,
+ name: String,
+ version: String,
+ date: String,
+}
+
+
+fn read_files<T: Read>(lst: T) -> Result<bool> {
+ let rd = BufReader::new(lst);
+ for line in rd.lines() {
+ let line = try!(line);
+ if man::ismanpath(&line) {
+ return Ok(true);
+ }
+ }
+ Ok(false)
+}
+
+
+fn read_desc(rd: &mut archive::ArchiveEntry) -> Result<Option<Meta>> {
+ let mut data = String::new();
+ try!(rd.take(64*1024).read_to_string(&mut data));
+
+ let path = rd.path().unwrap();
+ lazy_static! {
+ static ref RE: Regex = Regex::new(r"\s*%([^%]+)%\s*\n\s*([^\n]+)\s*\n").unwrap();
+ }
+
+ let mut filename = None;
+ let mut name = None;
+ let mut version = None;
+ let mut builddate = None;
+
+ for kv in RE.captures_iter(&data) {
+ let key = kv.at(1).unwrap();
+ let val = kv.at(2).unwrap();
+ trace!("{}: {} = {}", path, key, val);
+ match key {
+ "FILENAME" => filename = Some(val),
+ "NAME" => name = Some(val),
+ "VERSION" => version = Some(val),
+ "BUILDDATE" => builddate = i64::from_str(val).ok(),
+ _ => {},
+ }
+ }
+
+ if filename.is_some() && name.is_some() && version.is_some() && builddate.is_some() {
+ Ok(Some(Meta {
+ filename: filename.unwrap().to_string(),
+ name: name.unwrap().to_string(),
+ version: version.unwrap().to_string(),
+ date: NaiveDateTime::from_timestamp(builddate.unwrap(), 0).format("%Y-%m-%d").to_string(),
+ }))
+ } else {
+ warn!("Metadata missing from package description: {}", path);
+ Ok(None)
+ }
+}
+
+
+// TODO: Switch to x86_64 instead of i686
+pub fn sync(pg: &postgres::GenericConnection, sys: i32, mirror: &str, repo: &str) {
+ info!("Reading packages from {} {}", mirror, repo);
+
+ let path = format!("{}/{}/os/i686/{1:}.files.tar.gz", mirror, repo);
+ let path = open::Path{ path: &path, cache: true, canbelocal: false };
+ let mut index = match path.open() {
+ Err(e) => { error!("Can't read package index: {}", e); return },
+ Ok(x) => x,
+ };
+
+ let ent = match archive::Archive::open_archive(&mut index) {
+ Err(e) => { error!("Can't read package index: {}", e); return },
+ Ok(x) => x,
+ };
+
+ let mut hasman = false;
+ let mut meta = None;
+ let r = archive::walk(ent, |x| {
+ if x.filetype() == archive::FileType::Directory {
+ hasman = false;
+ meta = None;
+ } else if x.path().unwrap().ends_with("/files") {
+ hasman = try!(read_files(x));
+ } else if x.path().unwrap().ends_with("/desc") {
+ meta = try!(read_desc(x));
+ }
+
+ if hasman && meta.is_some() {
+ hasman = false;
+ let m = meta.take().unwrap();
+
+ let p = format!("{}/{}/os/i686/{}", mirror, repo, m.filename);
+ pkg::pkg(pg, pkg::PkgOpt{
+ force: false,
+ sys: sys,
+ cat: repo,
+ pkg: &m.name,
+ ver: &m.version,
+ date: &m.date,
+ file: open::Path{
+ path: &p,
+ cache: false,
+ canbelocal: false,
+ },
+ });
+ }
+
+ Ok(true)
+ });
+
+ if let Err(e) = r {
+ error!("Error reading package index: {}", e);
+ }
+}