storkcli: Process streams from stork async instead of generating a hierarchy synchronously
I'm sure this'll come back in a future update but for now this massively speeds
up link gathering using storkcli.
Diff
storkcli/src/main.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------
1 file changed, 52 insertions(+), 24 deletions(-)
@@ -1,6 +1,11 @@
use std::hash::{Hash, Hasher};
use futures::{pin_mut, StreamExt};
use std::collections::VecDeque;
use stork_http::{HttpStorkable, Link};
use failure::Fallible;
use stork::FilterSet;
use stork_http::{filters::*, HttpStorkable, Link};
#[derive(argh::FromArgs)]
@@ -11,31 +16,52 @@
max_depth: Option<usize>,
#[argh(switch, short = 'o')]
same_origin: bool,
#[argh(positional)]
url: Link,
}
fn make_tuple_fn(
depth: usize,
) -> impl Fn(failure::Fallible<HttpStorkable>) -> (Fallible<HttpStorkable>, usize) {
move |v| (v, depth)
}
#[tokio::main]
async fn main() -> failure::Fallible<()> {
let args: Args = argh::from_env();
let url = args.url;
let mut filters = FilterSet::default();
if args.same_origin {
filters = filters.add_filter(DomainFilter::new(url.url().host().unwrap().to_string()));
}
let stream = HttpStorkable::new(url).exec();
pin_mut!(stream);
let queue = futures::stream::SelectAll::new();
pin_mut!(queue);
let mut queue = stream.map(|v| (v, 0)).collect::<VecDeque<_>>().await;
queue.push(Box::pin(
HttpStorkable::new(url)
.with_filters(filters)
.exec()
.map(make_tuple_fn(0)),
));
if queue.is_empty() {
panic!("Failed to find any links on the page!");
}
let mut seen = Vec::new();
loop {
if queue.is_empty() {
let value = queue.next().await;
if value.is_none() {
break;
}
let (link, depth) = queue.pop_front().unwrap();
let (link, depth) = value.unwrap();
if let Err(e) = link {
eprintln!("Failed to grab a link: {}", e);
@@ -43,8 +69,21 @@
}
let link = link.unwrap();
let hash = {
let mut hash = twox_hash::XxHash64::default();
link.val().hash(&mut hash);
hash.finish()
};
if seen.contains(&hash) {
continue;
} else {
seen.push(hash);
}
println!("{}↳ {}", " ".repeat(depth), link.val().url());
println!("{}", link.val().url());
if let Some(max_depth) = args.max_depth {
if depth >= max_depth {
@@ -54,18 +93,7 @@
let children = link.exec();
pin_mut!(children);
while let Some(v) = children.next().await {
queue.push_front((v, depth + 1));
}
queue.push(Box::pin(link.exec().map(make_tuple_fn(depth + 1))));
}
Ok(())