//! `stork` is a simple futures-based library to recursively crawl
//! sources in a search engine-like fashion. stork was designed from the
//! ground to have a simple API that is easy to use and can be reused
//! across multiple protocols, yielding each result giving end users the
//! freedom to do BFS, DFS or any type of search they may so wish.
//!
//! Your entry point into stork is the [Storkable::new] function. Have
//! a look through the [Storkable] struct's documentation for your
//! entry into the world of storking.
//!
//! *Note: you're probably not looking for this library on its own but
//! a protocol implementation of it. See below for some first-party
//! implementations:*
//! - [stork_http](../../../stork_http/)
#![recursion_limit = "256"]
#[macro_use]
extern crate failure_derive;
pub mod errors;
pub mod filters;
pub use errors::StorkError;
pub use filters::FilterSet;
use async_stream::try_stream;
use futures::prelude::*;
use std::pin::Pin;
use std::sync::{Arc, RwLock};
use failure::Error;
use failure::ResultExt;
use std::hash::{Hash, Hasher};
/// A [Storkable] represents a "thing" which is traversable ("storkable").
///
/// To start "storking" an initial [Storkable] can be constructed with
/// with [Storkable::new], once initialised filters can be added using
/// [Storkable::with_filters].
///
/// After a [Storkable] has been initialised, the storking can begin
/// with a call to [Storkable::exec] which will return a
/// stream of more [Storkable]s (with the filters from the parent
/// [Storkable] copied) which in turn can also be storked if necessary.
///
/// A Storkable derives its functionality from its two generics,
/// `T` and `C: StorkClient<T>`. The `StorkClient` implementation will
/// be called with a value of `T`, and is expected to return all the
/// values of `T` that can be found on the given `T`.
#[derive(Debug, Clone)]
pub struct Storkable<T: Unpin + PartialEq + Hash, C: StorkClient<T>> {
value: T,
filters: FilterSet<T>,
client: Arc<C>,
parent: Option<Arc<Storkable<T, C>>>,
seen: Arc<RwLock<Vec<u64>>>,
}
impl<'a, T: Unpin + PartialEq + Hash + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
/// Instantiates a new [Storkable] from a T, storking can then
/// begin on the given entrypoint using the [Storkable::exec] method.
pub fn new(val: T) -> Self {
Self {
value: val,
filters: FilterSet::default(),
client: Arc::new(C::default()),
parent: None,
seen: Arc::new(RwLock::new(Vec::new())),
}
}
/// Attaches a [FilterSet] to this [Storkable] and any children
/// found after executing this one.
pub fn with_filters(mut self, filters: FilterSet<T>) -> Self {
self.filters = filters;
self
}
/// Replaces the default [StorkClient] with a new one accepting
/// and returning the same type for this [Storkable].
pub fn with_client(mut self, client: C) -> Self {
self.client = Arc::new(client);
self
}
// Grab a reference to the filters set on this [Storkable].
pub fn filters(&self) -> &FilterSet<T> {
&self.filters
}
/// Get the value of this [Storkable].
pub fn val(&self) -> &T {
&self.value
}
/// Get the [Storkable] from which this [Storkable] was found on.
pub fn parent(&self) -> Option<&Storkable<T, C>> {
// map to Arc::as_ref to hide the underlying Arc implementation
self.parent.as_ref().map(Arc::as_ref)
}
/// Checks if this Storkable, or any parent Storkables have the same
/// value as the one given.
fn check_parent_is(&self, value: &T) -> bool {
// loop through all parents (starting with ourselves) to see if
// they happen to have the same value.
let mut current_parent = Some(self);
while let Some(parent) = current_parent {
if &parent.value == value {
return true;
}
current_parent = parent.parent();
}
false
}
/// Checks if this Storkable has seen this `value` before. If it
/// hasn't, this method will return false but any subsequent calls
/// with the same value will return true.
fn check_has_seen(&self, value: &T) -> bool {
let mut hasher = twox_hash::XxHash64::default();
value.hash(&mut hasher);
let hash = hasher.finish();
return if self.seen.read().unwrap().contains(&hash) {
true
} else {
self.seen.write().unwrap().push(hash);
false
};
}
/// Start storking this [Storkable].
///
/// Finds all the followable links on this [Storkable] and returns
/// a stream of more [Storkable]s with the same filters and the
/// `parent` set to a reference of the current [Storkable].
pub fn exec<'b>(self) -> impl futures::Stream<Item = Result<Storkable<T, C>, Error>> + 'a {
let this = Arc::new(self);
try_stream! {
let mut children = this.client.run(this.val());
while let Some(child) = children.next().await {
let child = child.context(StorkError::ClientError)?;
if !this.filters.matches(&child) {
continue;
}
// ensure we haven't returned this link before from this
// Storkable
if this.check_has_seen(&child) {
continue;
}
// ensure we're not going to cause a recursive loop by
// checking that the page we're about to yield isn't a
// parent of it
if this.check_parent_is(&child) {
continue;
}
yield Storkable {
value: child,
client: Arc::clone(&this.client),
filters: this.filters.clone(),
parent: Some(Arc::clone(&this)),
seen: Arc::new(RwLock::new(Vec::new())),
};
}
}
}
}
/// A [StorkClient] is an underlying implementation of a storker. When a
/// [Storkable] is initialised a [StorkClient] will be created using
/// [Default::default] and the instance will be shared between all child
/// [Storkable]s.
///
/// The default [StorkClient] initialised by the [Storkable] can be
/// replaced using [Storkable::with_client].
///
/// [StorkClient]s may be used across threads and *must* be thread-safe.
pub trait StorkClient<T>: Default {
/// Makes a call to `T` and returns the child `T`s it can find on the
/// page.
fn run(&self, src: &T) -> Pin<Box<dyn futures::Stream<Item = Result<T, Error>>>>;
}