🏡 index : ~doyle/stork.git

//! `stork` is a simple library to recursively crawl websites for links
//! in a search engine-like fashion. stork was designed from the ground
//! to have a simple API that is easy to use.
//!
//! Your entry point into stork is the [Storkable::new] function. Have
//! a look through the [Storkable] struct's documentation for your
//! entry into the world of storking.
#![recursion_limit = "512"]

#[macro_use]
extern crate failure_derive;

pub mod errors;
pub mod filters;

pub use errors::StorkError;
pub use filters::FilterSet;

pub use url::Url;

use select::document::Document;
use select::predicate::{And, Attr, Name, Not};

use async_stream::try_stream;
use futures::pin_mut;
use futures::prelude::*;
use std::sync::Arc;

use failure::Error;
use failure::ResultExt;

/// A [Storkable] represents a "thing" (currently just a website link)
/// which is traversable.
///
/// To start "storking" a website an initial [Storkable] can be
/// constructed with [Storkable::new], once initialised filters can be
/// added using [Storkable::with_filters].
///
/// After a [Storkable] has been initialised, the storking can begin
/// with a call to [Storkable::exec] which will return a
/// stream of more [Storkable]s (with the filters from the parent
/// [Storkable] copied) which in turn can also be storked if necessary.
///
/// Example usage:
///
/// ```
/// # use failure::err_msg;
/// # use stork::{Storkable, FilterSet, filters::{UrlFilter, UrlFilterType}};
/// # use futures::StreamExt;
/// #
/// # #[tokio::main]
/// # async fn main() -> failure::Fallible<()> {
/// let stream = Storkable::new("https://example.com/".parse()?)
///     .with_filters(
///         FilterSet::default()
///             .add_url_filter(UrlFilter::new(UrlFilterType::Domain, String::from("www.iana.org")))
///             .add_url_filter(UrlFilter::new(UrlFilterType::Scheme, String::from("https")))
///     )
///     .exec();
/// # futures::pin_mut!(stream); // needed for iteration
/// let first_link: Storkable = stream.next().await.ok_or(err_msg("no links on page"))??;
/// assert_eq!(first_link.url().as_str(), "https://www.iana.org/domains/example");
/// assert_eq!(first_link.parent().unwrap().url().as_str(), "https://example.com/");
///
/// let stream = first_link.exec();
/// # futures::pin_mut!(stream); // needed for iteration
/// let inner_link = stream.next().await.ok_or(err_msg("no links on page"))??;
/// assert_eq!(inner_link.url().as_str(), "https://www.iana.org/");
/// assert_eq!(inner_link.parent().unwrap().url().as_str(), "https://www.iana.org/domains/example");
/// # Ok(())
/// # }
/// ```
#[derive(Debug, Clone)]
pub struct Storkable {
    url: Url,
    filters: Arc<FilterSet>,
    client: Arc<reqwest::Client>,
    parent: Option<Arc<Storkable>>,
}
impl Storkable {
    /// Instantiates a new [Storkable] from a [Url], storking can then
    /// begin on the given [Url] using the [Storkable::exec] method.
    pub fn new(url: Url) -> Self {
        Self {
            url,
            filters: Arc::new(FilterSet::default()),
            client: Arc::new(
                reqwest::Client::builder()
                    .user_agent(concat!(
                        env!("CARGO_PKG_NAME"),
                        "/",
                        env!("CARGO_PKG_VERSION")
                    ))
                    .build()
                    .unwrap(),
            ),
            parent: None,
        }
    }

    /// Attaches a [FilterSet] to this, and child, [Storkable]s.
    pub fn with_filters(mut self, filters: FilterSet) -> Self {
        self.filters = Arc::new(filters);
        self
    }

    /// Set a custom [reqwest::Client] to use with this, and child,
    /// [Storkable]s.
    pub fn with_client(mut self, client: reqwest::Client) -> Self {
        self.client = Arc::new(client);
        self
    }

    /// Get the URL of this [Storkable].
    pub fn url(&self) -> &Url {
        &self.url
    }

    /// Get the [Storkable] from which this [Storkable] was found on.
    pub fn parent(&self) -> Option<&Storkable> {
        // map to Arc::as_ref to hide the underlying Arc implementation
        self.parent.as_ref().map(Arc::as_ref)
    }

    /// Start storking this [Storkable].
    ///
    /// Finds all the followable links on this [Storkable] and returns
    /// a stream of more [Storkable]s with the same filters and the
    /// `parent` set to a reference of the current [Storkable].
    pub fn exec<'a>(self) -> impl futures::Stream<Item = Result<Storkable, Error>> + 'a {
        let this = Arc::new(self);

        try_stream! {
            let links = get_all_links_from_page(&this);
            pin_mut!(links); // needed for iteration

            while let Some(link) = links.next().await {
                let link = link?;

                if !this.filters.matches_url(&link.url) {
                    continue;
                }

                yield Storkable {
                    url: link.url,
                    client: Arc::clone(&this.client),
                    filters: Arc::clone(&this.filters),
                    parent: Some(Arc::clone(&this)),
                };
            }
        }
    }
}

struct PageLink {
    pub name: String,
    pub url: Url,
}

/// Sends a request to the [Storkable::url] and grabs all followable
/// links from it.
fn get_all_links_from_page<'a>(
    storkable: &'a Storkable,
) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a {
    try_stream! {
        let root = storkable.url.clone();

        // TODO: can we get this to stream into the Document? need some
        // TODO: compat layer between futures and std::io::Read
        let doc = storkable.client.get(root.clone())
            .send().await.context(StorkError::HttpError)?
            .bytes().await.context(StorkError::HttpError)?;
        let document = Document::from_read(&doc[..]).context(StorkError::HtmlParseError)?;

        for node in document.find(And(Name("a"), Not(Attr("rel", "nofollow")))) {
            let title = node.text().trim().to_string();
            let href = node.attr("href");

            if let Some(href) = href {
                // if this looks like a relative url append it to the root
                let href = if href.starts_with('/') || !href.contains("://") {
                    root.join(href).context(StorkError::UrlParseError)?
                } else {
                    Url::parse(href).context(StorkError::UrlParseError)?
                };

                yield PageLink {
                    name: title,
                    url: href,
                };
            }
        }
    }
}