🏡 index : ~doyle/stork.git

author Jordan Doyle <jordan@doyle.la> 2020-02-12 21:09:24.0 +00:00:00
committer Jordan Doyle <jordan@doyle.la> 2020-02-13 5:28:46.0 +00:00:00
commit
c485221c479e4c7b43f505219e444220abd262e3 [patch]
tree
36309972cc78ac3af5a0f92fa7fed51121c4004f
parent
f67610735390f18285b4c9794223b697281dbe58
download
c485221c479e4c7b43f505219e444220abd262e3.tar.gz

Add some basic rustdocs and run clippy over codebase



Diff

 README.md          |  2 ++
 crawler/Cargo.toml |  5 ++++-
 src/main.rs        | 21 +++------------------
 crawler/src/lib.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 91 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9a9eb08 100644
--- /dev/null
+++ a/README.md
@@ -1,0 +1,2 @@
# stork.rs
`stork` is a basic website scraper written in Rust.
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index b14032b..da97daf 100644
--- a/crawler/Cargo.toml
+++ a/crawler/Cargo.toml
@@ -17,4 +17,7 @@

digest = ""
meowhash = ""
generic-array = ""
generic-array = ""

[dev-dependencies]
tokio = { version = "0.2", features = ["full"] }
diff --git a/src/main.rs b/src/main.rs
index cf8e9f5..067506d 100644
--- a/src/main.rs
+++ a/src/main.rs
@@ -7,22 +7,11 @@
    let args: Vec<String> = std::env::args().collect();
    let url = args.get(1).expect("Expecting URL parameter").parse().unwrap();

    let stream = stork::Storkable::new(url)
//        .with_filters(
//            stork::Filters::default()
//                .add_url_filter(UrlFilter::new(
//                    UrlFilterType::Domain,
//                    "stackoverflow.blog".to_string()))
//        )
        .exec();
    let stream = stork::Storkable::new(url).exec();
    pin_mut!(stream); // needed for iteration

    while let Some(link) = stream.next().await {
        if let Err(err) = link {
            eprintln!("{:#?}", err);
            continue;
        }
        let link = link.unwrap();
        let link = link?;

        println!("{}", link.url());

@@ -30,11 +19,7 @@
        pin_mut!(stream); // needed for iteration

        while let Some(link) = stream.next().await {
            if let Err(err) = link {
                eprintln!("{:#?}", err);
                continue;
            }
            println!("> {}", link.unwrap().url());
            println!("> {}", link?.url());
        }
    }

diff --git a/crawler/src/lib.rs b/crawler/src/lib.rs
index e046c81..0d1fb19 100644
--- a/crawler/src/lib.rs
+++ a/crawler/src/lib.rs
@@ -1,9 +1,17 @@
#![recursion_limit="512"]
//! `stork` is a simple library to recursively crawl websites for links

//! in a search engine-like fashion. stork was designed from the ground

//! to have a simple API that is easy to use.

//!

//! Your entry point into stork is the [Storkable::new] function. Have

//! a look through the [Storkable] struct's documentation for your

//! entry into the world of storking.

#![recursion_limit = "512"]

#[macro_use]
extern crate failure_derive;

#[macro_use] extern crate failure_derive;

pub mod filters;
pub mod errors;
pub mod filters;

pub use errors::StorkError;
pub use filters::FilterSet;
@@ -11,17 +19,58 @@
pub use url::Url;

use select::document::Document;
use select::predicate::{Attr, Name, And, Not};
use select::predicate::{And, Attr, Name, Not};

use futures::prelude::*;
use futures::pin_mut;
use async_stream::try_stream;
use futures::pin_mut;
use futures::prelude::*;
use std::sync::Arc;

use failure::Error;
use failure::ResultExt;

/// A `Storkable` represents a website link which is traversable.

/// A [Storkable] represents a "thing" (currently just a website link)

/// which is traversable.

///

/// To start "storking" a website an initial [Storkable] can be

/// constructed with [Storkable::new], once initialised filters can be

/// added using [Storkable::with_filters].

///

/// After a [Storkable] has been initialised, the storking can begin

/// with a call to [Storkable::exec] which will return a

/// stream of more [Storkable]s (with the filters from the parent

/// [Storkable] copied) which in turn can also be storked if necessary.

///

/// Example usage:

///

/// ```

/// # use failure::err_msg;

/// # use stork::{Storkable, FilterSet, filters::{UrlFilter, UrlFilterType}};

/// # use futures::StreamExt;

/// #

/// # #[tokio::main]

/// # async fn main() -> failure::Fallible<()> {

/// let stream = Storkable::new("https://example.com/".parse()?)

///     .with_filters(

///         FilterSet::default()

///             .add_url_filter(UrlFilter::new(UrlFilterType::Domain, String::from("www.iana.org")))

///             .add_url_filter(UrlFilter::new(UrlFilterType::Scheme, String::from("https")))

///     )

///     .exec();

/// # futures::pin_mut!(stream); // needed for iteration

/// let first_link: Storkable = stream.next().await.ok_or(err_msg("no links on page"))??;

/// assert_eq!(first_link.url().as_str(), "https://www.iana.org/domains/example");

/// assert_eq!(first_link.parent().unwrap().url().as_str(), "https://example.com/");

///

/// let stream = first_link.exec();

/// # futures::pin_mut!(stream); // needed for iteration

/// let inner_link = stream.next().await.ok_or(err_msg("no links on page"))??;

/// assert_eq!(inner_link.url().as_str(), "https://www.iana.org/");

/// assert_eq!(inner_link.parent().unwrap().url().as_str(), "https://www.iana.org/domains/example");

/// # Ok(())

/// # }

/// ```

#[derive(Debug, Clone)]
pub struct Storkable {
    url: Url,
    filters: Arc<FilterSet>,
@@ -29,6 +78,8 @@
    parent: Option<Arc<Storkable>>,
}
impl Storkable {
    /// Instantiates a new [Storkable] from a [Url], storking can then

    /// begin on the given [Url] using the [Storkable::exec] method.

    pub fn new(url: Url) -> Self {
        Self {
            url,
@@ -50,18 +101,32 @@
    /// Attaches a [FilterSet] to this, and child, [Storkable]s.

    pub fn with_filters(mut self, filters: FilterSet) -> Self {
        self.filters = Arc::new(filters);
        self
    }

    /// Set a custom [reqwest::Client] to use with this, and child,

    /// [Storkable]s.

    pub fn with_client(mut self, client: reqwest::Client) -> Self {
        self.client = Arc::new(client);
        self
    }

    /// Get the URL of this [Storkable].

    pub fn url(&self) -> &Url {
        &self.url
    }

    /// Get the [Storkable] from which this [Storkable] was found on.

    pub fn parent(&self) -> Option<&Storkable> {
        // map to Arc::as_ref to hide the underlying Arc implementation
        self.parent.as_ref().map(Arc::as_ref)
    }

    /// Start storking this [Storkable].

    ///

    /// Finds all the followable links on this [Storkable] and returns

    /// a stream of more [Storkable]s with the same filters and the

    /// `parent` set to a reference of the current [Storkable].

    pub fn exec<'a>(self) -> impl futures::Stream<Item = Result<Storkable, Error>> + 'a {
        let this = Arc::new(self);

@@ -89,14 +154,19 @@

struct PageLink {
    pub name: String,
    pub url: Url
    pub url: Url,
}
fn get_all_links_from_page<'a>(storkable: &'a Storkable) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a {

/// Sends a request to the [Storkable::url] and grabs all followable

/// links from it.

fn get_all_links_from_page<'a>(
    storkable: &'a Storkable,
) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a {
    try_stream! {
        let root = storkable.url.clone();

        // TODO: can we get this to stream into the Document? need some compat layer
        // TODO: between futures and std::io::Read
        // TODO: can we get this to stream into the Document? need some
        // TODO: compat layer between futures and std::io::Read
        let doc = storkable.client.get(root.clone())
            .send().await.context(StorkError::HttpError)?
            .bytes().await.context(StorkError::HttpError)?;