🏡 index : ~doyle/stork.git

author Jordan Doyle <jordan@doyle.la> 2020-02-12 21:09:24.0 +00:00:00
committer Jordan Doyle <jordan@doyle.la> 2020-02-13 5:28:46.0 +00:00:00
commit
c485221c479e4c7b43f505219e444220abd262e3 [patch]
tree
36309972cc78ac3af5a0f92fa7fed51121c4004f
parent
f67610735390f18285b4c9794223b697281dbe58
download
c485221c479e4c7b43f505219e444220abd262e3.tar.gz

Add some basic rustdocs and run clippy over codebase



Diff

 README.md          |  2 +-
 crawler/Cargo.toml |  5 ++-
 crawler/src/lib.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++++--------
 src/main.rs        | 21 +-----------
 4 files changed, 91 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9a9eb08
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
# stork.rs
`stork` is a basic website scraper written in Rust.
\ No newline at end of file
diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml
index b14032b..da97daf 100644
--- a/crawler/Cargo.toml
+++ b/crawler/Cargo.toml
@@ -17,4 +17,7 @@ async-stream = ""

digest = ""
meowhash = ""
generic-array = ""
\ No newline at end of file
generic-array = ""

[dev-dependencies]
tokio = { version = "0.2", features = ["full"] }
\ No newline at end of file
diff --git a/crawler/src/lib.rs b/crawler/src/lib.rs
index e046c81..0d1fb19 100644
--- a/crawler/src/lib.rs
+++ b/crawler/src/lib.rs
@@ -1,9 +1,17 @@
#![recursion_limit="512"]
//! `stork` is a simple library to recursively crawl websites for links
//! in a search engine-like fashion. stork was designed from the ground
//! to have a simple API that is easy to use.
//!
//! Your entry point into stork is the [Storkable::new] function. Have
//! a look through the [Storkable] struct's documentation for your
//! entry into the world of storking.
#![recursion_limit = "512"]

#[macro_use]
extern crate failure_derive;

#[macro_use] extern crate failure_derive;

pub mod filters;
pub mod errors;
pub mod filters;

pub use errors::StorkError;
pub use filters::FilterSet;
@@ -11,17 +19,58 @@ pub use filters::FilterSet;
pub use url::Url;

use select::document::Document;
use select::predicate::{Attr, Name, And, Not};
use select::predicate::{And, Attr, Name, Not};

use futures::prelude::*;
use futures::pin_mut;
use async_stream::try_stream;
use futures::pin_mut;
use futures::prelude::*;
use std::sync::Arc;

use failure::Error;
use failure::ResultExt;

/// A `Storkable` represents a website link which is traversable.
/// A [Storkable] represents a "thing" (currently just a website link)
/// which is traversable.
///
/// To start "storking" a website an initial [Storkable] can be
/// constructed with [Storkable::new], once initialised filters can be
/// added using [Storkable::with_filters].
///
/// After a [Storkable] has been initialised, the storking can begin
/// with a call to [Storkable::exec] which will return a
/// stream of more [Storkable]s (with the filters from the parent
/// [Storkable] copied) which in turn can also be storked if necessary.
///
/// Example usage:
///
/// ```
/// # use failure::err_msg;
/// # use stork::{Storkable, FilterSet, filters::{UrlFilter, UrlFilterType}};
/// # use futures::StreamExt;
/// #
/// # #[tokio::main]
/// # async fn main() -> failure::Fallible<()> {
/// let stream = Storkable::new("https://example.com/".parse()?)
///     .with_filters(
///         FilterSet::default()
///             .add_url_filter(UrlFilter::new(UrlFilterType::Domain, String::from("www.iana.org")))
///             .add_url_filter(UrlFilter::new(UrlFilterType::Scheme, String::from("https")))
///     )
///     .exec();
/// # futures::pin_mut!(stream); // needed for iteration
/// let first_link: Storkable = stream.next().await.ok_or(err_msg("no links on page"))??;
/// assert_eq!(first_link.url().as_str(), "https://www.iana.org/domains/example");
/// assert_eq!(first_link.parent().unwrap().url().as_str(), "https://example.com/");
///
/// let stream = first_link.exec();
/// # futures::pin_mut!(stream); // needed for iteration
/// let inner_link = stream.next().await.ok_or(err_msg("no links on page"))??;
/// assert_eq!(inner_link.url().as_str(), "https://www.iana.org/");
/// assert_eq!(inner_link.parent().unwrap().url().as_str(), "https://www.iana.org/domains/example");
/// # Ok(())
/// # }
/// ```
#[derive(Debug, Clone)]
pub struct Storkable {
    url: Url,
    filters: Arc<FilterSet>,
@@ -29,6 +78,8 @@ pub struct Storkable {
    parent: Option<Arc<Storkable>>,
}
impl Storkable {
    /// Instantiates a new [Storkable] from a [Url], storking can then
    /// begin on the given [Url] using the [Storkable::exec] method.
    pub fn new(url: Url) -> Self {
        Self {
            url,
@@ -53,15 +104,29 @@ impl Storkable {
        self
    }

    /// Set a custom [reqwest::Client] to use with this, and child,
    /// [Storkable]s.
    pub fn with_client(mut self, client: reqwest::Client) -> Self {
        self.client = Arc::new(client);
        self
    }

    /// Get the URL of this [Storkable].
    pub fn url(&self) -> &Url {
        &self.url
    }

    /// Get the [Storkable] from which this [Storkable] was found on.
    pub fn parent(&self) -> Option<&Storkable> {
        // map to Arc::as_ref to hide the underlying Arc implementation
        self.parent.as_ref().map(Arc::as_ref)
    }

    /// Start storking this [Storkable].
    ///
    /// Finds all the followable links on this [Storkable] and returns
    /// a stream of more [Storkable]s with the same filters and the
    /// `parent` set to a reference of the current [Storkable].
    pub fn exec<'a>(self) -> impl futures::Stream<Item = Result<Storkable, Error>> + 'a {
        let this = Arc::new(self);

@@ -89,14 +154,19 @@ impl Storkable {

struct PageLink {
    pub name: String,
    pub url: Url
    pub url: Url,
}
fn get_all_links_from_page<'a>(storkable: &'a Storkable) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a {

/// Sends a request to the [Storkable::url] and grabs all followable
/// links from it.
fn get_all_links_from_page<'a>(
    storkable: &'a Storkable,
) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a {
    try_stream! {
        let root = storkable.url.clone();

        // TODO: can we get this to stream into the Document? need some compat layer
        // TODO: between futures and std::io::Read
        // TODO: can we get this to stream into the Document? need some
        // TODO: compat layer between futures and std::io::Read
        let doc = storkable.client.get(root.clone())
            .send().await.context(StorkError::HttpError)?
            .bytes().await.context(StorkError::HttpError)?;
diff --git a/src/main.rs b/src/main.rs
index cf8e9f5..067506d 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -7,22 +7,11 @@ async fn main() -> failure::Fallible<()> {
    let args: Vec<String> = std::env::args().collect();
    let url = args.get(1).expect("Expecting URL parameter").parse().unwrap();

    let stream = stork::Storkable::new(url)
//        .with_filters(
//            stork::Filters::default()
//                .add_url_filter(UrlFilter::new(
//                    UrlFilterType::Domain,
//                    "stackoverflow.blog".to_string()))
//        )
        .exec();
    let stream = stork::Storkable::new(url).exec();
    pin_mut!(stream); // needed for iteration

    while let Some(link) = stream.next().await {
        if let Err(err) = link {
            eprintln!("{:#?}", err);
            continue;
        }
        let link = link.unwrap();
        let link = link?;

        println!("{}", link.url());

@@ -30,11 +19,7 @@ async fn main() -> failure::Fallible<()> {
        pin_mut!(stream); // needed for iteration

        while let Some(link) = stream.next().await {
            if let Err(err) = link {
                eprintln!("{:#?}", err);
                continue;
            }
            println!("> {}", link.unwrap().url());
            println!("> {}", link?.url());
        }
    }