From c485221c479e4c7b43f505219e444220abd262e3 Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Wed, 12 Feb 2020 21:09:24 +0000 Subject: [PATCH] Add some basic rustdocs and run clippy over codebase --- README.md | 2 ++ crawler/Cargo.toml | 5 ++++- crawler/src/lib.rs | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ src/main.rs | 21 +++------------------ 4 files changed, 91 insertions(+), 31 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..9a9eb08 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# stork.rs +`stork` is a basic website scraper written in Rust. \ No newline at end of file diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index b14032b..da97daf 100644 --- a/crawler/Cargo.toml +++ b/crawler/Cargo.toml @@ -17,4 +17,7 @@ async-stream = "" digest = "" meowhash = "" -generic-array = "" \ No newline at end of file +generic-array = "" + +[dev-dependencies] +tokio = { version = "0.2", features = ["full"] } \ No newline at end of file diff --git a/crawler/src/lib.rs b/crawler/src/lib.rs index e046c81..0d1fb19 100644 --- a/crawler/src/lib.rs +++ b/crawler/src/lib.rs @@ -1,9 +1,17 @@ -#![recursion_limit="512"] +//! `stork` is a simple library to recursively crawl websites for links +//! in a search engine-like fashion. stork was designed from the ground +//! to have a simple API that is easy to use. +//! +//! Your entry point into stork is the [Storkable::new] function. Have +//! a look through the [Storkable] struct's documentation for your +//! entry into the world of storking. +#![recursion_limit = "512"] + +#[macro_use] +extern crate failure_derive; -#[macro_use] extern crate failure_derive; - -pub mod filters; pub mod errors; +pub mod filters; pub use errors::StorkError; pub use filters::FilterSet; @@ -11,17 +19,58 @@ pub use filters::FilterSet; pub use url::Url; use select::document::Document; -use select::predicate::{Attr, Name, And, Not}; +use select::predicate::{And, Attr, Name, Not}; -use futures::prelude::*; -use futures::pin_mut; use async_stream::try_stream; +use futures::pin_mut; +use futures::prelude::*; use std::sync::Arc; use failure::Error; use failure::ResultExt; -/// A `Storkable` represents a website link which is traversable. +/// A [Storkable] represents a "thing" (currently just a website link) +/// which is traversable. +/// +/// To start "storking" a website an initial [Storkable] can be +/// constructed with [Storkable::new], once initialised filters can be +/// added using [Storkable::with_filters]. +/// +/// After a [Storkable] has been initialised, the storking can begin +/// with a call to [Storkable::exec] which will return a +/// stream of more [Storkable]s (with the filters from the parent +/// [Storkable] copied) which in turn can also be storked if necessary. +/// +/// Example usage: +/// +/// ``` +/// # use failure::err_msg; +/// # use stork::{Storkable, FilterSet, filters::{UrlFilter, UrlFilterType}}; +/// # use futures::StreamExt; +/// # +/// # #[tokio::main] +/// # async fn main() -> failure::Fallible<()> { +/// let stream = Storkable::new("https://example.com/".parse()?) +/// .with_filters( +/// FilterSet::default() +/// .add_url_filter(UrlFilter::new(UrlFilterType::Domain, String::from("www.iana.org"))) +/// .add_url_filter(UrlFilter::new(UrlFilterType::Scheme, String::from("https"))) +/// ) +/// .exec(); +/// # futures::pin_mut!(stream); // needed for iteration +/// let first_link: Storkable = stream.next().await.ok_or(err_msg("no links on page"))??; +/// assert_eq!(first_link.url().as_str(), "https://www.iana.org/domains/example"); +/// assert_eq!(first_link.parent().unwrap().url().as_str(), "https://example.com/"); +/// +/// let stream = first_link.exec(); +/// # futures::pin_mut!(stream); // needed for iteration +/// let inner_link = stream.next().await.ok_or(err_msg("no links on page"))??; +/// assert_eq!(inner_link.url().as_str(), "https://www.iana.org/"); +/// assert_eq!(inner_link.parent().unwrap().url().as_str(), "https://www.iana.org/domains/example"); +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug, Clone)] pub struct Storkable { url: Url, filters: Arc, @@ -29,6 +78,8 @@ pub struct Storkable { parent: Option>, } impl Storkable { + /// Instantiates a new [Storkable] from a [Url], storking can then + /// begin on the given [Url] using the [Storkable::exec] method. pub fn new(url: Url) -> Self { Self { url, @@ -53,15 +104,29 @@ impl Storkable { self } + /// Set a custom [reqwest::Client] to use with this, and child, + /// [Storkable]s. + pub fn with_client(mut self, client: reqwest::Client) -> Self { + self.client = Arc::new(client); + self + } + + /// Get the URL of this [Storkable]. pub fn url(&self) -> &Url { &self.url } + /// Get the [Storkable] from which this [Storkable] was found on. pub fn parent(&self) -> Option<&Storkable> { // map to Arc::as_ref to hide the underlying Arc implementation self.parent.as_ref().map(Arc::as_ref) } + /// Start storking this [Storkable]. + /// + /// Finds all the followable links on this [Storkable] and returns + /// a stream of more [Storkable]s with the same filters and the + /// `parent` set to a reference of the current [Storkable]. pub fn exec<'a>(self) -> impl futures::Stream> + 'a { let this = Arc::new(self); @@ -89,14 +154,19 @@ impl Storkable { struct PageLink { pub name: String, - pub url: Url + pub url: Url, } -fn get_all_links_from_page<'a>(storkable: &'a Storkable) -> impl futures::Stream> + 'a { + +/// Sends a request to the [Storkable::url] and grabs all followable +/// links from it. +fn get_all_links_from_page<'a>( + storkable: &'a Storkable, +) -> impl futures::Stream> + 'a { try_stream! { let root = storkable.url.clone(); - // TODO: can we get this to stream into the Document? need some compat layer - // TODO: between futures and std::io::Read + // TODO: can we get this to stream into the Document? need some + // TODO: compat layer between futures and std::io::Read let doc = storkable.client.get(root.clone()) .send().await.context(StorkError::HttpError)? .bytes().await.context(StorkError::HttpError)?; diff --git a/src/main.rs b/src/main.rs index cf8e9f5..067506d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,22 +7,11 @@ async fn main() -> failure::Fallible<()> { let args: Vec = std::env::args().collect(); let url = args.get(1).expect("Expecting URL parameter").parse().unwrap(); - let stream = stork::Storkable::new(url) -// .with_filters( -// stork::Filters::default() -// .add_url_filter(UrlFilter::new( -// UrlFilterType::Domain, -// "stackoverflow.blog".to_string())) -// ) - .exec(); + let stream = stork::Storkable::new(url).exec(); pin_mut!(stream); // needed for iteration while let Some(link) = stream.next().await { - if let Err(err) = link { - eprintln!("{:#?}", err); - continue; - } - let link = link.unwrap(); + let link = link?; println!("{}", link.url()); @@ -30,11 +19,7 @@ async fn main() -> failure::Fallible<()> { pin_mut!(stream); // needed for iteration while let Some(link) = stream.next().await { - if let Err(err) = link { - eprintln!("{:#?}", err); - continue; - } - println!("> {}", link.unwrap().url()); + println!("> {}", link?.url()); } } -- libgit2 1.7.2