author | Jordan Doyle <jordan@doyle.la> | 2020-02-12 21:09:24.0 +00:00:00 |
---|---|---|
committer | Jordan Doyle <jordan@doyle.la> | 2020-02-13 5:28:46.0 +00:00:00 |
commit | c485221c479e4c7b43f505219e444220abd262e3 [patch] |
|
tree | 36309972cc78ac3af5a0f92fa7fed51121c4004f |
|
parent | f67610735390f18285b4c9794223b697281dbe58 |
|
download | c485221c479e4c7b43f505219e444220abd262e3.tar.gz |
Add some basic rustdocs and run clippy over codebase
Diff
README.md | 2 ++ crawler/Cargo.toml | 5 ++++- src/main.rs | 21 +++------------------ crawler/src/lib.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 91 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md new file mode 100644 index 0000000..9a9eb08 100644 --- /dev/null +++ a/README.md @@ -1,0 +1,2 @@ # stork.rs `stork` is a basic website scraper written in Rust. diff --git a/crawler/Cargo.toml b/crawler/Cargo.toml index b14032b..da97daf 100644 --- a/crawler/Cargo.toml +++ a/crawler/Cargo.toml @@ -17,4 +17,7 @@ digest = "" meowhash = "" generic-array = "" generic-array = "" [dev-dependencies] tokio = { version = "0.2", features = ["full"] } diff --git a/src/main.rs b/src/main.rs index cf8e9f5..067506d 100644 --- a/src/main.rs +++ a/src/main.rs @@ -7,22 +7,11 @@ let args: Vec<String> = std::env::args().collect(); let url = args.get(1).expect("Expecting URL parameter").parse().unwrap(); let stream = stork::Storkable::new(url) // .with_filters( // stork::Filters::default() // .add_url_filter(UrlFilter::new( // UrlFilterType::Domain, // "stackoverflow.blog".to_string())) // ) .exec(); let stream = stork::Storkable::new(url).exec(); pin_mut!(stream); // needed for iteration while let Some(link) = stream.next().await { if let Err(err) = link { eprintln!("{:#?}", err); continue; } let link = link.unwrap(); let link = link?; println!("{}", link.url()); @@ -30,11 +19,7 @@ pin_mut!(stream); // needed for iteration while let Some(link) = stream.next().await { if let Err(err) = link { eprintln!("{:#?}", err); continue; } println!("> {}", link.unwrap().url()); println!("> {}", link?.url()); } } diff --git a/crawler/src/lib.rs b/crawler/src/lib.rs index e046c81..0d1fb19 100644 --- a/crawler/src/lib.rs +++ a/crawler/src/lib.rs @@ -1,9 +1,17 @@ #![recursion_limit="512"] //! `stork` is a simple library to recursively crawl websites for links //! in a search engine-like fashion. stork was designed from the ground //! to have a simple API that is easy to use. //! //! Your entry point into stork is the [Storkable::new] function. Have //! a look through the [Storkable] struct's documentation for your //! entry into the world of storking. #![recursion_limit = "512"] #[macro_use] extern crate failure_derive; #[macro_use] extern crate failure_derive; pub mod filters; pub mod errors; pub mod filters; pub use errors::StorkError; pub use filters::FilterSet; @@ -11,17 +19,58 @@ pub use url::Url; use select::document::Document; use select::predicate::{Attr, Name, And, Not}; use select::predicate::{And, Attr, Name, Not}; use futures::prelude::*; use futures::pin_mut; use async_stream::try_stream; use futures::pin_mut; use futures::prelude::*; use std::sync::Arc; use failure::Error; use failure::ResultExt; /// A `Storkable` represents a website link which is traversable. /// A [Storkable] represents a "thing" (currently just a website link) /// which is traversable. /// /// To start "storking" a website an initial [Storkable] can be /// constructed with [Storkable::new], once initialised filters can be /// added using [Storkable::with_filters]. /// /// After a [Storkable] has been initialised, the storking can begin /// with a call to [Storkable::exec] which will return a /// stream of more [Storkable]s (with the filters from the parent /// [Storkable] copied) which in turn can also be storked if necessary. /// /// Example usage: /// /// ``` /// # use failure::err_msg; /// # use stork::{Storkable, FilterSet, filters::{UrlFilter, UrlFilterType}}; /// # use futures::StreamExt; /// # /// # #[tokio::main] /// # async fn main() -> failure::Fallible<()> { /// let stream = Storkable::new("https://example.com/".parse()?) /// .with_filters( /// FilterSet::default() /// .add_url_filter(UrlFilter::new(UrlFilterType::Domain, String::from("www.iana.org"))) /// .add_url_filter(UrlFilter::new(UrlFilterType::Scheme, String::from("https"))) /// ) /// .exec(); /// # futures::pin_mut!(stream); // needed for iteration /// let first_link: Storkable = stream.next().await.ok_or(err_msg("no links on page"))??; /// assert_eq!(first_link.url().as_str(), "https://www.iana.org/domains/example"); /// assert_eq!(first_link.parent().unwrap().url().as_str(), "https://example.com/"); /// /// let stream = first_link.exec(); /// # futures::pin_mut!(stream); // needed for iteration /// let inner_link = stream.next().await.ok_or(err_msg("no links on page"))??; /// assert_eq!(inner_link.url().as_str(), "https://www.iana.org/"); /// assert_eq!(inner_link.parent().unwrap().url().as_str(), "https://www.iana.org/domains/example"); /// # Ok(()) /// # } /// ``` #[derive(Debug, Clone)] pub struct Storkable { url: Url, filters: Arc<FilterSet>, @@ -29,6 +78,8 @@ parent: Option<Arc<Storkable>>, } impl Storkable { /// Instantiates a new [Storkable] from a [Url], storking can then /// begin on the given [Url] using the [Storkable::exec] method. pub fn new(url: Url) -> Self { Self { url, @@ -50,18 +101,32 @@ /// Attaches a [FilterSet] to this, and child, [Storkable]s. pub fn with_filters(mut self, filters: FilterSet) -> Self { self.filters = Arc::new(filters); self } /// Set a custom [reqwest::Client] to use with this, and child, /// [Storkable]s. pub fn with_client(mut self, client: reqwest::Client) -> Self { self.client = Arc::new(client); self } /// Get the URL of this [Storkable]. pub fn url(&self) -> &Url { &self.url } /// Get the [Storkable] from which this [Storkable] was found on. pub fn parent(&self) -> Option<&Storkable> { // map to Arc::as_ref to hide the underlying Arc implementation self.parent.as_ref().map(Arc::as_ref) } /// Start storking this [Storkable]. /// /// Finds all the followable links on this [Storkable] and returns /// a stream of more [Storkable]s with the same filters and the /// `parent` set to a reference of the current [Storkable]. pub fn exec<'a>(self) -> impl futures::Stream<Item = Result<Storkable, Error>> + 'a { let this = Arc::new(self); @@ -89,14 +154,19 @@ struct PageLink { pub name: String, pub url: Url pub url: Url, } fn get_all_links_from_page<'a>(storkable: &'a Storkable) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a { /// Sends a request to the [Storkable::url] and grabs all followable /// links from it. fn get_all_links_from_page<'a>( storkable: &'a Storkable, ) -> impl futures::Stream<Item = Result<PageLink, Error>> + 'a { try_stream! { let root = storkable.url.clone(); // TODO: can we get this to stream into the Document? need some compat layer // TODO: between futures and std::io::Read // TODO: can we get this to stream into the Document? need some // TODO: compat layer between futures and std::io::Read let doc = storkable.client.get(root.clone()) .send().await.context(StorkError::HttpError)? .bytes().await.context(StorkError::HttpError)?;