//! # stork_http //! This is a [stork](../../../stork/) implementation for the HTTP //! protocol and specifically HTML-based web scraping. Given an initial //! page to scrape, stork_http will find all indexable links on the page //! and yield them back to you - ready to scrape again in an instant //! or store for later to come back to at another time, all using futures //! to allow for parallel processing. //! //! At this time `rel="nofollow"` is strictly enforced and not possible //! to change although this will come in time as more filters are added. //! //! Example usage: //! //! ``` //! # use stork::FilterSet; //! # use failure::err_msg; //! # use stork_http::{HttpStorkable, filters::*}; //! # use futures::StreamExt; //! # //! # #[tokio::main] //! # async fn main() -> failure::Fallible<()> { //! // start scanning https://example.com/ for links with the given filters //! let stream = HttpStorkable::new("https://example.com/".parse()?) //! .with_filters( //! FilterSet::default() //! .add_filter(DomainFilter::new("www.iana.org")) //! .add_filter(SchemeFilter::new("https")) //! ) //! .exec(); //! # futures::pin_mut!(stream); // needed for iteration //! // get the first link from example.com and ensure its the one we expected //! // it to be //! let first_link_on_example: HttpStorkable = match stream.next().await { //! Some(Ok(link)) => { //! assert_eq!(link.val().text(), Some("More information...".to_string())); //! assert_eq!(link.val().url().as_str(), "https://www.iana.org/domains/example"); //! assert_eq!(link.parent().unwrap().val().url().as_str(), "https://example.com/"); //! link //! }, //! _ => panic!("failed to get links from page") //! }; //! //! // add another filter looking for the root path and start scanning for links //! let filters = first_link_on_example.filters().clone() //! .add_filter(PathFilter::new(FilterType::Equals, "/")); //! let stream = first_link_on_example //! .with_filters(filters) //! .exec(); //! # futures::pin_mut!(stream); // needed for iteration //! // get the first link from the stream and ensure its a link to the homepage //! match stream.next().await { //! Some(Ok(link)) => { //! assert_eq!(link.val().url().as_str(), "https://www.iana.org/"); //! assert_eq!(link.parent().unwrap().val().url().as_str(), "https://www.iana.org/domains/example"); //! assert_eq!(link.parent().unwrap().parent().unwrap().val().url().as_str(), "https://example.com/") //! }, //! _ => panic!("failed to get links from page") //! } //! // ensure theres no other unexpected links on the homepage //! assert!(stream.next().await.is_none(), "should've been only one homepage link on the page!"); //! # Ok(()) //! # } //! ``` #![recursion_limit = "256"] #[macro_use] extern crate failure_derive; mod errors; pub mod filters; pub use errors::StorkHttpError; pub use url::Url; use stork::{StorkClient, Storkable}; use std::pin::Pin; use select::document::Document; use select::predicate::{And, Attr, Name, Not}; use async_stream::try_stream; use failure::Error; use failure::ResultExt; use std::sync::Arc; pub use reqwest::Client as ReqwestClient; use std::hash::{Hash, Hasher}; pub type HttpStorkable = Storkable; #[derive(Debug)] pub struct Link { url: Url, text: Option, } impl Link { pub fn url(&self) -> &Url { &self.url } pub fn text(&self) -> Option { self.text.clone() } } impl PartialEq for Link { fn eq(&self, other: &Self) -> bool { self.url() == other.url() } } impl Hash for Link { fn hash(&self, state: &mut H) { self.url().hash(state) } } impl std::str::FromStr for Link { type Err = failure::Error; fn from_str(input: &str) -> Result { Ok(Self { url: Url::parse(input).context(StorkHttpError::UrlParseError)?, text: None, }) } } impl From for Link { fn from(url: Url) -> Self { Self { url, text: None } } } pub struct HttpStorkClient { client: Arc, } impl HttpStorkClient { pub fn new(client: ReqwestClient) -> Self { Self { client: Arc::new(client), } } } impl Default for HttpStorkClient { fn default() -> Self { Self { client: Arc::new( reqwest::Client::builder() .user_agent(concat!( env!("CARGO_PKG_NAME"), "/", env!("CARGO_PKG_VERSION") )) .build() .unwrap(), ), } } } impl StorkClient for HttpStorkClient { fn run(&self, src: &Link) -> Pin>>> { let root = src.url.clone(); let client = Arc::clone(&self.client); Box::pin(try_stream! { // TODO: can we get this to stream into the Document? need some // TODO: compat layer between futures and std::io::Read let doc = client.get(root.clone()) .send().await.context(StorkHttpError::HttpError)? .bytes().await.context(StorkHttpError::HttpError)?; let document = Document::from_read(&doc[..]).context(StorkHttpError::HtmlParseError)?; for node in document.find(And(Name("a"), Not(Attr("rel", "nofollow")))) { let title = node.text().trim().to_string(); let href = node.attr("href"); if let Some(href) = href { // if this looks like a relative url append it to the root let mut href = if href.starts_with('/') || !href.contains("://") { root.join(href).context(StorkHttpError::UrlParseError)? } else { Url::parse(href).context(StorkHttpError::UrlParseError)? }; href.set_fragment(None); yield Link { url: href, text: Some(title).filter(|x| !x.is_empty()) }; } } }) } }