#![recursion_limit = "256"]
#[macro_use]
extern crate failure_derive;
mod errors;
pub mod filters;
pub use errors::StorkHttpError;
pub use url::Url;
use stork::{StorkClient, Storkable};
use std::pin::Pin;
use select::document::Document;
use select::predicate::{And, Attr, Name, Not};
use async_stream::try_stream;
use failure::Error;
use failure::ResultExt;
use std::sync::Arc;
pub use reqwest::Client as ReqwestClient;
pub type HttpStorkable = Storkable<Link, HttpStorkClient>;
#[derive(Debug)]
pub struct Link {
url: Url,
text: Option<String>,
}
impl Link {
pub fn url(&self) -> &Url {
&self.url
}
pub fn text(&self) -> Option<String> {
self.text.clone()
}
}
impl std::str::FromStr for Link {
type Err = failure::Error;
fn from_str(input: &str) -> Result<Link, Error> {
Ok(Self {
url: Url::parse(input).context(StorkHttpError::UrlParseError)?,
text: None,
})
}
}
impl From<Url> for Link {
fn from(url: Url) -> Self {
Self { url, text: None }
}
}
pub struct HttpStorkClient {
client: Arc<reqwest::Client>,
}
impl HttpStorkClient {
pub fn new(client: ReqwestClient) -> Self {
Self {
client: Arc::new(client),
}
}
}
impl Default for HttpStorkClient {
fn default() -> Self {
Self {
client: Arc::new(
reqwest::Client::builder()
.user_agent(concat!(
env!("CARGO_PKG_NAME"),
"/",
env!("CARGO_PKG_VERSION")
))
.build()
.unwrap(),
),
}
}
}
impl StorkClient<Link> for HttpStorkClient {
fn run(&self, src: &Link) -> Pin<Box<dyn futures::Stream<Item = Result<Link, Error>>>> {
let root = src.url.clone();
let client = Arc::clone(&self.client);
Box::pin(try_stream! {
let doc = client.get(root.clone())
.send().await.context(StorkHttpError::HttpError)?
.bytes().await.context(StorkHttpError::HttpError)?;
let document = Document::from_read(&doc[..]).context(StorkHttpError::HtmlParseError)?;
for node in document.find(And(Name("a"), Not(Attr("rel", "nofollow")))) {
let title = node.text().trim().to_string();
let href = node.attr("href");
if let Some(href) = href {
let href = if href.starts_with('/') || !href.contains("://") {
root.join(href).context(StorkHttpError::UrlParseError)?
} else {
Url::parse(href).context(StorkHttpError::UrlParseError)?
};
yield Link {
url: href,
text: Some(title).filter(|x| !x.is_empty())
};
}
}
})
}
}