From 5d3d72522eb212a2771085f1f0fac2d8fb5f7e9e Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Fri, 14 Feb 2020 13:08:42 +0000 Subject: [PATCH] Fixes #1, stops the same Storkable being yielded multiple times by a single Storkable --- stork/Cargo.toml | 2 ++ stork/src/lib.rs | 30 ++++++++++++++++++++++++++++-- stork_http/src/lib.rs | 11 +++++++++-- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/stork/Cargo.toml b/stork/Cargo.toml index 8982023..8394ade 100644 --- a/stork/Cargo.toml +++ b/stork/Cargo.toml @@ -16,5 +16,7 @@ dyn-clone = "1.0.1" futures = "0.3.4" async-stream = "0.2.1" +twox-hash = "" + [dev-dependencies] tokio = { version = "0.2", features = ["full"] } \ No newline at end of file diff --git a/stork/src/lib.rs b/stork/src/lib.rs index 364722b..4f3212a 100644 --- a/stork/src/lib.rs +++ b/stork/src/lib.rs @@ -27,10 +27,11 @@ use async_stream::try_stream; use futures::prelude::*; use std::pin::Pin; -use std::sync::Arc; +use std::sync::{Arc, RwLock}; use failure::Error; use failure::ResultExt; +use std::hash::{Hash, Hasher}; /// A [Storkable] represents a "thing" which is traversable ("storkable"). /// @@ -53,9 +54,10 @@ pub struct Storkable> { filters: FilterSet, client: Arc, parent: Option>>, + seen: Arc>>, } -impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient + 'a> Storkable { +impl<'a, T: Unpin + PartialEq + Hash + 'a, C: StorkClient + 'a> Storkable { /// Instantiates a new [Storkable] from a T, storking can then /// begin on the given entrypoint using the [Storkable::exec] method. pub fn new(val: T) -> Self { @@ -64,6 +66,7 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient + 'a> Storkable { filters: FilterSet::default(), client: Arc::new(C::default()), parent: None, + seen: Arc::new(RwLock::new(Vec::new())), } } @@ -113,6 +116,22 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient + 'a> Storkable { false } + /// Checks if this Storkable has seen this `value` before. If it + /// hasn't, this method will return false but any subsequent calls + /// with the same value will return true. + fn check_has_seen(&self, value: &T) -> bool { + let mut hasher = twox_hash::XxHash64::default(); + value.hash(&mut hasher); + let hash = hasher.finish(); + + return if self.seen.read().unwrap().contains(&hash) { + true + } else { + self.seen.write().unwrap().push(hash); + false + }; + } + /// Start storking this [Storkable]. /// /// Finds all the followable links on this [Storkable] and returns @@ -131,6 +150,12 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient + 'a> Storkable { continue; } + // ensure we haven't returned this link before from this + // Storkable + if this.check_has_seen(&child) { + continue; + } + // ensure we're not going to cause a recursive loop by // checking that the page we're about to yield isn't a // parent of it @@ -143,6 +168,7 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient + 'a> Storkable { client: Arc::clone(&this.client), filters: this.filters.clone(), parent: Some(Arc::clone(&this)), + seen: Arc::new(RwLock::new(Vec::new())), }; } } diff --git a/stork_http/src/lib.rs b/stork_http/src/lib.rs index 5de75ef..246f4b1 100644 --- a/stork_http/src/lib.rs +++ b/stork_http/src/lib.rs @@ -88,6 +88,7 @@ use failure::ResultExt; use std::sync::Arc; pub use reqwest::Client as ReqwestClient; +use std::hash::{Hash, Hasher}; pub type HttpStorkable = Storkable; @@ -107,7 +108,12 @@ impl Link { } impl PartialEq for Link { fn eq(&self, other: &Self) -> bool { - self.url().as_str() == other.url().as_str() + self.url() == other.url() + } +} +impl Hash for Link { + fn hash(&self, state: &mut H) { + self.url().hash(state) } } impl std::str::FromStr for Link { @@ -174,11 +180,12 @@ impl StorkClient for HttpStorkClient { if let Some(href) = href { // if this looks like a relative url append it to the root - let href = if href.starts_with('/') || !href.contains("://") { + let mut href = if href.starts_with('/') || !href.contains("://") { root.join(href).context(StorkHttpError::UrlParseError)? } else { Url::parse(href).context(StorkHttpError::UrlParseError)? }; + href.set_fragment(None); yield Link { url: href, -- libgit2 1.7.2