🏡 index : ~doyle/stork.git

author Jordan Doyle <jordan@doyle.la> 2020-02-14 13:08:42.0 +00:00:00
committer Jordan Doyle <jordan@doyle.la> 2020-02-14 13:08:42.0 +00:00:00
commit
5d3d72522eb212a2771085f1f0fac2d8fb5f7e9e [patch]
tree
75d5270368244630f948832827e93f8aa889b9f6
parent
cb677ad04c59235ce73e1ba4b953b7d87fd405a8
download
5d3d72522eb212a2771085f1f0fac2d8fb5f7e9e.tar.gz

Fixes #1, stops the same Storkable being yielded multiple times by a single Storkable



Diff

 stork/Cargo.toml      |  2 ++
 stork/src/lib.rs      | 30 ++++++++++++++++++++++++++++--
 stork_http/src/lib.rs | 11 +++++++++--
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/stork/Cargo.toml b/stork/Cargo.toml
index 8982023..8394ade 100644
--- a/stork/Cargo.toml
+++ b/stork/Cargo.toml
@@ -16,5 +16,7 @@ dyn-clone = "1.0.1"
futures = "0.3.4"
async-stream = "0.2.1"

twox-hash = ""

[dev-dependencies]
tokio = { version = "0.2", features = ["full"] }
\ No newline at end of file
diff --git a/stork/src/lib.rs b/stork/src/lib.rs
index 364722b..4f3212a 100644
--- a/stork/src/lib.rs
+++ b/stork/src/lib.rs
@@ -27,10 +27,11 @@ use async_stream::try_stream;
use futures::prelude::*;

use std::pin::Pin;
use std::sync::Arc;
use std::sync::{Arc, RwLock};

use failure::Error;
use failure::ResultExt;
use std::hash::{Hash, Hasher};

/// A [Storkable] represents a "thing" which is traversable ("storkable").
///
@@ -53,9 +54,10 @@ pub struct Storkable<T: Unpin + PartialEq + Hash, C: StorkClient<T>> {
    filters: FilterSet<T>,
    client: Arc<C>,
    parent: Option<Arc<Storkable<T, C>>>,
    seen: Arc<RwLock<Vec<u64>>>,
}

impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
impl<'a, T: Unpin + PartialEq + Hash + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
    /// Instantiates a new [Storkable] from a T, storking can then
    /// begin on the given entrypoint using the [Storkable::exec] method.
    pub fn new(val: T) -> Self {
@@ -64,6 +66,7 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
            filters: FilterSet::default(),
            client: Arc::new(C::default()),
            parent: None,
            seen: Arc::new(RwLock::new(Vec::new())),
        }
    }

@@ -113,6 +116,22 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
        false
    }

    /// Checks if this Storkable has seen this `value` before. If it
    /// hasn't, this method will return false but any subsequent calls
    /// with the same value will return true.
    fn check_has_seen(&self, value: &T) -> bool {
        let mut hasher = twox_hash::XxHash64::default();
        value.hash(&mut hasher);
        let hash = hasher.finish();

        return if self.seen.read().unwrap().contains(&hash) {
            true
        } else {
            self.seen.write().unwrap().push(hash);
            false
        };
    }

    /// Start storking this [Storkable].
    ///
    /// Finds all the followable links on this [Storkable] and returns
@@ -131,6 +150,12 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
                    continue;
                }

                // ensure we haven't returned this link before from this
                // Storkable
                if this.check_has_seen(&child) {
                    continue;
                }

                // ensure we're not going to cause a recursive loop by
                // checking that the page we're about to yield isn't a
                // parent of it
@@ -143,6 +168,7 @@ impl<'a, T: Unpin + PartialEq + 'a, C: StorkClient<T> + 'a> Storkable<T, C> {
                    client: Arc::clone(&this.client),
                    filters: this.filters.clone(),
                    parent: Some(Arc::clone(&this)),
                    seen: Arc::new(RwLock::new(Vec::new())),
                };
            }
        }
diff --git a/stork_http/src/lib.rs b/stork_http/src/lib.rs
index 5de75ef..246f4b1 100644
--- a/stork_http/src/lib.rs
+++ b/stork_http/src/lib.rs
@@ -88,6 +88,7 @@ use failure::ResultExt;
use std::sync::Arc;

pub use reqwest::Client as ReqwestClient;
use std::hash::{Hash, Hasher};

pub type HttpStorkable = Storkable<Link, HttpStorkClient>;

@@ -107,7 +108,12 @@ impl Link {
}
impl PartialEq for Link {
    fn eq(&self, other: &Self) -> bool {
        self.url().as_str() == other.url().as_str()
        self.url() == other.url()
    }
}
impl Hash for Link {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.url().hash(state)
    }
}
impl std::str::FromStr for Link {
@@ -174,11 +180,12 @@ impl StorkClient<Link> for HttpStorkClient {

                if let Some(href) = href {
                    // if this looks like a relative url append it to the root
                    let href = if href.starts_with('/') || !href.contains("://") {
                    let mut href = if href.starts_with('/') || !href.contains("://") {
                        root.join(href).context(StorkHttpError::UrlParseError)?
                    } else {
                        Url::parse(href).context(StorkHttpError::UrlParseError)?
                    };
                    href.set_fragment(None);

                    yield Link {
                        url: href,