🏡 index : ~doyle/gitlab-cargo-shim.git

author Alex Butler <alexheretic@gmail.com> 2024-02-23 13:10:28.0 +00:00:00
committer Alex Butler <alexheretic@gmail.com> 2024-02-23 13:10:28.0 +00:00:00
commit
3b453237f1611ce0e85a794747ebf555f6b19c4f [patch]
tree
44b55d914ab5672e4b9c95e0ffed0d15d2e64cbe
parent
881a4ea21ab6b87fb0bb8c49b39e40568c2a6cbd
download
3b453237f1611ce0e85a794747ebf555f6b19c4f.tar.gz

Add file checksum fetch caching



Diff

 CHANGELOG.md                      |   1 +-
 Cargo.lock                        |  27 +++++++++-
 Cargo.toml                        |  10 +--
 config.toml                       |   9 +++-
 src/config.rs                     |  42 ++++++++++++--
 src/providers/gitlab.rs           | 117 +++++++++++++++++++++++++--------------
 src/providers/gitlab/checksums.rs |  40 +++++++++++++-
 src/providers/mod.rs              |   2 +-
 8 files changed, 197 insertions(+), 51 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a7990e5..ba672bd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
- Add info logs for release & metadata fetch latency.
- When fetching all releases handle 429 by backing off.
- Improve fetch error logging.
- Add file checksum fetch caching controlled by `cache-releases-older-than` config.

# v0.1.4

diff --git a/Cargo.lock b/Cargo.lock
index 385a938..0ec7183 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -681,6 +681,7 @@ dependencies = [
 "clap",
 "futures",
 "hex",
 "humantime-serde",
 "indexmap",
 "indoc",
 "itoa",
@@ -693,6 +694,7 @@ dependencies = [
 "serde",
 "serde_json",
 "shlex",
 "smol_str",
 "thrussh",
 "thrussh-keys",
 "thrussh-libsodium",
@@ -797,6 +799,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"

[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"

[[package]]
name = "humantime-serde"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c"
dependencies = [
 "humantime",
 "serde",
]

[[package]]
name = "hyper"
version = "0.14.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1590,6 +1608,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"

[[package]]
name = "smol_str"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49"
dependencies = [
 "serde",
]

[[package]]
name = "socket2"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
index 51c248c..7f411ce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,9 +2,7 @@
name = "gitlab-cargo-shim"
version = "0.1.4"
edition = "2021"
authors = [
    "Jordan Doyle <jordan@doyl.ee>"
]
authors = ["Jordan Doyle <jordan@doyl.ee>"]

[dependencies]
anyhow = "1"
@@ -17,6 +15,7 @@ cargo-platform = "0.1"
clap = { version = "4", features = ["derive", "cargo", "wrap_help"] }
futures = "0.3"
hex = "0.4"
humantime-serde = "1.1.1"
indexmap = "2"
indoc = "2.0"
itoa = "1.0"
@@ -26,13 +25,14 @@ parse_link_header = "0.3"
percent-encoding = "2.3"
reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
semver = "1.0"
serde = { version = "1.0", features = ["derive"] }
serde = { version = "1.0", features = ["derive", "rc"] }
serde_json = "1"
shlex = "1.1"
smol_str = { version = "0.2.1", features = ["serde"] }
thrussh = "0.34"
thrussh-keys = "0.22"
thrussh-libsodium = "=0.2.1" # 0.2.2 causes dynamic linking by enabling use-pkg-config
time = { version = "0.3", features = ["serde"] }
time = { version = "0.3", features = ["serde", "parsing"] }
tokio = { version = "1.17", features = ["full"] }
tokio-util = { version = "0.7", features = ["codec"] }
toml = "0.5"
diff --git a/config.toml b/config.toml
index a3299a0..e4d15fe 100644
--- a/config.toml
+++ b/config.toml
@@ -23,3 +23,12 @@ uri = "http://127.0.0.1:3000"
## The correct format must be available in the package registry for all
## packages.
# metadata-format = "json"

## Cache file checksum fetches for all release older than this value.
##
## If omitted no caching will occur.
##
## Note: Caching shouldn't be used if published releases are expected to be mutated.
## However, a grace period can allow the majority of crates to benefit from caching
## but handle mutation of recently published crates.
# cache-releases-older-than = "2 days"
diff --git a/src/config.rs b/src/config.rs
index 0121075..301bff4 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -3,8 +3,7 @@
use crate::providers::gitlab::handle_error;
use clap::Parser;
use serde::{de::DeserializeOwned, Deserialize};
use std::{io, net::SocketAddr, path::PathBuf, str::FromStr};
use time::Duration;
use std::{io, net::SocketAddr, path::PathBuf, str::FromStr, time::Duration};
use url::Url;

#[derive(Parser)]
@@ -36,19 +35,23 @@ pub struct GitlabConfig {
    pub uri: Url,
    /// If absent personal access tokens must be provided.
    pub admin_token: Option<String>,
    // TODO use humantime-serde?
    #[serde(default = "GitlabConfig::default_token_expiry")]
    pub token_expiry: Duration,
    pub token_expiry: time::Duration,
    #[serde(default)]
    pub ssl_cert: Option<String>,
    /// Metadata format for fetching.
    #[serde(default)]
    pub metadata_format: MetadataFormat,
    /// Cache file checksum fetches for all release older than this value.
    #[serde(default, with = "humantime_serde")]
    pub cache_releases_older_than: Option<Duration>,
}

impl GitlabConfig {
    #[must_use]
    const fn default_token_expiry() -> Duration {
        Duration::days(30)
    const fn default_token_expiry() -> time::Duration {
        time::Duration::days(30)
    }
}

@@ -95,3 +98,32 @@ pub fn from_toml_path<T: DeserializeOwned>(path: &str) -> Result<T, std::io::Err
    let contents = std::fs::read(path)?;
    toml::from_slice(&contents).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
}

#[test]
fn deser_config() {
    let conf = r#"
        listen-address = "[::]:2222"
        state-directory = "/var/lib/gitlab-cargo-shim"
        [gitlab]
        uri = "http://127.0.0.1:3000"
        metadata-format = "json.zst"
        cache-releases-older-than = "2 days""#;

    let conf: Config = toml::from_str(conf).unwrap();
    assert_eq!(
        conf.state_directory.to_string_lossy(),
        "/var/lib/gitlab-cargo-shim"
    );
    assert_eq!(conf.listen_address.to_string(), "[::]:2222");

    let gitlab = conf.gitlab;
    assert_eq!(gitlab.uri.as_str(), "http://127.0.0.1:3000/");
    assert_eq!(gitlab.admin_token, None);
    assert_eq!(gitlab.token_expiry, GitlabConfig::default_token_expiry());
    assert_eq!(gitlab.ssl_cert, None);
    assert_eq!(gitlab.metadata_format, MetadataFormat::JsonZst);
    assert_eq!(
        gitlab.cache_releases_older_than,
        Some(Duration::from_secs(2 * 24 * 60 * 60))
    );
}
diff --git a/src/providers/gitlab.rs b/src/providers/gitlab.rs
index ae3a14d..2c9e863 100644
--- a/src/providers/gitlab.rs
+++ b/src/providers/gitlab.rs
@@ -1,5 +1,6 @@
// blocks_in_conditions: didn't work with `#[instrument...`` usage
#![allow(clippy::module_name_repetitions, clippy::blocks_in_conditions)]
mod checksums;

use crate::{
    config::{GitlabConfig, MetadataFormat},
@@ -8,12 +9,14 @@ use crate::{
use anyhow::Context;
use async_trait::async_trait;
use backoff::backoff::Backoff;
use checksums::ChecksumCache;
use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt};
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
use reqwest::{header, Certificate};
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use time::{Duration, OffsetDateTime};
use smol_str::{format_smolstr, SmolStr};
use std::{sync::Arc, time::Duration};
use time::OffsetDateTime;
use tokio::sync::Semaphore;
use tracing::{debug, info_span, instrument, Instrument};
use url::Url;
@@ -24,9 +27,11 @@ const PARALLEL_PACKAGE_FILES_GETS: usize = 32;
pub struct Gitlab {
    client: reqwest::Client,
    base_url: Url,
    token_expiry: Duration,
    token_expiry: time::Duration,
    metadata_format: MetadataFormat,
    admin_token: Option<String>,
    checksums: ChecksumCache,
    cache_checksums_older_than: Option<Duration>,
}

impl Gitlab {
@@ -45,8 +50,49 @@ impl Gitlab {
            token_expiry: config.token_expiry,
            metadata_format: config.metadata_format,
            admin_token: config.admin_token.clone(),
            checksums: <_>::default(),
            cache_checksums_older_than: config.cache_releases_older_than,
        })
    }

    async fn fetch_checksum(
        &self,
        key: checksums::Key,
        do_as: &User,
    ) -> anyhow::Result<Option<Arc<str>>> {
        if let Some(chksum) = self.checksums.get(&key) {
            return Ok(Some(chksum));
        }

        let package_files: Vec<GitlabPackageFilesResponse> = handle_error(
            self.client
                .get(key.fetch_url())
                .user_or_admin_token(do_as, &self.admin_token)
                .send_retry_429()
                .await?,
        )
        .await?
        .json()
        .await?;

        let Some(file) = package_files
            .into_iter()
            .find(|package_file| package_file.file_name == key.file_name)
        else {
            return Ok(None);
        };

        // if `cache_checksums_older_than` is configured and this file is old enough
        // cache the checksum to avoid having to fetch again
        if let Some(cache_older_than) = self.cache_checksums_older_than {
            let cache_max_created = OffsetDateTime::now_utc() - cache_older_than;
            if file.created_at < cache_max_created {
                self.checksums.set(key, Arc::clone(&file.file_sha256));
            }
        }

        Ok(Some(file.file_sha256))
    }
}

#[async_trait]
@@ -219,7 +265,7 @@ impl super::PackageProvider for Gitlab {
                            let mut splitter = release.links.web_path.splitn(2, "/-/packages/");
                            match (splitter.next(), splitter.next()) {
                                (Some(project), Some(package)) => (&project[1..], package),
                                _ => return Ok(None),
                                _ => return anyhow::Ok(None),
                            }
                        };

@@ -229,40 +275,29 @@ impl super::PackageProvider for Gitlab {
                                .to_string(),
                        });

                        let package_files: Vec<GitlabPackageFilesResponse> = handle_error(
                            this.client
                                .get(format!(
                                    "{}/projects/{}/packages/{}/package_files",
                                    this.base_url,
                                    utf8_percent_encode(project, NON_ALPHANUMERIC),
                                    utf8_percent_encode(package, NON_ALPHANUMERIC),
                                ))
                                .user_or_admin_token(&do_as, &this.admin_token)
                                .send_retry_429()
                                .await?,
                        )
                        .await?
                        .json()
                        .await?;

                        let expected_file_name =
                            format!("{}-{}.crate", release.name, release.version);

                        Ok::<_, anyhow::Error>(
                            package_files
                                .into_iter()
                                .find(|package_file| package_file.file_name == expected_file_name)
                                .map(move |package_file| {
                                    (
                                        Arc::clone(&package_path),
                                        Release {
                                            name: Arc::from(release.name),
                                            version: release.version,
                                            checksum: package_file.file_sha256,
                                        },
                                    )
                                }),
                        )
                        let key = checksums::Key {
                            base_url: this.base_url.as_str().into(),
                            project: project.into(),
                            package: package.into(),
                            file_name: format_smolstr!(
                                "{}-{}.crate",
                                release.name,
                                release.version
                            ),
                        };

                        let checksum = this.fetch_checksum(key, &do_as).await?;

                        Ok(checksum.map(|checksum| {
                            (
                                Arc::clone(&package_path),
                                Release {
                                    name: Arc::from(release.name),
                                    version: release.version,
                                    checksum,
                                },
                            )
                        }))
                    }
                    .instrument(info_span!("fetch_package_files")),
                );
@@ -357,8 +392,10 @@ pub struct GitlabImpersonationTokenResponse {

#[derive(Deserialize)]
pub struct GitlabPackageFilesResponse {
    pub file_name: String,
    pub file_sha256: String,
    pub file_name: SmolStr,
    #[serde(with = "time::serde::rfc3339")]
    pub created_at: time::OffsetDateTime,
    pub file_sha256: Arc<str>,
}

#[derive(Deserialize)]
diff --git a/src/providers/gitlab/checksums.rs b/src/providers/gitlab/checksums.rs
new file mode 100644
index 0000000..06f2ef7
--- /dev/null
+++ b/src/providers/gitlab/checksums.rs
@@ -0,0 +1,40 @@
use parking_lot::RwLock;
use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
use smol_str::SmolStr;
use std::{collections::HashMap, sync::Arc};

/// Cache of fetched `/package_files` checksums fetched from
/// <https://docs.gitlab.com/ee/api/packages.html#list-package-files>
#[derive(Debug, Default)]
pub struct ChecksumCache {
    checksums: RwLock<HashMap<Key, Arc<str>>>,
}

impl ChecksumCache {
    pub fn get(&self, key: &Key) -> Option<Arc<str>> {
        self.checksums.read().get(key).cloned()
    }

    pub fn set(&self, key: Key, checksum: Arc<str>) {
        self.checksums.write().insert(key, checksum);
    }
}

#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Key {
    pub base_url: SmolStr,
    pub project: SmolStr,
    pub package: SmolStr,
    pub file_name: SmolStr,
}

impl Key {
    pub fn fetch_url(&self) -> String {
        format!(
            "{}/projects/{}/packages/{}/package_files",
            self.base_url,
            utf8_percent_encode(self.project.as_str(), NON_ALPHANUMERIC),
            utf8_percent_encode(self.package.as_str(), NON_ALPHANUMERIC),
        )
    }
}
diff --git a/src/providers/mod.rs b/src/providers/mod.rs
index 6c5a33c..0b22d99 100644
--- a/src/providers/mod.rs
+++ b/src/providers/mod.rs
@@ -50,5 +50,5 @@ pub type ReleaseName = Arc<str>;
pub struct Release {
    pub name: ReleaseName,
    pub version: String,
    pub checksum: String,
    pub checksum: Arc<str>,
}