From 3b453237f1611ce0e85a794747ebf555f6b19c4f Mon Sep 17 00:00:00 2001 From: Alex Butler Date: Fri, 23 Feb 2024 13:10:28 +0000 Subject: [PATCH] Add file checksum fetch caching --- CHANGELOG.md | 1 + Cargo.lock | 27 +++++++++++++++++++++++++++ Cargo.toml | 10 +++++----- config.toml | 9 +++++++++ src/config.rs | 42 +++++++++++++++++++++++++++++++++++++----- src/providers/gitlab.rs | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------------------- src/providers/gitlab/checksums.rs | 40 ++++++++++++++++++++++++++++++++++++++++ src/providers/mod.rs | 2 +- 8 files changed, 197 insertions(+), 51 deletions(-) create mode 100644 src/providers/gitlab/checksums.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index a7990e5..ba672bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - Add info logs for release & metadata fetch latency. - When fetching all releases handle 429 by backing off. - Improve fetch error logging. +- Add file checksum fetch caching controlled by `cache-releases-older-than` config. # v0.1.4 diff --git a/Cargo.lock b/Cargo.lock index 385a938..0ec7183 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -681,6 +681,7 @@ dependencies = [ "clap", "futures", "hex", + "humantime-serde", "indexmap", "indoc", "itoa", @@ -693,6 +694,7 @@ dependencies = [ "serde", "serde_json", "shlex", + "smol_str", "thrussh", "thrussh-keys", "thrussh-libsodium", @@ -797,6 +799,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "humantime-serde" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c" +dependencies = [ + "humantime", + "serde", +] + +[[package]] name = "hyper" version = "0.14.28" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1590,6 +1608,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] +name = "smol_str" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49" +dependencies = [ + "serde", +] + +[[package]] name = "socket2" version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" diff --git a/Cargo.toml b/Cargo.toml index 51c248c..7f411ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,9 +2,7 @@ name = "gitlab-cargo-shim" version = "0.1.4" edition = "2021" -authors = [ - "Jordan Doyle " -] +authors = ["Jordan Doyle "] [dependencies] anyhow = "1" @@ -17,6 +15,7 @@ cargo-platform = "0.1" clap = { version = "4", features = ["derive", "cargo", "wrap_help"] } futures = "0.3" hex = "0.4" +humantime-serde = "1.1.1" indexmap = "2" indoc = "2.0" itoa = "1.0" @@ -26,13 +25,14 @@ parse_link_header = "0.3" percent-encoding = "2.3" reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] } semver = "1.0" -serde = { version = "1.0", features = ["derive"] } +serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1" shlex = "1.1" +smol_str = { version = "0.2.1", features = ["serde"] } thrussh = "0.34" thrussh-keys = "0.22" thrussh-libsodium = "=0.2.1" # 0.2.2 causes dynamic linking by enabling use-pkg-config -time = { version = "0.3", features = ["serde"] } +time = { version = "0.3", features = ["serde", "parsing"] } tokio = { version = "1.17", features = ["full"] } tokio-util = { version = "0.7", features = ["codec"] } toml = "0.5" diff --git a/config.toml b/config.toml index a3299a0..e4d15fe 100644 --- a/config.toml +++ b/config.toml @@ -23,3 +23,12 @@ uri = "http://127.0.0.1:3000" ## The correct format must be available in the package registry for all ## packages. # metadata-format = "json" + +## Cache file checksum fetches for all release older than this value. +## +## If omitted no caching will occur. +## +## Note: Caching shouldn't be used if published releases are expected to be mutated. +## However, a grace period can allow the majority of crates to benefit from caching +## but handle mutation of recently published crates. +# cache-releases-older-than = "2 days" diff --git a/src/config.rs b/src/config.rs index 0121075..301bff4 100644 --- a/src/config.rs +++ b/src/config.rs @@ -3,8 +3,7 @@ use crate::providers::gitlab::handle_error; use clap::Parser; use serde::{de::DeserializeOwned, Deserialize}; -use std::{io, net::SocketAddr, path::PathBuf, str::FromStr}; -use time::Duration; +use std::{io, net::SocketAddr, path::PathBuf, str::FromStr, time::Duration}; use url::Url; #[derive(Parser)] @@ -36,19 +35,23 @@ pub struct GitlabConfig { pub uri: Url, /// If absent personal access tokens must be provided. pub admin_token: Option, + // TODO use humantime-serde? #[serde(default = "GitlabConfig::default_token_expiry")] - pub token_expiry: Duration, + pub token_expiry: time::Duration, #[serde(default)] pub ssl_cert: Option, /// Metadata format for fetching. #[serde(default)] pub metadata_format: MetadataFormat, + /// Cache file checksum fetches for all release older than this value. + #[serde(default, with = "humantime_serde")] + pub cache_releases_older_than: Option, } impl GitlabConfig { #[must_use] - const fn default_token_expiry() -> Duration { - Duration::days(30) + const fn default_token_expiry() -> time::Duration { + time::Duration::days(30) } } @@ -95,3 +98,32 @@ pub fn from_toml_path(path: &str) -> Result, + checksums: ChecksumCache, + cache_checksums_older_than: Option, } impl Gitlab { @@ -45,8 +50,49 @@ impl Gitlab { token_expiry: config.token_expiry, metadata_format: config.metadata_format, admin_token: config.admin_token.clone(), + checksums: <_>::default(), + cache_checksums_older_than: config.cache_releases_older_than, }) } + + async fn fetch_checksum( + &self, + key: checksums::Key, + do_as: &User, + ) -> anyhow::Result>> { + if let Some(chksum) = self.checksums.get(&key) { + return Ok(Some(chksum)); + } + + let package_files: Vec = handle_error( + self.client + .get(key.fetch_url()) + .user_or_admin_token(do_as, &self.admin_token) + .send_retry_429() + .await?, + ) + .await? + .json() + .await?; + + let Some(file) = package_files + .into_iter() + .find(|package_file| package_file.file_name == key.file_name) + else { + return Ok(None); + }; + + // if `cache_checksums_older_than` is configured and this file is old enough + // cache the checksum to avoid having to fetch again + if let Some(cache_older_than) = self.cache_checksums_older_than { + let cache_max_created = OffsetDateTime::now_utc() - cache_older_than; + if file.created_at < cache_max_created { + self.checksums.set(key, Arc::clone(&file.file_sha256)); + } + } + + Ok(Some(file.file_sha256)) + } } #[async_trait] @@ -219,7 +265,7 @@ impl super::PackageProvider for Gitlab { let mut splitter = release.links.web_path.splitn(2, "/-/packages/"); match (splitter.next(), splitter.next()) { (Some(project), Some(package)) => (&project[1..], package), - _ => return Ok(None), + _ => return anyhow::Ok(None), } }; @@ -229,40 +275,29 @@ impl super::PackageProvider for Gitlab { .to_string(), }); - let package_files: Vec = handle_error( - this.client - .get(format!( - "{}/projects/{}/packages/{}/package_files", - this.base_url, - utf8_percent_encode(project, NON_ALPHANUMERIC), - utf8_percent_encode(package, NON_ALPHANUMERIC), - )) - .user_or_admin_token(&do_as, &this.admin_token) - .send_retry_429() - .await?, - ) - .await? - .json() - .await?; - - let expected_file_name = - format!("{}-{}.crate", release.name, release.version); - - Ok::<_, anyhow::Error>( - package_files - .into_iter() - .find(|package_file| package_file.file_name == expected_file_name) - .map(move |package_file| { - ( - Arc::clone(&package_path), - Release { - name: Arc::from(release.name), - version: release.version, - checksum: package_file.file_sha256, - }, - ) - }), - ) + let key = checksums::Key { + base_url: this.base_url.as_str().into(), + project: project.into(), + package: package.into(), + file_name: format_smolstr!( + "{}-{}.crate", + release.name, + release.version + ), + }; + + let checksum = this.fetch_checksum(key, &do_as).await?; + + Ok(checksum.map(|checksum| { + ( + Arc::clone(&package_path), + Release { + name: Arc::from(release.name), + version: release.version, + checksum, + }, + ) + })) } .instrument(info_span!("fetch_package_files")), ); @@ -357,8 +392,10 @@ pub struct GitlabImpersonationTokenResponse { #[derive(Deserialize)] pub struct GitlabPackageFilesResponse { - pub file_name: String, - pub file_sha256: String, + pub file_name: SmolStr, + #[serde(with = "time::serde::rfc3339")] + pub created_at: time::OffsetDateTime, + pub file_sha256: Arc, } #[derive(Deserialize)] diff --git a/src/providers/gitlab/checksums.rs b/src/providers/gitlab/checksums.rs new file mode 100644 index 0000000..06f2ef7 --- /dev/null +++ b/src/providers/gitlab/checksums.rs @@ -0,0 +1,40 @@ +use parking_lot::RwLock; +use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; +use smol_str::SmolStr; +use std::{collections::HashMap, sync::Arc}; + +/// Cache of fetched `/package_files` checksums fetched from +/// +#[derive(Debug, Default)] +pub struct ChecksumCache { + checksums: RwLock>>, +} + +impl ChecksumCache { + pub fn get(&self, key: &Key) -> Option> { + self.checksums.read().get(key).cloned() + } + + pub fn set(&self, key: Key, checksum: Arc) { + self.checksums.write().insert(key, checksum); + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Key { + pub base_url: SmolStr, + pub project: SmolStr, + pub package: SmolStr, + pub file_name: SmolStr, +} + +impl Key { + pub fn fetch_url(&self) -> String { + format!( + "{}/projects/{}/packages/{}/package_files", + self.base_url, + utf8_percent_encode(self.project.as_str(), NON_ALPHANUMERIC), + utf8_percent_encode(self.package.as_str(), NON_ALPHANUMERIC), + ) + } +} diff --git a/src/providers/mod.rs b/src/providers/mod.rs index 6c5a33c..0b22d99 100644 --- a/src/providers/mod.rs +++ b/src/providers/mod.rs @@ -50,5 +50,5 @@ pub type ReleaseName = Arc; pub struct Release { pub name: ReleaseName, pub version: String, - pub checksum: String, + pub checksum: Arc, } -- libgit2 1.7.2