From 3b453237f1611ce0e85a794747ebf555f6b19c4f Mon Sep 17 00:00:00 2001 From: Alex Butler Date: Fri, 23 Feb 2024 13:10:28 +0000 Subject: [PATCH] Add file checksum fetch caching --- CHANGELOG.md | 1 + Cargo.lock | 27 +++++++++++++++++++++++++++ Cargo.toml | 10 +++++----- config.toml | 9 +++++++++ src/config.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ src/providers/gitlab.rs | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ src/providers/mod.rs | 2 +- src/providers/gitlab/checksums.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 197 insertions(+), 51 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7990e5..ba672bd 100644 --- a/CHANGELOG.md +++ a/CHANGELOG.md @@ -6,6 +6,7 @@ - Add info logs for release & metadata fetch latency. - When fetching all releases handle 429 by backing off. - Improve fetch error logging. +- Add file checksum fetch caching controlled by `cache-releases-older-than` config. # v0.1.4 diff --git a/Cargo.lock b/Cargo.lock index 385a938..0ec7183 100644 --- a/Cargo.lock +++ a/Cargo.lock @@ -681,6 +681,7 @@ "clap", "futures", "hex", + "humantime-serde", "indexmap", "indoc", "itoa", @@ -693,6 +694,7 @@ "serde", "serde_json", "shlex", + "smol_str", "thrussh", "thrussh-keys", "thrussh-libsodium", @@ -797,6 +799,22 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "humantime-serde" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a3db5ea5923d99402c94e9feb261dc5ee9b4efa158b0315f788cf549cc200c" +dependencies = [ + "humantime", + "serde", +] + +[[package]] name = "hyper" version = "0.14.28" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -1588,6 +1606,15 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" + +[[package]] +name = "smol_str" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6845563ada680337a52d43bb0b29f396f2d911616f6573012645b9e3d048a49" +dependencies = [ + "serde", +] [[package]] name = "socket2" diff --git a/Cargo.toml b/Cargo.toml index 51c248c..7f411ce 100644 --- a/Cargo.toml +++ a/Cargo.toml @@ -1,10 +1,8 @@ [package] name = "gitlab-cargo-shim" version = "0.1.4" edition = "2021" -authors = [ - "Jordan Doyle " -] +authors = ["Jordan Doyle "] [dependencies] anyhow = "1" @@ -17,6 +15,7 @@ clap = { version = "4", features = ["derive", "cargo", "wrap_help"] } futures = "0.3" hex = "0.4" +humantime-serde = "1.1.1" indexmap = "2" indoc = "2.0" itoa = "1.0" @@ -26,13 +25,14 @@ percent-encoding = "2.3" reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] } semver = "1.0" -serde = { version = "1.0", features = ["derive"] } +serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1" shlex = "1.1" +smol_str = { version = "0.2.1", features = ["serde"] } thrussh = "0.34" thrussh-keys = "0.22" thrussh-libsodium = "=0.2.1" # 0.2.2 causes dynamic linking by enabling use-pkg-config -time = { version = "0.3", features = ["serde"] } +time = { version = "0.3", features = ["serde", "parsing"] } tokio = { version = "1.17", features = ["full"] } tokio-util = { version = "0.7", features = ["codec"] } toml = "0.5" diff --git a/config.toml b/config.toml index a3299a0..e4d15fe 100644 --- a/config.toml +++ a/config.toml @@ -23,3 +23,12 @@ ## The correct format must be available in the package registry for all ## packages. # metadata-format = "json" + +## Cache file checksum fetches for all release older than this value. +## +## If omitted no caching will occur. +## +## Note: Caching shouldn't be used if published releases are expected to be mutated. +## However, a grace period can allow the majority of crates to benefit from caching +## but handle mutation of recently published crates. +# cache-releases-older-than = "2 days" diff --git a/src/config.rs b/src/config.rs index 0121075..301bff4 100644 --- a/src/config.rs +++ a/src/config.rs @@ -1,10 +1,9 @@ #![allow(clippy::module_name_repetitions)] use crate::providers::gitlab::handle_error; use clap::Parser; use serde::{de::DeserializeOwned, Deserialize}; -use std::{io, net::SocketAddr, path::PathBuf, str::FromStr}; -use time::Duration; +use std::{io, net::SocketAddr, path::PathBuf, str::FromStr, time::Duration}; use url::Url; #[derive(Parser)] @@ -36,19 +35,23 @@ pub uri: Url, /// If absent personal access tokens must be provided. pub admin_token: Option, + // TODO use humantime-serde? #[serde(default = "GitlabConfig::default_token_expiry")] - pub token_expiry: Duration, + pub token_expiry: time::Duration, #[serde(default)] pub ssl_cert: Option, /// Metadata format for fetching. #[serde(default)] pub metadata_format: MetadataFormat, + /// Cache file checksum fetches for all release older than this value. + #[serde(default, with = "humantime_serde")] + pub cache_releases_older_than: Option, } impl GitlabConfig { #[must_use] - const fn default_token_expiry() -> Duration { - Duration::days(30) + const fn default_token_expiry() -> time::Duration { + time::Duration::days(30) } } @@ -94,4 +97,33 @@ pub fn from_toml_path(path: &str) -> Result { let contents = std::fs::read(path)?; toml::from_slice(&contents).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e)) +} + +#[test] +fn deser_config() { + let conf = r#" + listen-address = "[::]:2222" + state-directory = "/var/lib/gitlab-cargo-shim" + [gitlab] + uri = "http://127.0.0.1:3000" + metadata-format = "json.zst" + cache-releases-older-than = "2 days""#; + + let conf: Config = toml::from_str(conf).unwrap(); + assert_eq!( + conf.state_directory.to_string_lossy(), + "/var/lib/gitlab-cargo-shim" + ); + assert_eq!(conf.listen_address.to_string(), "[::]:2222"); + + let gitlab = conf.gitlab; + assert_eq!(gitlab.uri.as_str(), "http://127.0.0.1:3000/"); + assert_eq!(gitlab.admin_token, None); + assert_eq!(gitlab.token_expiry, GitlabConfig::default_token_expiry()); + assert_eq!(gitlab.ssl_cert, None); + assert_eq!(gitlab.metadata_format, MetadataFormat::JsonZst); + assert_eq!( + gitlab.cache_releases_older_than, + Some(Duration::from_secs(2 * 24 * 60 * 60)) + ); } diff --git a/src/providers/gitlab.rs b/src/providers/gitlab.rs index ae3a14d..2c9e863 100644 --- a/src/providers/gitlab.rs +++ a/src/providers/gitlab.rs @@ -1,5 +1,6 @@ // blocks_in_conditions: didn't work with `#[instrument...`` usage #![allow(clippy::module_name_repetitions, clippy::blocks_in_conditions)] +mod checksums; use crate::{ config::{GitlabConfig, MetadataFormat}, @@ -8,12 +9,14 @@ use anyhow::Context; use async_trait::async_trait; use backoff::backoff::Backoff; +use checksums::ChecksumCache; use futures::{stream::FuturesUnordered, StreamExt, TryStreamExt}; use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest::{header, Certificate}; use serde::{Deserialize, Serialize}; -use std::sync::Arc; -use time::{Duration, OffsetDateTime}; +use smol_str::{format_smolstr, SmolStr}; +use std::{sync::Arc, time::Duration}; +use time::OffsetDateTime; use tokio::sync::Semaphore; use tracing::{debug, info_span, instrument, Instrument}; use url::Url; @@ -24,9 +27,11 @@ pub struct Gitlab { client: reqwest::Client, base_url: Url, - token_expiry: Duration, + token_expiry: time::Duration, metadata_format: MetadataFormat, admin_token: Option, + checksums: ChecksumCache, + cache_checksums_older_than: Option, } impl Gitlab { @@ -45,8 +50,49 @@ token_expiry: config.token_expiry, metadata_format: config.metadata_format, admin_token: config.admin_token.clone(), + checksums: <_>::default(), + cache_checksums_older_than: config.cache_releases_older_than, }) } + + async fn fetch_checksum( + &self, + key: checksums::Key, + do_as: &User, + ) -> anyhow::Result>> { + if let Some(chksum) = self.checksums.get(&key) { + return Ok(Some(chksum)); + } + + let package_files: Vec = handle_error( + self.client + .get(key.fetch_url()) + .user_or_admin_token(do_as, &self.admin_token) + .send_retry_429() + .await?, + ) + .await? + .json() + .await?; + + let Some(file) = package_files + .into_iter() + .find(|package_file| package_file.file_name == key.file_name) + else { + return Ok(None); + }; + + // if `cache_checksums_older_than` is configured and this file is old enough + // cache the checksum to avoid having to fetch again + if let Some(cache_older_than) = self.cache_checksums_older_than { + let cache_max_created = OffsetDateTime::now_utc() - cache_older_than; + if file.created_at < cache_max_created { + self.checksums.set(key, Arc::clone(&file.file_sha256)); + } + } + + Ok(Some(file.file_sha256)) + } } #[async_trait] @@ -219,7 +265,7 @@ let mut splitter = release.links.web_path.splitn(2, "/-/packages/"); match (splitter.next(), splitter.next()) { (Some(project), Some(package)) => (&project[1..], package), - _ => return Ok(None), + _ => return anyhow::Ok(None), } }; @@ -228,41 +274,30 @@ package_name: utf8_percent_encode(&release.name, NON_ALPHANUMERIC) .to_string(), }); + + let key = checksums::Key { + base_url: this.base_url.as_str().into(), + project: project.into(), + package: package.into(), + file_name: format_smolstr!( + "{}-{}.crate", + release.name, + release.version + ), + }; - let package_files: Vec = handle_error( - this.client - .get(format!( - "{}/projects/{}/packages/{}/package_files", - this.base_url, - utf8_percent_encode(project, NON_ALPHANUMERIC), - utf8_percent_encode(package, NON_ALPHANUMERIC), - )) - .user_or_admin_token(&do_as, &this.admin_token) - .send_retry_429() - .await?, - ) - .await? - .json() - .await?; - - let expected_file_name = - format!("{}-{}.crate", release.name, release.version); - - Ok::<_, anyhow::Error>( - package_files - .into_iter() - .find(|package_file| package_file.file_name == expected_file_name) - .map(move |package_file| { - ( - Arc::clone(&package_path), - Release { - name: Arc::from(release.name), - version: release.version, - checksum: package_file.file_sha256, - }, - ) - }), - ) + let checksum = this.fetch_checksum(key, &do_as).await?; + + Ok(checksum.map(|checksum| { + ( + Arc::clone(&package_path), + Release { + name: Arc::from(release.name), + version: release.version, + checksum, + }, + ) + })) } .instrument(info_span!("fetch_package_files")), ); @@ -357,8 +392,10 @@ #[derive(Deserialize)] pub struct GitlabPackageFilesResponse { - pub file_name: String, - pub file_sha256: String, + pub file_name: SmolStr, + #[serde(with = "time::serde::rfc3339")] + pub created_at: time::OffsetDateTime, + pub file_sha256: Arc, } #[derive(Deserialize)] diff --git a/src/providers/mod.rs b/src/providers/mod.rs index 6c5a33c..0b22d99 100644 --- a/src/providers/mod.rs +++ a/src/providers/mod.rs @@ -50,5 +50,5 @@ pub struct Release { pub name: ReleaseName, pub version: String, - pub checksum: String, + pub checksum: Arc, } diff --git a/src/providers/gitlab/checksums.rs b/src/providers/gitlab/checksums.rs new file mode 100644 index 0000000..06f2ef7 100644 --- /dev/null +++ a/src/providers/gitlab/checksums.rs @@ -1,0 +1,40 @@ +use parking_lot::RwLock; +use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; +use smol_str::SmolStr; +use std::{collections::HashMap, sync::Arc}; + +/// Cache of fetched `/package_files` checksums fetched from +/// +#[derive(Debug, Default)] +pub struct ChecksumCache { + checksums: RwLock>>, +} + +impl ChecksumCache { + pub fn get(&self, key: &Key) -> Option> { + self.checksums.read().get(key).cloned() + } + + pub fn set(&self, key: Key, checksum: Arc) { + self.checksums.write().insert(key, checksum); + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Key { + pub base_url: SmolStr, + pub project: SmolStr, + pub package: SmolStr, + pub file_name: SmolStr, +} + +impl Key { + pub fn fetch_url(&self) -> String { + format!( + "{}/projects/{}/packages/{}/package_files", + self.base_url, + utf8_percent_encode(self.project.as_str(), NON_ALPHANUMERIC), + utf8_percent_encode(self.package.as_str(), NON_ALPHANUMERIC), + ) + } +} -- rgit 0.1.3