From 9ad3b95a4ebcf11de1f46b3061bd6d818c835f5d Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Sat, 16 Jul 2022 14:36:36 +0100 Subject: [PATCH] Start reading repository metadata from sled --- Cargo.lock | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- Cargo.toml | 5 +++-- src/git.rs | 98 ++------------------------------------------------------------------------------ src/main.rs | 15 ++++++++++++++- src/database/indexer.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/database/mod.rs | 2 ++ src/methods/index.rs | 22 +++++++++++++++++----- src/database/schema/commit.rs | 20 ++++++++++++++++++++ src/database/schema/mod.rs | 3 +++ src/database/schema/prefixes.rs | 35 +++++++++++++++++++++++++++++++++++ src/database/schema/repository.rs | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 378 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 03badbd..22eda4e 100644 --- a/Cargo.lock +++ a/Cargo.lock @@ -42,12 +42,6 @@ checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704" [[package]] -name = "arc-swap" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5d78ce20460b82d3fa150275ed9d55e21064fc7951177baacf86a145c4a4b1f" - -[[package]] name = "askama" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -280,6 +274,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a" + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" @@ -473,7 +473,21 @@ "crossbeam-utils 0.7.2", "lazy_static", "maybe-uninit", - "memoffset", + "memoffset 0.5.6", + "scopeguard", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d" +dependencies = [ + "autocfg 1.1.0", + "cfg-if 1.0.0", + "crossbeam-utils 0.8.10", + "memoffset 0.6.5", + "once_cell", "scopeguard", ] @@ -619,6 +633,16 @@ ] [[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] name = "fuchsia-cprng" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -726,6 +750,15 @@ "pin-project-lite", "pin-utils", "slab", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", ] [[package]] @@ -1063,6 +1096,15 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" +dependencies = [ + "autocfg 1.1.0", +] + +[[package]] +name = "memoffset" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" dependencies = [ "autocfg 1.1.0", ] @@ -1119,12 +1161,12 @@ "async-io", "async-lock", "crossbeam-channel", - "crossbeam-epoch", + "crossbeam-epoch 0.8.2", "crossbeam-utils 0.8.10", "futures-util", "num_cpus", "once_cell", - "parking_lot", + "parking_lot 0.12.1", "quanta", "scheduled-thread-pool", "skeptic", @@ -1274,6 +1316,17 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "427c3892f9e783d91cc128285287e70a59e206ca452770ece88a76f7a3eddd72" + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.5", +] [[package]] name = "parking_lot" @@ -1282,7 +1335,21 @@ checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" dependencies = [ "lock_api", - "parking_lot_core", + "parking_lot_core 0.9.3", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" +dependencies = [ + "cfg-if 1.0.0", + "instant", + "libc", + "redox_syscall", + "smallvec", + "winapi", ] [[package]] @@ -1640,10 +1707,10 @@ version = "0.1.0" dependencies = [ "anyhow", - "arc-swap", "askama", "axum", "bat", + "bincode", "clap", "futures", "git2", @@ -1651,10 +1718,11 @@ "humantime", "md5", "moka", - "parking_lot", + "parking_lot 0.12.1", "path-clean", "rsass", "serde", + "sled", "syntect", "time 0.3.11", "timeago", @@ -1712,7 +1780,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "977a7519bff143a44f842fd07e80ad1329295bd71686457f18e496736f4bf9bf" dependencies = [ - "parking_lot", + "parking_lot 0.12.1", ] [[package]] @@ -1829,6 +1897,22 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32" + +[[package]] +name = "sled" +version = "0.34.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" +dependencies = [ + "crc32fast", + "crossbeam-epoch 0.9.9", + "crossbeam-utils 0.8.10", + "fs2", + "fxhash", + "libc", + "log", + "parking_lot 0.11.2", +] [[package]] name = "smallvec" @@ -1991,6 +2075,7 @@ "itoa", "libc", "num_threads", + "serde", ] [[package]] @@ -2030,7 +2115,7 @@ "mio", "num_cpus", "once_cell", - "parking_lot", + "parking_lot 0.12.1", "pin-project-lite", "signal-hook-registry", "socket2", diff --git a/Cargo.toml b/Cargo.toml index 90a1901..2409754 100644 --- a/Cargo.toml +++ a/Cargo.toml @@ -7,9 +7,9 @@ [dependencies] askama = "0.11" -arc-swap = "1.5" axum = "0.5" bat = { version = "0.21", default-features = false, features = ["build-assets"] } +bincode = "1.3" clap = { version = "3.2", features = ["cargo"] } futures = "0.3" git2 = "0.14" @@ -21,7 +21,8 @@ parking_lot = "0.12" serde = { version = "1.0", features = ["derive"] } syntect = "5" -time = "0.3" +sled = "0.34" +time = { version = "0.3", features = ["serde"] } timeago = "0.3" tokio = { version = "1.19", features = ["full"] } tower = "0.4" diff --git a/src/git.rs b/src/git.rs index 38aa78b..11e3811 100644 --- a/src/git.rs +++ a/src/git.rs @@ -1,16 +1,7 @@ -use std::{ - borrow::Cow, - collections::BTreeMap, - fmt::Write, - path::{Path, PathBuf}, - sync::Arc, - time::Duration, -}; +use std::{borrow::Cow, fmt::Write, path::PathBuf, sync::Arc, time::Duration}; -use arc_swap::ArcSwapOption; use git2::{ - BranchType, DiffFormat, DiffLineType, DiffOptions, DiffStatsFormat, ObjectType, Oid, - Repository, Signature, + BranchType, DiffFormat, DiffLineType, DiffOptions, DiffStatsFormat, ObjectType, Oid, Signature, }; use moka::future::Cache; use parking_lot::Mutex; @@ -20,13 +11,10 @@ use time::OffsetDateTime; use tracing::instrument; -pub type RepositoryMetadataList = BTreeMap, Vec>; - pub struct Git { commits: Cache>, readme_cache: Cache>>, refs: Cache>, - repository_metadata: ArcSwapOption, syntax_set: SyntaxSet, } @@ -46,7 +34,6 @@ .time_to_live(Duration::from_secs(10)) .max_capacity(100) .build(), - repository_metadata: ArcSwapOption::default(), syntax_set, } } @@ -66,29 +53,7 @@ git: self, cache_key: repo_path, repo: Mutex::new(repo), - }) - } - - #[instrument(skip(self))] - pub async fn fetch_repository_metadata(&self) -> Arc { - if let Some(metadata) = self.repository_metadata.load().as_ref() { - return Arc::clone(metadata); - } - - let start = Path::new("../test-git").canonicalize().unwrap(); - - let repos = tokio::task::spawn_blocking(move || { - let mut repos: RepositoryMetadataList = RepositoryMetadataList::new(); - fetch_repository_metadata_impl(&start, &start, &mut repos); - repos }) - .await - .unwrap(); - - let repos = Arc::new(repos); - self.repository_metadata.store(Some(repos.clone())); - - repos } } @@ -445,14 +410,6 @@ pub struct Tag { pub name: String, pub tagger: Option, -} - -#[derive(Debug)] -pub struct RepositoryMetadata { - pub name: String, - pub description: Option>, - pub owner: Option, - pub last_modified: OffsetDateTime, } #[derive(Debug)] @@ -648,55 +605,4 @@ .unwrap(); diff_output -} - -#[instrument(skip(repos))] -fn fetch_repository_metadata_impl( - start: &Path, - current: &Path, - repos: &mut RepositoryMetadataList, -) { - let dirs = std::fs::read_dir(current) - .unwrap() - .map(|v| v.unwrap().path()) - .filter(|path| path.is_dir()); - - for dir in dirs { - let repository = match Repository::open_bare(&dir) { - Ok(v) => v, - Err(_e) => { - fetch_repository_metadata_impl(start, &dir, repos); - continue; - } - }; - - let repo_path = Some( - current - .strip_prefix(start) - .unwrap() - .to_string_lossy() - .into_owned(), - ) - .filter(|v| !v.is_empty()); - let repos = repos.entry(repo_path).or_default(); - - let description = std::fs::read_to_string(dir.join("description")) - .map(Cow::Owned) - .ok(); - let last_modified = std::fs::metadata(&dir).unwrap().modified().unwrap(); - let owner = repository.config().unwrap().get_string("gitweb.owner").ok(); - - repos.push(RepositoryMetadata { - name: dir - .components() - .last() - .unwrap() - .as_os_str() - .to_string_lossy() - .into_owned(), - description, - owner, - last_modified: OffsetDateTime::from(last_modified), - }); - } } diff --git a/src/main.rs b/src/main.rs index 2e7d5f5..3fc07d3 100644 --- a/src/main.rs +++ a/src/main.rs @@ -14,6 +14,7 @@ use crate::{git::Git, layers::logger::LoggingMiddleware}; +mod database; mod git; mod layers; mod methods; @@ -27,6 +28,17 @@ let subscriber = subscriber.pretty(); subscriber.init(); + let db = sled::open("/tmp/some-sled.db").unwrap(); + + std::thread::spawn({ + let db = db.clone(); + + move || { + crate::database::indexer::run_indexer(&db); + eprintln!("finished indexer"); + } + }); + let bat_assets = HighlightingAssets::from_binary(); let syntax_set = bat_assets.get_syntax_set().unwrap().clone(); let theme = bat_assets.get_theme("GitHub"); @@ -49,7 +61,8 @@ .route("/highlight.css", get(static_css(css))) .fallback(methods::repo::service.into_service()) .layer(layer_fn(LoggingMiddleware)) - .layer(Extension(Arc::new(Git::new(syntax_set)))); + .layer(Extension(Arc::new(Git::new(syntax_set)))) + .layer(Extension(db)); axum::Server::bind(&"127.0.0.1:3333".parse().unwrap()) .serve(app.into_make_service_with_connect_info::()) diff --git a/src/database/indexer.rs b/src/database/indexer.rs new file mode 100644 index 0000000..56bf671 100644 --- /dev/null +++ a/src/database/indexer.rs @@ -1,0 +1,101 @@ +use std::path::{Path, PathBuf}; +use time::OffsetDateTime; + +use crate::database::schema::repository::{Repository, RepositoryId}; + +pub fn run_indexer(db: &sled::Db) { + let scan_path = Path::new("/Users/jordan/Code/test-git"); + update_repository_metadata(scan_path, &db); + + for (relative_path, _repository) in Repository::fetch_all(&db) { + let git_repository = git2::Repository::open(scan_path.join(relative_path)).unwrap(); + + for reference in git_repository.references().unwrap() { + let _reference = if let Some(reference) = reference.as_ref().ok().and_then(|v| v.name()) + { + reference + } else { + continue; + }; + + // let mut revwalk = git_repository.revwalk().unwrap(); + // revwalk.set_sorting(Sort::REVERSE).unwrap(); + // revwalk.push_ref(reference).unwrap(); + // + // for rev in revwalk { + // let rev = rev.unwrap(); + // let commit = git_repository.find_commit(rev).unwrap(); + // } + } + } +} + +fn update_repository_metadata(scan_path: &Path, db: &sled::Db) { + let mut discovered = Vec::new(); + discover_repositories(scan_path, &mut discovered); + + for repository in discovered { + let relative = get_relative_path(scan_path, &repository); + + let id = Repository::open(db, relative) + .map(|v| v.id) + .unwrap_or_else(|| RepositoryId::new(db)); + let name = relative.file_name().unwrap().to_string_lossy().to_string(); + let description = Some( + String::from_utf8_lossy( + &std::fs::read(repository.join("description")).unwrap_or_default(), + ) + .to_string(), + ) + .filter(|v| !v.is_empty()); + + Repository { + id, + name, + description, + owner: None, // TODO read this from config + last_modified: OffsetDateTime::now_utc(), + } + .insert(db, relative); + } +} + +// util + +fn get_relative_path<'a>(relative_to: &Path, full_path: &'a Path) -> &'a Path { + full_path.strip_prefix(relative_to).unwrap() +} + +fn discover_repositories(current: &Path, discovered_repos: &mut Vec) { + let dirs = std::fs::read_dir(current) + .unwrap() + .map(|v| v.unwrap().path()) + .filter(|path| path.is_dir()); + + for dir in dirs { + if dir.join("packed-refs").is_file() { + // we've hit what looks like a bare git repo, lets take it + discovered_repos.push(dir); + } else { + // probably not a bare git repo, lets recurse deeper + discover_repositories(&dir, discovered_repos); + } + } +} + +#[cfg(test)] +mod test { + use crate::database::schema::repository::Repository; + use time::Instant; + + #[test] + fn test_discovery() { + let db = sled::open(std::env::temp_dir().join("sled-test.db")).unwrap(); + + let start = Instant::now(); + super::run_indexer(&db); + let repo = Repository::open(&db, "1p.git"); + + panic!("{} - {:#?}", start.elapsed(), repo); + } +} diff --git a/src/database/mod.rs b/src/database/mod.rs new file mode 100644 index 0000000..aa8039f 100644 --- /dev/null +++ a/src/database/mod.rs @@ -1,0 +1,2 @@ +pub mod indexer; +pub mod schema; diff --git a/src/methods/index.rs b/src/methods/index.rs index 9497930..a10ed46 100644 --- a/src/methods/index.rs +++ a/src/methods/index.rs @@ -1,19 +1,31 @@ +use std::collections::BTreeMap; + use askama::Template; use axum::response::Response; use axum::Extension; -use std::sync::Arc; use super::filters; -use crate::{git::RepositoryMetadataList, into_response, Git}; +use crate::database::schema::repository::Repository; +use crate::into_response; #[derive(Template)] #[template(path = "index.html")] pub struct View { - pub repositories: Arc, + pub repositories: BTreeMap, Vec>, } + +pub async fn handle(Extension(db): Extension) -> Response { + let mut repositories: BTreeMap, Vec> = BTreeMap::new(); + + for (k, v) in Repository::fetch_all(&db) { + // TODO: fixme + let mut split: Vec<_> = k.split('/').collect(); + split.pop(); + let key = Some(split.join("/")).filter(|v| !v.is_empty()); -pub async fn handle(Extension(git): Extension>) -> Response { - let repositories = git.fetch_repository_metadata().await; + let k = repositories.entry(key).or_default(); + k.push(v); + } into_response(&View { repositories }) } diff --git a/src/database/schema/commit.rs b/src/database/schema/commit.rs new file mode 100644 index 0000000..eca799a 100644 --- /dev/null +++ a/src/database/schema/commit.rs @@ -1,0 +1,20 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize, Debug)] +pub struct Commit { + age: String, + message: String, + author: String, +} + +impl Commit {} + +pub struct CommitVault { + _tree: sled::Tree, +} + +impl CommitVault { + pub(super) fn new(tree: sled::Tree) -> Self { + Self { _tree: tree } + } +} diff --git a/src/database/schema/mod.rs b/src/database/schema/mod.rs new file mode 100644 index 0000000..ae2930f 100644 --- /dev/null +++ a/src/database/schema/mod.rs @@ -1,0 +1,3 @@ +pub mod commit; +pub mod prefixes; +pub mod repository; diff --git a/src/database/schema/prefixes.rs b/src/database/schema/prefixes.rs new file mode 100644 index 0000000..e088bba 100644 --- /dev/null +++ a/src/database/schema/prefixes.rs @@ -1,0 +1,35 @@ +use crate::database::schema::repository::RepositoryId; +use std::path::Path; + +#[repr(u8)] +pub enum TreePrefix { + Repository = 0, + Commit = 100, + _Tag = 101, +} + +impl TreePrefix { + pub fn repository_id>(path: T) -> Vec { + let path = path.as_ref().to_string_lossy(); + let path_bytes = path.as_bytes(); + + let mut prefixed = Vec::with_capacity(path_bytes.len() + std::mem::size_of::()); + prefixed.push(Self::Repository as u8); + prefixed.extend_from_slice(path_bytes); + + prefixed + } + + pub fn commit_id>(repository: RepositoryId, commit: T) -> Vec { + let commit = commit.as_ref(); + + let mut prefixed = Vec::with_capacity( + commit.len() + std::mem::size_of::() + std::mem::size_of::(), + ); + prefixed.push(TreePrefix::Commit as u8); + prefixed.extend_from_slice(&repository.to_ne_bytes()); + prefixed.extend_from_slice(&commit); + + prefixed + } +} diff --git a/src/database/schema/repository.rs b/src/database/schema/repository.rs new file mode 100644 index 0000000..da272e6 100644 --- /dev/null +++ a/src/database/schema/repository.rs @@ -1,0 +1,82 @@ +use crate::database::schema::commit::CommitVault; +use crate::database::schema::prefixes::TreePrefix; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::ops::Deref; +use std::path::Path; +use time::OffsetDateTime; + +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Hash)] +pub struct Repository { + /// The ID of the repository, as stored in `sled` + pub id: RepositoryId, + /// The "clean name" of the repository (ie. `hello-world.git`) + pub name: String, + /// The description of the repository, as it is stored in the `description` file in the + /// bare repo root + pub description: Option, + /// The owner of the repository (`gitweb.owner` in the repository configuration) + pub owner: Option, + /// The last time this repository was updated, currently read from the directory mtime + pub last_modified: OffsetDateTime, +} + +impl Repository { + pub fn fetch_all(database: &sled::Db) -> HashMap { + database + .scan_prefix([TreePrefix::Repository as u8]) + .filter_map(Result::ok) + .map(|(k, v)| { + let key = String::from_utf8_lossy(&k[1..]).to_string(); + let value = bincode::deserialize(&v).unwrap(); + + (key, value) + }) + .collect() + } + + pub fn insert>(&self, database: &sled::Db, path: P) { + database + .insert( + TreePrefix::repository_id(path), + bincode::serialize(self).unwrap(), + ) + .unwrap(); + } + + pub fn open>(database: &sled::Db, path: P) -> Option { + database + .get(TreePrefix::repository_id(path)) + .unwrap() + .map(|v| bincode::deserialize(&v)) + .transpose() + .unwrap() + } + + #[allow(dead_code)] + pub fn commit_vault(&self, database: &sled::Db, commit: &str) -> CommitVault { + let commit = hex::decode(commit).unwrap(); + let tree = database + .open_tree(TreePrefix::commit_id(self.id, commit)) + .unwrap(); + + CommitVault::new(tree) + } +} + +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub struct RepositoryId(pub(super) u64); + +impl RepositoryId { + pub fn new(db: &sled::Db) -> Self { + Self(db.generate_id().unwrap()) + } +} + +impl Deref for RepositoryId { + type Target = u64; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} -- rgit 0.1.3