From 1a476f691146854c0afb059d625fd3faac19b924 Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Mon, 28 Apr 2025 01:16:54 +0700 Subject: [PATCH] Index trees into RocksDB --- Cargo.toml | 2 +- src/git.rs | 271 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------- src/main.rs | 12 +++++++++++- src/database/indexer.rs | 193 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- src/methods/filters.rs | 4 ++-- templates/repo/tree.html | 59 ++++++++++++++++++++++++++++------------------------------- src/database/schema/commit.rs | 3 +++ src/database/schema/mod.rs | 3 ++- src/database/schema/prefixes.rs | 2 ++ src/database/schema/tag.rs | 9 +++++++-- src/database/schema/tree.rs | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/methods/repo/mod.rs | 6 ++++++ src/methods/repo/tree.rs | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 13 files changed, 608 insertions(+), 261 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fac855b..777cf65 100644 --- a/Cargo.toml +++ a/Cargo.toml @@ -84,7 +84,7 @@ unix_mode = "0.1" uuid = { version = "1.11", features = ["v4"] } v_htmlescape = { version = "0.15", features = ["bytes-buf"] } -xxhash-rust = { version = "0.8.15", features = ["const_xxh3"] } +xxhash-rust = { version = "0.8.15", features = ["const_xxh3", "xxh3"] } yoke = { version = "0.7.5", features = ["derive"] } [features] diff --git a/src/git.rs b/src/git.rs index f682517..f3ed9c4 100644 --- a/src/git.rs +++ a/src/git.rs @@ -7,18 +7,17 @@ actor::SignatureRef, bstr::{BStr, BString, ByteSlice, ByteVec}, diff::blob::{platform::prepare_diff::Operation, Sink}, - object::{tree::EntryKind, Kind}, + object::Kind, objs::{tree::EntryRef, CommitRef, TagRef}, prelude::TreeEntryRefExt, traverse::tree::visit::Action, - url::Scheme, - ObjectId, ThreadSafeRepository, Url, + ObjectId, ThreadSafeRepository, }; -use itertools::{Either, Itertools}; +use itertools::Either; use moka::future::Cache; use std::{ borrow::Cow, - collections::{BTreeMap, VecDeque}, + collections::VecDeque, ffi::OsStr, fmt::{self, Arguments, Write}, io::ErrorKind, @@ -121,7 +120,7 @@ path: Option, tree_id: Option<&str>, formatted: bool, - ) -> Result { + ) -> Result { let tree_id = tree_id .map(ObjectId::from_str) .transpose() @@ -154,8 +153,6 @@ Kind::Blob => { let mut blob = object.into_blob(); - let size = blob.data.len(); - let content = match (formatted, simdutf8::basic::from_utf8(&blob.data)) { (true, Err(_)) => Content::Binary(vec![]), (true, Ok(data)) => Content::Text(Cow::Owned(format_file( @@ -168,107 +165,13 @@ })), }; - return Ok(PathDestination::File(FileWithContent { - metadata: File { - mode: item.mode().0, - size, - path: path.clone(), - name: item.filename().to_string(), - }, - content, - })); + return Ok(FileWithContent { content }); } - Kind::Tree => { - tree = object.into_tree(); - } _ => anyhow::bail!("bad object of type {:?}", object.kind), - } - } - - let mut tree_items = Vec::new(); - let submodules = repo - .submodules()? - .into_iter() - .flatten() - .filter_map(|v| Some((v.name().to_path_lossy().to_path_buf(), v.url().ok()?))) - .collect::>(); - - for item in tree.iter() { - let item = item?; - - let path = path - .clone() - .unwrap_or_default() - .join(item.filename().to_path_lossy()); - - match item.mode().kind() { - EntryKind::Tree - | EntryKind::Blob - | EntryKind::BlobExecutable - | EntryKind::Link => { - let mut object = item - .object() - .context("Expected item in tree to be object but it wasn't")?; - - tree_items.push(match object.kind { - Kind::Blob => TreeItem::File(File { - mode: item.mode().0, - size: object.into_blob().data.len(), - path, - name: item.filename().to_string(), - }), - Kind::Tree => { - let mut children = PathBuf::new(); - - // if the tree only has one child, flatten it down - while let Ok(Some(Ok(item))) = object - .try_into_tree() - .iter() - .flat_map(gix::Tree::iter) - .at_most_one() - { - let nested_object = item.object().context( - "Expected item in tree to be object but it wasn't", - )?; - - if nested_object.kind != Kind::Tree { - break; - } - - object = nested_object; - children.push(item.filename().to_path_lossy()); - } - - TreeItem::Tree(Tree { - mode: item.mode().0, - path, - children, - name: item.filename().to_string(), - }) - } - _ => continue, - }); - } - EntryKind::Commit => { - if let Some(mut url) = submodules.get(path.as_path()).cloned() { - if matches!(url.scheme, Scheme::Git | Scheme::Ssh) { - url.scheme = Scheme::Https; - } - - tree_items.push(TreeItem::Submodule(Submodule { - mode: item.mode().0, - name: item.filename().to_string(), - url, - oid: item.object_id(), - })); - - continue; - } - } } } - Ok(PathDestination::Tree(tree_items)) + anyhow::bail!("bad object"); }) .await .context("Failed to join Tokio task")? @@ -442,16 +345,16 @@ } let buffer = BytesMut::with_capacity(BUFFER_CAP + 1024); - let mut visitor = ArchivalVisitor { + let mut visitor = PathVisitor::new(ArchivalVisitor { repository: &repo, res, archive: Builder::new(GzEncoder::new(buffer.writer(), flate2::Compression::fast())), - path_deque: VecDeque::new(), - path: BString::default(), - }; + }); tree.traverse().breadthfirst(&mut visitor)?; + let visitor = visitor.into_inner(); + visitor.res.blocking_send(Ok(visitor .archive .into_inner()? @@ -514,16 +417,77 @@ } const BUFFER_CAP: usize = 512 * 1024; + +pub trait PathVisitorHandler { + fn visit(&mut self, entry: &EntryRef<'_>, path: &BStr) -> Action; +} -pub struct ArchivalVisitor<'a> { +struct ArchivalVisitor<'a> { repository: &'a gix::Repository, res: tokio::sync::mpsc::Sender>, archive: Builder>>, +} + +impl PathVisitorHandler for ArchivalVisitor<'_> { + fn visit(&mut self, entry: &EntryRef<'_>, path: &BStr) -> Action { + let entry = entry.attach(self.repository); + + let Ok(object) = entry.object() else { + return Action::Continue; + }; + + if object.kind != Kind::Blob { + return Action::Continue; + } + + let blob = object.into_blob(); + + let mut header = tar::Header::new_gnu(); + if let Err(error) = header.set_path(path.to_path_lossy()) { + warn!(%error, "Attempted to write invalid path to archive"); + return Action::Continue; + } + header.set_size(blob.data.len() as u64); + #[allow(clippy::cast_sign_loss)] + header.set_mode(entry.mode().0.into()); + header.set_cksum(); + + if let Err(error) = self.archive.append(&header, blob.data.as_slice()) { + warn!(%error, "Failed to append to archive"); + return Action::Cancel; + } + + if self.archive.get_ref().get_ref().get_ref().len() >= BUFFER_CAP { + let b = self.archive.get_mut().get_mut().get_mut().split().freeze(); + + if self.res.blocking_send(Ok(b)).is_err() { + return Action::Cancel; + } + } + + Action::Continue + } +} + +pub struct PathVisitor { path_deque: VecDeque, path: BString, + inner: T, } + +impl PathVisitor { + pub fn new(inner: T) -> Self { + Self { + path_deque: VecDeque::new(), + path: BString::default(), + inner, + } + } + + pub fn into_inner(self) -> T { + self.inner + } -impl ArchivalVisitor<'_> { fn pop_element(&mut self) { if let Some(pos) = memchr::memrchr(b'/', &self.path) { self.path.resize(pos, 0); @@ -533,6 +497,9 @@ } fn push_element(&mut self, name: &BStr) { + if name.is_empty() { + return; + } if !self.path.is_empty() { self.path.push(b'/'); } @@ -540,7 +507,7 @@ } } -impl gix::traverse::tree::Visit for ArchivalVisitor<'_> { +impl gix::traverse::tree::Visit for PathVisitor { fn pop_front_tracked_path_and_set_current(&mut self) { self.path = self .path_deque @@ -549,10 +516,7 @@ } fn pop_back_tracked_path_and_set_current(&mut self) { - self.path = self - .path_deque - .pop_back() - .expect("every call is matched with push_tracked_path_component"); + self.path = self.path_deque.pop_back().unwrap_or_default(); } fn push_back_tracked_path_component(&mut self, component: &BStr) { @@ -568,47 +532,12 @@ self.pop_element(); } - fn visit_tree(&mut self, _entry: &EntryRef<'_>) -> Action { - Action::Continue + fn visit_tree(&mut self, entry: &EntryRef<'_>) -> Action { + self.inner.visit(entry, self.path.as_ref()) } fn visit_nontree(&mut self, entry: &EntryRef<'_>) -> Action { - let entry = entry.attach(self.repository); - - let Ok(object) = entry.object() else { - return Action::Continue; - }; - - if object.kind != Kind::Blob { - return Action::Continue; - } - - let blob = object.into_blob(); - - let mut header = tar::Header::new_gnu(); - if let Err(error) = header.set_path(self.path.to_path_lossy()) { - warn!(%error, "Attempted to write invalid path to archive"); - return Action::Continue; - } - header.set_size(blob.data.len() as u64); - #[allow(clippy::cast_sign_loss)] - header.set_mode(entry.mode().0.into()); - header.set_cksum(); - - if let Err(error) = self.archive.append(&header, blob.data.as_slice()) { - warn!(%error, "Failed to append to archive"); - return Action::Cancel; - } - - if self.archive.get_ref().get_ref().get_ref().len() >= BUFFER_CAP { - let b = self.archive.get_mut().get_mut().get_mut().split().freeze(); - - if self.res.blocking_send(Ok(b)).is_err() { - return Action::Cancel; - } - } - - Action::Continue + self.inner.visit(entry, self.path.as_ref()) } } @@ -634,47 +563,11 @@ pub enum ReadmeFormat { Markdown, Plaintext, -} - -pub enum PathDestination { - Tree(Vec), - File(FileWithContent), -} - -pub enum TreeItem { - Tree(Tree), - File(File), - Submodule(Submodule), -} - -#[derive(Debug)] -pub struct Submodule { - pub mode: u16, - pub name: String, - pub url: Url, - pub oid: ObjectId, -} - -#[derive(Debug)] -pub struct Tree { - pub mode: u16, - pub name: String, - pub children: PathBuf, - pub path: PathBuf, -} - -#[derive(Debug)] -pub struct File { - pub mode: u16, - pub size: usize, - pub name: String, - pub path: PathBuf, } #[derive(Debug)] #[allow(unused)] pub struct FileWithContent { - pub metadata: File, pub content: Content, } diff --git a/src/main.rs b/src/main.rs index 6e8a4b1..9105436 100644 --- a/src/main.rs +++ a/src/main.rs @@ -23,7 +23,10 @@ }; use clap::Parser; use const_format::formatcp; -use database::schema::SCHEMA_VERSION; +use database::schema::{ + prefixes::{TREE_FAMILY, TREE_ITEM_FAMILY}, + SCHEMA_VERSION, +}; use rocksdb::{Options, SliceTransform}; use tokio::{ net::TcpListener, @@ -257,6 +260,11 @@ tag_family_options.set_prefix_extractor(SliceTransform::create_fixed_prefix( std::mem::size_of::(), )); // repository id prefix + + let mut tree_item_family_options = Options::default(); + tree_item_family_options.set_prefix_extractor(SliceTransform::create_fixed_prefix( + std::mem::size_of::() + std::mem::size_of::(), + )); let db = rocksdb::DB::open_cf_with_opts( &db_options, @@ -267,6 +275,8 @@ (TAG_FAMILY, tag_family_options), (REFERENCE_FAMILY, Options::default()), (COMMIT_COUNT_FAMILY, Options::default()), + (TREE_FAMILY, Options::default()), + (TREE_ITEM_FAMILY, tree_item_family_options), ], )?; diff --git a/src/database/indexer.rs b/src/database/indexer.rs index 7311c32..9f9b007 100644 --- a/src/database/indexer.rs +++ a/src/database/indexer.rs @@ -1,5 +1,5 @@ use std::{ - collections::HashSet, + collections::{BTreeMap, HashSet}, ffi::OsStr, fmt::Debug, io::{BufRead, BufReader}, @@ -8,18 +8,30 @@ }; use anyhow::Context; -use gix::{bstr::ByteSlice, refs::Category, Reference}; +use gix::{ + bstr::{BStr, ByteSlice}, + objs::tree::EntryKind, + refs::Category, + url::Scheme, + ObjectId, Reference, Url, +}; use itertools::{Either, Itertools}; use rocksdb::WriteBatch; use time::{OffsetDateTime, UtcOffset}; use tracing::{error, info, info_span, instrument, warn}; - -use crate::database::schema::{ - commit::Commit, - repository::{ArchivedRepository, Repository, RepositoryId}, - tag::{Tag, TagTree}, +use xxhash_rust::xxh3::Xxh3; + +use crate::{ + database::schema::{ + commit::Commit, + repository::{ArchivedRepository, Repository, RepositoryId}, + tag::{Tag, TagTree}, + }, + git::{PathVisitor, PathVisitorHandler}, }; +use super::schema::tree::{Tree, TreeItem, TreeItemKind}; + pub fn run(scan_path: &Path, repository_list: Option<&Path>, db: &Arc) { let span = info_span!("index_update"); let _entered = span.enter(); @@ -157,6 +169,18 @@ Ok(v) => v, Err(error) => { error!(%error, "Failed to read references for {relative_path}"); + continue; + } + }; + + let submodules = match git_repository.submodules() { + Ok(submodules) => submodules + .into_iter() + .flatten() + .filter_map(|v| Some((v.name().to_path_lossy().to_path_buf(), v.url().ok()?))) + .collect::>(), + Err(error) => { + error!(%error, "Failed to read submodules for {relative_path}"); continue; } }; @@ -189,6 +213,7 @@ db.clone(), &git_repository, false, + &submodules, ) { error!(%error, "Failed to update reflog for {relative_path}@{:?}", valid_references.last()); } @@ -208,6 +233,7 @@ db: Arc, git_repository: &gix::Repository, force_reindex: bool, + submodules: &BTreeMap, ) -> Result<(), anyhow::Error> { info!("Refreshing indexes"); @@ -238,6 +264,8 @@ .into_iter() .rev(); + let mut hasher = Xxh3::new(); + let tree_len = commit_tree.len()?; let mut seen = false; let mut i = 0; @@ -266,12 +294,16 @@ let commit = commit.decode()?; let author = commit.author(); let committer = commit.committer(); + + let tree = git_repository.find_tree(commit.tree())?; + let tree_id = index_tree(&db, &mut batch, &tree, &mut hasher, submodules)?; - Commit::new(oid, &commit, author, committer)?.insert( + Commit::new(oid, &commit, author, committer, tree_id)?.insert( &commit_tree, tree_len + i, &mut batch, )?; + i += 1; } @@ -289,10 +321,117 @@ db, git_repository, true, + submodules, ); } Ok(()) +} + +fn index_tree( + database: &rocksdb::DB, + batch: &mut WriteBatch, + tree: &gix::Tree<'_>, + hasher: &mut Xxh3, + submodules: &BTreeMap, +) -> Result { + hasher.reset(); + tree.traverse() + .breadthfirst(&mut PathVisitor::new(TreeHasherVisitor { hasher }))?; + let digest = hasher.digest(); + + if !TreeItem::contains(database, digest)? { + tree.traverse() + .breadthfirst(&mut PathVisitor::new(TreeItemIndexerVisitor { + buffer: Vec::new(), + digest, + database, + batch, + submodules, + }))?; + } + + Tree { + indexed_tree_id: digest, + } + .insert(database, batch, tree.id)?; + + Ok(digest) +} + +/// Walks the entire tree and hashes all the (path, mode)s so trees can be deduplicated. +/// +/// Note: unlike git's tree oid, this does not take into account blob contents. +struct TreeHasherVisitor<'a> { + hasher: &'a mut Xxh3, +} + +impl PathVisitorHandler for TreeHasherVisitor<'_> { + fn visit( + &mut self, + entry: &gix::objs::tree::EntryRef<'_>, + path: &BStr, + ) -> gix::traverse::tree::visit::Action { + self.hasher.update(path); + self.hasher.update(&entry.mode.to_ne_bytes()); + gix::traverse::tree::visit::Action::Continue + } +} + +struct TreeItemIndexerVisitor<'a> { + digest: u64, + buffer: Vec, + database: &'a rocksdb::DB, + batch: &'a mut WriteBatch, + submodules: &'a BTreeMap, +} + +impl PathVisitorHandler for TreeItemIndexerVisitor<'_> { + fn visit( + &mut self, + entry: &gix::objs::tree::EntryRef<'_>, + path: &BStr, + ) -> gix::traverse::tree::visit::Action { + let kind = match entry.mode.kind() { + EntryKind::Blob | EntryKind::BlobExecutable | EntryKind::Link => TreeItemKind::File, + EntryKind::Commit => { + let Some(mut url) = self + .submodules + .get(&path.to_path_lossy().into_owned()) + .cloned() + else { + return gix::traverse::tree::visit::Action::Continue; + }; + + if matches!(url.scheme, Scheme::Git | Scheme::Ssh) { + url.scheme = Scheme::Https; + } + + TreeItemKind::Submodule(match entry.oid.to_owned() { + ObjectId::Sha1(oid) => super::schema::tree::Submodule { + url: url.to_string(), + oid, + }, + }) + } + EntryKind::Tree => TreeItemKind::Tree, + }; + + TreeItem { + mode: entry.mode.0, + kind, + } + .insert( + &mut self.buffer, + self.digest, + path, + self.database, + self.batch, + ) + .expect("failed to insert TreeItem"); + + gix::traverse::tree::visit::Action::Continue + } } #[instrument(skip(db))] @@ -309,6 +448,18 @@ let Some(git_repository) = open_repo(scan_path, &relative_path, db_repository.get(), &db) else { continue; + }; + + let submodules = match git_repository.submodules() { + Ok(submodules) => submodules + .into_iter() + .flatten() + .filter_map(|v| Some((v.name().to_path_lossy().to_path_buf(), v.url().ok()?))) + .collect::>(), + Err(error) => { + error!(%error, "Failed to read submodules for {relative_path}"); + continue; + } }; if let Err(error) = tag_index_scan( @@ -316,6 +467,7 @@ db_repository.get(), db.clone(), &git_repository, + &submodules, ) { error!(%error, "Failed to update tags for {relative_path}"); } @@ -328,6 +480,7 @@ db_repository: &ArchivedRepository, db: Arc, git_repository: &gix::Repository, + submodules: &BTreeMap, ) -> Result<(), anyhow::Error> { let tag_tree = db_repository.tag_tree(db); @@ -343,7 +496,7 @@ // insert any git tags that are missing from the index for tag_name in git_tags.difference(&indexed_tags) { - tag_index_update(tag_name, git_repository, &tag_tree)?; + tag_index_update(tag_name, git_repository, &tag_tree, submodules)?; } // remove any extra tags that the index has @@ -360,15 +513,31 @@ tag_name: &str, git_repository: &gix::Repository, tag_tree: &TagTree, + submodules: &BTreeMap, ) -> Result<(), anyhow::Error> { let mut reference = git_repository .find_reference(tag_name) .context("Failed to read newly discovered tag")?; + + let tree_id = if let Ok(tree) = reference.peel_to_tree() { + let mut batch = WriteBatch::default(); + let tree_id = index_tree( + &tag_tree.db, + &mut batch, + &tree, + &mut Xxh3::new(), + submodules, + )?; + tag_tree.db.write_without_wal(batch)?; + Some(tree_id) + } else { + None + }; if let Ok(tag) = reference.peel_to_tag() { info!("Inserting newly discovered tag to index"); - Tag::new(tag.tagger()?)?.insert(tag_tree, tag_name)?; + Tag::new(tag.tagger()?, tree_id)?.insert(tag_tree, tag_name)?; } Ok(()) @@ -420,7 +589,7 @@ discovered_repos: &mut Vec<(PathBuf, gix::Repository)>, ) { let dirs = if let Some(repo_list) = repository_list { - let mut repo_list = match std::fs::File::open(&repo_list) { + let repo_list = match std::fs::File::open(repo_list) { Ok(v) => BufReader::new(v).lines(), Err(error) => { error!(%error, "Failed to open repository list file"); @@ -430,7 +599,7 @@ let mut out = Vec::new(); - while let Some(line) = repo_list.next() { + for line in repo_list { let line = match line { Ok(v) => v, Err(error) => { diff --git a/src/methods/filters.rs b/src/methods/filters.rs index a0e15f9..e39b3d2 100644 --- a/src/methods/filters.rs +++ a/src/methods/filters.rs @@ -37,8 +37,8 @@ .convert((OffsetDateTime::now_utc() - s.into().0).try_into().unwrap())) } -pub fn file_perms(s: &u16) -> Result { - Ok(unix_mode::to_string(u32::from(*s))) +pub fn file_perms(s: u16) -> Result { + Ok(unix_mode::to_string(u32::from(s))) } pub struct DisplayHexBuffer(pub const_hex::Buffer); diff --git a/templates/repo/tree.html b/templates/repo/tree.html index 1d87346..4f69b3c 100644 --- a/templates/repo/tree.html +++ a/templates/repo/tree.html @@ -1,52 +1,47 @@ {% import "macros/breadcrumbs.html" as breadcrumbs %} {% extends "repo/base.html" %} {% block tree_nav_class %}active{% endblock %} {% block subnav %} - {% call breadcrumbs::breadcrumbs(repo_path, query) %} +{% call breadcrumbs::breadcrumbs(repo_path, query) %} {% endblock %} {% block content %}
- +
- + - - + - {% for item in items -%} - - {% match item -%} - {%- when crate::git::TreeItem::Tree with (tree) -%} - - - - - - {%- when crate::git::TreeItem::File with (file) -%} - - - - - - {%- when crate::git::TreeItem::Submodule with (submodule) -%} - - - - + {% for (name, name_split, item) in items -%} + + + {% set local_name = name.get()[*name_split..] -%} + {% set local_name = local_name.strip_prefix('/').unwrap_or(local_name) -%} + {% match item.get().kind -%} + {%- when ArchivedTreeItemKind::Tree -%} + + {%- when ArchivedTreeItemKind::File -%} + + {%- when ArchivedTreeItemKind::Submodule with (submodule) -%} + {%- endmatch %} - - {% endfor -%} + + {% endfor -%} -
Mode NameSize
{{ tree.mode|file_perms }}
{{ tree.name }}
-            {%- for child in tree.children.ancestors().collect_vec().into_iter().rev() -%}
-                {%- if let Some(file_name) = child.file_name() %} / {{ file_name.to_string_lossy() }}{%- endif -%}
-            {%- endfor -%}
-        
{{ file.mode|file_perms }}
{{ file.name }}
{{ file.size }}
{{ submodule.mode|file_perms }}
🔗 {{ submodule.name }} @ {{ submodule.oid.to_hex_with_len(7) }}
+
{{ item.get().mode.to_native()|file_perms }}
+
+
{{ local_name }}
+
+
{{ local_name }}
+
+
🔗 {{ local_name }} @ {{ submodule.oid|hex }}
+
+
{% endblock %} diff --git a/src/database/schema/commit.rs b/src/database/schema/commit.rs index 6ebbb4a..12307b4 100644 --- a/src/database/schema/commit.rs +++ a/src/database/schema/commit.rs @@ -21,6 +21,7 @@ pub author: Author, pub committer: Author, pub hash: [u8; 20], + pub tree: u64, } impl Commit { @@ -29,6 +30,7 @@ commit: &CommitRef<'_>, author: SignatureRef<'_>, committer: SignatureRef<'_>, + tree: u64, ) -> Result { let message = commit.message(); @@ -40,6 +42,7 @@ hash: match oid { ObjectId::Sha1(d) => d, }, + tree, }) } diff --git a/src/database/schema/mod.rs b/src/database/schema/mod.rs index 3e6f177..2e8a91b 100644 --- a/src/database/schema/mod.rs +++ a/src/database/schema/mod.rs @@ -6,7 +6,8 @@ pub mod prefixes; pub mod repository; pub mod tag; +pub mod tree; pub type Yoked = Yoke>; -pub const SCHEMA_VERSION: &str = "3"; +pub const SCHEMA_VERSION: &str = "4"; diff --git a/src/database/schema/prefixes.rs b/src/database/schema/prefixes.rs index 299364b..d6cd311 100644 --- a/src/database/schema/prefixes.rs +++ a/src/database/schema/prefixes.rs @@ -1,5 +1,7 @@ pub const COMMIT_FAMILY: &str = "commit"; pub const COMMIT_COUNT_FAMILY: &str = "commit_count"; pub const REPOSITORY_FAMILY: &str = "repository"; pub const TAG_FAMILY: &str = "tag"; pub const REFERENCE_FAMILY: &str = "repository_refs"; +pub const TREE_FAMILY: &str = "tree"; +pub const TREE_ITEM_FAMILY: &str = "tree_item"; diff --git a/src/database/schema/tag.rs b/src/database/schema/tag.rs index e57dfa9..b2db248 100644 --- a/src/database/schema/tag.rs +++ a/src/database/schema/tag.rs @@ -15,12 +15,17 @@ #[derive(Serialize, Archive, Debug, Yokeable)] pub struct Tag { pub tagger: Option, + pub tree_id: Option, } impl Tag { - pub fn new(tagger: Option>) -> Result { + pub fn new( + tagger: Option>, + tree_id: Option, + ) -> Result { Ok(Self { tagger: tagger.map(TryFrom::try_from).transpose()?, + tree_id, }) } @@ -30,7 +35,7 @@ } pub struct TagTree { - db: Arc, + pub db: Arc, prefix: RepositoryId, } diff --git a/src/database/schema/tree.rs b/src/database/schema/tree.rs new file mode 100644 index 0000000..2cb67c9 100644 --- /dev/null +++ a/src/database/schema/tree.rs @@ -1,0 +1,191 @@ +use anyhow::Context; +use gix::{bstr::BStr, ObjectId}; +use itertools::{Either, Itertools}; +use rkyv::{Archive, Serialize}; +use rocksdb::{WriteBatch, DB}; +use yoke::{Yoke, Yokeable}; + +use super::{ + prefixes::{TREE_FAMILY, TREE_ITEM_FAMILY}, + Yoked, +}; + +#[derive(Serialize, Archive, Debug, PartialEq, Eq, Hash)] +pub struct Tree { + pub indexed_tree_id: u64, +} + +impl Tree { + pub fn insert( + &self, + database: &DB, + batch: &mut WriteBatch, + tree_oid: ObjectId, + ) -> Result<(), anyhow::Error> { + let cf = database + .cf_handle(TREE_FAMILY) + .context("tree column family missing")?; + + batch.put_cf( + cf, + tree_oid.as_slice(), + rkyv::to_bytes::(self)?, + ); + + Ok(()) + } + + pub fn find(database: &DB, tree_oid: ObjectId) -> Result, anyhow::Error> { + let cf = database + .cf_handle(TREE_FAMILY) + .context("tree column family missing")?; + + let Some(data) = database.get_pinned_cf(cf, tree_oid.as_slice())? else { + return Ok(None); + }; + + let data = rkyv::access::<::Archived, rkyv::rancor::Error>(data.as_ref())?; + + Ok(Some(data.indexed_tree_id.to_native())) + } +} + +#[derive(Serialize, Archive, Debug, PartialEq, Eq, Hash)] +pub struct Submodule { + pub url: String, + pub oid: [u8; 20], +} + +#[derive(Serialize, Archive, Debug, PartialEq, Eq, Hash)] +pub enum TreeItemKind { + Submodule(Submodule), + Tree, + File, +} + +#[derive(Serialize, Archive, Debug, PartialEq, Eq, Hash, Yokeable)] +pub struct TreeItem { + pub mode: u16, + pub kind: TreeItemKind, +} + +pub type YokedTreeItem = Yoked<&'static ::Archived>; +pub type YokedTreeItemKey = Yoked<&'static [u8]>; +pub type YokedTreeItemKeyUtf8 = Yoked<&'static str>; + +impl TreeItem { + pub fn insert( + &self, + buffer: &mut Vec, + digest: u64, + path: &BStr, + database: &DB, + batch: &mut WriteBatch, + ) -> Result<(), anyhow::Error> { + let cf = database + .cf_handle(TREE_ITEM_FAMILY) + .context("tree column family missing")?; + + buffer.clear(); + buffer.reserve(std::mem::size_of::() + path.len() + std::mem::size_of::()); + buffer.extend_from_slice(&digest.to_ne_bytes()); + buffer.extend_from_slice(&memchr::memchr_iter(b'/', path).count().to_be_bytes()); + buffer.extend_from_slice(path.as_ref()); + + batch.put_cf(cf, &buffer, rkyv::to_bytes::(self)?); + + Ok(()) + } + + pub fn find_exact( + database: &DB, + digest: u64, + path: &[u8], + ) -> Result, anyhow::Error> { + let cf = database + .cf_handle(TREE_ITEM_FAMILY) + .expect("tree column family missing"); + + let mut buffer = Vec::with_capacity(std::mem::size_of::() + path.len()); + buffer.extend_from_slice(&digest.to_ne_bytes()); + buffer.extend_from_slice(&memchr::memchr_iter(b'/', path).count().to_be_bytes()); + buffer.extend_from_slice(path); + + database + .get_cf(cf, buffer)? + .map(|data| { + Yoke::try_attach_to_cart(data.into_boxed_slice(), |data| { + rkyv::access::<_, rkyv::rancor::Error>(data) + }) + }) + .transpose() + .context("failed to parse tree item") + } + + pub fn find_prefix<'a>( + database: &'a DB, + digest: u64, + prefix: &[u8], + ) -> impl Iterator> + use<'a> + { + let cf = database + .cf_handle(TREE_ITEM_FAMILY) + .expect("tree column family missing"); + + let (iterator, key) = if prefix.is_empty() { + let mut buffer = [0_u8; std::mem::size_of::() + std::mem::size_of::()]; + buffer[..std::mem::size_of::()].copy_from_slice(&digest.to_ne_bytes()); + buffer[std::mem::size_of::()..].copy_from_slice(&0_usize.to_be_bytes()); + + let iterator = database.prefix_iterator_cf(cf, buffer); + + (iterator, Either::Left(buffer)) + } else { + let mut buffer = Vec::with_capacity( + std::mem::size_of::() + prefix.len() + std::mem::size_of::(), + ); + buffer.extend_from_slice(&digest.to_ne_bytes()); + buffer + .extend_from_slice(&(memchr::memchr_iter(b'/', prefix).count() + 1).to_be_bytes()); + buffer.extend_from_slice(prefix); + buffer.push(b'/'); + + let iterator = database.prefix_iterator_cf(cf, &buffer); + + (iterator, Either::Right(buffer)) + }; + + iterator + .take_while(move |v| { + v.as_ref().is_ok_and(|(k, _)| { + k.starts_with(match key.as_ref() { + Either::Left(v) => v.as_ref(), + Either::Right(v) => v.as_ref(), + }) + }) + }) + .map_ok(|(key, value)| { + let key = Yoke::attach_to_cart(key, |data| { + &data[std::mem::size_of::() + std::mem::size_of::()..] + }); + let value = Yoke::try_attach_to_cart(value, |data| { + rkyv::access::<_, rkyv::rancor::Error>(data) + }) + .context("Failed to open repository")?; + Ok((key, value)) + }) + .flatten() + } + + pub fn contains(database: &DB, digest: u64) -> Result { + let cf = database + .cf_handle(TREE_ITEM_FAMILY) + .context("tree column family missing")?; + + Ok(database + .prefix_iterator_cf(cf, digest.to_ne_bytes()) + .next() + .transpose()? + .is_some()) + } +} diff --git a/src/methods/repo/mod.rs b/src/methods/repo/mod.rs index 34d8c9e..c108ba3 100644 --- a/src/methods/repo/mod.rs +++ a/src/methods/repo/mod.rs @@ -275,6 +275,12 @@ } } +impl From for anyhow::Error { + fn from(value: Error) -> Self { + value.0 + } +} + impl IntoResponse for Error { fn into_response(self) -> Response { (StatusCode::INTERNAL_SERVER_ERROR, format!("{:?}", self.0)).into_response() diff --git a/src/methods/repo/tree.rs b/src/methods/repo/tree.rs index 2a90019..67c1f56 100644 --- a/src/methods/repo/tree.rs +++ a/src/methods/repo/tree.rs @@ -1,5 +1,7 @@ +use anyhow::{bail, Context}; use askama::Template; use axum::{extract::Query, response::IntoResponse, Extension}; +use gix::ObjectId; use itertools::Itertools; use serde::Deserialize; use std::path::PathBuf; @@ -8,8 +10,11 @@ sync::Arc, }; +use crate::database::schema::tree::{ + ArchivedTreeItemKind, Tree, TreeItem, YokedTreeItem, YokedTreeItemKeyUtf8, +}; use crate::{ - git::{FileWithContent, PathDestination, TreeItem}, + git::FileWithContent, into_response, methods::{ filters, @@ -17,6 +22,8 @@ }, Git, ResponseEither, }; + +use super::log::get_branch_commits; #[derive(Deserialize)] pub struct UriQuery { @@ -49,7 +56,7 @@ #[allow(clippy::module_name_repetitions)] pub struct TreeView { pub repo: Repository, - pub items: Vec, + pub items: Vec<(YokedTreeItemKeyUtf8, usize, YokedTreeItem)>, pub query: UriQuery, pub repo_path: PathBuf, pub branch: Option>, @@ -62,6 +69,11 @@ pub repo_path: PathBuf, pub file: FileWithContent, pub branch: Option>, +} + +enum LookupResult { + RealPath, + Children(Vec<(YokedTreeItemKeyUtf8, usize, YokedTreeItem)>), } pub async fn handle( @@ -69,26 +81,77 @@ Extension(RepositoryPath(repository_path)): Extension, Extension(ChildPath(child_path)): Extension, Extension(git): Extension>, + Extension(db): Extension>, Query(query): Query, ) -> Result { - let open_repo = git.repo(repository_path, query.branch.clone()).await?; - - Ok( - match open_repo - .path(child_path.clone(), query.id.as_deref(), !query.raw) - .await? - { - PathDestination::Tree(items) => { - ResponseEither::Left(ResponseEither::Left(into_response(TreeView { - repo, - items, - branch: query.branch.clone(), - query, - repo_path: child_path.unwrap_or_default(), - }))) + // TODO: bit messy + let (repo, query, child_path, lookup_result) = tokio::task::spawn_blocking(move || { + let tree_id = if let Some(id) = query.id.as_deref() { + let hex = const_hex::decode_to_array(id).context("Failed to parse tree hash")?; + Tree::find(&db, ObjectId::Sha1(hex)) + .context("Failed to lookup tree")? + .context("Couldn't find tree with given id")? + } else { + let repository = crate::database::schema::repository::Repository::open(&db, &*repo)? + .context("Repository does not exist")?; + let commit = get_branch_commits(&repository, &db, query.branch.as_deref(), 1, 0)? + .into_iter() + .next() + .context("Branch not found")?; + commit.get().tree.to_native() + }; + + if let Some(path) = &child_path { + if let Some(item) = + TreeItem::find_exact(&db, tree_id, path.as_os_str().as_encoded_bytes())? + { + if let ArchivedTreeItemKind::File = item.get().kind { + return Ok((repo, query, child_path, LookupResult::RealPath)); + } } - PathDestination::File(file) if query.raw => ResponseEither::Right(file.content), - PathDestination::File(file) => { + } + + let path = child_path + .as_ref() + .map(|v| v.as_os_str().as_encoded_bytes()) + .unwrap_or_default(); + + let tree_items = TreeItem::find_prefix(&db, tree_id, path) + // don't take the current path the user is on + .filter_ok(|(k, _)| !k.get()[path.len()..].is_empty()) + // only take direct descendents + .filter_ok(|(k, _)| { + memchr::memrchr(b'/', &k.get()[path.len()..]).is_none_or(|v| v == 0) + }) + .map_ok(|(k, v)| { + ( + k.try_map_project(|v, _| simdutf8::basic::from_utf8(v)) + .expect("invalid utf8"), + path.len(), + v, + ) + }) + .try_collect::<_, Vec<_>, _>()?; + + if tree_items.is_empty() { + bail!("Path doesn't exist in tree"); + } + + Ok::<_, anyhow::Error>((repo, query, child_path, LookupResult::Children(tree_items))) + }) + .await + .context("Failed to join on task")??; + + Ok(match lookup_result { + LookupResult::RealPath => { + let open_repo = git.repo(repository_path, query.branch.clone()).await?; + let file = open_repo + .path(child_path.clone(), query.id.as_deref(), !query.raw) + .await?; + + if query.raw { + ResponseEither::Right(file.content) + } else { ResponseEither::Left(ResponseEither::Right(into_response(FileView { repo, file, @@ -96,6 +159,15 @@ repo_path: child_path.unwrap_or_default(), }))) } - }, - ) + } + LookupResult::Children(items) => { + ResponseEither::Left(ResponseEither::Left(into_response(TreeView { + repo, + items, + branch: query.branch.clone(), + query, + repo_path: child_path.unwrap_or_default(), + }))) + } + }) } -- rgit 0.1.5