From f6046cf6372e0d8ab845f6dec1602c303a66ee91 Mon Sep 17 00:00:00 2001 From: Jordan Doyle Date: Sun, 29 Aug 2021 22:22:56 +0100 Subject: [PATCH] initial packfile index generation --- Cargo.lock | 23 +++++++++++++++++++++++ Cargo.toml | 4 +++- src/main.rs | 32 +++++++++++++++++++++++++------- src/git/packfile.rs | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------- 4 files changed, 172 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9723d11..d019dc2 100644 --- a/Cargo.lock +++ a/Cargo.lock @@ -190,6 +190,8 @@ "async-trait", "axum", "bytes", + "const-sha1", + "crc", "env_logger", "flate2", "futures", @@ -212,13 +214,34 @@ ] [[package]] +name = "const-sha1" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb58b6451e8c2a812ad979ed1d83378caa5e927eef2622017a45f251457c2c9d" + +[[package]] name = "cpufeatures" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" dependencies = [ "libc", +] + +[[package]] +name = "crc" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10c2722795460108a7872e1cd933a85d6ec38abc4baecad51028f702da28889f" +dependencies = [ + "crc-catalog", ] + +[[package]] +name = "crc-catalog" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccaeedb56da03b09f598226e25e80088cb4cd25f316e6e4df7d695f0feeb1403" [[package]] name = "crc32fast" diff --git a/Cargo.toml b/Cargo.toml index 4c6566a..d85c62a 100644 --- a/Cargo.toml +++ a/Cargo.toml @@ -20,4 +20,6 @@ tokio-util = { version = "0.6", features = ["codec"] } bytes = "1" flate2 = "1.0" -sha-1 = "0.9"+sha-1 = "0.9" +const-sha1 = "0.2" +crc = "2" diff --git a/src/main.rs b/src/main.rs index 9929608..019fc61 100644 --- a/src/main.rs +++ a/src/main.rs @@ -1,11 +1,11 @@ pub mod git; use crate::git::PktLine; -use git::codec::GitCodec; use bytes::BytesMut; use futures::future::Future; use git::codec::Encoder; +use git::codec::GitCodec; use std::{fmt::Write, pin::Pin, sync::Arc}; use thrussh::server::{Auth, Session}; use thrussh::*; @@ -165,15 +165,27 @@ } if fetch { - git::packfile::PackFile { - entries: vec![ - git::packfile::PackFileEntry { - entry_type: git::packfile::PackFileEntryType::Commit, - data: vec![0, 1, 2, 3, 4], - sha1: [0; 20], - } - ] - }.encode_to(&mut self.output_bytes).unwrap(); + let packfile = git::packfile::PackFile::new(vec![git::packfile::PackFileEntry::new( + git::packfile::PackFileEntryType::Blob, + b"testing this is a test cool test", + )?]); + + { + let mut buf = BytesMut::new(); + let packfile_index = git::packfile::PackFileIndex { + packfile: &packfile, + }; + packfile_index.encode_to(&mut buf)?; + self.write(PktLine::Data(buf.as_ref()))?; + } + + { + let mut buf = BytesMut::new(); + packfile.encode_to(&mut buf)?; + self.write(PktLine::Data(buf.as_ref()))?; + } + + self.write(PktLine::Flush)?; self.flush(&mut session, channel); } diff --git a/src/git/packfile.rs b/src/git/packfile.rs index 6665bab..668a2ad 100644 --- a/src/git/packfile.rs +++ a/src/git/packfile.rs @@ -1,7 +1,7 @@ use bytes::{BufMut, BytesMut}; +use const_sha1::{sha1, ConstBuffer}; use flate2::{write::ZlibEncoder, Compression}; -use sha1::{Digest, Sha1}; -use std::fmt::Write; +use std::convert::TryInto; use std::io::Write as IoWrite; // The offset/sha1[] tables are sorted by sha1[] values (this is to @@ -9,24 +9,78 @@ // the offset/sha1[] table in a specific way (so that part of the // latter table that covers all hashes that start with a given byte // can be found to avoid 8 iterations of the binary search). -// -// packfile indexes are not neccesary to extract objects from a packfile -pub struct PackFileIndex { - pub fanout: [[u8; 4]; 255], - pub size: u16, // fanout[256] => size == S - pub sha1: [[u8; 20]; S], // sha listing - pub crc: [[u8; 4]; S], // checksum - pub offset: [[u8; 4]; S], // packfile offsets - // 64b_offset: [[u8; 8]; N], // for packfiles over 2gb - pub packfile_checksum: [u8; 20], // sha1 - pub idxfiel_checksum: [u8; 20], // sha1 +pub struct PackFileIndex<'a> { + pub packfile: &'a PackFile, } -impl PackFileIndex { - pub fn encode_to(self, buf: &mut BytesMut) -> Result<(), anyhow::Error> { - buf.extend_from_slice(b"\xFFtOc"); // magic header - buf.put_u8(2); // version +impl<'a> PackFileIndex<'a> { + pub fn encode_to(self, original_buf: &mut BytesMut) -> Result<(), anyhow::Error> { + use sha1::{Sha1, Digest}; + + // split the buffer so we can hash only what we're currently generating at the + // end of this function + let mut buf = original_buf.split(); + + buf.extend_from_slice(&[255u8, 116u8, 79u8, 99u8]); // magic header + buf.put_u32(2); // version + + // calculate total `PackFileEntry` hashes beginning with the same first byte + let mut totals_by_first_byte = [0u32; 256]; + for entry in &self.packfile.entries { + totals_by_first_byte[entry.uncompressed_sha1[0] as usize] += 1; + } + + // calculate fanout value by taking cumulative totals of first byte counts + let mut cumulative = 0; + for i in 0..256usize { + cumulative += totals_by_first_byte[i]; + buf.put_u32(cumulative); + } + + // write all the sha hashes out, this needs to be sorted by the hash which should've + // been done by `PackFile::new()` + for entry in &self.packfile.entries { + buf.extend_from_slice(&entry.uncompressed_sha1); + } + + for entry in &self.packfile.entries { + buf.put_u32(entry.compressed_crc32); + } + + let mut offset = PackFile::header_size(); + + // encode offsets into the packfile + for entry in &self.packfile.entries { + offset += entry.compressed_data.len(); + + let mut offset_be = offset.to_be(); + + while offset_be != 0 { + // read 7 LSBs from the `offset_be` and push them off for the next iteration + let mut val = (offset_be & 0b1111111) as u8; + offset_be >>= 7; + + if offset_be != 0 { + // MSB set to 1 implies there's more offset_be bytes to come, otherwise + // the data starts after this byte + val |= 1 << 7; + } + buf.put_u8(val); + } + } + + // push a copy of the hash that appears at the end of the packfile + buf.extend_from_slice(&self.packfile.hash); + + // hash of the whole buffer we've just generated for the index + let mut hasher = Sha1::new(); + hasher.update(&buf); + let result = hasher.finalize(); + buf.extend_from_slice(result.as_ref()); + + // put the buffer we've just generated back into the mutable buffer we were passed + original_buf.unsplit(buf); Ok(()) } @@ -39,38 +93,50 @@ // packfile correctly. This is followed by a 4-byte packfile version // number and then a 4-byte number of entries in that file. pub struct PackFile { - pub entries: Vec, + entries: Vec, + hash: [u8; 20], } impl PackFile { + pub fn new(mut entries: Vec) -> Self { + entries.sort_unstable_by_key(|v| v.uncompressed_sha1[0]); + let hash_buffer = entries.iter().fold(ConstBuffer::new(), |acc, curr| { + acc.push_slice(&curr.uncompressed_sha1) + }); + + Self { + entries, + hash: sha1(&hash_buffer).bytes(), + } + } + + pub const fn header_size() -> usize { + 4 + std::mem::size_of::() + std::mem::size_of::() + } + pub fn encode_to(self, buf: &mut BytesMut) -> Result<(), anyhow::Error> { buf.extend_from_slice(b"PACK"); // magic header - buf.extend_from_slice(b"0002"); // version - write!(buf, "{:04x}", self.entries.len())?; // number of entries in the packfile + buf.put_u32(2); // version + buf.put_u32(self.entries.len().try_into().unwrap()); // number of entries in the packfile for entry in &self.entries { entry.encode_to(buf)?; } - let mut hasher = Sha1::new(); - for entry in &self.entries { - hasher.update(entry.sha1); - } - let hash = hasher.finalize(); - buf.extend_from_slice(&hash.as_slice()); + buf.extend_from_slice(&self.hash); Ok(()) } } pub enum PackFileEntryType { - // jordan@Jordans-MacBook-Pro-2 0d % printf "\x1f\x8b\x08\x00\x00\x00\x00\x00" | cat - f5/473259d9674ed66239766a013f96a3550374e3-test | gzip -dc + // jordan@Jordans-MacBook-Pro-2 0d % printf "\x1f\x8b\x08\x00\x00\x00\x00\x00" | cat - f5/473259d9674ed66239766a013f96a3550374e3 | gzip -dc // commit 1068tree 0d586b48bc42e8591773d3d8a7223551c39d453c // parent c2a862612a14346ae95234f26efae1ee69b5b7a9 // author Jordan Doyle 1630244577 +0100 // committer Jordan Doyle 1630244577 +0100 // gpgsig -----BEGIN PGP SIGNATURE----- - // + // // iQIzBAABCAAdFiEEMn1zof7yzaURQBGDHqa65vZtxJoFAmErjuEACgkQHqa65vZt // xJqhvhAAieKXnGRjT926qzozcvarC8D3TlA+Z1wVXueTAWqfusNIP0zCun/crOb2 // tOULO+/DXVBmwu5eInAf+t/wvlnIsrzJonhVr1ZT0f0vDX6fs2vflWg4UCVEuTsZ @@ -85,7 +151,7 @@ // bDqWN0Q2qcKX3k4ggtucmkkA6gP+K3+F5ANQj3AsGMQeddowC0Y= // =fXoH // -----END PGP SIGNATURE----- - // + // // test Commit, // jordan@Jordans-MacBook-Pro-2 0d % printf "\x1f\x8b\x08\x00\x00\x00\x00\x00" | cat - 0d/586b48bc42e8591773d3d8a7223551c39d453c | gzip -dc @@ -101,16 +167,34 @@ } pub struct PackFileEntry { - pub entry_type: PackFileEntryType, - pub data: Vec, - pub sha1: [u8; 20], + entry_type: PackFileEntryType, + compressed_data: Vec, + compressed_crc32: u32, + uncompressed_sha1: [u8; 20], + uncompressed_size: usize, } impl PackFileEntry { - fn size_of_data(&self) -> usize { - self.data.len() as usize + pub fn new(entry_type: PackFileEntryType, data: &[u8]) -> Result { + let mut e = ZlibEncoder::new(Vec::new(), Compression::default()); + e.write_all(&data)?; + let compressed_data = e.finish()?; + + let compressed_crc32 = crc::Crc::::new(&crc::CRC_32_CKSUM).checksum(&compressed_data); + + Ok(Self { + entry_type, + compressed_data, + compressed_crc32, + uncompressed_sha1: sha1(&ConstBuffer::new().push_slice(data)).bytes(), + uncompressed_size: data.len(), + }) } + fn size_of_data_be(&self) -> usize { + self.uncompressed_size.to_be() + } + // The object header is a series of one or more 1 byte (8 bit) hunks // that specify the type of object the following data is, and the size // of the data when expanded. Each byte is really 7 bits of data, with @@ -119,7 +203,7 @@ // byte, otherwise the data starts next. The first 3 bits in the first // byte specifies the type of data, according to the table below. fn write_header(&self, buf: &mut BytesMut) { - let mut size = self.size_of_data(); + let mut size = self.size_of_data_be(); // write header { @@ -134,7 +218,7 @@ // PackFileEntryType::RefDelta => 0b111, } << 4; - // pack the last 4 bits of the size into the header + // pack the 4 LSBs of the size into the header val |= (size & 0b1111) as u8; size >>= 4; @@ -143,13 +227,13 @@ // write size bytes while size != 0 { - // read 7 bits from the `size` and push them off for the next iteration + // read 7 LSBs from the `size` and push them off for the next iteration let mut val = (size & 0b1111111) as u8; size >>= 7; if size != 0 { - // first bit implies there's more size bytes to come, otherwise the - // data starts after this byte + // MSB set to 1 implies there's more size bytes to come, otherwise + // the data starts after this byte val |= 1 << 7; } @@ -159,10 +243,7 @@ pub fn encode_to(&self, buf: &mut BytesMut) -> Result<(), anyhow::Error> { self.write_header(buf); - - let mut e = ZlibEncoder::new(buf.as_mut(), Compression::default()); - e.write_all(self.data.as_ref())?; - e.finish()?; + buf.extend_from_slice(&self.compressed_data); Ok(()) } -- rgit 0.1.3