🏡 index : ~doyle/chartered.git

author Jordan Doyle <jordan@doyle.la> 2021-08-29 22:22:56.0 +01:00:00
committer Jordan Doyle <jordan@doyle.la> 2021-08-29 22:22:56.0 +01:00:00
commit
f6046cf6372e0d8ab845f6dec1602c303a66ee91 [patch]
tree
9215cd13b120a3bb60b01b0916fd5cfdb9e1c464
parent
f09b9d182e2331820debbd7eb1636148137efa2c
download
f6046cf6372e0d8ab845f6dec1602c303a66ee91.tar.gz

initial packfile index generation



Diff

 Cargo.lock          |  23 +++++++++++++++++++++++
 Cargo.toml          |   4 +++-
 src/main.rs         |  32 +++++++++++++++++++++++++-------
 src/git/packfile.rs | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
 4 files changed, 172 insertions(+), 54 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 9723d11..d019dc2 100644
--- a/Cargo.lock
+++ a/Cargo.lock
@@ -190,6 +190,8 @@
 "async-trait",
 "axum",
 "bytes",
 "const-sha1",
 "crc",
 "env_logger",
 "flate2",
 "futures",
@@ -212,13 +214,34 @@
]

[[package]]
name = "const-sha1"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb58b6451e8c2a812ad979ed1d83378caa5e927eef2622017a45f251457c2c9d"

[[package]]
name = "cpufeatures"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469"
dependencies = [
 "libc",
]

[[package]]
name = "crc"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10c2722795460108a7872e1cd933a85d6ec38abc4baecad51028f702da28889f"
dependencies = [
 "crc-catalog",
]

[[package]]
name = "crc-catalog"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccaeedb56da03b09f598226e25e80088cb4cd25f316e6e4df7d695f0feeb1403"

[[package]]
name = "crc32fast"
diff --git a/Cargo.toml b/Cargo.toml
index 4c6566a..d85c62a 100644
--- a/Cargo.toml
+++ a/Cargo.toml
@@ -20,4 +20,6 @@
tokio-util = { version = "0.6", features = ["codec"] }
bytes = "1"
flate2 = "1.0"
sha-1 = "0.9"
sha-1 = "0.9"
const-sha1 = "0.2"
crc = "2"
diff --git a/src/main.rs b/src/main.rs
index 9929608..019fc61 100644
--- a/src/main.rs
+++ a/src/main.rs
@@ -1,11 +1,11 @@
pub mod git;

use crate::git::PktLine;

use git::codec::GitCodec;
use bytes::BytesMut;
use futures::future::Future;
use git::codec::Encoder;
use git::codec::GitCodec;
use std::{fmt::Write, pin::Pin, sync::Arc};
use thrussh::server::{Auth, Session};
use thrussh::*;
@@ -165,15 +165,27 @@
            }

            if fetch {
                git::packfile::PackFile {
                    entries: vec![
                        git::packfile::PackFileEntry {
                            entry_type: git::packfile::PackFileEntryType::Commit,
                            data: vec![0, 1, 2, 3, 4],
                            sha1: [0; 20],
                        }
                    ]
                }.encode_to(&mut self.output_bytes).unwrap();
                let packfile = git::packfile::PackFile::new(vec![git::packfile::PackFileEntry::new(
                    git::packfile::PackFileEntryType::Blob,
                    b"testing this is a test cool test",
                )?]);

                {
                    let mut buf = BytesMut::new();
                    let packfile_index = git::packfile::PackFileIndex {
                        packfile: &packfile,
                    };
                    packfile_index.encode_to(&mut buf)?;
                    self.write(PktLine::Data(buf.as_ref()))?;
                }

                {
                    let mut buf = BytesMut::new();
                    packfile.encode_to(&mut buf)?;
                    self.write(PktLine::Data(buf.as_ref()))?;
                }

                self.write(PktLine::Flush)?;

                self.flush(&mut session, channel);
            }
diff --git a/src/git/packfile.rs b/src/git/packfile.rs
index 6665bab..668a2ad 100644
--- a/src/git/packfile.rs
+++ a/src/git/packfile.rs
@@ -1,7 +1,7 @@
use bytes::{BufMut, BytesMut};
use const_sha1::{sha1, ConstBuffer};
use flate2::{write::ZlibEncoder, Compression};
use sha1::{Digest, Sha1};
use std::fmt::Write;
use std::convert::TryInto;
use std::io::Write as IoWrite;

// The offset/sha1[] tables are sorted by sha1[] values (this is to
@@ -9,24 +9,78 @@
// the offset/sha1[] table in a specific way (so that part of the
// latter table that covers all hashes that start with a given byte
// can be found to avoid 8 iterations of the binary search).
//
// packfile indexes are not neccesary to extract objects from a packfile
pub struct PackFileIndex<const S: usize> {
    pub fanout: [[u8; 4]; 255],
    pub size: u16,            // fanout[256] => size == S
    pub sha1: [[u8; 20]; S],  // sha listing
    pub crc: [[u8; 4]; S],    // checksum
    pub offset: [[u8; 4]; S], // packfile offsets
    // 64b_offset: [[u8; 8]; N], // for packfiles over 2gb
    pub packfile_checksum: [u8; 20], // sha1
    pub idxfiel_checksum: [u8; 20],  // sha1
pub struct PackFileIndex<'a> {
    pub packfile: &'a PackFile,
}

impl<const S: usize> PackFileIndex<S> {
    pub fn encode_to(self, buf: &mut BytesMut) -> Result<(), anyhow::Error> {
        buf.extend_from_slice(b"\xFFtOc"); // magic header
        buf.put_u8(2); // version
impl<'a> PackFileIndex<'a> {
    pub fn encode_to(self, original_buf: &mut BytesMut) -> Result<(), anyhow::Error> {
        use sha1::{Sha1, Digest};

        // split the buffer so we can hash only what we're currently generating at the
        // end of this function
        let mut buf = original_buf.split();

        buf.extend_from_slice(&[255u8, 116u8, 79u8, 99u8]); // magic header
        buf.put_u32(2); // version

        // calculate total `PackFileEntry` hashes beginning with the same first byte
        let mut totals_by_first_byte = [0u32; 256];
        for entry in &self.packfile.entries {
            totals_by_first_byte[entry.uncompressed_sha1[0] as usize] += 1;
        }

        // calculate fanout value by taking cumulative totals of first byte counts
        let mut cumulative = 0;
        for i in 0..256usize {
            cumulative += totals_by_first_byte[i];
            buf.put_u32(cumulative);
        }

        // write all the sha hashes out, this needs to be sorted by the hash which should've
        // been done by `PackFile::new()`
        for entry in &self.packfile.entries {
            buf.extend_from_slice(&entry.uncompressed_sha1);
        }

        for entry in &self.packfile.entries {
            buf.put_u32(entry.compressed_crc32);
        }

        let mut offset = PackFile::header_size();

        // encode offsets into the packfile
        for entry in &self.packfile.entries {
            offset += entry.compressed_data.len();

            let mut offset_be = offset.to_be();

            while offset_be != 0 {
                // read 7 LSBs from the `offset_be` and push them off for the next iteration
                let mut val = (offset_be & 0b1111111) as u8;
                offset_be >>= 7;

                if offset_be != 0 {
                    // MSB set to 1 implies there's more offset_be bytes to come, otherwise
                    // the data starts after this byte
                    val |= 1 << 7;
                }

                buf.put_u8(val);
            }
        }

        // push a copy of the hash that appears at the end of the packfile
        buf.extend_from_slice(&self.packfile.hash);

        // hash of the whole buffer we've just generated for the index
        let mut hasher = Sha1::new();
        hasher.update(&buf);
        let result = hasher.finalize();
        buf.extend_from_slice(result.as_ref());

        // put the buffer we've just generated back into the mutable buffer we were passed
        original_buf.unsplit(buf);

        Ok(())
    }
@@ -39,38 +93,50 @@
// packfile correctly. This is followed by a 4-byte packfile version
// number and then a 4-byte number of entries in that file.
pub struct PackFile {
    pub entries: Vec<PackFileEntry>,
    entries: Vec<PackFileEntry>,
    hash: [u8; 20],
}

impl PackFile {
    pub fn new(mut entries: Vec<PackFileEntry>) -> Self {
        entries.sort_unstable_by_key(|v| v.uncompressed_sha1[0]);
        let hash_buffer = entries.iter().fold(ConstBuffer::new(), |acc, curr| {
            acc.push_slice(&curr.uncompressed_sha1)
        });

        Self {
            entries,
            hash: sha1(&hash_buffer).bytes(),
        }
    }

    pub const fn header_size() -> usize {
        4 + std::mem::size_of::<u32>() + std::mem::size_of::<u32>()
    }

    pub fn encode_to(self, buf: &mut BytesMut) -> Result<(), anyhow::Error> {
        buf.extend_from_slice(b"PACK"); // magic header
        buf.extend_from_slice(b"0002"); // version
        write!(buf, "{:04x}", self.entries.len())?; // number of entries in the packfile
        buf.put_u32(2); // version
        buf.put_u32(self.entries.len().try_into().unwrap()); // number of entries in the packfile

        for entry in &self.entries {
            entry.encode_to(buf)?;
        }

        let mut hasher = Sha1::new();
        for entry in &self.entries {
            hasher.update(entry.sha1);
        }
        let hash = hasher.finalize();
        buf.extend_from_slice(&hash.as_slice());
        buf.extend_from_slice(&self.hash);

        Ok(())
    }
}

pub enum PackFileEntryType {
    // jordan@Jordans-MacBook-Pro-2 0d % printf "\x1f\x8b\x08\x00\x00\x00\x00\x00" | cat - f5/473259d9674ed66239766a013f96a3550374e3-test | gzip -dc
    // jordan@Jordans-MacBook-Pro-2 0d % printf "\x1f\x8b\x08\x00\x00\x00\x00\x00" | cat - f5/473259d9674ed66239766a013f96a3550374e3 | gzip -dc
    // commit 1068tree 0d586b48bc42e8591773d3d8a7223551c39d453c
    // parent c2a862612a14346ae95234f26efae1ee69b5b7a9
    // author Jordan Doyle <jordan@doyle.la> 1630244577 +0100
    // committer Jordan Doyle <jordan@doyle.la> 1630244577 +0100
    // gpgsig -----BEGIN PGP SIGNATURE-----
    // 
    //
    // iQIzBAABCAAdFiEEMn1zof7yzaURQBGDHqa65vZtxJoFAmErjuEACgkQHqa65vZt
    // xJqhvhAAieKXnGRjT926qzozcvarC8D3TlA+Z1wVXueTAWqfusNIP0zCun/crOb2
    // tOULO+/DXVBmwu5eInAf+t/wvlnIsrzJonhVr1ZT0f0vDX6fs2vflWg4UCVEuTsZ
@@ -85,7 +151,7 @@
    // bDqWN0Q2qcKX3k4ggtucmkkA6gP+K3+F5ANQj3AsGMQeddowC0Y=
    // =fXoH
    // -----END PGP SIGNATURE-----
    //  
    //
    // test
    Commit,
    // jordan@Jordans-MacBook-Pro-2 0d % printf "\x1f\x8b\x08\x00\x00\x00\x00\x00" | cat - 0d/586b48bc42e8591773d3d8a7223551c39d453c | gzip -dc
@@ -101,16 +167,34 @@
}

pub struct PackFileEntry {
    pub entry_type: PackFileEntryType,
    pub data: Vec<u8>,
    pub sha1: [u8; 20],
    entry_type: PackFileEntryType,
    compressed_data: Vec<u8>,
    compressed_crc32: u32,
    uncompressed_sha1: [u8; 20],
    uncompressed_size: usize,
}

impl PackFileEntry {
    fn size_of_data(&self) -> usize {
        self.data.len() as usize
    pub fn new(entry_type: PackFileEntryType, data: &[u8]) -> Result<Self, anyhow::Error> {
        let mut e = ZlibEncoder::new(Vec::new(), Compression::default());
        e.write_all(&data)?;
        let compressed_data = e.finish()?;

        let compressed_crc32 = crc::Crc::<u32>::new(&crc::CRC_32_CKSUM).checksum(&compressed_data);

        Ok(Self {
            entry_type,
            compressed_data,
            compressed_crc32,
            uncompressed_sha1: sha1(&ConstBuffer::new().push_slice(data)).bytes(),
            uncompressed_size: data.len(),
        })
    }

    fn size_of_data_be(&self) -> usize {
        self.uncompressed_size.to_be()
    }

    // The object header is a series of one or more 1 byte (8 bit) hunks
    // that specify the type of object the following data is, and the size
    // of the data when expanded. Each byte is really 7 bits of data, with
@@ -119,7 +203,7 @@
    // byte, otherwise the data starts next. The first 3 bits in the first
    // byte specifies the type of data, according to the table below.
    fn write_header(&self, buf: &mut BytesMut) {
        let mut size = self.size_of_data();
        let mut size = self.size_of_data_be();

        // write header
        {
@@ -134,7 +218,7 @@
                // PackFileEntryType::RefDelta => 0b111,
            } << 4;

            // pack the last 4 bits of the size into the header
            // pack the 4 LSBs of the size into the header
            val |= (size & 0b1111) as u8;
            size >>= 4;

@@ -143,13 +227,13 @@

        // write size bytes
        while size != 0 {
            // read 7 bits from the `size` and push them off for the next iteration
            // read 7 LSBs from the `size` and push them off for the next iteration
            let mut val = (size & 0b1111111) as u8;
            size >>= 7;

            if size != 0 {
                // first bit implies there's more size bytes to come, otherwise the
                // data starts after this byte
                // MSB set to 1 implies there's more size bytes to come, otherwise
                // the data starts after this byte
                val |= 1 << 7;
            }

@@ -159,10 +243,7 @@

    pub fn encode_to(&self, buf: &mut BytesMut) -> Result<(), anyhow::Error> {
        self.write_header(buf);

        let mut e = ZlibEncoder::new(buf.as_mut(), Compression::default());
        e.write_all(self.data.as_ref())?;
        e.finish()?;
        buf.extend_from_slice(&self.compressed_data);

        Ok(())
    }