🏡 index : ~doyle/serde_bson.git

author Jordan Doyle <jordan@doyle.la> 2021-07-23 2:50:01.0 +01:00:00
committer Jordan Doyle <jordan@doyle.la> 2021-07-23 3:04:37.0 +01:00:00
commit
fd15ebb44b65ce7a8aa81250edbef294c79e3592 [patch]
tree
4b9206230fde3eb97b84a75962b32bbe311eacfb
download
fd15ebb44b65ce7a8aa81250edbef294c79e3592.tar.gz

initial commit



Diff

 .gitignore          |   3 +++
 Cargo.toml          |  30 ++++++++++++++++++++++++++++++
 LICENSE             |  12 ++++++++++++
 README.md           |  23 +++++++++++++++++++++++
 benches/borrowed.rs |  58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 benches/owned.rs    |  60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/error.rs        |  31 +++++++++++++++++++++++++++++++
 src/lib.rs          |  71 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/ser.rs          | 256 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 9 files changed, 544 insertions(+)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..77147e2 100644
--- /dev/null
+++ a/.gitignore
@@ -1,0 +1,3 @@
/target
Cargo.lock
.idea/
diff --git a/Cargo.toml b/Cargo.toml
new file mode 100644
index 0000000..f43bd78 100644
--- /dev/null
+++ a/Cargo.toml
@@ -1,0 +1,30 @@
[package]
name = "serde_bson"
authors = ["Jordan D. <jordan@doyle.la>"]
description = "Fast bson serde implementation"
repository = "https://github.com/w4/serde_bson"
version = "0.0.1"
edition = "2018"
license = "0BSD"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
serde = "1"
bytes = "1"
take_mut = "0.2"

[dev-dependencies]
serde = { version = "1", features = ["derive"] }
serde_bytes = "0.11"
bson = "1.2"
criterion = "0.3"
rand = "0.8"

[[bench]]
name = "borrowed"
harness = false

[[bench]]
name = "owned"
harness = false
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..e9d84ae 100644
--- /dev/null
+++ a/LICENSE
@@ -1,0 +1,12 @@
Copyright (C) 2006 by Rob Landley <rob@landley.net>

Permission to use, copy, modify, and/or distribute this software for any purpose
with or without fee is hereby granted.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1016dda 100644
--- /dev/null
+++ a/README.md
@@ -1,0 +1,23 @@
## serde_bson

Originally implemented as a workaround to the `bson` crate cloning every value it
comes across and it's looking like it shows significant improvement across the board
for serialisation (~80% improvement).

```

mongodb's bson  time:   [1.1160 us 1.1171 us 1.1183 us]
Found 2 outliers among 100 measurements (2.00%)
  2 (2.00%) high mild

serde_bson      time:   [201.99 ns 202.17 ns 202.38 ns]                                 
Found 10 outliers among 100 measurements (10.00%)
  4 (4.00%) low mild
  4 (4.00%) high mild
  2 (2.00%) high severe
```


There's a few pieces missing such as arrays and nested documents but they're not
too difficult to add, it's just that it's 2:38am and I've smashed this out in an
hour.

Pull requests welcome as always.
diff --git a/benches/borrowed.rs b/benches/borrowed.rs
new file mode 100644
index 0000000..44f382a 100644
--- /dev/null
+++ a/benches/borrowed.rs
@@ -1,0 +1,58 @@
use bytes::BufMut;
use criterion::{black_box, criterion_group, criterion_main, Criterion};

#[derive(serde::Serialize)]
pub struct A<'a> {
    a: &'a str,
    b: &'a str,
    c: &'a str,
    d: i64,
    e: f64,
    #[serde(with = "serde_bytes")]
    f: &'a [u8],
}

fn benchmark(c: &mut Criterion) {
    let val = A {
        a: "Now this is a story all about how
            My life got flipped turned upside down
            And I'd like to take a minute, just sit right there
            I'll tell you how I became the prince of a town called Bel-Air",
        b: "In West Philadelphia born and raised
            On the playground is where I spent most of my days
            Chillin' out, maxin', relaxin' all cool
            And all shootin' some b-ball outside of the school
            When a couple of guys who were up to no good
            Started makin' trouble in my neighborhood",
        c: "I got in one little fight and my mom got scared
            And said 'You're movin' with your auntie and uncle in Bel-Air'",
        d: 420,
        e: 420.69696969696969,
        f: "Above are some popular 'pop culture' references for your perusal and enjoyment"
            .as_bytes(),
    };

    c.bench_function("borrowed: mongodb's bson", |b| {
        let mut theirs = Vec::new();

        b.iter(|| {
            bson::ser::to_document(black_box(&val))
                .unwrap()
                .to_writer(&mut theirs)
                .unwrap();
            theirs.clear();
        })
    });

    c.bench_function("borrowed: serde_bson", |b| {
        let mut out = bytes::BytesMut::new();

        b.iter(|| {
            serde_bson::to_string(black_box(&val), &mut out).unwrap();
            drop(out.split());
        });
    });
}

criterion_group!(benches, benchmark);
criterion_main!(benches);
diff --git a/benches/owned.rs b/benches/owned.rs
new file mode 100644
index 0000000..773567e 100644
--- /dev/null
+++ a/benches/owned.rs
@@ -1,0 +1,60 @@
use bytes::BufMut;
use criterion::{black_box, criterion_group, criterion_main, Criterion};

#[derive(serde::Serialize)]
pub struct A {
    a: String,
    b: String,
    c: String,
    d: i64,
    e: f64,
    #[serde(with = "serde_bytes")]
    f: Vec<u8>,
}

fn benchmark(c: &mut Criterion) {
    let val = A {
        a: "Now this is a story all about how
            My life got flipped turned upside down
            And I'd like to take a minute, just sit right there
            I'll tell you how I became the prince of a town called Bel-Air"
            .to_string(),
        b: "In West Philadelphia born and raised
            On the playground is where I spent most of my days
            Chillin' out, maxin', relaxin' all cool
            And all shootin' some b-ball outside of the school
            When a couple of guys who were up to no good
            Started makin' trouble in my neighborhood"
            .to_string(),
        c: "I got in one little fight and my mom got scared
            And said 'You're movin' with your auntie and uncle in Bel-Air'"
            .to_string(),
        d: 420,
        e: 420.69696969696969,
        f: "Above are some popular 'pop culture' references for your perusal and enjoyment".into(),
    };

    c.bench_function("owned: mongodb's bson", |b| {
        let mut theirs = Vec::new();

        b.iter(|| {
            bson::ser::to_document(black_box(&val))
                .unwrap()
                .to_writer(&mut theirs)
                .unwrap();
            theirs.clear();
        })
    });

    c.bench_function("owned: serde_bson", |b| {
        let mut out = bytes::BytesMut::new();

        b.iter(|| {
            serde_bson::to_string(black_box(&val), &mut out).unwrap();
            drop(out.split());
        });
    });
}

criterion_group!(benches, benchmark);
criterion_main!(benches);
diff --git a/src/error.rs b/src/error.rs
new file mode 100644
index 0000000..8bf71b9 100644
--- /dev/null
+++ a/src/error.rs
@@ -1,0 +1,31 @@
use std::fmt::{Display, Formatter};

#[derive(Debug)]
pub enum Error {
    NotSerializingStruct,
    Serde(String),
    UnsignedIntNotInSpec,
}

impl Display for Error {
    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::NotSerializingStruct => write!(
                f,
                "individual values cannot be serialised, try serialising a struct instead"
            ),
            Self::Serde(context) => write!(f, "error from value serialiser: {}", context),
            Self::UnsignedIntNotInSpec => {
                write!(f, "unsigned ints are not supported in the bson spec")
            }
        }
    }
}

impl std::error::Error for Error {}

impl serde::ser::Error for Error {
    fn custom<T: Display>(msg: T) -> Self {
        Error::Serde(msg.to_string())
    }
}
diff --git a/src/lib.rs b/src/lib.rs
new file mode 100644
index 0000000..f42503f 100644
--- /dev/null
+++ a/src/lib.rs
@@ -1,0 +1,71 @@
mod error;
pub mod ser;

pub use error::Error;

use bytes::{BufMut, BytesMut};
use serde::Serialize;

pub fn to_string<T: Serialize>(val: &T, output: &mut BytesMut) -> Result<(), Error> {
    const SIZE_OF_SIZE: usize = std::mem::size_of::<i32>();

    // essentially reserves a i32 we can prepend back onto the BytesMut later
    // at the cost of an atomic increment
    output.put_i32(0);
    let mut size = output.split_to(SIZE_OF_SIZE);

    val.serialize(ser::Serializer { key: None, output })?;

    // writes the total length of the output to the i32 we split off before
    for (i, byte) in ((output.len() + SIZE_OF_SIZE) as i32)
        .to_le_bytes()
        .iter()
        .enumerate()
    {
        size[i] = *byte;
    }

    // this is safe because `unsplit` can't panic
    take_mut::take(output, move |output| {
        // O(1) prepend since `size` originally came from `output`.
        size.unsplit(output);
        size
    });

    Ok(())
}

#[cfg(test)]
mod test {
    use super::to_string;
    use bytes::{BufMut, BytesMut};
    use serde::Serialize;

    #[test]
    pub fn test_basic() {
        #[derive(Serialize)]
        pub struct A<'a> {
            cool: i32,
            #[serde(with = "serde_bytes")]
            beans: &'a [u8],
            bro: &'a str,
        }

        let test = &A {
            cool: 999,
            beans: "so there was this one time at bandcamp".as_bytes(),
            bro: "the craziest thing happened",
        };

        let mut ours = BytesMut::new();
        to_string(&test, &mut ours);

        let mut theirs = BytesMut::new().writer();
        bson::ser::to_document(&test)
            .unwrap()
            .to_writer(&mut theirs)
            .unwrap();

        assert_eq!(ours, theirs.into_inner());
    }
}
diff --git a/src/ser.rs b/src/ser.rs
new file mode 100644
index 0000000..8fdb2e0 100644
--- /dev/null
+++ a/src/ser.rs
@@ -1,0 +1,256 @@
use crate::Error;
use bytes::BufMut;
use serde::Serialize;
use std::convert::TryFrom;

pub struct Serializer<'a, B: BufMut> {
    pub key: Option<&'static str>,
    pub output: &'a mut B,
}

macro_rules! write_key_or_error {
    ($id:literal, $key:expr, $output:expr) => {
        if let Some(key) = $key {
            $output.put_u8($id);
            $output.put_slice(key.as_bytes());
            $output.put_u8(0x00);
        } else {
            return Err(Error::NotSerializingStruct);
        }
    };
}

impl<'a, B: BufMut> serde::Serializer for Serializer<'a, B> {
    type Ok = ();
    type Error = Error;

    type SerializeSeq = serde::ser::Impossible<Self::Ok, Self::Error>;
    type SerializeTuple = serde::ser::Impossible<Self::Ok, Self::Error>;
    type SerializeTupleStruct = serde::ser::Impossible<Self::Ok, Self::Error>;
    type SerializeTupleVariant = serde::ser::Impossible<Self::Ok, Self::Error>;
    type SerializeMap = serde::ser::Impossible<Self::Ok, Self::Error>;
    type SerializeStruct = StructSerializer<'a, B>;
    type SerializeStructVariant = serde::ser::Impossible<Self::Ok, Self::Error>;

    fn serialize_bool(self, v: bool) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x01, self.key, self.output);
        self.output.put_u8(v as u8);
        Ok(())
    }

    fn serialize_i8(self, v: i8) -> Result<Self::Ok, Self::Error> {
        self.serialize_i32(v as i32)
    }

    fn serialize_i16(self, v: i16) -> Result<Self::Ok, Self::Error> {
        self.serialize_i32(v as i32)
    }

    fn serialize_i32(self, v: i32) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x10, self.key, self.output);
        self.output.put_i32_le(v);
        Ok(())
    }

    fn serialize_i64(self, v: i64) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x12, self.key, self.output);
        self.output.put_i64_le(v);
        Ok(())
    }

    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
        Err(Error::UnsignedIntNotInSpec)
    }

    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
        Err(Error::UnsignedIntNotInSpec)
    }

    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
        Err(Error::UnsignedIntNotInSpec)
    }

    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
        Err(Error::UnsignedIntNotInSpec)
    }

    fn serialize_f32(self, v: f32) -> Result<Self::Ok, Self::Error> {
        self.serialize_f64(v as f64)
    }

    fn serialize_f64(self, v: f64) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x01, self.key, self.output);
        self.output.put_f64_le(v);
        Ok(())
    }

    fn serialize_char(self, _: char) -> Result<Self::Ok, Self::Error> {
        Err(Error::UnsignedIntNotInSpec)
    }

    fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x02, self.key, self.output);

        let v = v.as_bytes();
        let len = i32::try_from(v.len() + 1) // `+ 1` for the null byte at the end of the str
            .unwrap_or_else(|_| panic!(
                "encoded string exceeds max size: {}",
                i32::MAX - 1
            ));

        self.output.put_i32_le(len);
        self.output.put_slice(v);
        self.output.put_u8(0x00);

        Ok(())
    }

    fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x05, self.key, self.output);

        // we don't need the + 1 here since there's no null terminator
        let len = i32::try_from(v.len())
            .unwrap_or_else(|_| panic!("bytes exceeds max size: {}", i32::MAX));

        self.output.put_i32_le(len);
        self.output.put_u8(0x00); // subtype, we'll just assume 0x00
        self.output.put_slice(v);

        Ok(())
    }

    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
        write_key_or_error!(0x0A, self.key, self.output);
        Ok(())
    }

    fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        value.serialize(self)
    }

    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
        self.serialize_none()
    }

    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
        unimplemented!("unit struct")
    }

    fn serialize_unit_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
    ) -> Result<Self::Ok, Self::Error> {
        unimplemented!("unit variant")
    }

    fn serialize_newtype_struct<T: ?Sized>(
        self,
        _name: &'static str,
        value: &T,
    ) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        value.serialize(self)
    }

    fn serialize_newtype_variant<T: ?Sized>(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _value: &T,
    ) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        todo!("newtype variant")
    }

    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
        todo!("seq")
    }

    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
        todo!("tuple")
    }

    fn serialize_tuple_struct(
        self,
        _name: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
        todo!("tuple struct")
    }

    fn serialize_tuple_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
        todo!("tuple variant")
    }

    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
        todo!("map")
    }

    fn serialize_struct(
        self,
        _name: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeStruct, Self::Error> {
        if self.key.is_some() {
            todo!("nested struct: {:?}", self.key);
        }

        Ok(StructSerializer {
            output: self.output,
        })
    }

    fn serialize_struct_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeStructVariant, Self::Error> {
        todo!("struct variant")
    }
}

pub struct StructSerializer<'a, B: BufMut> {
    output: &'a mut B,
}

impl<'a, B: BufMut> serde::ser::SerializeStruct for StructSerializer<'a, B> {
    type Ok = ();
    type Error = <Serializer<'a, B> as serde::Serializer>::Error;

    fn serialize_field<T: ?Sized>(
        &mut self,
        key: &'static str,
        value: &T,
    ) -> Result<(), Self::Error>
    where
        T: Serialize,
    {
        value.serialize(Serializer {
            key: Some(key),
            output: &mut self.output,
        })
    }

    fn end(self) -> Result<Self::Ok, Self::Error> {
        self.output.put_u8(0x00); // doc terminator
        Ok(())
    }
}