blob: 64d363beb28bf8cfd56a5c9be9057163bd3164df [file] [log] [blame] [edit]
///
#[allow(clippy::empty_docs)]
pub mod decode {
/// The error returned by [`decode()`][super::decode()].
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("{}", message)]
Corrupt { message: &'static str },
}
}
/// Decode `data` as EWAH bitmap.
pub fn decode(data: &[u8]) -> Result<(Vec, &[u8]), decode::Error> {
use self::decode::Error;
use crate::decode;
let (num_bits, data) = decode::u32(data).ok_or(Error::Corrupt {
message: "eof reading amount of bits",
})?;
let (len, data) = decode::u32(data).ok_or(Error::Corrupt {
message: "eof reading chunk length",
})?;
let len = len as usize;
// NOTE: git does this by copying all bytes first, and then it will change the endianness in a separate loop.
// Maybe it's faster, but we can't do it without unsafe. Let's leave it to the optimizer and maybe
// one day somebody will find out that it's worth it to use unsafe here.
let (mut bits, data) = decode::split_at_pos(data, len * std::mem::size_of::<u64>()).ok_or(Error::Corrupt {
message: "eof while reading bit data",
})?;
let mut buf = std::vec::Vec::<u64>::with_capacity(len);
for _ in 0..len {
let (bit_num, rest) = bits.split_at(std::mem::size_of::<u64>());
bits = rest;
buf.push(u64::from_be_bytes(bit_num.try_into().unwrap()))
}
let (rlw, data) = decode::u32(data).ok_or(Error::Corrupt {
message: "eof while reading run length width",
})?;
Ok((
Vec {
num_bits,
bits: buf,
rlw: rlw.into(),
},
data,
))
}
mod access {
use super::Vec;
impl Vec {
/// Call `f(index)` for each bit that is true, given the index of the bit that identifies it uniquely within the bit array.
/// If `f` returns `None` the iteration will be stopped and `None` is returned.
///
/// The index is sequential like in any other vector.
pub fn for_each_set_bit(&self, mut f: impl FnMut(usize) -> Option<()>) -> Option<()> {
let mut index = 0usize;
let mut iter = self.bits.iter();
while let Some(word) = iter.next() {
if rlw_runbit_is_set(word) {
let len = rlw_running_len_bits(word);
for _ in 0..len {
f(index)?;
index += 1;
}
} else {
index += usize::try_from(rlw_running_len_bits(word)).ok()?;
}
for _ in 0..rlw_literal_words(word) {
let word = iter
.next()
.expect("BUG: ran out of words while going through uncompressed portion");
for bit_index in 0..64 {
if word & (1 << bit_index) != 0 {
f(index)?;
}
index += 1;
}
}
}
Some(())
}
/// The amount of bits we are currently holding.
pub fn num_bits(&self) -> usize {
self.num_bits.try_into().expect("we are not on 16 bit systems")
}
}
#[inline]
fn rlw_running_len_bits(w: &u64) -> u64 {
rlw_running_len(w) * 64
}
#[inline]
fn rlw_running_len(w: &u64) -> u64 {
(w >> 1) & RLW_LARGEST_RUNNING_COUNT
}
#[inline]
fn rlw_literal_words(w: &u64) -> u64 {
w >> (1 + RLW_RUNNING_BITS)
}
#[inline]
fn rlw_runbit_is_set(w: &u64) -> bool {
w & 1 == 1
}
const RLW_RUNNING_BITS: u64 = 4 * 8;
const RLW_LARGEST_RUNNING_COUNT: u64 = (1 << RLW_RUNNING_BITS) - 1;
}
/// A growable collection of u64 that are seen as stream of individual bits.
#[allow(dead_code)]
#[derive(Clone)]
pub struct Vec {
num_bits: u32,
bits: std::vec::Vec<u64>,
/// RLW is an offset into the `bits` buffer, so `1` translates into &bits\[1] essentially.
rlw: u64,
}