vendor/decodeme-12.0.0/src/stringtable.rs - toolchain/rustc - Git at Google

 //! See module-level documentation `measureme::stringtable`.

 use measureme::stringtable::{METADATA_STRING_ID, TERMINATOR};
 use measureme::{
     file_header::{
         strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
         FILE_MAGIC_STRINGTABLE_INDEX,
     },
     stringtable::STRING_REF_ENCODED_SIZE,
     stringtable::STRING_REF_TAG,
 };
 use measureme::{Addr, StringId};
 use memchr::{memchr, memchr2};
 use rustc_hash::FxHashMap;
 use std::borrow::Cow;
 use std::convert::TryInto;
 use std::error::Error;
 use std::path::Path;

 const INDEX_ENTRY_SIZE: usize = std::mem::size_of::<StringId>() + std::mem::size_of::<Addr>();

 fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
     (
         StringId::new(u64::from_le_bytes(bytes[0..8].try_into().unwrap())),
         Addr(u64::from_le_bytes(bytes[8..16].try_into().unwrap())),
     )
 }

 #[derive(Copy, Clone)]
 pub struct StringRef<'st> {
     id: StringId,
     table: &'st StringTable,
 }

 // This is the text we emit when encountering a virtual string ID that cannot
 // be resolved.
 const UNKNOWN_STRING: &str = "<unknown>";

 // This is the text we emit when we encounter string data that does not have a
 // proper terminator.
 const INVALID_STRING: &str = "<invalid>";

 impl<'st> StringRef<'st> {
     /// Expands the StringRef into an actual string. This method will
     /// avoid allocating a `String` if it can instead return a `&str` pointing
     /// into the raw string table data.
     pub fn to_string(&self) -> Cow<'st, str> {
         let addr = match self.get_addr() {
             Ok(addr) => addr,
             Err(_) => return Cow::from(UNKNOWN_STRING),
         };

         // Try to avoid the allocation, which we can do if this is
         //
         //  - a string with a single value component (`[value, 0xFF]`) or
         //  - a string with a single reference component (`[string_id, 0xFF]`)

         let pos = addr.as_usize();
         let slice_to_search = &self.table.string_data[pos..];

         // Find the first 0xFF byte which which is either the sequence
         // terminator or a byte in the middle of string id. Use `memchr` which
         // is super fast.
         let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();

         // Check if this is a string containing a single StringId component
         let first_byte = self.table.string_data[pos];
         if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
             let id = decode_string_ref_from_data(&self.table.string_data[pos..]);
             return StringRef {
                 id,
                 table: self.table,
             }
             .to_string();
         }

         // Decode the bytes until the terminator. If there is a string id in
         // between somewhere this will fail, and we fall back to the allocating
         // path.
         if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
             Cow::from(s)
         } else {
             // This is the slow path where we actually allocate a `String` on
             // the heap and expand into that. If you suspect that there is a
             // bug in the fast path above, you can easily check if always taking
             // the slow path fixes the issue.
             let mut output = String::new();
             self.write_to_string(&mut output);
             Cow::from(output)
         }
     }

     pub fn write_to_string(&self, output: &mut String) {
         let addr = match self.get_addr() {
             Ok(addr) => addr,
             Err(_) => {
                 output.push_str(UNKNOWN_STRING);
                 return;
             }
         };

         let mut pos = addr.as_usize();

         loop {
             let byte = self.table.string_data[pos];

             if byte == TERMINATOR {
                 return;
             } else if byte == STRING_REF_TAG {
                 let string_ref = StringRef {
                     id: decode_string_ref_from_data(&self.table.string_data[pos..]),
                     table: self.table,
                 };

                 string_ref.write_to_string(output);

                 pos += STRING_REF_ENCODED_SIZE;
             } else {
                 // This is a literal UTF-8 string value. Find its end by looking
                 // for either of the two possible terminator bytes.
                 let remaining_data = &self.table.string_data[pos..];
                 if let Some(len) = memchr2(0xFF, 0xFE, remaining_data) {
                     let value = String::from_utf8_lossy(&remaining_data[..len]);
                     output.push_str(&value);
                     pos += len;
                 } else {
                     // The grammar does not allow unterminated raw strings. We
                     // have to stop decoding.
                     output.push_str(INVALID_STRING);
                     return;
                 }
             }
         }
     }

     fn get_addr(&self) -> Result<Addr, ()> {
         if self.id.is_virtual() {
             match self.table.index.get(&self.id) {
                 Some(&addr) => Ok(addr),
                 None => Err(()),
             }
         } else if self.id == StringId::INVALID {
             Err(())
         } else {
             Ok(self.id.to_addr())
         }
     }
 }

 // String IDs in the table data are encoded in big endian format, while string
 // IDs in the index are encoded in little endian format. Don't mix the two up.
 fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
     // The code below assumes we use a 5-byte encoding for string
     // refs, where the first byte is STRING_REF_TAG and the
     // following 4 bytes are a little-endian u32 string ID value.
     assert!(bytes[0] == STRING_REF_TAG);
     assert!(STRING_REF_ENCODED_SIZE == 9);

     let id = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
     StringId::new(id)
 }

 /// Read-only version of the string table
 #[derive(Debug)]
 pub struct StringTable {
     // TODO: Replace with something lazy
     string_data: Vec<u8>,
     index: FxHashMap<StringId, Addr>,
 }

 impl StringTable {
     pub fn new(
         string_data: Vec<u8>,
         index_data: Vec<u8>,
         diagnostic_file_path: Option<&Path>,
     ) -> Result<StringTable, Box<dyn Error + Send + Sync>> {
         verify_file_header(
             &string_data,
             FILE_MAGIC_STRINGTABLE_DATA,
             diagnostic_file_path,
             "StringTable Data",
         )?;
         verify_file_header(
             &index_data,
             FILE_MAGIC_STRINGTABLE_INDEX,
             diagnostic_file_path,
             "StringTable Index",
         )?;

         // The non-header data should be divisible into index entries.
         assert!(
             (index_data.len() - measureme::file_header::FILE_HEADER_SIZE) % INDEX_ENTRY_SIZE == 0,
             "StringTable index size appears malformed",
         );
         assert_eq!(INDEX_ENTRY_SIZE, 16);

         let index: FxHashMap<_, _> = strip_file_header(&index_data)
             .chunks(INDEX_ENTRY_SIZE)
             .map(deserialize_index_entry)
             .collect();

         Ok(StringTable { string_data, index })
     }

     #[inline]
     pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> {
         StringRef { id, table: self }
     }

     pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
         let id = StringId::new(METADATA_STRING_ID);
         self.get(id)
     }
 }

 #[cfg(test)]
 mod tests {
     use super::*;
     use measureme::{PageTag, SerializationSinkBuilder, StringComponent, StringTableBuilder};
     use std::sync::Arc;

     #[test]
     fn simple_strings() {
         let sink_builder = SerializationSinkBuilder::new_in_memory();
         let data_sink = Arc::new(sink_builder.new_sink(PageTag::StringData));
         let index_sink = Arc::new(sink_builder.new_sink(PageTag::StringIndex));

         let expected_strings = &[
             "abc",
             "",
             "xyz",
             "g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y",
             "",
             "",
             "g2h9284hgjv282y32983849&35g5y",
         ];

         let mut string_ids = vec![];

         {
             let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()).unwrap();

             for &s in expected_strings {
                 string_ids.push(builder.alloc(s));
             }
         }

         let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
         let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();

         let string_table = StringTable::new(data_bytes, index_bytes, None).unwrap();

         for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
             let str_ref = string_table.get(id);

             assert_eq!(str_ref.to_string(), expected_string);

             let mut write_to = String::new();
             str_ref.write_to_string(&mut write_to);
             assert_eq!(str_ref.to_string(), write_to);
         }
     }

     #[test]
     fn composite_string() {
         let sink_builder = SerializationSinkBuilder::new_in_memory();
         let data_sink = Arc::new(sink_builder.new_sink(PageTag::StringData));
         let index_sink = Arc::new(sink_builder.new_sink(PageTag::StringIndex));

         let expected_strings = &[
             "abc",                  // 0
             "abcabc",               // 1
             "abcabcabc",            // 2
             "abcabcabc",            // 3
             "abcabcabc",            // 4
             "abcabcabcabc",         // 5
             "xxabcabcuuuabcabcqqq", // 6
             "xxxxxx",               // 7
         ];

         let mut string_ids = vec![];

         {
             let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()).unwrap();

             let r = |id| StringComponent::Ref(id);
             let v = |s| StringComponent::Value(s);

             string_ids.push(builder.alloc("abc")); // 0
             string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1
             string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2
             string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3
             string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4
             string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5
             string_ids.push(builder.alloc(&[
                 v("xx"),
                 r(string_ids[1]),
                 v("uuu"),
                 r(string_ids[1]),
                 v("qqq"),
             ])); // 6
         }

         let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
         let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();

         let string_table = StringTable::new(data_bytes, index_bytes, None).unwrap();

         for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
             let str_ref = string_table.get(id);

             assert_eq!(str_ref.to_string(), expected_string);

             let mut write_to = String::new();
             str_ref.write_to_string(&mut write_to);
             assert_eq!(str_ref.to_string(), write_to);
         }
     }
 }
	//! See module-level documentation `measureme::stringtable`.

	use measureme::stringtable::{METADATA_STRING_ID, TERMINATOR};
	use measureme::{
	file_header::{
	strip_file_header, verify_file_header, FILE_MAGIC_STRINGTABLE_DATA,
	FILE_MAGIC_STRINGTABLE_INDEX,
	},
	stringtable::STRING_REF_ENCODED_SIZE,
	stringtable::STRING_REF_TAG,
	};
	use measureme::{Addr, StringId};
	use memchr::{memchr, memchr2};
	use rustc_hash::FxHashMap;
	use std::borrow::Cow;
	use std::convert::TryInto;
	use std::error::Error;
	use std::path::Path;

	const INDEX_ENTRY_SIZE: usize = std::mem::size_of::<StringId>() + std::mem::size_of::<Addr>();

	fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
	(
	StringId::new(u64::from_le_bytes(bytes[0..8].try_into().unwrap())),
	Addr(u64::from_le_bytes(bytes[8..16].try_into().unwrap())),
	)
	}

	#[derive(Copy, Clone)]
	pub struct StringRef<'st> {
	id: StringId,
	table: &'st StringTable,
	}

	// This is the text we emit when encountering a virtual string ID that cannot
	// be resolved.
	const UNKNOWN_STRING: &str = "<unknown>";

	// This is the text we emit when we encounter string data that does not have a
	// proper terminator.
	const INVALID_STRING: &str = "<invalid>";

	impl<'st> StringRef<'st> {
	/// Expands the StringRef into an actual string. This method will
	/// avoid allocating a `String` if it can instead return a `&str` pointing
	/// into the raw string table data.
	pub fn to_string(&self) -> Cow<'st, str> {
	let addr = match self.get_addr() {
	Ok(addr) => addr,
	Err(_) => return Cow::from(UNKNOWN_STRING),
	};

	// Try to avoid the allocation, which we can do if this is
	//
	// - a string with a single value component (`[value, 0xFF]`) or
	// - a string with a single reference component (`[string_id, 0xFF]`)

	let pos = addr.as_usize();
	let slice_to_search = &self.table.string_data[pos..];

	// Find the first 0xFF byte which which is either the sequence
	// terminator or a byte in the middle of string id. Use `memchr` which
	// is super fast.
	let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();

	// Check if this is a string containing a single StringId component
	let first_byte = self.table.string_data[pos];
	if first_byte == STRING_REF_TAG && terminator_pos == pos + STRING_REF_ENCODED_SIZE {
	let id = decode_string_ref_from_data(&self.table.string_data[pos..]);
	return StringRef {
	id,
	table: self.table,
	}
	.to_string();
	}

	// Decode the bytes until the terminator. If there is a string id in
	// between somewhere this will fail, and we fall back to the allocating
	// path.
	if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
	Cow::from(s)
	} else {
	// This is the slow path where we actually allocate a `String` on
	// the heap and expand into that. If you suspect that there is a
	// bug in the fast path above, you can easily check if always taking
	// the slow path fixes the issue.
	let mut output = String::new();
	self.write_to_string(&mut output);
	Cow::from(output)
	}
	}

	pub fn write_to_string(&self, output: &mut String) {
	let addr = match self.get_addr() {
	Ok(addr) => addr,
	Err(_) => {
	output.push_str(UNKNOWN_STRING);
	return;
	}
	};

	let mut pos = addr.as_usize();

	loop {
	let byte = self.table.string_data[pos];

	if byte == TERMINATOR {
	return;
	} else if byte == STRING_REF_TAG {
	let string_ref = StringRef {
	id: decode_string_ref_from_data(&self.table.string_data[pos..]),
	table: self.table,
	};

	string_ref.write_to_string(output);

	pos += STRING_REF_ENCODED_SIZE;
	} else {
	// This is a literal UTF-8 string value. Find its end by looking
	// for either of the two possible terminator bytes.
	let remaining_data = &self.table.string_data[pos..];
	if let Some(len) = memchr2(0xFF, 0xFE, remaining_data) {
	let value = String::from_utf8_lossy(&remaining_data[..len]);
	output.push_str(&value);
	pos += len;
	} else {
	// The grammar does not allow unterminated raw strings. We
	// have to stop decoding.
	output.push_str(INVALID_STRING);
	return;
	}
	}
	}
	}

	fn get_addr(&self) -> Result<Addr, ()> {
	if self.id.is_virtual() {
	match self.table.index.get(&self.id) {
	Some(&addr) => Ok(addr),
	None => Err(()),
	}
	} else if self.id == StringId::INVALID {
	Err(())
	} else {
	Ok(self.id.to_addr())
	}
	}
	}

	// String IDs in the table data are encoded in big endian format, while string
	// IDs in the index are encoded in little endian format. Don't mix the two up.
	fn decode_string_ref_from_data(bytes: &[u8]) -> StringId {
	// The code below assumes we use a 5-byte encoding for string
	// refs, where the first byte is STRING_REF_TAG and the
	// following 4 bytes are a little-endian u32 string ID value.
	assert!(bytes[0] == STRING_REF_TAG);
	assert!(STRING_REF_ENCODED_SIZE == 9);

	let id = u64::from_le_bytes(bytes[1..9].try_into().unwrap());
	StringId::new(id)
	}

	/// Read-only version of the string table
	#[derive(Debug)]
	pub struct StringTable {
	// TODO: Replace with something lazy
	string_data: Vec<u8>,
	index: FxHashMap<StringId, Addr>,
	}

	impl StringTable {
	pub fn new(
	string_data: Vec<u8>,
	index_data: Vec<u8>,
	diagnostic_file_path: Option<&Path>,
	) -> Result<StringTable, Box<dyn Error + Send + Sync>> {
	verify_file_header(
	&string_data,
	FILE_MAGIC_STRINGTABLE_DATA,
	diagnostic_file_path,
	"StringTable Data",
	)?;
	verify_file_header(
	&index_data,
	FILE_MAGIC_STRINGTABLE_INDEX,
	diagnostic_file_path,
	"StringTable Index",
	)?;

	// The non-header data should be divisible into index entries.
	assert!(
	(index_data.len() - measureme::file_header::FILE_HEADER_SIZE) % INDEX_ENTRY_SIZE == 0,
	"StringTable index size appears malformed",
	);
	assert_eq!(INDEX_ENTRY_SIZE, 16);

	let index: FxHashMap<_, _> = strip_file_header(&index_data)
	.chunks(INDEX_ENTRY_SIZE)
	.map(deserialize_index_entry)
	.collect();

	Ok(StringTable { string_data, index })
	}

	#[inline]
	pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> {
	StringRef { id, table: self }
	}

	pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
	let id = StringId::new(METADATA_STRING_ID);
	self.get(id)
	}
	}

	#[cfg(test)]
	mod tests {
	use super::*;
	use measureme::{PageTag, SerializationSinkBuilder, StringComponent, StringTableBuilder};
	use std::sync::Arc;

	#[test]
	fn simple_strings() {
	let sink_builder = SerializationSinkBuilder::new_in_memory();
	let data_sink = Arc::new(sink_builder.new_sink(PageTag::StringData));
	let index_sink = Arc::new(sink_builder.new_sink(PageTag::StringIndex));

	let expected_strings = &[
	"abc",
	"",
	"xyz",
	"g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y",
	"",
	"",
	"g2h9284hgjv282y32983849&35g5y",
	];

	let mut string_ids = vec![];

	{
	let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()).unwrap();

	for &s in expected_strings {
	string_ids.push(builder.alloc(s));
	}
	}

	let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
	let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();

	let string_table = StringTable::new(data_bytes, index_bytes, None).unwrap();

	for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
	let str_ref = string_table.get(id);

	assert_eq!(str_ref.to_string(), expected_string);

	let mut write_to = String::new();
	str_ref.write_to_string(&mut write_to);
	assert_eq!(str_ref.to_string(), write_to);
	}
	}

	#[test]
	fn composite_string() {
	let sink_builder = SerializationSinkBuilder::new_in_memory();
	let data_sink = Arc::new(sink_builder.new_sink(PageTag::StringData));
	let index_sink = Arc::new(sink_builder.new_sink(PageTag::StringIndex));

	let expected_strings = &[
	"abc", // 0
	"abcabc", // 1
	"abcabcabc", // 2
	"abcabcabc", // 3
	"abcabcabc", // 4
	"abcabcabcabc", // 5
	"xxabcabcuuuabcabcqqq", // 6
	"xxxxxx", // 7
	];

	let mut string_ids = vec![];

	{
	let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()).unwrap();

	let r = \|id\| StringComponent::Ref(id);
	let v = \|s\| StringComponent::Value(s);

	string_ids.push(builder.alloc("abc")); // 0
	string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1
	string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2
	string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3
	string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4
	string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5
	string_ids.push(builder.alloc(&[
	v("xx"),
	r(string_ids[1]),
	v("uuu"),
	r(string_ids[1]),
	v("qqq"),
	])); // 6
	}

	let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
	let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();

	let string_table = StringTable::new(data_bytes, index_bytes, None).unwrap();

	for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
	let str_ref = string_table.get(id);

	assert_eq!(str_ref.to_string(), expected_string);

	let mut write_to = String::new();
	str_ref.write_to_string(&mut write_to);
	assert_eq!(str_ref.to_string(), write_to);
	}
	}
	}