| use crate::automaton::AlwaysMatch; |
| use crate::error::Error; |
| use crate::raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; |
| use crate::stream::Streamer; |
| |
| const TEXT: &'static str = include_str!("./../../data/words-100000"); |
| |
| pub fn fst_set<I, S>(ss: I) -> Fst<Vec<u8>> |
| where |
| I: IntoIterator<Item = S>, |
| S: AsRef<[u8]>, |
| { |
| let mut bfst = Builder::memory(); |
| let mut ss: Vec<Vec<u8>> = |
| ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); |
| ss.sort(); |
| ss.dedup(); |
| for s in ss.iter().into_iter() { |
| bfst.add(s).unwrap(); |
| } |
| let fst = bfst.into_fst(); |
| assert_eq!(fst.len(), ss.len()); |
| fst |
| } |
| |
| pub fn fst_map<I, S>(ss: I) -> Fst<Vec<u8>> |
| where |
| I: IntoIterator<Item = (S, u64)>, |
| S: AsRef<[u8]>, |
| { |
| let mut bfst = Builder::memory(); |
| let mut ss: Vec<(Vec<u8>, u64)> = |
| ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect(); |
| ss.sort(); |
| ss.dedup(); |
| for (s, o) in ss.into_iter() { |
| bfst.insert(s, o).unwrap(); |
| } |
| bfst.into_fst() |
| } |
| |
| pub fn fst_inputs<D: AsRef<[u8]>>(fst: &Fst<D>) -> Vec<Vec<u8>> { |
| let mut words = vec![]; |
| let mut rdr = fst.stream(); |
| while let Some((word, _)) = rdr.next() { |
| words.push(word.to_vec()); |
| } |
| words |
| } |
| |
| pub fn fst_inputs_outputs<D: AsRef<[u8]>>( |
| fst: &Fst<D>, |
| ) -> Vec<(Vec<u8>, u64)> { |
| let mut words = vec![]; |
| let mut rdr = fst.stream(); |
| while let Some((word, out)) = rdr.next() { |
| words.push((word.to_vec(), out.value())); |
| } |
| words |
| } |
| |
| macro_rules! test_set { |
| ($name:ident, $($s:expr),+) => { |
| #[test] |
| fn $name() { |
| let mut items = vec![$($s),*]; |
| let fst = fst_set(&items); |
| let mut rdr = fst.stream(); |
| items.sort(); |
| items.dedup(); |
| for item in &items { |
| assert_eq!(rdr.next().unwrap().0, item.as_bytes()); |
| } |
| assert_eq!(rdr.next(), None); |
| for item in &items { |
| assert!(fst.get(item).is_some()); |
| } |
| } |
| } |
| } |
| |
| macro_rules! test_set_fail { |
| ($name:ident, $($s:expr),+) => { |
| #[test] |
| #[should_panic] |
| fn $name() { |
| let mut bfst = Builder::memory(); |
| $(bfst.add($s).unwrap();)* |
| } |
| } |
| } |
| |
| test_set!(fst_set_only_empty, ""); |
| test_set!(fst_set_one, "a"); |
| test_set!(fst_set_dupe_empty, "", ""); |
| test_set!(fst_set_dupe1, "a", "a"); |
| test_set!(fst_set_dupe2, "a", "b", "b"); |
| test_set!(fst_set_two1, "a", "b"); |
| test_set!(fst_set_two2, "a", "ab"); |
| test_set!(fst_set_jan, "jam", "jbm", "jcm", "jdm", "jem", "jfm", "jgm"); |
| |
| test_set_fail!(fst_set_order1, "b", "a"); |
| test_set_fail!(fst_set_order2, "a", "b", "c", "a"); |
| |
| #[test] |
| fn fst_set_100000() { |
| let words: Vec<Vec<u8>> = |
| TEXT.lines().map(|s| s.as_bytes().to_vec()).collect(); |
| let fst = fst_set(words.clone()); |
| assert_eq!(words, fst_inputs(&fst)); |
| for word in &words { |
| assert!( |
| fst.get(word).is_some(), |
| "failed to find word: {}", |
| std::str::from_utf8(word).unwrap() |
| ); |
| } |
| } |
| |
| macro_rules! test_map { |
| ($name:ident, $($s:expr, $o:expr),+) => { |
| #[test] |
| fn $name() { |
| let fst = fst_map(vec![$(($s, $o)),*]); |
| let mut rdr = fst.stream(); |
| $({ |
| let (s, o) = rdr.next().unwrap(); |
| assert_eq!((s, o.value()), ($s.as_bytes(), $o)); |
| })* |
| assert_eq!(rdr.next(), None); |
| $({ |
| assert_eq!(fst.get($s.as_bytes()), Some(Output::new($o))); |
| })* |
| } |
| } |
| } |
| |
| macro_rules! test_map_fail { |
| ($name:ident, $($s:expr, $o:expr),+) => { |
| #[test] |
| #[should_panic] |
| fn $name() { |
| let mut bfst = Builder::memory(); |
| $(bfst.insert($s, $o).unwrap();)* |
| } |
| } |
| } |
| |
| test_map!(fst_map_only_empty1, "", 0); |
| test_map!(fst_map_only_empty2, "", 100); |
| test_map!(fst_map_only_empty3, "", 9999999999); |
| test_map!(fst_map_one1, "a", 0); |
| test_map!(fst_map_one2, "a", 100); |
| test_map!(fst_map_one3, "a", 999999999); |
| test_map!(fst_map_two, "a", 1, "b", 2); |
| test_map!(fst_map_many1, "a", 34786, "ab", 26); |
| test_map!( |
| fst_map_many2, |
| "a", |
| 34786, |
| "ab", |
| 26, |
| "abc", |
| 58976, |
| "abcd", |
| 25, |
| "z", |
| 58, |
| "zabc", |
| 6798 |
| ); |
| test_map!(fst_map_many3, "a", 1, "ab", 0, "abc", 0); |
| |
| test_map_fail!(fst_map_dupe_empty, "", 0, "", 0); |
| test_map_fail!(fst_map_dupe1, "a", 0, "a", 0); |
| test_map_fail!(fst_map_dupe2, "a", 0, "b", 0, "b", 0); |
| test_map_fail!(fst_map_order1, "b", 0, "a", 0); |
| test_map_fail!(fst_map_order2, "a", 0, "b", 0, "c", 0, "a", 0); |
| |
| #[test] |
| fn fst_map_100000_increments() { |
| let words: Vec<(Vec<u8>, u64)> = TEXT |
| .lines() |
| .enumerate() |
| .map(|(i, s)| (s.as_bytes().to_vec(), i as u64)) |
| .collect(); |
| let fst = fst_map(words.clone()); |
| assert_eq!(words, fst_inputs_outputs(&fst)); |
| for &(ref word, out) in &words { |
| assert_eq!(fst.get(word), Some(Output::new(out))); |
| } |
| } |
| |
| #[test] |
| fn fst_map_100000_lengths() { |
| let words: Vec<(Vec<u8>, u64)> = TEXT |
| .lines() |
| .map(|s| (s.as_bytes().to_vec(), s.len() as u64)) |
| .collect(); |
| let fst = fst_map(words.clone()); |
| assert_eq!(words, fst_inputs_outputs(&fst)); |
| for &(ref word, out) in &words { |
| assert_eq!(fst.get(word), Some(Output::new(out))); |
| } |
| } |
| |
| #[test] |
| fn invalid_version() { |
| match Fst::new(vec![0; 36]) { |
| Err(Error::Fst(raw::Error::Version { got, .. })) => assert_eq!(got, 0), |
| Err(err) => panic!("expected version error, got {:?}", err), |
| Ok(_) => panic!("expected version error, got FST"), |
| } |
| } |
| |
| #[test] |
| fn invalid_version_crate_too_old() { |
| let mut buf = vec![0; 36]; |
| crate::bytes::write_u64_le(VERSION + 1, &mut buf); |
| match Fst::new(buf) { |
| Err(Error::Fst(raw::Error::Version { got, .. })) => { |
| assert_eq!(got, VERSION + 1); |
| } |
| Err(err) => panic!("expected version error, got {:?}", err), |
| Ok(_) => panic!("expected version error, got FST"), |
| } |
| } |
| |
| #[test] |
| fn invalid_format() { |
| match Fst::new(vec![0; 0]) { |
| Err(Error::Fst(raw::Error::Format { .. })) => {} |
| Err(err) => panic!("expected format error, got {:?}", err), |
| Ok(_) => panic!("expected format error, got FST"), |
| } |
| } |
| |
| #[test] |
| fn fst_set_zero() { |
| let fst = fst_set::<_, String>(vec![]); |
| let mut rdr = fst.stream(); |
| assert_eq!(rdr.next(), None); |
| } |
| |
| macro_rules! test_range { |
| ( |
| $name:ident, |
| min: $min:expr, |
| max: $max:expr, |
| imin: $imin:expr, |
| imax: $imax:expr, |
| $($s:expr),* |
| ) => { |
| #[test] |
| fn $name() { |
| let items: Vec<&'static str> = vec![$($s),*]; |
| let items: Vec<_> = items |
| .into_iter() |
| .enumerate() |
| .map(|(i, k)| (k, i as u64)) |
| .collect(); |
| let fst = fst_map(items.clone()); |
| let mut rdr = Stream::new(fst.as_ref(), AlwaysMatch, $min, $max); |
| for i in $imin..$imax { |
| assert_eq!( |
| rdr.next().unwrap(), |
| (items[i].0.as_bytes(), Output::new(items[i].1)), |
| ); |
| } |
| assert_eq!(rdr.next(), None); |
| } |
| } |
| } |
| |
| test_range! { |
| fst_range_empty_1, |
| min: Bound::Unbounded, max: Bound::Unbounded, |
| imin: 0, imax: 0, |
| } |
| |
| test_range! { |
| fst_range_empty_2, |
| min: Bound::Unbounded, max: Bound::Unbounded, |
| imin: 0, imax: 1, |
| "" |
| } |
| |
| test_range! { |
| fst_range_empty_3, |
| min: Bound::Included(vec![]), max: Bound::Unbounded, |
| imin: 0, imax: 1, |
| "" |
| } |
| |
| test_range! { |
| fst_range_empty_4, |
| min: Bound::Excluded(vec![]), max: Bound::Unbounded, |
| imin: 0, imax: 0, |
| "" |
| } |
| |
| test_range! { |
| fst_range_empty_5, |
| min: Bound::Included(vec![]), max: Bound::Unbounded, |
| imin: 0, imax: 2, |
| "", "a" |
| } |
| |
| test_range! { |
| fst_range_empty_6, |
| min: Bound::Excluded(vec![]), max: Bound::Unbounded, |
| imin: 1, imax: 2, |
| "", "a" |
| } |
| |
| test_range! { |
| fst_range_empty_7, |
| min: Bound::Unbounded, max: Bound::Unbounded, |
| imin: 0, imax: 2, |
| "", "a" |
| } |
| |
| test_range! { |
| fst_range_empty_8, |
| min: Bound::Unbounded, max: Bound::Included(vec![]), |
| imin: 0, imax: 1, |
| "" |
| } |
| |
| test_range! { |
| fst_range_empty_9, |
| min: Bound::Unbounded, max: Bound::Excluded(vec![]), |
| imin: 0, imax: 0, |
| "" |
| } |
| |
| test_range! { |
| fst_range_empty_10, |
| min: Bound::Unbounded, max: Bound::Included(vec![]), |
| imin: 0, imax: 1, |
| "", "a" |
| } |
| |
| test_range! { |
| fst_range_empty_11, |
| min: Bound::Included(vec![]), max: Bound::Included(vec![]), |
| imin: 0, imax: 1, |
| "" |
| } |
| |
| test_range! { |
| fst_range_1, |
| min: Bound::Included(vec![b'a']), max: Bound::Included(vec![b'z']), |
| imin: 0, imax: 4, |
| "a", "b", "y", "z" |
| } |
| |
| test_range! { |
| fst_range_2, |
| min: Bound::Excluded(vec![b'a']), max: Bound::Included(vec![b'y']), |
| imin: 1, imax: 3, |
| "a", "b", "y", "z" |
| } |
| |
| test_range! { |
| fst_range_3, |
| min: Bound::Excluded(vec![b'a']), max: Bound::Excluded(vec![b'y']), |
| imin: 1, imax: 2, |
| "a", "b", "y", "z" |
| } |
| |
| test_range! { |
| fst_range_4, |
| min: Bound::Unbounded, max: Bound::Unbounded, |
| imin: 0, imax: 4, |
| "a", "b", "y", "z" |
| } |
| |
| test_range! { |
| fst_range_5, |
| min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded, |
| imin: 0, imax: 0, |
| "a", "ab", "abc", "abcd", "abcde" |
| } |
| |
| test_range! { |
| fst_range_6, |
| min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded, |
| imin: 5, imax: 6, |
| "a", "ab", "abc", "abcd", "abcde", "abe" |
| } |
| |
| test_range! { |
| fst_range_7, |
| min: Bound::Excluded(b"abd".to_vec()), max: Bound::Unbounded, |
| imin: 5, imax: 6, |
| "a", "ab", "abc", "abcd", "abcde", "abe" |
| } |
| |
| test_range! { |
| fst_range_8, |
| min: Bound::Included(b"abd".to_vec()), max: Bound::Unbounded, |
| imin: 5, imax: 6, |
| "a", "ab", "abc", "abcd", "abcde", "xyz" |
| } |
| |
| test_range! { |
| fst_range_9, |
| min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()), |
| imin: 0, imax: 5, |
| "a", "ab", "abc", "abcd", "abcde", "abe" |
| } |
| |
| test_range! { |
| fst_range_10, |
| min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()), |
| imin: 0, imax: 6, |
| "a", "ab", "abc", "abcd", "abcde", "abd" |
| } |
| |
| test_range! { |
| fst_range_11, |
| min: Bound::Unbounded, max: Bound::Included(b"abd".to_vec()), |
| imin: 0, imax: 6, |
| "a", "ab", "abc", "abcd", "abcde", "abd", "abdx" |
| } |
| |
| test_range! { |
| fst_range_12, |
| min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()), |
| imin: 0, imax: 5, |
| "a", "ab", "abc", "abcd", "abcde", "abe" |
| } |
| |
| test_range! { |
| fst_range_13, |
| min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()), |
| imin: 0, imax: 5, |
| "a", "ab", "abc", "abcd", "abcde", "abd" |
| } |
| |
| test_range! { |
| fst_range_14, |
| min: Bound::Unbounded, max: Bound::Excluded(b"abd".to_vec()), |
| imin: 0, imax: 5, |
| "a", "ab", "abc", "abcd", "abcde", "abd", "abdx" |
| } |
| |
| test_range! { |
| fst_range_15, |
| min: Bound::Included(vec![b'd']), max: Bound::Included(vec![b'c']), |
| imin: 0, imax: 0, |
| "a", "b", "c", "d", "e", "f" |
| } |
| |
| test_range! { |
| fst_range_16, |
| min: Bound::Included(vec![b'c']), max: Bound::Included(vec![b'c']), |
| imin: 2, imax: 3, |
| "a", "b", "c", "d", "e", "f" |
| } |
| |
| test_range! { |
| fst_range_17, |
| min: Bound::Excluded(vec![b'c']), max: Bound::Excluded(vec![b'c']), |
| imin: 0, imax: 0, |
| "a", "b", "c", "d", "e", "f" |
| } |
| |
| test_range! { |
| fst_range_18, |
| min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'c']), |
| imin: 0, imax: 0, |
| "a", "b", "c", "d", "e", "f" |
| } |
| |
| test_range! { |
| fst_range_19, |
| min: Bound::Included(vec![b'c']), max: Bound::Excluded(vec![b'd']), |
| imin: 2, imax: 3, |
| "a", "b", "c", "d", "e", "f" |
| } |
| |
| #[test] |
| fn one_vec_multiple_fsts() { |
| let mut bfst1 = Builder::memory(); |
| bfst1.add(b"bar").unwrap(); |
| bfst1.add(b"baz").unwrap(); |
| let bytes = bfst1.into_inner().unwrap(); |
| let fst1_len = bytes.len(); |
| |
| let mut bfst2 = Builder::new(bytes).unwrap(); |
| bfst2.add(b"bar").unwrap(); |
| bfst2.add(b"foo").unwrap(); |
| |
| let bytes = bfst2.into_inner().unwrap(); |
| let slice1 = &bytes[0..fst1_len]; |
| let slice2 = &bytes[fst1_len..bytes.len()]; |
| |
| let fst1 = Fst::new(slice1).unwrap(); |
| let fst2 = Fst::new(slice2).unwrap(); |
| |
| assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]); |
| assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]); |
| } |
| |
| #[test] |
| fn bytes_written() { |
| let mut bfst1 = Builder::memory(); |
| bfst1.add(b"bar").unwrap(); |
| bfst1.add(b"baz").unwrap(); |
| let counted_len = bfst1.bytes_written(); |
| let bytes = bfst1.into_inner().unwrap(); |
| let fst1_len = bytes.len() as u64; |
| let footer_size = 28; |
| assert_eq!(counted_len + footer_size, fst1_len); |
| } |
| |
| #[test] |
| fn get_key_simple() { |
| let map = fst_map(vec![("abc", 2), ("xyz", 3)]); |
| assert_eq!(map.get_key(0), None); |
| assert_eq!(map.get_key(1), None); |
| assert_eq!(map.get_key(2), Some(b"abc".to_vec())); |
| assert_eq!(map.get_key(3), Some(b"xyz".to_vec())); |
| assert_eq!(map.get_key(4), None); |
| } |
| |
| #[test] |
| fn get_key_words() { |
| let words: Vec<(Vec<u8>, u64)> = TEXT |
| .lines() |
| .enumerate() |
| .map(|(i, line)| (line.as_bytes().to_vec(), i as u64)) |
| .collect(); |
| let map = fst_map(words.clone()); |
| for (key, value) in words { |
| assert_eq!(map.get_key(value), Some(key)); |
| } |
| } |
| |
| #[test] |
| fn get_key_words_discontiguous() { |
| let words: Vec<(Vec<u8>, u64)> = TEXT |
| .lines() |
| .enumerate() |
| .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2)) |
| .collect(); |
| let map = fst_map(words.clone()); |
| for (key, value) in words { |
| assert_eq!(map.get_key(value), Some(key)); |
| } |
| } |
| |
| #[test] |
| fn verify_ok_nonempty() { |
| let words: Vec<(Vec<u8>, u64)> = TEXT |
| .lines() |
| .enumerate() |
| .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2)) |
| .collect(); |
| let map = fst_map(words.clone()); |
| assert!(map.verify().is_ok()); |
| } |
| |
| #[test] |
| fn verify_ok_empty() { |
| let map = fst_map(Vec::<(&str, u64)>::new()); |
| assert!(map.verify().is_ok()); |
| } |
| |
| #[test] |
| fn verify_err() { |
| let mut b = Builder::memory(); |
| b.add(b"bar").unwrap(); |
| b.add(b"baz").unwrap(); |
| let mut bytes = b.into_inner().unwrap(); |
| |
| { |
| let fst = Fst::new(&bytes).unwrap(); |
| assert!(fst.verify().is_ok()); |
| } |
| |
| bytes[17] = b'\xFF'; |
| { |
| let fst = Fst::new(&bytes).unwrap(); |
| assert!(fst.verify().is_err()); |
| } |
| } |