| //- |
| // Copyright 2017 Jason Lingle |
| // |
| // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| // option. This file may not be copied, modified, or distributed |
| // except according to those terms. |
| |
| //! Strategies for generating `char` values. |
| //! |
| //! Unlike most strategies in Proptest, character generation is by default |
| //! biased to particular values known to be difficult to handle in various |
| //! circumstances. |
| //! |
| //! The main things of interest are `any()` to generate truly arbitrary |
| //! characters, and `range()` and `ranges()` to select characters from |
| //! inclusive ranges. |
| |
| use crate::std_facade::Cow; |
| use core::ops::RangeInclusive; |
| |
| use rand::Rng; |
| |
| use crate::num; |
| use crate::strategy::*; |
| use crate::test_runner::*; |
| |
| /// An inclusive char range from fst to snd. |
| type CharRange = RangeInclusive<char>; |
| |
| /// A default set of characters to consider as "special" during character |
| /// generation. |
| /// |
| /// Most of the characters here were chosen specifically because they are |
| /// difficult to handle in particular contexts. |
| pub const DEFAULT_SPECIAL_CHARS: &[char] = &[ |
| // Things to give shell scripts and filesystem logic difficulties |
| '/', '\\', '$', '.', '*', '{', '\'', '"', '`', ':', |
| // Characters with special significance in URLs and elsewhere |
| '?', '%', '=', '&', '<', |
| // Interesting ASCII control characters |
| // NUL, HT, CR, LF, VT ESC DEL |
| '\x00', '\t', '\r', '\n', '\x0B', '\x1B', '\x7F', |
| // ¥ both to test simple Unicode handling and because it has interesting |
| // properties on MS Shift-JIS systems. |
| '¥', // No non-Unicode encoding has both ¥ and Ѩ |
| 'Ѩ', |
| // In UTF-8, Ⱥ increases in length from 2 to 3 bytes when lowercased |
| 'Ⱥ', |
| // More Unicode edge-cases: BOM, replacement character, RTL override, and non-BMP |
| '\u{FEFF}', '\u{FFFD}', '\u{202E}', '🕴', |
| ]; |
| |
| /// A default sequence of ranges used preferentially when generating random |
| /// characters. |
| pub const DEFAULT_PREFERRED_RANGES: &[CharRange] = &[ |
| // ASCII printable |
| ' '..='~', |
| ' '..='~', |
| ' '..='~', |
| ' '..='~', |
| ' '..='~', |
| // Latin-1 |
| '\u{0040}'..='\u{00ff}', |
| ]; |
| |
| /// Selects a random character the way `CharStrategy` does. |
| /// |
| /// If `special` is non-empty, there is a 50% chance that a character from this |
| /// array is chosen randomly, and will be returned if that character falls |
| /// within `ranges`. |
| /// |
| /// If `preferred` is non-empty, there is a 50% chance that any generation |
| /// which gets past the `special` step picks a random element from this list, |
| /// then a random character from within that range (both endpoints inclusive). |
| /// That character will be returned if it falls within `ranges`. |
| /// |
| /// In all other cases, an element is picked randomly from `ranges` and a |
| /// random character within the range (both endpoints inclusive) is chosen and |
| /// returned. |
| /// |
| /// Notice that in all cases, `ranges` completely defines the set of characters |
| /// that can possibly be defined. |
| /// |
| /// It is legal for ranges in all cases to contain non-characters. |
| /// |
| /// Both `preferred` and `ranges` bias selection towards characters in smaller |
| /// ranges. This is deliberate. `preferred` is usually tuned to select |
| /// particular characters anyway. `ranges` is usually derived from some |
| /// external property, and the fact that a range is small often means it is |
| /// more interesting. |
| pub fn select_char( |
| rnd: &mut impl Rng, |
| special: &[char], |
| preferred: &[CharRange], |
| ranges: &[CharRange], |
| ) -> char { |
| let (base, offset) = select_range_index(rnd, special, preferred, ranges); |
| ::core::char::from_u32(base + offset).expect("bad character selected") |
| } |
| |
| fn select_range_index( |
| rnd: &mut impl Rng, |
| special: &[char], |
| preferred: &[CharRange], |
| ranges: &[CharRange], |
| ) -> (u32, u32) { |
| fn in_range(ranges: &[CharRange], ch: char) -> Option<(u32, u32)> { |
| ranges |
| .iter() |
| .find(|r| ch >= *r.start() && ch <= *r.end()) |
| .map(|r| (*r.start() as u32, ch as u32 - *r.start() as u32)) |
| } |
| |
| if !special.is_empty() && rnd.gen() { |
| let s = special[rnd.gen_range(0..special.len())]; |
| if let Some(ret) = in_range(ranges, s) { |
| return ret; |
| } |
| } |
| |
| if !preferred.is_empty() && rnd.gen() { |
| let range = preferred[rnd.gen_range(0..preferred.len())].clone(); |
| if let Some(ch) = ::core::char::from_u32( |
| rnd.gen_range(*range.start() as u32..*range.end() as u32 + 1), |
| ) { |
| if let Some(ret) = in_range(ranges, ch) { |
| return ret; |
| } |
| } |
| } |
| |
| for _ in 0..65_536 { |
| let range = ranges[rnd.gen_range(0..ranges.len())].clone(); |
| if let Some(ch) = ::core::char::from_u32( |
| rnd.gen_range(*range.start() as u32..*range.end() as u32 + 1), |
| ) { |
| return (*range.start() as u32, ch as u32 - *range.start() as u32); |
| } |
| } |
| |
| // Give up and return a character we at least know is valid. |
| (*ranges[0].start() as u32, 0) |
| } |
| |
| /// Strategy for generating `char`s. |
| /// |
| /// Character selection is more sophisticated than integer selection. Naïve |
| /// selection (particularly in the larger context of generating strings) would |
| /// result in starting inputs like `ꂡ螧轎ቶᢹ糦狥芹ᘆ㶏曊ᒀ踔虙ჲ` and "simplified" |
| /// inputs consisting mostly of control characters. It also has difficulty |
| /// locating edge cases, since the vast majority of code points (such as the |
| /// enormous CJK regions) don't cause problems for anything with even basic |
| /// Unicode support. |
| /// |
| /// Instead, character selection is always based on explicit ranges, and is |
| /// designed to bias to specifically chosen characters and character ranges to |
| /// produce inputs that are both more useful and easier for humans to |
| /// understand. There are also hard-wired simplification targets based on ASCII |
| /// instead of simply simplifying towards NUL to avoid problematic inputs being |
| /// reduced to a bunch of NUL characters. |
| /// |
| /// Shrinking never crosses ranges. If you have a complex range like `[A-Za-z]` |
| /// and the starting point `x` is chosen, it will not shrink to the first `A-Z` |
| /// group, but rather simply to `a`. |
| /// |
| /// The usual way to get instances of this class is with the module-level `ANY` |
| /// constant or `range` function. Directly constructing a `CharStrategy` is |
| /// only necessary for complex ranges or to override the default biases. |
| #[derive(Debug, Clone)] |
| #[must_use = "strategies do nothing unless used"] |
| pub struct CharStrategy<'a> { |
| special: Cow<'a, [char]>, |
| preferred: Cow<'a, [CharRange]>, |
| ranges: Cow<'a, [CharRange]>, |
| } |
| |
| impl<'a> CharStrategy<'a> { |
| /// Construct a new `CharStrategy` with the parameters it will pass to the |
| /// function underlying `select_char()`. |
| /// |
| /// All arguments as per `select_char()`. |
| pub fn new( |
| special: Cow<'a, [char]>, |
| preferred: Cow<'a, [CharRange]>, |
| ranges: Cow<'a, [CharRange]>, |
| ) -> Self { |
| CharStrategy { |
| special, |
| preferred, |
| ranges, |
| } |
| } |
| |
| /// Same as `CharStrategy::new()` but using `Cow::Borrowed` for all parts. |
| pub fn new_borrowed( |
| special: &'a [char], |
| preferred: &'a [CharRange], |
| ranges: &'a [CharRange], |
| ) -> Self { |
| CharStrategy::new( |
| Cow::Borrowed(special), |
| Cow::Borrowed(preferred), |
| Cow::Borrowed(ranges), |
| ) |
| } |
| } |
| |
| const WHOLE_RANGE: &[CharRange] = &['\x00'..=::core::char::MAX]; |
| |
| /// Creates a `CharStrategy` which picks from literally any character, with the |
| /// default biases. |
| pub fn any() -> CharStrategy<'static> { |
| CharStrategy { |
| special: Cow::Borrowed(DEFAULT_SPECIAL_CHARS), |
| preferred: Cow::Borrowed(DEFAULT_PREFERRED_RANGES), |
| ranges: Cow::Borrowed(WHOLE_RANGE), |
| } |
| } |
| |
| /// Creates a `CharStrategy` which selects characters within the given |
| /// endpoints, inclusive, using the default biases. |
| pub fn range(start: char, end: char) -> CharStrategy<'static> { |
| CharStrategy { |
| special: Cow::Borrowed(DEFAULT_SPECIAL_CHARS), |
| preferred: Cow::Borrowed(DEFAULT_PREFERRED_RANGES), |
| ranges: Cow::Owned(vec![start..=end]), |
| } |
| } |
| |
| /// Creates a `CharStrategy` which selects characters within the given ranges, |
| /// all inclusive, using the default biases. |
| pub fn ranges(ranges: Cow<[CharRange]>) -> CharStrategy { |
| CharStrategy { |
| special: Cow::Borrowed(DEFAULT_SPECIAL_CHARS), |
| preferred: Cow::Borrowed(DEFAULT_PREFERRED_RANGES), |
| ranges, |
| } |
| } |
| |
| /// The `ValueTree` corresponding to `CharStrategy`. |
| #[derive(Debug, Clone, Copy)] |
| pub struct CharValueTree { |
| value: num::u32::BinarySearch, |
| } |
| |
| impl<'a> Strategy for CharStrategy<'a> { |
| type Tree = CharValueTree; |
| type Value = char; |
| |
| fn new_tree(&self, runner: &mut TestRunner) -> NewTree<Self> { |
| let (base, offset) = select_range_index( |
| runner.rng(), |
| &self.special, |
| &self.preferred, |
| &self.ranges, |
| ); |
| |
| // Select a minimum point more convenient than 0 |
| let start = base + offset; |
| let bottom = if start >= '¡' as u32 && base < '¡' as u32 { |
| '¡' as u32 |
| } else if start >= 'a' as u32 && base < 'a' as u32 { |
| 'a' as u32 |
| } else if start >= 'A' as u32 && base < 'A' as u32 { |
| 'A' as u32 |
| } else if start >= '0' as u32 && base < '0' as u32 { |
| '0' as u32 |
| } else if start >= ' ' as u32 && base < ' ' as u32 { |
| ' ' as u32 |
| } else { |
| base |
| }; |
| |
| Ok(CharValueTree { |
| value: num::u32::BinarySearch::new_above(bottom, start), |
| }) |
| } |
| } |
| |
| impl CharValueTree { |
| fn reposition(&mut self) { |
| while ::core::char::from_u32(self.value.current()).is_none() { |
| if !self.value.complicate() { |
| panic!("Converged to non-char value"); |
| } |
| } |
| } |
| } |
| |
| impl ValueTree for CharValueTree { |
| type Value = char; |
| |
| fn current(&self) -> char { |
| ::core::char::from_u32(self.value.current()) |
| .expect("Generated non-char value") |
| } |
| |
| fn simplify(&mut self) -> bool { |
| if self.value.simplify() { |
| self.reposition(); |
| true |
| } else { |
| false |
| } |
| } |
| |
| fn complicate(&mut self) -> bool { |
| if self.value.complicate() { |
| self.reposition(); |
| true |
| } else { |
| false |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod test { |
| use std::cmp::{max, min}; |
| use std::vec::Vec; |
| |
| use super::*; |
| use crate::collection; |
| |
| proptest! { |
| #[test] |
| fn stays_in_range(input_ranges in collection::vec( |
| (0..::std::char::MAX as u32, |
| 0..::std::char::MAX as u32), |
| 1..5)) |
| { |
| let input = ranges(Cow::Owned(input_ranges.iter().map( |
| |&(lo, hi)| ::std::char::from_u32(lo).and_then( |
| |lo| ::std::char::from_u32(hi).map( |
| |hi| min(lo, hi) ..= max(lo, hi))) |
| .ok_or_else(|| TestCaseError::reject("non-char"))) |
| .collect::<Result<Vec<CharRange>,_>>()?)); |
| |
| let mut runner = TestRunner::default(); |
| for _ in 0..256 { |
| let mut value = input.new_tree(&mut runner).unwrap(); |
| loop { |
| let ch = value.current() as u32; |
| assert!(input_ranges.iter().any( |
| |&(lo, hi)| ch >= min(lo, hi) && |
| ch <= max(lo, hi))); |
| |
| if !value.simplify() { break; } |
| } |
| } |
| } |
| } |
| |
| #[test] |
| fn applies_desired_bias() { |
| let mut men_in_business_suits_levitating = 0; |
| let mut ascii_printable = 0; |
| let mut runner = TestRunner::deterministic(); |
| |
| for _ in 0..1024 { |
| let ch = any().new_tree(&mut runner).unwrap().current(); |
| if '🕴' == ch { |
| men_in_business_suits_levitating += 1; |
| } else if ch >= ' ' && ch <= '~' { |
| ascii_printable += 1; |
| } |
| } |
| |
| assert!(ascii_printable >= 256); |
| assert!(men_in_business_suits_levitating >= 1); |
| } |
| |
| #[test] |
| fn doesnt_shrink_to_ascii_control() { |
| let mut accepted = 0; |
| let mut runner = TestRunner::deterministic(); |
| |
| for _ in 0..256 { |
| let mut value = any().new_tree(&mut runner).unwrap(); |
| |
| if value.current() <= ' ' { |
| continue; |
| } |
| |
| while value.simplify() {} |
| |
| assert!(value.current() >= ' '); |
| accepted += 1; |
| } |
| |
| assert!(accepted >= 200); |
| } |
| |
| #[test] |
| fn test_sanity() { |
| check_strategy_sanity( |
| any(), |
| Some(CheckStrategySanityOptions { |
| // `simplify()` can itself `complicate()` back to the starting |
| // position, so the overly strict complicate-after-simplify check |
| // must be disabled. |
| strict_complicate_after_simplify: false, |
| ..CheckStrategySanityOptions::default() |
| }), |
| ); |
| } |
| } |