| use core::fmt; |
| |
| use crate::Terminator; |
| |
| // BE ADVISED |
| // |
| // This may just be one of the more complicated CSV parsers you'll come across. |
| // The implementation never allocates and consists of both a functional NFA |
| // parser and a DFA parser. The DFA parser is the work horse and we could elide |
| // much of the work involved in making the NFA parser work, but the NFA parser |
| // is much easier to debug. The NFA parser is tested alongside the DFA parser, |
| // so they should never be out of sync. |
| // |
| // The basic structure of the implementation is to encode the NFA parser as |
| // an explicit state machine in code. The DFA is then generated by populating |
| // a transition table on the stack by exhaustively enumerating all possible |
| // states on all possible inputs (this is possible because the number of states |
| // and the number of inputs is very small). |
| // |
| // Note that some pieces of the NFA parser (such as the NFA state machine) are |
| // required. In particular, the translation from the NFA to the DFA depends on |
| // the configuration of the CSV parser as given by the caller, and indeed, this |
| // is one of the key performance benefits of the DFA: it doesn't have any |
| // overhead (other than a bigger transition table) associated with the number |
| // of configuration options. |
| // |
| // ADVICE FOR HACKERS |
| // |
| // This code is too clever for its own good. As such, changes to some parts of |
| // the code may have a non-obvious impact on other parts. This is mostly |
| // motivated by trying to keep the DFA transition table as small as possible, |
| // since it is stored on the stack. Here are some tips that may save you some |
| // time: |
| // |
| // * If you add a new NFA state, then you also need to consider how it impacts |
| // the DFA. If all of the incoming transitions into an NFA state are |
| // epsilon transitions, then it probably isn't materialized in the DFA. |
| // If the NFA state indicates that a field or a record has been parsed, then |
| // it should be considered final. Let the comments in `NfaState` be your |
| // guide. |
| // * If you add a new configuration knob to the parser, then you may need to |
| // modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant |
| // indicates the total number of discriminating bytes in the DFA. And if you |
| // modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to |
| // add a new class. For example, in order to add parsing support for |
| // comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment |
| // byte (if one exists) to the list of classes in `build_dfa`. |
| // * The special DFA start state doubles as the final state once all input |
| // from the caller has been exhausted. We must be careful to guard this |
| // case analysis on whether the input is actually exhausted, since the start |
| // state is an otherwise valid state. |
| |
| /// A pull based CSV reader. |
| /// |
| /// This reader parses CSV data using a finite state machine. Callers can |
| /// extract parsed data incrementally using one of the `read` methods. |
| /// |
| /// Note that this CSV reader is somewhat encoding agnostic. The source data |
| /// needs to be at least ASCII compatible. There is no support for specifying |
| /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead, |
| /// any byte can be used, although callers probably want to stick to the ASCII |
| /// subset (`<= 0x7F`). |
| /// |
| /// # Usage |
| /// |
| /// A reader has two different ways to read CSV data, each with their own |
| /// trade offs. |
| /// |
| /// * `read_field` - Copies a single CSV field into an output buffer while |
| /// unescaping quotes. This is simple to use and doesn't require storing an |
| /// entire record contiguously in memory, but it is slower. |
| /// * `read_record` - Copies an entire CSV record into an output buffer while |
| /// unescaping quotes. The ending positions of each field are copied into |
| /// an additional buffer. This is harder to use and requires larger output |
| /// buffers, but it is faster than `read_field` since it amortizes more |
| /// costs. |
| /// |
| /// # RFC 4180 |
| /// |
| /// [RFC 4180](https://tools.ietf.org/html/rfc4180) |
| /// is the closest thing to a specification for CSV data. Unfortunately, |
| /// CSV data that is seen in the wild can vary significantly. Often, the CSV |
| /// data is outright invalid. Instead of fixing the producers of bad CSV data, |
| /// we have seen fit to make consumers much more flexible in what they accept. |
| /// This reader continues that tradition, and therefore, isn't technically |
| /// compliant with RFC 4180. In particular, this reader will never return an |
| /// error and will always find *a* parse. |
| /// |
| /// Here are some detailed differences from RFC 4180: |
| /// |
| /// * CRLF, LF and CR are each treated as a single record terminator by |
| /// default. |
| /// * Records are permitted to be of varying length. |
| /// * Empty lines (that do not include other whitespace) are ignored. |
| #[derive(Clone, Debug)] |
| pub struct Reader { |
| /// A table-based DFA for parsing CSV. |
| dfa: Dfa, |
| /// The current DFA state, if the DFA is used. |
| dfa_state: DfaState, |
| /// The current NFA state, if the NFA is used. |
| nfa_state: NfaState, |
| /// The delimiter that separates fields. |
| delimiter: u8, |
| /// The terminator that separates records. |
| term: Terminator, |
| /// The quotation byte. |
| quote: u8, |
| /// Whether to recognize escaped quotes. |
| escape: Option<u8>, |
| /// Whether to recognized doubled quotes. |
| double_quote: bool, |
| /// If enabled, lines beginning with this byte are ignored. |
| comment: Option<u8>, |
| /// If enabled (the default), then quotes are respected. When disabled, |
| /// quotes are not treated specially. |
| quoting: bool, |
| /// Whether to use the NFA for parsing. |
| /// |
| /// Generally this is for debugging. There's otherwise no good reason |
| /// to avoid the DFA. |
| use_nfa: bool, |
| /// The current line number. |
| line: u64, |
| /// Whether this parser has ever read anything. |
| has_read: bool, |
| /// The current position in the output buffer when reading a record. |
| output_pos: usize, |
| } |
| |
| impl Default for Reader { |
| fn default() -> Reader { |
| Reader { |
| dfa: Dfa::new(), |
| dfa_state: DfaState::start(), |
| nfa_state: NfaState::StartRecord, |
| delimiter: b',', |
| term: Terminator::default(), |
| quote: b'"', |
| escape: None, |
| double_quote: true, |
| comment: None, |
| quoting: true, |
| use_nfa: false, |
| line: 1, |
| has_read: false, |
| output_pos: 0, |
| } |
| } |
| } |
| |
| /// Builds a CSV reader with various configuration knobs. |
| /// |
| /// This builder can be used to tweak the field delimiter, record terminator |
| /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration |
| /// cannot be changed. |
| #[derive(Debug, Default)] |
| pub struct ReaderBuilder { |
| rdr: Reader, |
| } |
| |
| impl ReaderBuilder { |
| /// Create a new builder. |
| pub fn new() -> ReaderBuilder { |
| ReaderBuilder::default() |
| } |
| |
| /// Build a CSV parser from this configuration. |
| pub fn build(&self) -> Reader { |
| let mut rdr = self.rdr.clone(); |
| rdr.build_dfa(); |
| rdr |
| } |
| |
| /// The field delimiter to use when parsing CSV. |
| /// |
| /// The default is `b','`. |
| pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { |
| self.rdr.delimiter = delimiter; |
| self |
| } |
| |
| /// The record terminator to use when parsing CSV. |
| /// |
| /// A record terminator can be any single byte. The default is a special |
| /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` |
| /// or `\r\n` as a single record terminator. |
| pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { |
| self.rdr.term = term; |
| self |
| } |
| |
| /// The quote character to use when parsing CSV. |
| /// |
| /// The default is `b'"'`. |
| pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { |
| self.rdr.quote = quote; |
| self |
| } |
| |
| /// The escape character to use when parsing CSV. |
| /// |
| /// In some variants of CSV, quotes are escaped using a special escape |
| /// character like `\` (instead of escaping quotes by doubling them). |
| /// |
| /// By default, recognizing these idiosyncratic escapes is disabled. |
| pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder { |
| self.rdr.escape = escape; |
| self |
| } |
| |
| /// Enable double quote escapes. |
| /// |
| /// This is enabled by default, but it may be disabled. When disabled, |
| /// doubled quotes are not interpreted as escapes. |
| pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.rdr.double_quote = yes; |
| self |
| } |
| |
| /// Enable or disable quoting. |
| /// |
| /// This is enabled by default, but it may be disabled. When disabled, |
| /// quotes are not treated specially. |
| pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.rdr.quoting = yes; |
| self |
| } |
| |
| /// The comment character to use when parsing CSV. |
| /// |
| /// If the start of a record begins with the byte given here, then that |
| /// line is ignored by the CSV parser. |
| /// |
| /// This is disabled by default. |
| pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder { |
| self.rdr.comment = comment; |
| self |
| } |
| |
| /// A convenience method for specifying a configuration to read ASCII |
| /// delimited text. |
| /// |
| /// This sets the delimiter and record terminator to the ASCII unit |
| /// separator (`\x1F`) and record separator (`\x1E`), respectively. |
| pub fn ascii(&mut self) -> &mut ReaderBuilder { |
| self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E')) |
| } |
| |
| /// Enable or disable the NFA for parsing CSV. |
| /// |
| /// This is intended to be a debug option useful for debugging. The NFA |
| /// is always slower than the DFA. |
| #[doc(hidden)] |
| pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.rdr.use_nfa = yes; |
| self |
| } |
| } |
| |
| /// The result of parsing at most one field from CSV data. |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub enum ReadFieldResult { |
| /// The caller provided input was exhausted before the end of a field or |
| /// record was found. |
| InputEmpty, |
| /// The caller provided output buffer was filled before an entire field |
| /// could be written to it. |
| OutputFull, |
| /// The end of a field was found. |
| /// |
| /// Note that when `record_end` is true, then the end of this field also |
| /// corresponds to the end of a record. |
| Field { |
| /// Whether this was the last field in a record or not. |
| record_end: bool, |
| }, |
| /// All CSV data has been read. |
| /// |
| /// This state can only be returned when an empty input buffer is provided |
| /// by the caller. |
| End, |
| } |
| |
| impl ReadFieldResult { |
| fn from_nfa( |
| state: NfaState, |
| inpdone: bool, |
| outdone: bool, |
| ) -> ReadFieldResult { |
| match state { |
| NfaState::End => ReadFieldResult::End, |
| NfaState::EndRecord | NfaState::CRLF => { |
| ReadFieldResult::Field { record_end: true } |
| } |
| NfaState::EndFieldDelim => { |
| ReadFieldResult::Field { record_end: false } |
| } |
| _ => { |
| assert!(!state.is_field_final()); |
| if !inpdone && outdone { |
| ReadFieldResult::OutputFull |
| } else { |
| ReadFieldResult::InputEmpty |
| } |
| } |
| } |
| } |
| } |
| |
| /// The result of parsing at most one field from CSV data while ignoring the |
| /// output. |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub enum ReadFieldNoCopyResult { |
| /// The caller provided input was exhausted before the end of a field or |
| /// record was found. |
| InputEmpty, |
| /// The end of a field was found. |
| /// |
| /// Note that when `record_end` is true, then the end of this field also |
| /// corresponds to the end of a record. |
| Field { |
| /// Whether this was the last field in a record or not. |
| record_end: bool, |
| }, |
| /// All CSV data has been read. |
| /// |
| /// This state can only be returned when an empty input buffer is provided |
| /// by the caller. |
| End, |
| } |
| |
| /// The result of parsing at most one record from CSV data. |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub enum ReadRecordResult { |
| /// The caller provided input was exhausted before the end of a record was |
| /// found. |
| InputEmpty, |
| /// The caller provided output buffer was filled before an entire field |
| /// could be written to it. |
| OutputFull, |
| /// The caller provided output buffer of field end poisitions was filled |
| /// before the next field could be parsed. |
| OutputEndsFull, |
| /// The end of a record was found. |
| Record, |
| /// All CSV data has been read. |
| /// |
| /// This state can only be returned when an empty input buffer is provided |
| /// by the caller. |
| End, |
| } |
| |
| impl ReadRecordResult { |
| fn is_record(&self) -> bool { |
| *self == ReadRecordResult::Record |
| } |
| |
| fn from_nfa( |
| state: NfaState, |
| inpdone: bool, |
| outdone: bool, |
| endsdone: bool, |
| ) -> ReadRecordResult { |
| match state { |
| NfaState::End => ReadRecordResult::End, |
| NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record, |
| _ => { |
| assert!(!state.is_record_final()); |
| if !inpdone && outdone { |
| ReadRecordResult::OutputFull |
| } else if !inpdone && endsdone { |
| ReadRecordResult::OutputEndsFull |
| } else { |
| ReadRecordResult::InputEmpty |
| } |
| } |
| } |
| } |
| } |
| |
| /// The result of parsing at most one record from CSV data while ignoring |
| /// output. |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| pub enum ReadRecordNoCopyResult { |
| /// The caller provided input was exhausted before the end of a record was |
| /// found. |
| InputEmpty, |
| /// The end of a record was found. |
| Record, |
| /// All CSV data has been read. |
| /// |
| /// This state can only be returned when an empty input buffer is provided |
| /// by the caller. |
| End, |
| } |
| |
| /// What should be done with input bytes during an NFA transition |
| #[derive(Clone, Debug, Eq, PartialEq)] |
| enum NfaInputAction { |
| // Do not consume an input byte |
| Epsilon, |
| // Copy input byte to a caller-provided output buffer |
| CopyToOutput, |
| // Consume but do not copy input byte (for example, seeing a field |
| // delimiter will consume an input byte but should not copy it to the |
| // output buffer. |
| Discard, |
| } |
| |
| /// An NFA state is a state that can be visited in the NFA parser. |
| /// |
| /// Given the simplicity of the machine, a subset of NFA states double as DFA |
| /// states. NFA states that only have incoming epsilon transitions are |
| /// optimized out when converting the machine to a DFA. |
| #[derive(Copy, Clone, Debug, Eq, PartialEq)] |
| enum NfaState { |
| // These states aren't used in the DFA, so we |
| // assign them meaningless numbers. |
| EndFieldTerm = 200, |
| InRecordTerm = 201, |
| End = 202, |
| |
| // All states below are DFA states. |
| StartRecord = 0, |
| StartField = 1, |
| InField = 2, |
| InQuotedField = 3, |
| InEscapedQuote = 4, |
| InDoubleEscapedQuote = 5, |
| InComment = 6, |
| // All states below are "final field" states. |
| // Namely, they indicate that a field has been parsed. |
| EndFieldDelim = 7, |
| // All states below are "final record" states. |
| // Namely, they indicate that a record has been parsed. |
| EndRecord = 8, |
| CRLF = 9, |
| } |
| |
| /// A list of NFA states that have an explicit representation in the DFA. |
| const NFA_STATES: &'static [NfaState] = &[ |
| NfaState::StartRecord, |
| NfaState::StartField, |
| NfaState::EndFieldDelim, |
| NfaState::InField, |
| NfaState::InQuotedField, |
| NfaState::InEscapedQuote, |
| NfaState::InDoubleEscapedQuote, |
| NfaState::InComment, |
| NfaState::EndRecord, |
| NfaState::CRLF, |
| ]; |
| |
| impl NfaState { |
| /// Returns true if this state indicates that a field has been parsed. |
| fn is_field_final(&self) -> bool { |
| match *self { |
| NfaState::End |
| | NfaState::EndRecord |
| | NfaState::CRLF |
| | NfaState::EndFieldDelim => true, |
| _ => false, |
| } |
| } |
| |
| /// Returns true if this state indicates that a record has been parsed. |
| fn is_record_final(&self) -> bool { |
| match *self { |
| NfaState::End | NfaState::EndRecord | NfaState::CRLF => true, |
| _ => false, |
| } |
| } |
| } |
| |
| impl Reader { |
| /// Create a new CSV reader with a default parser configuration. |
| pub fn new() -> Reader { |
| ReaderBuilder::new().build() |
| } |
| |
| /// Reset the parser such that it behaves as if it had never been used. |
| /// |
| /// This may be useful when reading CSV data in a random access pattern. |
| pub fn reset(&mut self) { |
| self.dfa_state = self.dfa.new_state(NfaState::StartRecord); |
| self.nfa_state = NfaState::StartRecord; |
| self.line = 1; |
| self.has_read = false; |
| } |
| |
| /// Return the current line number as measured by the number of occurrences |
| /// of `\n`. |
| /// |
| /// Line numbers starts at `1` and are reset when `reset` is called. |
| pub fn line(&self) -> u64 { |
| self.line |
| } |
| |
| /// Set the line number. |
| /// |
| /// This is useful after a call to `reset` where the caller knows the |
| /// line number from some additional context. |
| pub fn set_line(&mut self, line: u64) { |
| self.line = line; |
| } |
| |
| /// Parse a single CSV field in `input` and copy field data to `output`. |
| /// |
| /// This routine requires a caller provided buffer of CSV data as the |
| /// `input` and a caller provided buffer, `output`, in which to store field |
| /// data extracted from `input`. The field data copied to `output` will |
| /// have its quotes unescaped. |
| /// |
| /// Calling this routine parses at most a single field and returns |
| /// three values indicating the state of the parser. The first value, a |
| /// `ReadFieldResult`, tells the caller what to do next. For example, if |
| /// the entire input was read or if the output buffer was filled before |
| /// a full field had been read, then `ReadFieldResult::InputEmpty` or |
| /// `ReadFieldResult::OutputFull` is returned, respectively. See the |
| /// documentation for `ReadFieldResult` for more details. |
| /// |
| /// The other two values returned correspond to the number of bytes |
| /// read from `input` and written to `output`, respectively. |
| /// |
| /// # Termination |
| /// |
| /// This reader interprets an empty `input` buffer as an indication that |
| /// there is no CSV data left to read. Namely, when the caller has |
| /// exhausted all CSV data, the caller should continue to call `read` with |
| /// an empty input buffer until `ReadFieldResult::End` is returned. |
| /// |
| /// # Errors |
| /// |
| /// This CSV reader can never return an error. Instead, it prefers *a* |
| /// parse over *no* parse. |
| pub fn read_field( |
| &mut self, |
| input: &[u8], |
| output: &mut [u8], |
| ) -> (ReadFieldResult, usize, usize) { |
| let (input, bom_nin) = self.strip_utf8_bom(input); |
| let (res, nin, nout) = if self.use_nfa { |
| self.read_field_nfa(input, output) |
| } else { |
| self.read_field_dfa(input, output) |
| }; |
| self.has_read = true; |
| (res, nin + bom_nin, nout) |
| } |
| |
| /// Parse a single CSV record in `input` and copy each field contiguously |
| /// to `output`, with the end position of each field written to `ends`. |
| /// |
| /// **NOTE**: This method is more cumbersome to use than `read_field`, but |
| /// it can be faster since it amortizes more work. |
| /// |
| /// This routine requires a caller provided buffer of CSV data as the |
| /// `input` and two caller provided buffers to store the unescaped field |
| /// data (`output`) and the end position of each field in the record |
| /// (`fields`). |
| /// |
| /// Calling this routine parses at most a single record and returns four |
| /// values indicating the state of the parser. The first value, a |
| /// `ReadRecordResult`, tells the caller what to do next. For example, if |
| /// the entire input was read or if the output buffer was filled before a |
| /// full field had been read, then `ReadRecordResult::InputEmpty` or |
| /// `ReadRecordResult::OutputFull` is returned, respectively. Similarly, if |
| /// the `ends` buffer is full, then `ReadRecordResult::OutputEndsFull` is |
| /// returned. See the documentation for `ReadRecordResult` for more |
| /// details. |
| /// |
| /// The other three values correspond to the number of bytes read from |
| /// `input`, the number of bytes written to `output` and the number of |
| /// end positions written to `ends`, respectively. |
| /// |
| /// The end positions written to `ends` are constructed as if there was |
| /// a single contiguous buffer in memory containing the entire row, even |
| /// if `ReadRecordResult::OutputFull` was returned in the middle of reading |
| /// a row. |
| /// |
| /// # Termination |
| /// |
| /// This reader interprets an empty `input` buffer as an indication that |
| /// there is no CSV data left to read. Namely, when the caller has |
| /// exhausted all CSV data, the caller should continue to call `read` with |
| /// an empty input buffer until `ReadRecordResult::End` is returned. |
| /// |
| /// # Errors |
| /// |
| /// This CSV reader can never return an error. Instead, it prefers *a* |
| /// parse over *no* parse. |
| pub fn read_record( |
| &mut self, |
| input: &[u8], |
| output: &mut [u8], |
| ends: &mut [usize], |
| ) -> (ReadRecordResult, usize, usize, usize) { |
| let (input, bom_nin) = self.strip_utf8_bom(input); |
| let (res, nin, nout, nend) = if self.use_nfa { |
| self.read_record_nfa(input, output, ends) |
| } else { |
| self.read_record_dfa(input, output, ends) |
| }; |
| self.has_read = true; |
| (res, nin + bom_nin, nout, nend) |
| } |
| |
| /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that |
| /// this method will fail to strip off the BOM if only part of the BOM is |
| /// buffered. Hopefully that won't happen very often. |
| fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) { |
| let (input, nin) = if { |
| !self.has_read |
| && input.len() >= 3 |
| && &input[0..3] == b"\xef\xbb\xbf" |
| } { |
| (&input[3..], 3) |
| } else { |
| (input, 0) |
| }; |
| (input, nin) |
| } |
| |
| #[inline(always)] |
| fn read_record_dfa( |
| &mut self, |
| input: &[u8], |
| output: &mut [u8], |
| ends: &mut [usize], |
| ) -> (ReadRecordResult, usize, usize, usize) { |
| if input.is_empty() { |
| let s = self.transition_final_dfa(self.dfa_state); |
| let res = |
| self.dfa.new_read_record_result(s, true, false, false, false); |
| // This part is a little tricky. When reading the final record, |
| // the last result the caller will get is an InputEmpty, and while |
| // they'll have everything they need in `output`, they'll be |
| // missing the final end position of the final field in `ends`. |
| // We insert that here, but we must take care to handle the case |
| // where `ends` doesn't have enough space. If it doesn't have |
| // enough space, then we also can't transition to the next state. |
| return match res { |
| ReadRecordResult::Record => { |
| if ends.is_empty() { |
| return (ReadRecordResult::OutputEndsFull, 0, 0, 0); |
| } |
| self.dfa_state = s; |
| ends[0] = self.output_pos; |
| self.output_pos = 0; |
| (res, 0, 0, 1) |
| } |
| _ => { |
| self.dfa_state = s; |
| (res, 0, 0, 0) |
| } |
| }; |
| } |
| if output.is_empty() { |
| return (ReadRecordResult::OutputFull, 0, 0, 0); |
| } |
| if ends.is_empty() { |
| return (ReadRecordResult::OutputEndsFull, 0, 0, 0); |
| } |
| let (mut nin, mut nout, mut nend) = (0, 0, 0); |
| let mut state = self.dfa_state; |
| while nin < input.len() && nout < output.len() && nend < ends.len() { |
| let (s, has_out) = self.dfa.get_output(state, input[nin]); |
| self.line += (input[nin] == b'\n') as u64; |
| state = s; |
| if has_out { |
| output[nout] = input[nin]; |
| nout += 1; |
| } |
| nin += 1; |
| if state >= self.dfa.final_field { |
| ends[nend] = self.output_pos + nout; |
| nend += 1; |
| if state > self.dfa.final_field { |
| break; |
| } |
| } |
| if state == self.dfa.in_field || state == self.dfa.in_quoted { |
| self.dfa |
| .classes |
| .scan_and_copy(input, &mut nin, output, &mut nout); |
| } |
| } |
| let res = self.dfa.new_read_record_result( |
| state, |
| false, |
| nin >= input.len(), |
| nout >= output.len(), |
| nend >= ends.len(), |
| ); |
| self.dfa_state = state; |
| if res.is_record() { |
| self.output_pos = 0; |
| } else { |
| self.output_pos += nout; |
| } |
| (res, nin, nout, nend) |
| } |
| |
| #[inline(always)] |
| fn read_field_dfa( |
| &mut self, |
| input: &[u8], |
| output: &mut [u8], |
| ) -> (ReadFieldResult, usize, usize) { |
| if input.is_empty() { |
| self.dfa_state = self.transition_final_dfa(self.dfa_state); |
| let res = self.dfa.new_read_field_result( |
| self.dfa_state, |
| true, |
| false, |
| false, |
| ); |
| return (res, 0, 0); |
| } |
| if output.is_empty() { |
| return (ReadFieldResult::OutputFull, 0, 0); |
| } |
| let (mut nin, mut nout) = (0, 0); |
| let mut state = self.dfa_state; |
| while nin < input.len() && nout < output.len() { |
| let b = input[nin]; |
| self.line += (b == b'\n') as u64; |
| let (s, has_out) = self.dfa.get_output(state, b); |
| state = s; |
| if has_out { |
| output[nout] = b; |
| nout += 1; |
| } |
| nin += 1; |
| if state >= self.dfa.final_field { |
| break; |
| } |
| } |
| let res = self.dfa.new_read_field_result( |
| state, |
| false, |
| nin >= input.len(), |
| nout >= output.len(), |
| ); |
| self.dfa_state = state; |
| (res, nin, nout) |
| } |
| |
| /// Perform the final state transition, i.e., when the caller indicates |
| /// that the input has been exhausted. |
| fn transition_final_dfa(&self, state: DfaState) -> DfaState { |
| // If we''ve already emitted a record or think we're ready to start |
| // parsing a new record, then we should sink into the final state |
| // and never move from there. (pro-tip: the start state doubles as |
| // the final state!) |
| if state >= self.dfa.final_record || state.is_start() { |
| self.dfa.new_state_final_end() |
| } else { |
| self.dfa.new_state_final_record() |
| } |
| } |
| |
| /// Write the transition tables for the DFA based on this parser's |
| /// configuration. |
| fn build_dfa(&mut self) { |
| // A naive DFA transition table has |
| // `cells = (# number of states) * (# size of alphabet)`. While we |
| // could get away with that, the table would have `10 * 256 = 2560` |
| // entries. Even worse, in order to avoid a multiplication instruction |
| // when computing the next transition, we store the starting index of |
| // each state's row, which would not be representible in a single byte. |
| // So we'd need a `u16`, which doubles our transition table size to |
| // ~5KB. This is a lot to put on the stack, even though it probably |
| // fits in the L1 cache of most modern CPUs. |
| // |
| // To avoid this, we note that while our "true" alphabet |
| // has 256 distinct possibilities, the DFA itself is only |
| // discriminatory on a very small subset of that alphabet. For |
| // example, assuming neither `a` nor `b` are set as special |
| // quote/comment/escape/delimiter/terminator bytes, they are otherwise |
| // indistinguishable to the DFA, so it would be OK to treat them as |
| // if they were equivalent. That is, they are in the same equivalence |
| // class. |
| // |
| // As it turns out, using this logic, we can shrink our effective |
| // alphabet down to 7 equivalence classes: |
| // |
| // 1. The field delimiter. |
| // 2. The record terminator. |
| // 3. If the record terminator is CRLF, then CR and LF are |
| // distinct equivalence classes. |
| // 4. The quote byte. |
| // 5. The escape byte. |
| // 6. The comment byte. |
| // 7. Everything else. |
| // |
| // We add those equivalence classes here. If more configuration knobs |
| // are added to the parser with more discriminating bytes, then this |
| // logic will need to be adjusted further. |
| // |
| // Even though this requires an extra bit of indirection when computing |
| // the next transition, microbenchmarks say that it doesn't make much |
| // of a difference. Perhaps because everything fits into the L1 cache. |
| self.dfa.classes.add(self.delimiter); |
| if self.quoting { |
| self.dfa.classes.add(self.quote); |
| if let Some(escape) = self.escape { |
| self.dfa.classes.add(escape); |
| } |
| } |
| if let Some(comment) = self.comment { |
| self.dfa.classes.add(comment); |
| } |
| match self.term { |
| Terminator::Any(b) => self.dfa.classes.add(b), |
| Terminator::CRLF => { |
| self.dfa.classes.add(b'\r'); |
| self.dfa.classes.add(b'\n'); |
| } |
| _ => unreachable!(), |
| } |
| // Build the DFA transition table by computing the DFA state for all |
| // possible combinations of state and input byte. |
| for &state in NFA_STATES { |
| for c in (0..256).map(|c| c as u8) { |
| let mut nfa_result = (state, NfaInputAction::Epsilon); |
| // Consume NFA states until we hit a non-epsilon transition. |
| while nfa_result.0 != NfaState::End |
| && nfa_result.1 == NfaInputAction::Epsilon |
| { |
| nfa_result = self.transition_nfa(nfa_result.0, c); |
| } |
| let from = self.dfa.new_state(state); |
| let to = self.dfa.new_state(nfa_result.0); |
| self.dfa.set( |
| from, |
| c, |
| to, |
| nfa_result.1 == NfaInputAction::CopyToOutput, |
| ); |
| } |
| } |
| self.dfa_state = self.dfa.new_state(NfaState::StartRecord); |
| self.dfa.finish(); |
| } |
| |
| // The NFA implementation follows. The transition_final_nfa and |
| // transition_nfa methods are required for the DFA to operate. The |
| // rest are included for completeness (and debugging). Note that this |
| // NFA implementation is included in most of the CSV parser tests below. |
| |
| #[inline(always)] |
| fn read_record_nfa( |
| &mut self, |
| input: &[u8], |
| output: &mut [u8], |
| ends: &mut [usize], |
| ) -> (ReadRecordResult, usize, usize, usize) { |
| if input.is_empty() { |
| let s = self.transition_final_nfa(self.nfa_state); |
| let res = ReadRecordResult::from_nfa(s, false, false, false); |
| return match res { |
| ReadRecordResult::Record => { |
| if ends.is_empty() { |
| return (ReadRecordResult::OutputEndsFull, 0, 0, 0); |
| } |
| self.nfa_state = s; |
| ends[0] = self.output_pos; |
| self.output_pos = 0; |
| (res, 0, 0, 1) |
| } |
| _ => { |
| self.nfa_state = s; |
| (res, 0, 0, 0) |
| } |
| }; |
| } |
| if output.is_empty() { |
| return (ReadRecordResult::OutputFull, 0, 0, 0); |
| } |
| if ends.is_empty() { |
| return (ReadRecordResult::OutputEndsFull, 0, 0, 0); |
| } |
| let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0); |
| let mut state = self.nfa_state; |
| while nin < input.len() && nout < output.len() && nend < ends.len() { |
| let (s, io) = self.transition_nfa(state, input[nin]); |
| match io { |
| NfaInputAction::CopyToOutput => { |
| output[nout] = input[nin]; |
| nout += 1; |
| nin += 1; |
| } |
| NfaInputAction::Discard => { |
| nin += 1; |
| } |
| NfaInputAction::Epsilon => {} |
| } |
| state = s; |
| if state.is_field_final() { |
| ends[nend] = nout; |
| nend += 1; |
| if state != NfaState::EndFieldDelim { |
| break; |
| } |
| } |
| } |
| let res = ReadRecordResult::from_nfa( |
| state, |
| nin >= input.len(), |
| nout >= output.len(), |
| nend >= ends.len(), |
| ); |
| self.nfa_state = state; |
| self.output_pos = if res.is_record() { 0 } else { nout }; |
| (res, nin, nout, nend) |
| } |
| |
| #[inline(always)] |
| fn read_field_nfa( |
| &mut self, |
| input: &[u8], |
| output: &mut [u8], |
| ) -> (ReadFieldResult, usize, usize) { |
| if input.is_empty() { |
| self.nfa_state = self.transition_final_nfa(self.nfa_state); |
| let res = ReadFieldResult::from_nfa(self.nfa_state, false, false); |
| return (res, 0, 0); |
| } |
| if output.is_empty() { |
| // If the output buffer is empty, then we can never make progress, |
| // so just quit now. |
| return (ReadFieldResult::OutputFull, 0, 0); |
| } |
| let (mut nin, mut nout) = (0, 0); |
| let mut state = self.nfa_state; |
| while nin < input.len() && nout < output.len() { |
| let (s, io) = self.transition_nfa(state, input[nin]); |
| match io { |
| NfaInputAction::CopyToOutput => { |
| output[nout] = input[nin]; |
| nout += 1; |
| nin += 1; |
| } |
| NfaInputAction::Discard => { |
| nin += 1; |
| } |
| NfaInputAction::Epsilon => (), |
| } |
| state = s; |
| if state.is_field_final() { |
| break; |
| } |
| } |
| let res = ReadFieldResult::from_nfa( |
| state, |
| nin >= input.len(), |
| nout >= output.len(), |
| ); |
| self.nfa_state = state; |
| (res, nin, nout) |
| } |
| |
| /// Compute the final NFA transition after all caller-provided input has |
| /// been exhausted. |
| #[inline(always)] |
| fn transition_final_nfa(&self, state: NfaState) -> NfaState { |
| use self::NfaState::*; |
| match state { |
| End | StartRecord | EndRecord | InComment | CRLF => End, |
| StartField | EndFieldDelim | EndFieldTerm | InField |
| | InQuotedField | InEscapedQuote | InDoubleEscapedQuote |
| | InRecordTerm => EndRecord, |
| } |
| } |
| |
| /// Compute the next NFA state given the current NFA state and the current |
| /// input byte. |
| /// |
| /// This returns the next NFA state along with an NfaInputAction that |
| /// indicates what should be done with the input byte (nothing for an epsilon |
| /// transition, copied to a caller provided output buffer, or discarded). |
| #[inline(always)] |
| fn transition_nfa( |
| &self, |
| state: NfaState, |
| c: u8, |
| ) -> (NfaState, NfaInputAction) { |
| use self::NfaState::*; |
| match state { |
| End => (End, NfaInputAction::Epsilon), |
| StartRecord => { |
| if self.term.equals(c) { |
| (StartRecord, NfaInputAction::Discard) |
| } else if self.comment == Some(c) { |
| (InComment, NfaInputAction::Discard) |
| } else { |
| (StartField, NfaInputAction::Epsilon) |
| } |
| } |
| EndRecord => (StartRecord, NfaInputAction::Epsilon), |
| StartField => { |
| if self.quoting && self.quote == c { |
| (InQuotedField, NfaInputAction::Discard) |
| } else if self.delimiter == c { |
| (EndFieldDelim, NfaInputAction::Discard) |
| } else if self.term.equals(c) { |
| (EndFieldTerm, NfaInputAction::Epsilon) |
| } else { |
| (InField, NfaInputAction::CopyToOutput) |
| } |
| } |
| EndFieldDelim => (StartField, NfaInputAction::Epsilon), |
| EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon), |
| InField => { |
| if self.delimiter == c { |
| (EndFieldDelim, NfaInputAction::Discard) |
| } else if self.term.equals(c) { |
| (EndFieldTerm, NfaInputAction::Epsilon) |
| } else { |
| (InField, NfaInputAction::CopyToOutput) |
| } |
| } |
| InQuotedField => { |
| if self.quoting && self.quote == c { |
| (InDoubleEscapedQuote, NfaInputAction::Discard) |
| } else if self.quoting && self.escape == Some(c) { |
| (InEscapedQuote, NfaInputAction::Discard) |
| } else { |
| (InQuotedField, NfaInputAction::CopyToOutput) |
| } |
| } |
| InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput), |
| InDoubleEscapedQuote => { |
| if self.quoting && self.double_quote && self.quote == c { |
| (InQuotedField, NfaInputAction::CopyToOutput) |
| } else if self.delimiter == c { |
| (EndFieldDelim, NfaInputAction::Discard) |
| } else if self.term.equals(c) { |
| (EndFieldTerm, NfaInputAction::Epsilon) |
| } else { |
| (InField, NfaInputAction::CopyToOutput) |
| } |
| } |
| InComment => { |
| if b'\n' == c { |
| (StartRecord, NfaInputAction::Discard) |
| } else { |
| (InComment, NfaInputAction::Discard) |
| } |
| } |
| InRecordTerm => { |
| if self.term.is_crlf() && b'\r' == c { |
| (CRLF, NfaInputAction::Discard) |
| } else { |
| (EndRecord, NfaInputAction::Discard) |
| } |
| } |
| CRLF => { |
| if b'\n' == c { |
| (StartRecord, NfaInputAction::Discard) |
| } else { |
| (StartRecord, NfaInputAction::Epsilon) |
| } |
| } |
| } |
| } |
| } |
| |
| /// The number of slots in the DFA transition table. |
| /// |
| /// This number is computed by multiplying the maximum number of transition |
| /// classes (7) by the total number of NFA states that are used in the DFA |
| /// (10). |
| /// |
| /// The number of transition classes is determined by an equivalence class of |
| /// bytes, where every byte in the same equivalence classes is |
| /// indistinguishable from any other byte with respect to the DFA. For example, |
| /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape, |
| /// then the DFA will never discriminate between `a` or `b`, so they can |
| /// effectively be treated as identical. This reduces storage space |
| /// substantially. |
| /// |
| /// The total number of NFA states (13) is greater than the total number of |
| /// NFA states that are in the DFA. In particular, any NFA state that can only |
| /// be reached by epsilon transitions will never have explicit usage in the |
| /// DFA. |
| const TRANS_CLASSES: usize = 7; |
| const DFA_STATES: usize = 10; |
| const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES; |
| |
| /// The number of possible transition classes. (See the comment on `TRANS_SIZE` |
| /// for more details.) |
| const CLASS_SIZE: usize = 256; |
| |
| /// A representation of a DFA. |
| /// |
| /// For the most part, this is a transition table, but various optimizations |
| /// have been applied to reduce its memory footprint. |
| struct Dfa { |
| /// The core transition table. Each row corresponds to the transitions for |
| /// each input equivalence class. (Input bytes are mapped to their |
| /// corresponding equivalence class with the `classes` map.) |
| /// |
| /// DFA states are represented as an index corresponding to the start of |
| /// its row in this table. |
| trans: [DfaState; TRANS_SIZE], |
| /// A table with the same layout as `trans`, except its values indicate |
| /// whether a particular `(state, equivalence class)` pair should emit an |
| /// output byte. |
| has_output: [bool; TRANS_SIZE], |
| /// A map from input byte to equivalence class. |
| /// |
| /// This is responsible for reducing the effective alphabet size from |
| /// 256 to `TRANS_CLASSES`. |
| classes: DfaClasses, |
| /// The DFA state corresponding to being inside an unquoted field. |
| in_field: DfaState, |
| /// The DFA state corresponding to being inside an quoted field. |
| in_quoted: DfaState, |
| /// The minimum DFA state that indicates a field has been parsed. All DFA |
| /// states greater than this are also final-field states. |
| final_field: DfaState, |
| /// The minimum DFA state that indicates a record has been parsed. All DFA |
| /// states greater than this are also final-record states. |
| final_record: DfaState, |
| } |
| |
| impl Dfa { |
| fn new() -> Dfa { |
| Dfa { |
| trans: [DfaState(0); TRANS_SIZE], |
| has_output: [false; TRANS_SIZE], |
| classes: DfaClasses::new(), |
| in_field: DfaState(0), |
| in_quoted: DfaState(0), |
| final_field: DfaState(0), |
| final_record: DfaState(0), |
| } |
| } |
| |
| fn new_state(&self, nfa_state: NfaState) -> DfaState { |
| let nclasses = self.classes.num_classes() as u8; |
| let idx = (nfa_state as u8).checked_mul(nclasses).unwrap(); |
| DfaState(idx) |
| } |
| |
| fn new_state_final_end(&self) -> DfaState { |
| self.new_state(NfaState::StartRecord) |
| } |
| |
| fn new_state_final_record(&self) -> DfaState { |
| self.new_state(NfaState::EndRecord) |
| } |
| |
| fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) { |
| let cls = self.classes.classes[c as usize]; |
| let idx = state.0 as usize + cls as usize; |
| (self.trans[idx], self.has_output[idx]) |
| } |
| |
| fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) { |
| let cls = self.classes.classes[c as usize]; |
| let idx = from.0 as usize + cls as usize; |
| self.trans[idx] = to; |
| self.has_output[idx] = output; |
| } |
| |
| fn finish(&mut self) { |
| self.in_field = self.new_state(NfaState::InField); |
| self.in_quoted = self.new_state(NfaState::InQuotedField); |
| self.final_field = self.new_state(NfaState::EndFieldDelim); |
| self.final_record = self.new_state(NfaState::EndRecord); |
| } |
| |
| fn new_read_field_result( |
| &self, |
| state: DfaState, |
| is_final_trans: bool, |
| inpdone: bool, |
| outdone: bool, |
| ) -> ReadFieldResult { |
| if state >= self.final_record { |
| ReadFieldResult::Field { record_end: true } |
| } else if state == self.final_field { |
| ReadFieldResult::Field { record_end: false } |
| } else if is_final_trans && state.is_start() { |
| ReadFieldResult::End |
| } else { |
| debug_assert!(state < self.final_field); |
| if !inpdone && outdone { |
| ReadFieldResult::OutputFull |
| } else { |
| ReadFieldResult::InputEmpty |
| } |
| } |
| } |
| |
| fn new_read_record_result( |
| &self, |
| state: DfaState, |
| is_final_trans: bool, |
| inpdone: bool, |
| outdone: bool, |
| endsdone: bool, |
| ) -> ReadRecordResult { |
| if state >= self.final_record { |
| ReadRecordResult::Record |
| } else if is_final_trans && state.is_start() { |
| ReadRecordResult::End |
| } else { |
| debug_assert!(state < self.final_record); |
| if !inpdone && outdone { |
| ReadRecordResult::OutputFull |
| } else if !inpdone && endsdone { |
| ReadRecordResult::OutputEndsFull |
| } else { |
| ReadRecordResult::InputEmpty |
| } |
| } |
| } |
| } |
| |
| /// A map from input byte to equivalence class. |
| struct DfaClasses { |
| classes: [u8; CLASS_SIZE], |
| next_class: usize, |
| } |
| |
| impl DfaClasses { |
| fn new() -> DfaClasses { |
| DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 } |
| } |
| |
| fn add(&mut self, b: u8) { |
| if self.next_class > CLASS_SIZE { |
| panic!("added too many classes") |
| } |
| self.classes[b as usize] = self.next_class as u8; |
| self.next_class = self.next_class + 1; |
| } |
| |
| fn num_classes(&self) -> usize { |
| self.next_class as usize |
| } |
| |
| /// Scan and copy the input bytes to the output buffer quickly. |
| /// |
| /// This assumes that the current state of the DFA is either `InField` or |
| /// `InQuotedField`. In this case, all bytes corresponding to the first |
| /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are |
| /// guaranteed to never result in a state transition out of the current |
| /// state. This function takes advantage of that copies every byte from |
| /// `input` in the first equivalence class to `output`. Once a byte is seen |
| /// outside the first equivalence class, we quit and should fall back to |
| /// the main DFA loop. |
| #[inline(always)] |
| fn scan_and_copy( |
| &self, |
| input: &[u8], |
| nin: &mut usize, |
| output: &mut [u8], |
| nout: &mut usize, |
| ) { |
| while *nin < input.len() |
| && *nout < output.len() |
| && self.classes[input[*nin] as usize] == 0 |
| { |
| output[*nout] = input[*nin]; |
| *nin += 1; |
| *nout += 1; |
| } |
| } |
| } |
| |
| /// A single DFA state. |
| /// |
| /// A DFA state is represented by the starting index of its corresponding row |
| /// in the DFA transition table. This representation allows us to elide a |
| /// single multiplication instruction when computing the next transition for |
| /// a particular input byte. |
| #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] |
| struct DfaState(u8); |
| |
| impl DfaState { |
| fn start() -> DfaState { |
| DfaState(0) |
| } |
| |
| fn is_start(&self) -> bool { |
| self.0 == 0 |
| } |
| } |
| |
| impl fmt::Debug for Dfa { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "Dfa(N/A)") |
| } |
| } |
| |
| impl fmt::Debug for DfaClasses { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!( |
| f, |
| "DfaClasses {{ classes: N/A, next_class: {:?} }}", |
| self.next_class |
| ) |
| } |
| } |
| |
| impl Clone for Dfa { |
| fn clone(&self) -> Dfa { |
| let mut dfa = Dfa::new(); |
| dfa.trans.copy_from_slice(&self.trans); |
| dfa |
| } |
| } |
| |
| impl Clone for DfaClasses { |
| fn clone(&self) -> DfaClasses { |
| let mut x = DfaClasses::new(); |
| x.classes.copy_from_slice(&self.classes); |
| x |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use core::str; |
| |
| use arrayvec::{ArrayString, ArrayVec}; |
| |
| use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator}; |
| |
| type Csv = ArrayVec<[Row; 10]>; |
| type Row = ArrayVec<[Field; 10]>; |
| type Field = ArrayString<[u8; 10]>; |
| |
| // OMG I HATE BYTE STRING LITERALS SO MUCH. |
| fn b(s: &str) -> &[u8] { |
| s.as_bytes() |
| } |
| |
| macro_rules! csv { |
| ($([$($field:expr),*]),*) => {{ |
| #[allow(unused_mut)] |
| fn x() -> Csv { |
| let mut csv = Csv::new(); |
| $( |
| let mut row = Row::new(); |
| $( |
| row.push(Field::from($field).unwrap()); |
| )* |
| csv.push(row); |
| )* |
| csv |
| } |
| x() |
| }} |
| } |
| |
| macro_rules! parses_to { |
| ($name:ident, $data:expr, $expected:expr) => { |
| parses_to!($name, $data, $expected, |builder| builder); |
| }; |
| ($name:ident, $data:expr, $expected:expr, $config:expr) => { |
| #[test] |
| fn $name() { |
| let mut builder = ReaderBuilder::new(); |
| builder.nfa(true); |
| $config(&mut builder); |
| let mut rdr = builder.build(); |
| let got = parse_by_field(&mut rdr, $data); |
| let expected = $expected; |
| assert_eq!(expected, got, "nfa by field"); |
| |
| let mut builder = ReaderBuilder::new(); |
| builder.nfa(true); |
| $config(&mut builder); |
| let mut rdr = builder.build(); |
| let got = parse_by_record(&mut rdr, $data); |
| let expected = $expected; |
| assert_eq!(expected, got, "nfa by record"); |
| |
| let mut builder = ReaderBuilder::new(); |
| $config(&mut builder); |
| let mut rdr = builder.build(); |
| let got = parse_by_field(&mut rdr, $data); |
| let expected = $expected; |
| assert_eq!(expected, got, "dfa by field"); |
| |
| let mut builder = ReaderBuilder::new(); |
| $config(&mut builder); |
| let mut rdr = builder.build(); |
| let got = parse_by_record(&mut rdr, $data); |
| let expected = $expected; |
| assert_eq!(expected, got, "dfa by record"); |
| } |
| }; |
| } |
| |
| fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv { |
| let mut data = data.as_bytes(); |
| let mut field = [0u8; 10]; |
| let mut csv = Csv::new(); |
| let mut row = Row::new(); |
| let mut outpos = 0; |
| loop { |
| let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]); |
| data = &data[nin..]; |
| outpos += nout; |
| |
| match res { |
| ReadFieldResult::InputEmpty => { |
| if !data.is_empty() { |
| panic!("missing input data") |
| } |
| } |
| ReadFieldResult::OutputFull => panic!("field too large"), |
| ReadFieldResult::Field { record_end } => { |
| let s = str::from_utf8(&field[..outpos]).unwrap(); |
| row.push(Field::from(s).unwrap()); |
| outpos = 0; |
| if record_end { |
| csv.push(row); |
| row = Row::new(); |
| } |
| } |
| ReadFieldResult::End => { |
| return csv; |
| } |
| } |
| } |
| } |
| |
| fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv { |
| use crate::ReadRecordResult::*; |
| |
| let mut data = data.as_bytes(); |
| let mut record = [0; 1024]; |
| let mut ends = [0; 10]; |
| |
| let mut csv = Csv::new(); |
| let (mut outpos, mut endpos) = (0, 0); |
| loop { |
| let (res, nin, nout, nend) = rdr.read_record( |
| data, |
| &mut record[outpos..], |
| &mut ends[endpos..], |
| ); |
| data = &data[nin..]; |
| outpos += nout; |
| endpos += nend; |
| |
| match res { |
| InputEmpty => { |
| if !data.is_empty() { |
| panic!("missing input data") |
| } |
| } |
| OutputFull => panic!("record too large (out buffer)"), |
| OutputEndsFull => panic!("record too large (end buffer)"), |
| Record => { |
| let s = str::from_utf8(&record[..outpos]).unwrap(); |
| let mut start = 0; |
| let mut row = Row::new(); |
| for &end in &ends[..endpos] { |
| row.push(Field::from(&s[start..end]).unwrap()); |
| start = end; |
| } |
| csv.push(row); |
| outpos = 0; |
| endpos = 0; |
| } |
| End => return csv, |
| } |
| } |
| } |
| |
| parses_to!(one_row_one_field, "a", csv![["a"]]); |
| parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]); |
| parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]); |
| parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]); |
| parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]); |
| parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]); |
| parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]); |
| parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]); |
| parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]); |
| parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]); |
| parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]); |
| parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]); |
| |
| parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]); |
| parses_to!( |
| many_rows_many_fields, |
| "a,b,c\nx,y,z", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| parses_to!( |
| many_rows_trailing_comma, |
| "a,b,\nx,y,", |
| csv![["a", "b", ""], ["x", "y", ""]] |
| ); |
| parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]); |
| parses_to!( |
| many_rows_many_fields_lf, |
| "a,b,c\nx,y,z\n", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| parses_to!( |
| many_rows_trailing_comma_lf, |
| "a,b,\nx,y,\n", |
| csv![["a", "b", ""], ["x", "y", ""]] |
| ); |
| parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]); |
| parses_to!( |
| many_rows_many_fields_crlf, |
| "a,b,c\r\nx,y,z\r\n", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| parses_to!( |
| many_rows_trailing_comma_crlf, |
| "a,b,\r\nx,y,\r\n", |
| csv![["a", "b", ""], ["x", "y", ""]] |
| ); |
| parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]); |
| parses_to!( |
| many_rows_many_fields_cr, |
| "a,b,c\rx,y,z\r", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| parses_to!( |
| many_rows_trailing_comma_cr, |
| "a,b,\rx,y,\r", |
| csv![["a", "b", ""], ["x", "y", ""]] |
| ); |
| |
| parses_to!( |
| trailing_lines_no_record, |
| "\n\n\na,b,c\nx,y,z\n\n\n", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| parses_to!( |
| trailing_lines_no_record_cr, |
| "\r\r\ra,b,c\rx,y,z\r\r\r", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| parses_to!( |
| trailing_lines_no_record_crlf, |
| "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n", |
| csv![["a", "b", "c"], ["x", "y", "z"]] |
| ); |
| |
| parses_to!(empty, "", csv![]); |
| parses_to!(empty_lines, "\n\n\n\n", csv![]); |
| parses_to!( |
| empty_lines_interspersed, |
| "\n\na,b\n\n\nx,y\n\n\nm,n\n", |
| csv![["a", "b"], ["x", "y"], ["m", "n"]] |
| ); |
| parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]); |
| parses_to!( |
| empty_lines_interspersed_crlf, |
| "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n", |
| csv![["a", "b"], ["x", "y"], ["m", "n"]] |
| ); |
| parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]); |
| parses_to!( |
| empty_lines_interspersed_mixed, |
| "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n", |
| csv![["a", "b"], ["x", "y"], ["m", "n"]] |
| ); |
| parses_to!(empty_lines_cr, "\r\r\r\r", csv![]); |
| parses_to!( |
| empty_lines_interspersed_cr, |
| "\r\ra,b\r\r\rx,y\r\r\rm,n\r", |
| csv![["a", "b"], ["x", "y"], ["m", "n"]] |
| ); |
| |
| parses_to!( |
| term_weird, |
| "zza,bzc,dzz", |
| csv![["a", "b"], ["c", "d"]], |
| |b: &mut ReaderBuilder| { |
| b.terminator(Terminator::Any(b'z')); |
| } |
| ); |
| |
| parses_to!( |
| ascii_delimited, |
| "a\x1fb\x1ec\x1fd", |
| csv![["a", "b"], ["c", "d"]], |
| |b: &mut ReaderBuilder| { |
| b.ascii(); |
| } |
| ); |
| |
| parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]); |
| parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]); |
| parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]); |
| |
| parses_to!(quote_empty, "\"\"", csv![[""]]); |
| parses_to!(quote_lf, "\"\"\n", csv![[""]]); |
| parses_to!(quote_space, "\" \"", csv![[" "]]); |
| parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]); |
| parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]); |
| |
| parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| { |
| b.quote(b'z'); |
| }); |
| |
| // This one is pretty hokey. |
| // I don't really know what the "right" behavior is. |
| parses_to!( |
| quote_delimiter, |
| ",a,,b", |
| csv![["a,b"]], |
| |b: &mut ReaderBuilder| { |
| b.quote(b','); |
| } |
| ); |
| |
| parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]); |
| parses_to!( |
| quote_escapes_no_double, |
| r#""a""b""#, |
| csv![[r#"a"b""#]], |
| |b: &mut ReaderBuilder| { |
| b.double_quote(false); |
| } |
| ); |
| parses_to!( |
| quote_escapes, |
| r#""a\"b""#, |
| csv![[r#"a"b"#]], |
| |b: &mut ReaderBuilder| { |
| b.escape(Some(b'\\')); |
| } |
| ); |
| parses_to!( |
| quote_escapes_change, |
| r#""az"b""#, |
| csv![[r#"a"b"#]], |
| |b: &mut ReaderBuilder| { |
| b.escape(Some(b'z')); |
| } |
| ); |
| |
| parses_to!( |
| quote_escapes_with_comma, |
| r#""\"A,B\"""#, |
| csv![[r#""A,B""#]], |
| |b: &mut ReaderBuilder| { |
| b.escape(Some(b'\\')).double_quote(false); |
| } |
| ); |
| |
| parses_to!( |
| quoting_disabled, |
| r#""abc,foo""#, |
| csv![[r#""abc"#, r#"foo""#]], |
| |b: &mut ReaderBuilder| { |
| b.quoting(false); |
| } |
| ); |
| |
| parses_to!( |
| delimiter_tabs, |
| "a\tb", |
| csv![["a", "b"]], |
| |b: &mut ReaderBuilder| { |
| b.delimiter(b'\t'); |
| } |
| ); |
| parses_to!( |
| delimiter_weird, |
| "azb", |
| csv![["a", "b"]], |
| |b: &mut ReaderBuilder| { |
| b.delimiter(b'z'); |
| } |
| ); |
| |
| parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]); |
| parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]); |
| |
| parses_to!( |
| comment_1, |
| "foo\n# hi\nbar\n", |
| csv![["foo"], ["bar"]], |
| |b: &mut ReaderBuilder| { |
| b.comment(Some(b'#')); |
| } |
| ); |
| parses_to!( |
| comment_2, |
| "foo\n # hi\nbar\n", |
| csv![["foo"], [" # hi"], ["bar"]], |
| |b: &mut ReaderBuilder| { |
| b.comment(Some(b'#')); |
| } |
| ); |
| parses_to!( |
| comment_3, |
| "foo\n# hi\nbar\n", |
| csv![["foo"], ["# hi"], ["bar"]], |
| |b: &mut ReaderBuilder| { |
| b.comment(Some(b'\n')); |
| } |
| ); |
| parses_to!( |
| comment_4, |
| "foo,b#ar,baz", |
| csv![["foo", "b#ar", "baz"]], |
| |b: &mut ReaderBuilder| { |
| b.comment(Some(b'#')); |
| } |
| ); |
| parses_to!( |
| comment_5, |
| "foo,#bar,baz", |
| csv![["foo", "#bar", "baz"]], |
| |b: &mut ReaderBuilder| { |
| b.comment(Some(b'#')); |
| } |
| ); |
| |
| macro_rules! assert_read { |
| ( |
| $rdr:expr, $input:expr, $output:expr, |
| $expect_in:expr, $expect_out:expr, $expect_res:expr |
| ) => {{ |
| let (res, nin, nout) = $rdr.read_field($input, $output); |
| assert_eq!($expect_in, nin); |
| assert_eq!($expect_out, nout); |
| assert_eq!($expect_res, res); |
| }}; |
| } |
| |
| // This tests that feeding a new reader with an empty buffer sends us |
| // straight to End. |
| #[test] |
| fn stream_empty() { |
| use crate::ReadFieldResult::*; |
| |
| let mut rdr = Reader::new(); |
| assert_read!(rdr, &[], &mut [], 0, 0, End); |
| } |
| |
| // Test that a single space is treated as a single field. |
| #[test] |
| fn stream_space() { |
| use crate::ReadFieldResult::*; |
| |
| let mut rdr = Reader::new(); |
| assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty); |
| assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); |
| assert_read!(rdr, &[], &mut [0], 0, 0, End); |
| } |
| |
| // Test that a single comma ... |
| #[test] |
| fn stream_comma() { |
| use crate::ReadFieldResult::*; |
| |
| let mut rdr = Reader::new(); |
| assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false }); |
| assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); |
| assert_read!(rdr, &[], &mut [0], 0, 0, End); |
| } |
| |
| // Test that we can read a single large field in multiple output |
| // buffers. |
| #[test] |
| fn stream_output_chunks() { |
| use crate::ReadFieldResult::*; |
| |
| let mut inp = b("fooquux"); |
| let out = &mut [0; 2]; |
| let mut rdr = Reader::new(); |
| |
| assert_read!(rdr, inp, out, 2, 2, OutputFull); |
| assert_eq!(out, b("fo")); |
| inp = &inp[2..]; |
| |
| assert_read!(rdr, inp, out, 2, 2, OutputFull); |
| assert_eq!(out, b("oq")); |
| inp = &inp[2..]; |
| |
| assert_read!(rdr, inp, out, 2, 2, OutputFull); |
| assert_eq!(out, b("uu")); |
| inp = &inp[2..]; |
| |
| assert_read!(rdr, inp, out, 1, 1, InputEmpty); |
| assert_eq!(&out[..1], b("x")); |
| inp = &inp[1..]; |
| assert!(inp.is_empty()); |
| |
| assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); |
| assert_read!(rdr, inp, out, 0, 0, End); |
| } |
| |
| // Test that we can read a single large field across multiple input |
| // buffers. |
| #[test] |
| fn stream_input_chunks() { |
| use crate::ReadFieldResult::*; |
| |
| let out = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty); |
| assert_eq!(&out[..2], b("fo")); |
| |
| assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty); |
| assert_eq!(&out[..4], b("fooq")); |
| |
| assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty); |
| assert_eq!(&out[..6], b("fooquu")); |
| |
| assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty); |
| assert_eq!(&out[..7], b("fooquux")); |
| |
| assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); |
| assert_read!(rdr, &[], out, 0, 0, End); |
| } |
| |
| // Test we can read doubled quotes correctly in a stream. |
| #[test] |
| fn stream_doubled_quotes() { |
| use crate::ReadFieldResult::*; |
| |
| let out = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty); |
| assert_eq!(&out[..2], b("fo")); |
| |
| assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); |
| assert_eq!(&out[..4], b("fo\"o")); |
| |
| assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); |
| assert_read!(rdr, &[], out, 0, 0, End); |
| } |
| |
| // Test we can read escaped quotes correctly in a stream. |
| #[test] |
| fn stream_escaped_quotes() { |
| use crate::ReadFieldResult::*; |
| |
| let out = &mut [0; 10]; |
| let mut builder = ReaderBuilder::new(); |
| let mut rdr = builder.escape(Some(b'\\')).build(); |
| |
| assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty); |
| assert_eq!(&out[..2], b("fo")); |
| |
| assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); |
| assert_eq!(&out[..4], b("fo\"o")); |
| |
| assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); |
| assert_read!(rdr, &[], out, 0, 0, End); |
| } |
| |
| // Test that empty output buffers don't wreak havoc. |
| #[test] |
| fn stream_empty_output() { |
| use crate::ReadFieldResult::*; |
| |
| let out = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_read!( |
| rdr, |
| b("foo,bar"), |
| out, |
| 4, |
| 3, |
| Field { record_end: false } |
| ); |
| assert_eq!(&out[..3], b("foo")); |
| |
| assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull); |
| |
| assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty); |
| assert_eq!(&out[..3], b("bar")); |
| |
| assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); |
| assert_read!(rdr, &[], out, 0, 0, End); |
| } |
| |
| // Test that we can reset the parser mid-stream and count on it to do |
| // the right thing. |
| #[test] |
| fn reset_works() { |
| use crate::ReadFieldResult::*; |
| |
| let out = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty); |
| assert_eq!(&out[..3], b("foo")); |
| |
| // Without reseting the parser state, the reader will remember that |
| // we're in a quoted field, and therefore interpret the leading double |
| // quotes below as a single quote and the trailing quote as a matching |
| // terminator. With the reset, however, the parser forgets the quoted |
| // field and treats the leading double quotes as a syntax quirk and |
| // drops them, in addition to hanging on to the trailing unmatched |
| // quote. (Matches Python's behavior.) |
| rdr.reset(); |
| |
| assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty); |
| assert_eq!(&out[..4], b("bar\"")); |
| } |
| |
| // Test the line number reporting is correct. |
| #[test] |
| fn line_numbers() { |
| use crate::ReadFieldResult::*; |
| |
| let out = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_eq!(1, rdr.line()); |
| |
| assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty); |
| assert_eq!(5, rdr.line()); |
| |
| assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false }); |
| assert_eq!(5, rdr.line()); |
| |
| assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true }); |
| assert_eq!(6, rdr.line()); |
| |
| assert_read!(rdr, &[], &mut [0], 0, 0, End); |
| assert_eq!(6, rdr.line()); |
| } |
| |
| macro_rules! assert_read_record { |
| ( |
| $rdr:expr, $input:expr, $output:expr, $ends:expr, |
| $expect_in:expr, $expect_out:expr, |
| $expect_end:expr, $expect_res:expr |
| ) => {{ |
| let (res, nin, nout, nend) = |
| $rdr.read_record($input, $output, $ends); |
| assert_eq!($expect_res, res, "result"); |
| assert_eq!($expect_in, nin, "input"); |
| assert_eq!($expect_out, nout, "output"); |
| assert_eq!($expect_end, nend, "ends"); |
| }}; |
| } |
| |
| // Test that we can incrementally read a record. |
| #[test] |
| fn stream_record() { |
| use crate::ReadRecordResult::*; |
| |
| let mut inp = b("foo,bar\nbaz"); |
| let out = &mut [0; 1024]; |
| let ends = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); |
| assert_eq!(ends[0], 3); |
| assert_eq!(ends[1], 6); |
| inp = &inp[8..]; |
| |
| assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); |
| inp = &inp[3..]; |
| |
| assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); |
| assert_eq!(ends[0], 3); |
| |
| assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); |
| } |
| |
| // Test that if our output ends are full during the last read that |
| // we get an appropriate state returned. |
| #[test] |
| fn stream_record_last_end_output_full() { |
| use crate::ReadRecordResult::*; |
| |
| let mut inp = b("foo,bar\nbaz"); |
| let out = &mut [0; 1024]; |
| let ends = &mut [0; 10]; |
| let mut rdr = Reader::new(); |
| |
| assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); |
| assert_eq!(ends[0], 3); |
| assert_eq!(ends[1], 6); |
| inp = &inp[8..]; |
| |
| assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); |
| inp = &inp[3..]; |
| |
| assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull); |
| assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); |
| assert_eq!(ends[0], 3); |
| |
| assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); |
| } |
| } |