| use std::{ |
| fs::File, |
| io::{self, BufRead, Seek}, |
| marker::PhantomData, |
| path::Path, |
| result, |
| }; |
| |
| use { |
| csv_core::{Reader as CoreReader, ReaderBuilder as CoreReaderBuilder}, |
| serde::de::DeserializeOwned, |
| }; |
| |
| use crate::{ |
| byte_record::{ByteRecord, Position}, |
| error::{Error, ErrorKind, Result, Utf8Error}, |
| string_record::StringRecord, |
| {Terminator, Trim}, |
| }; |
| |
| /// Builds a CSV reader with various configuration knobs. |
| /// |
| /// This builder can be used to tweak the field delimiter, record terminator |
| /// and more. Once a CSV `Reader` is built, its configuration cannot be |
| /// changed. |
| #[derive(Debug)] |
| pub struct ReaderBuilder { |
| capacity: usize, |
| flexible: bool, |
| has_headers: bool, |
| trim: Trim, |
| /// The underlying CSV parser builder. |
| /// |
| /// We explicitly put this on the heap because CoreReaderBuilder embeds an |
| /// entire DFA transition table, which along with other things, tallies up |
| /// to almost 500 bytes on the stack. |
| builder: Box<CoreReaderBuilder>, |
| } |
| |
| impl Default for ReaderBuilder { |
| fn default() -> ReaderBuilder { |
| ReaderBuilder { |
| capacity: 8 * (1 << 10), |
| flexible: false, |
| has_headers: true, |
| trim: Trim::default(), |
| builder: Box::new(CoreReaderBuilder::default()), |
| } |
| } |
| } |
| |
| impl ReaderBuilder { |
| /// Create a new builder for configuring CSV parsing. |
| /// |
| /// To convert a builder into a reader, call one of the methods starting |
| /// with `from_`. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{ReaderBuilder, StringRecord}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes()); |
| /// |
| /// let records = rdr |
| /// .records() |
| /// .collect::<Result<Vec<StringRecord>, csv::Error>>()?; |
| /// assert_eq!(records, vec![ |
| /// vec!["Boston", "United States", "4628910"], |
| /// vec!["Concord", "United States", "42695"], |
| /// ]); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn new() -> ReaderBuilder { |
| ReaderBuilder::default() |
| } |
| |
| /// Build a CSV parser from this configuration that reads data from the |
| /// given file path. |
| /// |
| /// If there was a problem opening the file at the given path, then this |
| /// returns the corresponding error. |
| /// |
| /// # Example |
| /// |
| /// ```no_run |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let mut rdr = ReaderBuilder::new().from_path("foo.csv")?; |
| /// for result in rdr.records() { |
| /// let record = result?; |
| /// println!("{:?}", record); |
| /// } |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn from_path<P: AsRef<Path>>(&self, path: P) -> Result<Reader<File>> { |
| Ok(Reader::new(self, File::open(path)?)) |
| } |
| |
| /// Build a CSV parser from this configuration that reads data from `rdr`. |
| /// |
| /// Note that the CSV reader is buffered automatically, so you should not |
| /// wrap `rdr` in a buffered reader like `io::BufReader`. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new().from_reader(data.as_bytes()); |
| /// for result in rdr.records() { |
| /// let record = result?; |
| /// println!("{:?}", record); |
| /// } |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn from_reader<R: io::Read>(&self, rdr: R) -> Reader<R> { |
| Reader::new(self, rdr) |
| } |
| |
| /// The field delimiter to use when parsing CSV. |
| /// |
| /// The default is `b','`. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city;country;pop |
| /// Boston;United States;4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .delimiter(b';') |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { |
| self.builder.delimiter(delimiter); |
| self |
| } |
| |
| /// Whether to treat the first row as a special header row. |
| /// |
| /// By default, the first row is treated as a special header row, which |
| /// means the header is never returned by any of the record reading methods |
| /// or iterators. When this is disabled (`yes` set to `false`), the first |
| /// row is not treated specially. |
| /// |
| /// Note that the `headers` and `byte_headers` methods are unaffected by |
| /// whether this is set. Those methods always return the first record. |
| /// |
| /// # Example |
| /// |
| /// This example shows what happens when `has_headers` is disabled. |
| /// Namely, the first row is treated just like any other row. |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .has_headers(false) |
| /// .from_reader(data.as_bytes()); |
| /// let mut iter = rdr.records(); |
| /// |
| /// // Read the first record. |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["city", "country", "pop"]); |
| /// } else { |
| /// return Err(From::from( |
| /// "expected at least two records but got none")); |
| /// } |
| /// |
| /// // Read the second record. |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// } else { |
| /// return Err(From::from( |
| /// "expected at least two records but got one")) |
| /// } |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn has_headers(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.has_headers = yes; |
| self |
| } |
| |
| /// Whether the number of fields in records is allowed to change or not. |
| /// |
| /// When disabled (which is the default), parsing CSV data will return an |
| /// error if a record is found with a number of fields different from the |
| /// number of fields in a previous record. |
| /// |
| /// When enabled, this error checking is turned off. |
| /// |
| /// # Example: flexible records enabled |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// // Notice that the first row is missing the population count. |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .flexible(true) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// # Example: flexible records disabled |
| /// |
| /// This shows the error that appears when records of unequal length |
| /// are found and flexible records have been disabled (which is the |
| /// default). |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{ErrorKind, ReaderBuilder}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// // Notice that the first row is missing the population count. |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .flexible(false) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(Err(err)) = rdr.records().next() { |
| /// match *err.kind() { |
| /// ErrorKind::UnequalLengths { expected_len, len, .. } => { |
| /// // The header row has 3 fields... |
| /// assert_eq!(expected_len, 3); |
| /// // ... but the first row has only 2 fields. |
| /// assert_eq!(len, 2); |
| /// Ok(()) |
| /// } |
| /// ref wrong => { |
| /// Err(From::from(format!( |
| /// "expected UnequalLengths error but got {:?}", |
| /// wrong))) |
| /// } |
| /// } |
| /// } else { |
| /// Err(From::from( |
| /// "expected at least one errored record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn flexible(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.flexible = yes; |
| self |
| } |
| |
| /// Whether fields are trimmed of leading and trailing whitespace or not. |
| /// |
| /// By default, no trimming is performed. This method permits one to |
| /// override that behavior and choose one of the following options: |
| /// |
| /// 1. `Trim::Headers` trims only header values. |
| /// 2. `Trim::Fields` trims only non-header or "field" values. |
| /// 3. `Trim::All` trims both header and non-header values. |
| /// |
| /// A value is only interpreted as a header value if this CSV reader is |
| /// configured to read a header record (which is the default). |
| /// |
| /// When reading string records, characters meeting the definition of |
| /// Unicode whitespace are trimmed. When reading byte records, characters |
| /// meeting the definition of ASCII whitespace are trimmed. ASCII |
| /// whitespace characters correspond to the set `[\t\n\v\f\r ]`. |
| /// |
| /// # Example |
| /// |
| /// This example shows what happens when all values are trimmed. |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{ReaderBuilder, StringRecord, Trim}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city , country , pop |
| /// Boston,\" |
| /// United States\",4628910 |
| /// Concord, United States ,42695 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .trim(Trim::All) |
| /// .from_reader(data.as_bytes()); |
| /// let records = rdr |
| /// .records() |
| /// .collect::<Result<Vec<StringRecord>, csv::Error>>()?; |
| /// assert_eq!(records, vec![ |
| /// vec!["Boston", "United States", "4628910"], |
| /// vec!["Concord", "United States", "42695"], |
| /// ]); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn trim(&mut self, trim: Trim) -> &mut ReaderBuilder { |
| self.trim = trim; |
| self |
| } |
| |
| /// The record terminator to use when parsing CSV. |
| /// |
| /// A record terminator can be any single byte. The default is a special |
| /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` |
| /// or `\r\n` as a single record terminator. |
| /// |
| /// # Example: `$` as a record terminator |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{ReaderBuilder, Terminator}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "city,country,pop$Boston,United States,4628910"; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .terminator(Terminator::Any(b'$')) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { |
| self.builder.terminator(term.to_core()); |
| self |
| } |
| |
| /// The quote character to use when parsing CSV. |
| /// |
| /// The default is `b'"'`. |
| /// |
| /// # Example: single quotes instead of double quotes |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,'United States',4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .quote(b'\'') |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { |
| self.builder.quote(quote); |
| self |
| } |
| |
| /// The escape character to use when parsing CSV. |
| /// |
| /// In some variants of CSV, quotes are escaped using a special escape |
| /// character like `\` (instead of escaping quotes by doubling them). |
| /// |
| /// By default, recognizing these idiosyncratic escapes is disabled. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,\"The \\\"United\\\" States\",4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .escape(Some(b'\\')) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec![ |
| /// "Boston", "The \"United\" States", "4628910", |
| /// ]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn escape(&mut self, escape: Option<u8>) -> &mut ReaderBuilder { |
| self.builder.escape(escape); |
| self |
| } |
| |
| /// Enable double quote escapes. |
| /// |
| /// This is enabled by default, but it may be disabled. When disabled, |
| /// doubled quotes are not interpreted as escapes. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,\"The \"\"United\"\" States\",4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .double_quote(false) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec![ |
| /// "Boston", "The \"United\"\" States\"", "4628910", |
| /// ]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.builder.double_quote(yes); |
| self |
| } |
| |
| /// Enable or disable quoting. |
| /// |
| /// This is enabled by default, but it may be disabled. When disabled, |
| /// quotes are not treated specially. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,\"The United States,4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .quoting(false) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec![ |
| /// "Boston", "\"The United States", "4628910", |
| /// ]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.builder.quoting(yes); |
| self |
| } |
| |
| /// The comment character to use when parsing CSV. |
| /// |
| /// If the start of a record begins with the byte given here, then that |
| /// line is ignored by the CSV parser. |
| /// |
| /// This is disabled by default. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// #Concord,United States,42695 |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .comment(Some(b'#')) |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn comment(&mut self, comment: Option<u8>) -> &mut ReaderBuilder { |
| self.builder.comment(comment); |
| self |
| } |
| |
| /// A convenience method for specifying a configuration to read ASCII |
| /// delimited text. |
| /// |
| /// This sets the delimiter and record terminator to the ASCII unit |
| /// separator (`\x1F`) and record separator (`\x1E`), respectively. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city\x1Fcountry\x1Fpop\x1EBoston\x1FUnited States\x1F4628910"; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .ascii() |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn ascii(&mut self) -> &mut ReaderBuilder { |
| self.builder.ascii(); |
| self |
| } |
| |
| /// Set the capacity (in bytes) of the buffer used in the CSV reader. |
| /// This defaults to a reasonable setting. |
| pub fn buffer_capacity(&mut self, capacity: usize) -> &mut ReaderBuilder { |
| self.capacity = capacity; |
| self |
| } |
| |
| /// Enable or disable the NFA for parsing CSV. |
| /// |
| /// This is intended to be a debug option. The NFA is always slower than |
| /// the DFA. |
| #[doc(hidden)] |
| pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { |
| self.builder.nfa(yes); |
| self |
| } |
| } |
| |
| /// A already configured CSV reader. |
| /// |
| /// A CSV reader takes as input CSV data and transforms that into standard Rust |
| /// values. The most flexible way to read CSV data is as a sequence of records, |
| /// where a record is a sequence of fields and each field is a string. However, |
| /// a reader can also deserialize CSV data into Rust types like `i64` or |
| /// `(String, f64, f64, f64)` or even a custom struct automatically using |
| /// Serde. |
| /// |
| /// # Configuration |
| /// |
| /// A CSV reader has a couple convenient constructor methods like `from_path` |
| /// and `from_reader`. However, if you want to configure the CSV reader to use |
| /// a different delimiter or quote character (among many other things), then |
| /// you should use a [`ReaderBuilder`](struct.ReaderBuilder.html) to construct |
| /// a `Reader`. For example, to change the field delimiter: |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::ReaderBuilder; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city;country;pop |
| /// Boston;United States;4628910 |
| /// "; |
| /// let mut rdr = ReaderBuilder::new() |
| /// .delimiter(b';') |
| /// .from_reader(data.as_bytes()); |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// # Error handling |
| /// |
| /// In general, CSV *parsing* does not ever return an error. That is, there is |
| /// no such thing as malformed CSV data. Instead, this reader will prioritize |
| /// finding a parse over rejecting CSV data that it does not understand. This |
| /// choice was inspired by other popular CSV parsers, but also because it is |
| /// pragmatic. CSV data varies wildly, so even if the CSV data is malformed, |
| /// it might still be possible to work with the data. In the land of CSV, there |
| /// is no "right" or "wrong," only "right" and "less right." |
| /// |
| /// With that said, a number of errors can occur while reading CSV data: |
| /// |
| /// * By default, all records in CSV data must have the same number of fields. |
| /// If a record is found with a different number of fields than a prior |
| /// record, then an error is returned. This behavior can be disabled by |
| /// enabling flexible parsing via the `flexible` method on |
| /// [`ReaderBuilder`](struct.ReaderBuilder.html). |
| /// * When reading CSV data from a resource (like a file), it is possible for |
| /// reading from the underlying resource to fail. This will return an error. |
| /// For subsequent calls to the `Reader` after encountering a such error |
| /// (unless `seek` is used), it will behave as if end of file had been |
| /// reached, in order to avoid running into infinite loops when still |
| /// attempting to read the next record when one has errored. |
| /// * When reading CSV data into `String` or `&str` fields (e.g., via a |
| /// [`StringRecord`](struct.StringRecord.html)), UTF-8 is strictly |
| /// enforced. If CSV data is invalid UTF-8, then an error is returned. If |
| /// you want to read invalid UTF-8, then you should use the byte oriented |
| /// APIs such as [`ByteRecord`](struct.ByteRecord.html). If you need explicit |
| /// support for another encoding entirely, then you'll need to use another |
| /// crate to transcode your CSV data to UTF-8 before parsing it. |
| /// * When using Serde to deserialize CSV data into Rust types, it is possible |
| /// for a number of additional errors to occur. For example, deserializing |
| /// a field `xyz` into an `i32` field will result in an error. |
| /// |
| /// For more details on the precise semantics of errors, see the |
| /// [`Error`](enum.Error.html) type. |
| #[derive(Debug)] |
| pub struct Reader<R> { |
| /// The underlying CSV parser. |
| /// |
| /// We explicitly put this on the heap because CoreReader embeds an entire |
| /// DFA transition table, which along with other things, tallies up to |
| /// almost 500 bytes on the stack. |
| core: Box<CoreReader>, |
| /// The underlying reader. |
| rdr: io::BufReader<R>, |
| /// Various state tracking. |
| /// |
| /// There is more state embedded in the `CoreReader`. |
| state: ReaderState, |
| } |
| |
| #[derive(Debug)] |
| struct ReaderState { |
| /// When set, this contains the first row of any parsed CSV data. |
| /// |
| /// This is always populated, regardless of whether `has_headers` is set. |
| headers: Option<Headers>, |
| /// When set, the first row of parsed CSV data is excluded from things |
| /// that read records, like iterators and `read_record`. |
| has_headers: bool, |
| /// When set, there is no restriction on the length of records. When not |
| /// set, every record must have the same number of fields, or else an error |
| /// is reported. |
| flexible: bool, |
| trim: Trim, |
| /// The number of fields in the first record parsed. |
| first_field_count: Option<u64>, |
| /// The current position of the parser. |
| /// |
| /// Note that this position is only observable by callers at the start |
| /// of a record. More granular positions are not supported. |
| cur_pos: Position, |
| /// Whether the first record has been read or not. |
| first: bool, |
| /// Whether the reader has been seeked or not. |
| seeked: bool, |
| /// Whether EOF of the underlying reader has been reached or not. |
| /// |
| /// IO errors on the underlying reader will be considered as an EOF for |
| /// subsequent read attempts, as it would be incorrect to keep on trying |
| /// to read when the underlying reader has broken. |
| /// |
| /// For clarity, having the best `Debug` impl and in case they need to be |
| /// treated differently at some point, we store whether the `EOF` is |
| /// considered because an actual EOF happened, or because we encoundered |
| /// an IO error. |
| /// This has no additional runtime cost. |
| eof: ReaderEofState, |
| } |
| |
| /// Whether EOF of the underlying reader has been reached or not. |
| /// |
| /// IO errors on the underlying reader will be considered as an EOF for |
| /// subsequent read attempts, as it would be incorrect to keep on trying |
| /// to read when the underlying reader has broken. |
| /// |
| /// For clarity, having the best `Debug` impl and in case they need to be |
| /// treated differently at some point, we store whether the `EOF` is |
| /// considered because an actual EOF happened, or because we encoundered |
| /// an IO error |
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| enum ReaderEofState { |
| NotEof, |
| Eof, |
| IOError, |
| } |
| |
| /// Headers encapsulates any data associated with the headers of CSV data. |
| /// |
| /// The headers always correspond to the first row. |
| #[derive(Debug)] |
| struct Headers { |
| /// The header, as raw bytes. |
| byte_record: ByteRecord, |
| /// The header, as valid UTF-8 (or a UTF-8 error). |
| string_record: result::Result<StringRecord, Utf8Error>, |
| } |
| |
| impl Reader<Reader<File>> { |
| /// Create a new CSV parser with a default configuration for the given |
| /// file path. |
| /// |
| /// To customize CSV parsing, use a `ReaderBuilder`. |
| /// |
| /// # Example |
| /// |
| /// ```no_run |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let mut rdr = Reader::from_path("foo.csv")?; |
| /// for result in rdr.records() { |
| /// let record = result?; |
| /// println!("{:?}", record); |
| /// } |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Reader<File>> { |
| ReaderBuilder::new().from_path(path) |
| } |
| } |
| |
| impl<R: io::Read> Reader<R> { |
| /// Create a new CSV reader given a builder and a source of underlying |
| /// bytes. |
| fn new(builder: &ReaderBuilder, rdr: R) -> Reader<R> { |
| Reader { |
| core: Box::new(builder.builder.build()), |
| rdr: io::BufReader::with_capacity(builder.capacity, rdr), |
| state: ReaderState { |
| headers: None, |
| has_headers: builder.has_headers, |
| flexible: builder.flexible, |
| trim: builder.trim, |
| first_field_count: None, |
| cur_pos: Position::new(), |
| first: false, |
| seeked: false, |
| eof: ReaderEofState::NotEof, |
| }, |
| } |
| } |
| |
| /// Create a new CSV parser with a default configuration for the given |
| /// reader. |
| /// |
| /// To customize CSV parsing, use a `ReaderBuilder`. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// for result in rdr.records() { |
| /// let record = result?; |
| /// println!("{:?}", record); |
| /// } |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn from_reader(rdr: R) -> Reader<R> { |
| ReaderBuilder::new().from_reader(rdr) |
| } |
| |
| /// Returns a borrowed iterator over deserialized records. |
| /// |
| /// Each item yielded by this iterator is a `Result<D, Error>`. |
| /// Therefore, in order to access the record, callers must handle the |
| /// possibility of error (typically with `try!` or `?`). |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this does not include the first record. Additionally, |
| /// if `has_headers` is enabled, then deserializing into a struct will |
| /// automatically align the values in each row to the fields of a struct |
| /// based on the header row. |
| /// |
| /// # Example |
| /// |
| /// This shows how to deserialize CSV data into normal Rust structs. The |
| /// fields of the header row are used to match up the values in each row |
| /// to the fields of the struct. |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// |
| /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)] |
| /// struct Row { |
| /// city: String, |
| /// country: String, |
| /// #[serde(rename = "popcount")] |
| /// population: u64, |
| /// } |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,popcount |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.deserialize(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record: Row = result?; |
| /// assert_eq!(record, Row { |
| /// city: "Boston".to_string(), |
| /// country: "United States".to_string(), |
| /// population: 4628910, |
| /// }); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// # Rules |
| /// |
| /// For the most part, any Rust type that maps straight-forwardly to a CSV |
| /// record is supported. This includes maps, structs, tuples and tuple |
| /// structs. Other Rust types, such as `Vec`s, arrays, and enums have |
| /// a more complicated story. In general, when working with CSV data, one |
| /// should avoid *nested sequences* as much as possible. |
| /// |
| /// Maps, structs, tuples and tuple structs map to CSV records in a simple |
| /// way. Tuples and tuple structs decode their fields in the order that |
| /// they are defined. Structs will do the same only if `has_headers` has |
| /// been disabled using [`ReaderBuilder`](struct.ReaderBuilder.html), |
| /// otherwise, structs and maps are deserialized based on the fields |
| /// defined in the header row. (If there is no header row, then |
| /// deserializing into a map will result in an error.) |
| /// |
| /// Nested sequences are supported in a limited capacity. Namely, they |
| /// are flattened. As a result, it's often useful to use a `Vec` to capture |
| /// a "tail" of fields in a record: |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// |
| /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)] |
| /// struct Row { |
| /// label: String, |
| /// values: Vec<i32>, |
| /// } |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "foo,1,2,3"; |
| /// let mut rdr = csv::ReaderBuilder::new() |
| /// .has_headers(false) |
| /// .from_reader(data.as_bytes()); |
| /// let mut iter = rdr.deserialize(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record: Row = result?; |
| /// assert_eq!(record, Row { |
| /// label: "foo".to_string(), |
| /// values: vec![1, 2, 3], |
| /// }); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// In the above example, adding another field to the `Row` struct after |
| /// the `values` field will result in a deserialization error. This is |
| /// because the deserializer doesn't know when to stop reading fields |
| /// into the `values` vector, so it will consume the rest of the fields in |
| /// the record leaving none left over for the additional field. |
| /// |
| /// Finally, simple enums in Rust can be deserialized as well. Namely, |
| /// enums must either be variants with no arguments or variants with a |
| /// single argument. Variants with no arguments are deserialized based on |
| /// which variant name the field matches. Variants with a single argument |
| /// are deserialized based on which variant can store the data. The latter |
| /// is only supported when using "untagged" enum deserialization. The |
| /// following example shows both forms in action: |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// |
| /// #[derive(Debug, serde::Deserialize, PartialEq)] |
| /// struct Row { |
| /// label: Label, |
| /// value: Number, |
| /// } |
| /// |
| /// #[derive(Debug, serde::Deserialize, PartialEq)] |
| /// #[serde(rename_all = "lowercase")] |
| /// enum Label { |
| /// Celsius, |
| /// Fahrenheit, |
| /// } |
| /// |
| /// #[derive(Debug, serde::Deserialize, PartialEq)] |
| /// #[serde(untagged)] |
| /// enum Number { |
| /// Integer(i64), |
| /// Float(f64), |
| /// } |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// label,value |
| /// celsius,22.2222 |
| /// fahrenheit,72 |
| /// "; |
| /// let mut rdr = csv::Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.deserialize(); |
| /// |
| /// // Read the first record. |
| /// if let Some(result) = iter.next() { |
| /// let record: Row = result?; |
| /// assert_eq!(record, Row { |
| /// label: Label::Celsius, |
| /// value: Number::Float(22.2222), |
| /// }); |
| /// } else { |
| /// return Err(From::from( |
| /// "expected at least two records but got none")); |
| /// } |
| /// |
| /// // Read the second record. |
| /// if let Some(result) = iter.next() { |
| /// let record: Row = result?; |
| /// assert_eq!(record, Row { |
| /// label: Label::Fahrenheit, |
| /// value: Number::Integer(72), |
| /// }); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from( |
| /// "expected at least two records but got only one")) |
| /// } |
| /// } |
| /// ``` |
| pub fn deserialize<D>(&mut self) -> DeserializeRecordsIter<R, D> |
| where |
| D: DeserializeOwned, |
| { |
| DeserializeRecordsIter::new(self) |
| } |
| |
| /// Returns an owned iterator over deserialized records. |
| /// |
| /// Each item yielded by this iterator is a `Result<D, Error>`. |
| /// Therefore, in order to access the record, callers must handle the |
| /// possibility of error (typically with `try!` or `?`). |
| /// |
| /// This is mostly useful when you want to return a CSV iterator or store |
| /// it somewhere. |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this does not include the first record. Additionally, |
| /// if `has_headers` is enabled, then deserializing into a struct will |
| /// automatically align the values in each row to the fields of a struct |
| /// based on the header row. |
| /// |
| /// For more detailed deserialization rules, see the documentation on the |
| /// `deserialize` method. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// |
| /// #[derive(Debug, serde::Deserialize, Eq, PartialEq)] |
| /// struct Row { |
| /// city: String, |
| /// country: String, |
| /// #[serde(rename = "popcount")] |
| /// population: u64, |
| /// } |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,popcount |
| /// Boston,United States,4628910 |
| /// "; |
| /// let rdr = csv::Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.into_deserialize(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record: Row = result?; |
| /// assert_eq!(record, Row { |
| /// city: "Boston".to_string(), |
| /// country: "United States".to_string(), |
| /// population: 4628910, |
| /// }); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn into_deserialize<D>(self) -> DeserializeRecordsIntoIter<R, D> |
| where |
| D: DeserializeOwned, |
| { |
| DeserializeRecordsIntoIter::new(self) |
| } |
| |
| /// Returns a borrowed iterator over all records as strings. |
| /// |
| /// Each item yielded by this iterator is a `Result<StringRecord, Error>`. |
| /// Therefore, in order to access the record, callers must handle the |
| /// possibility of error (typically with `try!` or `?`). |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this does not include the first record. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.records(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn records(&mut self) -> StringRecordsIter<R> { |
| StringRecordsIter::new(self) |
| } |
| |
| /// Returns an owned iterator over all records as strings. |
| /// |
| /// Each item yielded by this iterator is a `Result<StringRecord, Error>`. |
| /// Therefore, in order to access the record, callers must handle the |
| /// possibility of error (typically with `try!` or `?`). |
| /// |
| /// This is mostly useful when you want to return a CSV iterator or store |
| /// it somewhere. |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this does not include the first record. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let rdr = Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.into_records(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn into_records(self) -> StringRecordsIntoIter<R> { |
| StringRecordsIntoIter::new(self) |
| } |
| |
| /// Returns a borrowed iterator over all records as raw bytes. |
| /// |
| /// Each item yielded by this iterator is a `Result<ByteRecord, Error>`. |
| /// Therefore, in order to access the record, callers must handle the |
| /// possibility of error (typically with `try!` or `?`). |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this does not include the first record. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.byte_records(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn byte_records(&mut self) -> ByteRecordsIter<R> { |
| ByteRecordsIter::new(self) |
| } |
| |
| /// Returns an owned iterator over all records as raw bytes. |
| /// |
| /// Each item yielded by this iterator is a `Result<ByteRecord, Error>`. |
| /// Therefore, in order to access the record, callers must handle the |
| /// possibility of error (typically with `try!` or `?`). |
| /// |
| /// This is mostly useful when you want to return a CSV iterator or store |
| /// it somewhere. |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this does not include the first record. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let rdr = Reader::from_reader(data.as_bytes()); |
| /// let mut iter = rdr.into_byte_records(); |
| /// |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn into_byte_records(self) -> ByteRecordsIntoIter<R> { |
| ByteRecordsIntoIter::new(self) |
| } |
| |
| /// Returns a reference to the first row read by this parser. |
| /// |
| /// If no row has been read yet, then this will force parsing of the first |
| /// row. |
| /// |
| /// If there was a problem parsing the row or if it wasn't valid UTF-8, |
| /// then this returns an error. |
| /// |
| /// If the underlying reader emits EOF before any data, then this returns |
| /// an empty record. |
| /// |
| /// Note that this method may be used regardless of whether `has_headers` |
| /// was enabled (but it is enabled by default). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to get the header row of CSV data. Notice that |
| /// the header row does not appear as a record in the iterator! |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// |
| /// // We can read the headers before iterating. |
| /// { |
| /// // `headers` borrows from the reader, so we put this in its |
| /// // own scope. That way, the borrow ends before we try iterating |
| /// // below. Alternatively, we could clone the headers. |
| /// let headers = rdr.headers()?; |
| /// assert_eq!(headers, vec!["city", "country", "pop"]); |
| /// } |
| /// |
| /// if let Some(result) = rdr.records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// } else { |
| /// return Err(From::from( |
| /// "expected at least one record but got none")) |
| /// } |
| /// |
| /// // We can also read the headers after iterating. |
| /// let headers = rdr.headers()?; |
| /// assert_eq!(headers, vec!["city", "country", "pop"]); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn headers(&mut self) -> Result<&StringRecord> { |
| if self.state.headers.is_none() { |
| let mut record = ByteRecord::new(); |
| self.read_byte_record_impl(&mut record)?; |
| self.set_headers_impl(Err(record)); |
| } |
| let headers = self.state.headers.as_ref().unwrap(); |
| match headers.string_record { |
| Ok(ref record) => Ok(record), |
| Err(ref err) => Err(Error::new(ErrorKind::Utf8 { |
| pos: headers.byte_record.position().map(Clone::clone), |
| err: err.clone(), |
| })), |
| } |
| } |
| |
| /// Returns a reference to the first row read by this parser as raw bytes. |
| /// |
| /// If no row has been read yet, then this will force parsing of the first |
| /// row. |
| /// |
| /// If there was a problem parsing the row then this returns an error. |
| /// |
| /// If the underlying reader emits EOF before any data, then this returns |
| /// an empty record. |
| /// |
| /// Note that this method may be used regardless of whether `has_headers` |
| /// was enabled (but it is enabled by default). |
| /// |
| /// # Example |
| /// |
| /// This example shows how to get the header row of CSV data. Notice that |
| /// the header row does not appear as a record in the iterator! |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::Reader; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// |
| /// // We can read the headers before iterating. |
| /// { |
| /// // `headers` borrows from the reader, so we put this in its |
| /// // own scope. That way, the borrow ends before we try iterating |
| /// // below. Alternatively, we could clone the headers. |
| /// let headers = rdr.byte_headers()?; |
| /// assert_eq!(headers, vec!["city", "country", "pop"]); |
| /// } |
| /// |
| /// if let Some(result) = rdr.byte_records().next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// } else { |
| /// return Err(From::from( |
| /// "expected at least one record but got none")) |
| /// } |
| /// |
| /// // We can also read the headers after iterating. |
| /// let headers = rdr.byte_headers()?; |
| /// assert_eq!(headers, vec!["city", "country", "pop"]); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn byte_headers(&mut self) -> Result<&ByteRecord> { |
| if self.state.headers.is_none() { |
| let mut record = ByteRecord::new(); |
| self.read_byte_record_impl(&mut record)?; |
| self.set_headers_impl(Err(record)); |
| } |
| Ok(&self.state.headers.as_ref().unwrap().byte_record) |
| } |
| |
| /// Set the headers of this CSV parser manually. |
| /// |
| /// This overrides any other setting (including `set_byte_headers`). Any |
| /// automatic detection of headers is disabled. This may be called at any |
| /// time. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{Reader, StringRecord}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// |
| /// assert_eq!(rdr.headers()?, vec!["city", "country", "pop"]); |
| /// rdr.set_headers(StringRecord::from(vec!["a", "b", "c"])); |
| /// assert_eq!(rdr.headers()?, vec!["a", "b", "c"]); |
| /// |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn set_headers(&mut self, headers: StringRecord) { |
| self.set_headers_impl(Ok(headers)); |
| } |
| |
| /// Set the headers of this CSV parser manually as raw bytes. |
| /// |
| /// This overrides any other setting (including `set_headers`). Any |
| /// automatic detection of headers is disabled. This may be called at any |
| /// time. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{Reader, ByteRecord}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// |
| /// assert_eq!(rdr.byte_headers()?, vec!["city", "country", "pop"]); |
| /// rdr.set_byte_headers(ByteRecord::from(vec!["a", "b", "c"])); |
| /// assert_eq!(rdr.byte_headers()?, vec!["a", "b", "c"]); |
| /// |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn set_byte_headers(&mut self, headers: ByteRecord) { |
| self.set_headers_impl(Err(headers)); |
| } |
| |
| fn set_headers_impl( |
| &mut self, |
| headers: result::Result<StringRecord, ByteRecord>, |
| ) { |
| // If we have string headers, then get byte headers. But if we have |
| // byte headers, then get the string headers (or a UTF-8 error). |
| let (mut str_headers, mut byte_headers) = match headers { |
| Ok(string) => { |
| let bytes = string.clone().into_byte_record(); |
| (Ok(string), bytes) |
| } |
| Err(bytes) => { |
| match StringRecord::from_byte_record(bytes.clone()) { |
| Ok(str_headers) => (Ok(str_headers), bytes), |
| Err(err) => (Err(err.utf8_error().clone()), bytes), |
| } |
| } |
| }; |
| if self.state.trim.should_trim_headers() { |
| if let Ok(ref mut str_headers) = str_headers.as_mut() { |
| str_headers.trim(); |
| } |
| byte_headers.trim(); |
| } |
| self.state.headers = Some(Headers { |
| byte_record: byte_headers, |
| string_record: str_headers, |
| }); |
| } |
| |
| /// Read a single row into the given record. Returns false when no more |
| /// records could be read. |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this will never read the first record. |
| /// |
| /// This method is useful when you want to read records as fast as |
| /// as possible. It's less ergonomic than an iterator, but it permits the |
| /// caller to reuse the `StringRecord` allocation, which usually results |
| /// in higher throughput. |
| /// |
| /// Records read via this method are guaranteed to have a position set |
| /// on them, even if the reader is at EOF or if an error is returned. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{Reader, StringRecord}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// let mut record = StringRecord::new(); |
| /// |
| /// if rdr.read_record(&mut record)? { |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn read_record(&mut self, record: &mut StringRecord) -> Result<bool> { |
| let result = record.read(self); |
| // We need to trim again because trimming string records includes |
| // Unicode whitespace. (ByteRecord trimming only includes ASCII |
| // whitespace.) |
| if self.state.trim.should_trim_fields() { |
| record.trim(); |
| } |
| result |
| } |
| |
| /// Read a single row into the given byte record. Returns false when no |
| /// more records could be read. |
| /// |
| /// If `has_headers` was enabled via a `ReaderBuilder` (which is the |
| /// default), then this will never read the first record. |
| /// |
| /// This method is useful when you want to read records as fast as |
| /// as possible. It's less ergonomic than an iterator, but it permits the |
| /// caller to reuse the `ByteRecord` allocation, which usually results |
| /// in higher throughput. |
| /// |
| /// Records read via this method are guaranteed to have a position set |
| /// on them, even if the reader is at EOF or if an error is returned. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::error::Error; |
| /// use csv::{ByteRecord, Reader}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,pop |
| /// Boston,United States,4628910 |
| /// "; |
| /// let mut rdr = Reader::from_reader(data.as_bytes()); |
| /// let mut record = ByteRecord::new(); |
| /// |
| /// if rdr.read_byte_record(&mut record)? { |
| /// assert_eq!(record, vec!["Boston", "United States", "4628910"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn read_byte_record( |
| &mut self, |
| record: &mut ByteRecord, |
| ) -> Result<bool> { |
| if !self.state.seeked && !self.state.has_headers && !self.state.first { |
| // If the caller indicated "no headers" and we haven't yielded the |
| // first record yet, then we should yield our header row if we have |
| // one. |
| if let Some(ref headers) = self.state.headers { |
| self.state.first = true; |
| record.clone_from(&headers.byte_record); |
| if self.state.trim.should_trim_fields() { |
| record.trim(); |
| } |
| return Ok(!record.is_empty()); |
| } |
| } |
| let ok = self.read_byte_record_impl(record)?; |
| self.state.first = true; |
| if !self.state.seeked && self.state.headers.is_none() { |
| self.set_headers_impl(Err(record.clone())); |
| // If the end user indicated that we have headers, then we should |
| // never return the first row. Instead, we should attempt to |
| // read and return the next one. |
| if self.state.has_headers { |
| let result = self.read_byte_record_impl(record); |
| if self.state.trim.should_trim_fields() { |
| record.trim(); |
| } |
| return result; |
| } |
| } else if self.state.trim.should_trim_fields() { |
| record.trim(); |
| } |
| Ok(ok) |
| } |
| |
| /// Read a byte record from the underlying CSV reader, without accounting |
| /// for headers. |
| #[inline(always)] |
| fn read_byte_record_impl( |
| &mut self, |
| record: &mut ByteRecord, |
| ) -> Result<bool> { |
| use csv_core::ReadRecordResult::*; |
| |
| record.clear(); |
| record.set_position(Some(self.state.cur_pos.clone())); |
| if self.state.eof != ReaderEofState::NotEof { |
| return Ok(false); |
| } |
| let (mut outlen, mut endlen) = (0, 0); |
| loop { |
| let (res, nin, nout, nend) = { |
| let input_res = self.rdr.fill_buf(); |
| if input_res.is_err() { |
| self.state.eof = ReaderEofState::IOError; |
| } |
| let input = input_res?; |
| let (fields, ends) = record.as_parts(); |
| self.core.read_record( |
| input, |
| &mut fields[outlen..], |
| &mut ends[endlen..], |
| ) |
| }; |
| self.rdr.consume(nin); |
| let byte = self.state.cur_pos.byte(); |
| self.state |
| .cur_pos |
| .set_byte(byte + nin as u64) |
| .set_line(self.core.line()); |
| outlen += nout; |
| endlen += nend; |
| match res { |
| InputEmpty => continue, |
| OutputFull => { |
| record.expand_fields(); |
| continue; |
| } |
| OutputEndsFull => { |
| record.expand_ends(); |
| continue; |
| } |
| Record => { |
| record.set_len(endlen); |
| self.state.add_record(record)?; |
| return Ok(true); |
| } |
| End => { |
| self.state.eof = ReaderEofState::Eof; |
| return Ok(false); |
| } |
| } |
| } |
| } |
| |
| /// Return the current position of this CSV reader. |
| /// |
| /// The byte offset in the position returned can be used to `seek` this |
| /// reader. In particular, seeking to a position returned here on the same |
| /// data will result in parsing the same subsequent record. |
| /// |
| /// # Example: reading the position |
| /// |
| /// ``` |
| /// use std::{error::Error, io}; |
| /// use csv::{Reader, Position}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,popcount |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let rdr = Reader::from_reader(io::Cursor::new(data)); |
| /// let mut iter = rdr.into_records(); |
| /// let mut pos = Position::new(); |
| /// loop { |
| /// // Read the position immediately before each record. |
| /// let next_pos = iter.reader().position().clone(); |
| /// if iter.next().is_none() { |
| /// break; |
| /// } |
| /// pos = next_pos; |
| /// } |
| /// |
| /// // `pos` should now be the position immediately before the last |
| /// // record. |
| /// assert_eq!(pos.byte(), 51); |
| /// assert_eq!(pos.line(), 3); |
| /// assert_eq!(pos.record(), 2); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn position(&self) -> &Position { |
| &self.state.cur_pos |
| } |
| |
| /// Returns true if and only if this reader has been exhausted. |
| /// |
| /// When this returns true, no more records can be read from this reader |
| /// (unless it has been seeked to another position). |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// use std::{error::Error, io}; |
| /// use csv::{Reader, Position}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,popcount |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let mut rdr = Reader::from_reader(io::Cursor::new(data)); |
| /// assert!(!rdr.is_done()); |
| /// for result in rdr.records() { |
| /// let _ = result?; |
| /// } |
| /// assert!(rdr.is_done()); |
| /// Ok(()) |
| /// } |
| /// ``` |
| pub fn is_done(&self) -> bool { |
| self.state.eof != ReaderEofState::NotEof |
| } |
| |
| /// Returns true if and only if this reader has been configured to |
| /// interpret the first record as a header record. |
| pub fn has_headers(&self) -> bool { |
| self.state.has_headers |
| } |
| |
| /// Returns a reference to the underlying reader. |
| pub fn get_ref(&self) -> &R { |
| self.rdr.get_ref() |
| } |
| |
| /// Returns a mutable reference to the underlying reader. |
| pub fn get_mut(&mut self) -> &mut R { |
| self.rdr.get_mut() |
| } |
| |
| /// Unwraps this CSV reader, returning the underlying reader. |
| /// |
| /// Note that any leftover data inside this reader's internal buffer is |
| /// lost. |
| pub fn into_inner(self) -> R { |
| self.rdr.into_inner() |
| } |
| } |
| |
| impl<R: io::Read + io::Seek> Reader<R> { |
| /// Seeks the underlying reader to the position given. |
| /// |
| /// This comes with a few caveats: |
| /// |
| /// * Any internal buffer associated with this reader is cleared. |
| /// * If the given position does not correspond to a position immediately |
| /// before the start of a record, then the behavior of this reader is |
| /// unspecified. |
| /// * Any special logic that skips the first record in the CSV reader |
| /// when reading or iterating over records is disabled. |
| /// |
| /// If the given position has a byte offset equivalent to the current |
| /// position, then no seeking is performed. |
| /// |
| /// If the header row has not already been read, then this will attempt |
| /// to read the header row before seeking. Therefore, it is possible that |
| /// this returns an error associated with reading CSV data. |
| /// |
| /// Note that seeking is performed based only on the byte offset in the |
| /// given position. Namely, the record or line numbers in the position may |
| /// be incorrect, but this will cause any future position generated by |
| /// this CSV reader to be similarly incorrect. |
| /// |
| /// # Example: seek to parse a record twice |
| /// |
| /// ``` |
| /// use std::{error::Error, io}; |
| /// use csv::{Reader, Position}; |
| /// |
| /// # fn main() { example().unwrap(); } |
| /// fn example() -> Result<(), Box<dyn Error>> { |
| /// let data = "\ |
| /// city,country,popcount |
| /// Boston,United States,4628910 |
| /// Concord,United States,42695 |
| /// "; |
| /// let rdr = Reader::from_reader(io::Cursor::new(data)); |
| /// let mut iter = rdr.into_records(); |
| /// let mut pos = Position::new(); |
| /// loop { |
| /// // Read the position immediately before each record. |
| /// let next_pos = iter.reader().position().clone(); |
| /// if iter.next().is_none() { |
| /// break; |
| /// } |
| /// pos = next_pos; |
| /// } |
| /// |
| /// // Now seek the reader back to `pos`. This will let us read the |
| /// // last record again. |
| /// iter.reader_mut().seek(pos)?; |
| /// let mut iter = iter.into_reader().into_records(); |
| /// if let Some(result) = iter.next() { |
| /// let record = result?; |
| /// assert_eq!(record, vec!["Concord", "United States", "42695"]); |
| /// Ok(()) |
| /// } else { |
| /// Err(From::from("expected at least one record but got none")) |
| /// } |
| /// } |
| /// ``` |
| pub fn seek(&mut self, pos: Position) -> Result<()> { |
| self.byte_headers()?; |
| self.state.seeked = true; |
| if pos.byte() == self.state.cur_pos.byte() { |
| return Ok(()); |
| } |
| self.rdr.seek(io::SeekFrom::Start(pos.byte()))?; |
| self.core.reset(); |
| self.core.set_line(pos.line()); |
| self.state.cur_pos = pos; |
| self.state.eof = ReaderEofState::NotEof; |
| Ok(()) |
| } |
| |
| /// This is like `seek`, but provides direct control over how the seeking |
| /// operation is performed via `io::SeekFrom`. |
| /// |
| /// The `pos` position given *should* correspond the position indicated |
| /// by `seek_from`, but there is no requirement. If the `pos` position |
| /// given is incorrect, then the position information returned by this |
| /// reader will be similarly incorrect. |
| /// |
| /// If the header row has not already been read, then this will attempt |
| /// to read the header row before seeking. Therefore, it is possible that |
| /// this returns an error associated with reading CSV data. |
| /// |
| /// Unlike `seek`, this will always cause an actual seek to be performed. |
| pub fn seek_raw( |
| &mut self, |
| seek_from: io::SeekFrom, |
| pos: Position, |
| ) -> Result<()> { |
| self.byte_headers()?; |
| self.state.seeked = true; |
| self.rdr.seek(seek_from)?; |
| self.core.reset(); |
| self.core.set_line(pos.line()); |
| self.state.cur_pos = pos; |
| self.state.eof = ReaderEofState::NotEof; |
| Ok(()) |
| } |
| } |
| |
| impl ReaderState { |
| #[inline(always)] |
| fn add_record(&mut self, record: &ByteRecord) -> Result<()> { |
| let i = self.cur_pos.record(); |
| self.cur_pos.set_record(i.checked_add(1).unwrap()); |
| if !self.flexible { |
| match self.first_field_count { |
| None => self.first_field_count = Some(record.len() as u64), |
| Some(expected) => { |
| if record.len() as u64 != expected { |
| return Err(Error::new(ErrorKind::UnequalLengths { |
| pos: record.position().map(Clone::clone), |
| expected_len: expected, |
| len: record.len() as u64, |
| })); |
| } |
| } |
| } |
| } |
| Ok(()) |
| } |
| } |
| |
| /// An owned iterator over deserialized records. |
| /// |
| /// The type parameter `R` refers to the underlying `io::Read` type, and `D` |
| /// refers to the type that this iterator will deserialize a record into. |
| pub struct DeserializeRecordsIntoIter<R, D> { |
| rdr: Reader<R>, |
| rec: StringRecord, |
| headers: Option<StringRecord>, |
| _priv: PhantomData<D>, |
| } |
| |
| impl<R: io::Read, D: DeserializeOwned> DeserializeRecordsIntoIter<R, D> { |
| fn new(mut rdr: Reader<R>) -> DeserializeRecordsIntoIter<R, D> { |
| let headers = if !rdr.state.has_headers { |
| None |
| } else { |
| rdr.headers().ok().map(Clone::clone) |
| }; |
| DeserializeRecordsIntoIter { |
| rdr, |
| rec: StringRecord::new(), |
| headers, |
| _priv: PhantomData, |
| } |
| } |
| |
| /// Return a reference to the underlying CSV reader. |
| pub fn reader(&self) -> &Reader<R> { |
| &self.rdr |
| } |
| |
| /// Return a mutable reference to the underlying CSV reader. |
| pub fn reader_mut(&mut self) -> &mut Reader<R> { |
| &mut self.rdr |
| } |
| |
| /// Drop this iterator and return the underlying CSV reader. |
| pub fn into_reader(self) -> Reader<R> { |
| self.rdr |
| } |
| } |
| |
| impl<R: io::Read, D: DeserializeOwned> Iterator |
| for DeserializeRecordsIntoIter<R, D> |
| { |
| type Item = Result<D>; |
| |
| fn next(&mut self) -> Option<Result<D>> { |
| match self.rdr.read_record(&mut self.rec) { |
| Err(err) => Some(Err(err)), |
| Ok(false) => None, |
| Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())), |
| } |
| } |
| } |
| |
| /// A borrowed iterator over deserialized records. |
| /// |
| /// The lifetime parameter `'r` refers to the lifetime of the underlying |
| /// CSV `Reader`. The type parameter `R` refers to the underlying `io::Read` |
| /// type, and `D` refers to the type that this iterator will deserialize a |
| /// record into. |
| pub struct DeserializeRecordsIter<'r, R: 'r, D> { |
| rdr: &'r mut Reader<R>, |
| rec: StringRecord, |
| headers: Option<StringRecord>, |
| _priv: PhantomData<D>, |
| } |
| |
| impl<'r, R: io::Read, D: DeserializeOwned> DeserializeRecordsIter<'r, R, D> { |
| fn new(rdr: &'r mut Reader<R>) -> DeserializeRecordsIter<'r, R, D> { |
| let headers = if !rdr.state.has_headers { |
| None |
| } else { |
| rdr.headers().ok().map(Clone::clone) |
| }; |
| DeserializeRecordsIter { |
| rdr, |
| rec: StringRecord::new(), |
| headers, |
| _priv: PhantomData, |
| } |
| } |
| |
| /// Return a reference to the underlying CSV reader. |
| pub fn reader(&self) -> &Reader<R> { |
| &self.rdr |
| } |
| |
| /// Return a mutable reference to the underlying CSV reader. |
| pub fn reader_mut(&mut self) -> &mut Reader<R> { |
| &mut self.rdr |
| } |
| } |
| |
| impl<'r, R: io::Read, D: DeserializeOwned> Iterator |
| for DeserializeRecordsIter<'r, R, D> |
| { |
| type Item = Result<D>; |
| |
| fn next(&mut self) -> Option<Result<D>> { |
| match self.rdr.read_record(&mut self.rec) { |
| Err(err) => Some(Err(err)), |
| Ok(false) => None, |
| Ok(true) => Some(self.rec.deserialize(self.headers.as_ref())), |
| } |
| } |
| } |
| |
| /// An owned iterator over records as strings. |
| pub struct StringRecordsIntoIter<R> { |
| rdr: Reader<R>, |
| rec: StringRecord, |
| } |
| |
| impl<R: io::Read> StringRecordsIntoIter<R> { |
| fn new(rdr: Reader<R>) -> StringRecordsIntoIter<R> { |
| StringRecordsIntoIter { rdr, rec: StringRecord::new() } |
| } |
| |
| /// Return a reference to the underlying CSV reader. |
| pub fn reader(&self) -> &Reader<R> { |
| &self.rdr |
| } |
| |
| /// Return a mutable reference to the underlying CSV reader. |
| pub fn reader_mut(&mut self) -> &mut Reader<R> { |
| &mut self.rdr |
| } |
| |
| /// Drop this iterator and return the underlying CSV reader. |
| pub fn into_reader(self) -> Reader<R> { |
| self.rdr |
| } |
| } |
| |
| impl<R: io::Read> Iterator for StringRecordsIntoIter<R> { |
| type Item = Result<StringRecord>; |
| |
| fn next(&mut self) -> Option<Result<StringRecord>> { |
| match self.rdr.read_record(&mut self.rec) { |
| Err(err) => Some(Err(err)), |
| Ok(true) => Some(Ok(self.rec.clone_truncated())), |
| Ok(false) => None, |
| } |
| } |
| } |
| |
| /// A borrowed iterator over records as strings. |
| /// |
| /// The lifetime parameter `'r` refers to the lifetime of the underlying |
| /// CSV `Reader`. |
| pub struct StringRecordsIter<'r, R: 'r> { |
| rdr: &'r mut Reader<R>, |
| rec: StringRecord, |
| } |
| |
| impl<'r, R: io::Read> StringRecordsIter<'r, R> { |
| fn new(rdr: &'r mut Reader<R>) -> StringRecordsIter<'r, R> { |
| StringRecordsIter { rdr, rec: StringRecord::new() } |
| } |
| |
| /// Return a reference to the underlying CSV reader. |
| pub fn reader(&self) -> &Reader<R> { |
| &self.rdr |
| } |
| |
| /// Return a mutable reference to the underlying CSV reader. |
| pub fn reader_mut(&mut self) -> &mut Reader<R> { |
| &mut self.rdr |
| } |
| } |
| |
| impl<'r, R: io::Read> Iterator for StringRecordsIter<'r, R> { |
| type Item = Result<StringRecord>; |
| |
| fn next(&mut self) -> Option<Result<StringRecord>> { |
| match self.rdr.read_record(&mut self.rec) { |
| Err(err) => Some(Err(err)), |
| Ok(true) => Some(Ok(self.rec.clone_truncated())), |
| Ok(false) => None, |
| } |
| } |
| } |
| |
| /// An owned iterator over records as raw bytes. |
| pub struct ByteRecordsIntoIter<R> { |
| rdr: Reader<R>, |
| rec: ByteRecord, |
| } |
| |
| impl<R: io::Read> ByteRecordsIntoIter<R> { |
| fn new(rdr: Reader<R>) -> ByteRecordsIntoIter<R> { |
| ByteRecordsIntoIter { rdr, rec: ByteRecord::new() } |
| } |
| |
| /// Return a reference to the underlying CSV reader. |
| pub fn reader(&self) -> &Reader<R> { |
| &self.rdr |
| } |
| |
| /// Return a mutable reference to the underlying CSV reader. |
| pub fn reader_mut(&mut self) -> &mut Reader<R> { |
| &mut self.rdr |
| } |
| |
| /// Drop this iterator and return the underlying CSV reader. |
| pub fn into_reader(self) -> Reader<R> { |
| self.rdr |
| } |
| } |
| |
| impl<R: io::Read> Iterator for ByteRecordsIntoIter<R> { |
| type Item = Result<ByteRecord>; |
| |
| fn next(&mut self) -> Option<Result<ByteRecord>> { |
| match self.rdr.read_byte_record(&mut self.rec) { |
| Err(err) => Some(Err(err)), |
| Ok(true) => Some(Ok(self.rec.clone_truncated())), |
| Ok(false) => None, |
| } |
| } |
| } |
| |
| /// A borrowed iterator over records as raw bytes. |
| /// |
| /// The lifetime parameter `'r` refers to the lifetime of the underlying |
| /// CSV `Reader`. |
| pub struct ByteRecordsIter<'r, R: 'r> { |
| rdr: &'r mut Reader<R>, |
| rec: ByteRecord, |
| } |
| |
| impl<'r, R: io::Read> ByteRecordsIter<'r, R> { |
| fn new(rdr: &'r mut Reader<R>) -> ByteRecordsIter<'r, R> { |
| ByteRecordsIter { rdr, rec: ByteRecord::new() } |
| } |
| |
| /// Return a reference to the underlying CSV reader. |
| pub fn reader(&self) -> &Reader<R> { |
| &self.rdr |
| } |
| |
| /// Return a mutable reference to the underlying CSV reader. |
| pub fn reader_mut(&mut self) -> &mut Reader<R> { |
| &mut self.rdr |
| } |
| } |
| |
| impl<'r, R: io::Read> Iterator for ByteRecordsIter<'r, R> { |
| type Item = Result<ByteRecord>; |
| |
| fn next(&mut self) -> Option<Result<ByteRecord>> { |
| match self.rdr.read_byte_record(&mut self.rec) { |
| Err(err) => Some(Err(err)), |
| Ok(true) => Some(Ok(self.rec.clone_truncated())), |
| Ok(false) => None, |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use std::io; |
| |
| use crate::{ |
| byte_record::ByteRecord, error::ErrorKind, string_record::StringRecord, |
| }; |
| |
| use super::{Position, ReaderBuilder, Trim}; |
| |
| fn b(s: &str) -> &[u8] { |
| s.as_bytes() |
| } |
| fn s(b: &[u8]) -> &str { |
| ::std::str::from_utf8(b).unwrap() |
| } |
| |
| fn newpos(byte: u64, line: u64, record: u64) -> Position { |
| let mut p = Position::new(); |
| p.set_byte(byte).set_line(line).set_record(record); |
| p |
| } |
| |
| #[test] |
| fn read_byte_record() { |
| let data = b("foo,\"b,ar\",baz\nabc,mno,xyz"); |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader(data); |
| let mut rec = ByteRecord::new(); |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("foo", s(&rec[0])); |
| assert_eq!("b,ar", s(&rec[1])); |
| assert_eq!("baz", s(&rec[2])); |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("abc", s(&rec[0])); |
| assert_eq!("mno", s(&rec[1])); |
| assert_eq!("xyz", s(&rec[2])); |
| |
| assert!(!rdr.read_byte_record(&mut rec).unwrap()); |
| } |
| |
| #[test] |
| fn read_trimmed_records_and_headers() { |
| let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(true) |
| .trim(Trim::All) |
| .from_reader(data); |
| let mut rec = ByteRecord::new(); |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!("1", s(&rec[0])); |
| assert_eq!("2", s(&rec[1])); |
| assert_eq!("3", s(&rec[2])); |
| let mut rec = StringRecord::new(); |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!("1", &rec[0]); |
| assert_eq!("", &rec[1]); |
| assert_eq!("3", &rec[2]); |
| { |
| let headers = rdr.headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!("foo", &headers[0]); |
| assert_eq!("bar", &headers[1]); |
| assert_eq!("baz", &headers[2]); |
| } |
| } |
| |
| #[test] |
| fn read_trimmed_header() { |
| let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(true) |
| .trim(Trim::Headers) |
| .from_reader(data); |
| let mut rec = ByteRecord::new(); |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(" 1", s(&rec[0])); |
| assert_eq!(" 2", s(&rec[1])); |
| assert_eq!(" 3", s(&rec[2])); |
| { |
| let headers = rdr.headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!("foo", &headers[0]); |
| assert_eq!("bar", &headers[1]); |
| assert_eq!("baz", &headers[2]); |
| } |
| } |
| |
| #[test] |
| fn read_trimed_header_invalid_utf8() { |
| let data = &b"foo, b\xFFar,\tbaz\na,b,c\nd,e,f"[..]; |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(true) |
| .trim(Trim::Headers) |
| .from_reader(data); |
| let mut rec = StringRecord::new(); |
| |
| // force the headers to be read |
| let _ = rdr.read_record(&mut rec); |
| // Check the byte headers are trimmed |
| { |
| let headers = rdr.byte_headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!(b"foo", &headers[0]); |
| assert_eq!(b"b\xFFar", &headers[1]); |
| assert_eq!(b"baz", &headers[2]); |
| } |
| match *rdr.headers().unwrap_err().kind() { |
| ErrorKind::Utf8 { pos: Some(ref pos), ref err } => { |
| assert_eq!(pos, &newpos(0, 1, 0)); |
| assert_eq!(err.field(), 1); |
| assert_eq!(err.valid_up_to(), 3); |
| } |
| ref err => panic!("match failed, got {:?}", err), |
| } |
| } |
| |
| #[test] |
| fn read_trimmed_records() { |
| let data = b("foo, bar,\tbaz\n 1, 2, 3\n1\t,\t,3\t\t"); |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(true) |
| .trim(Trim::Fields) |
| .from_reader(data); |
| let mut rec = ByteRecord::new(); |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!("1", s(&rec[0])); |
| assert_eq!("2", s(&rec[1])); |
| assert_eq!("3", s(&rec[2])); |
| { |
| let headers = rdr.headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!("foo", &headers[0]); |
| assert_eq!(" bar", &headers[1]); |
| assert_eq!("\tbaz", &headers[2]); |
| } |
| } |
| |
| #[test] |
| fn read_record_unequal_fails() { |
| let data = b("foo\nbar,baz"); |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader(data); |
| let mut rec = ByteRecord::new(); |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(1, rec.len()); |
| assert_eq!("foo", s(&rec[0])); |
| |
| match rdr.read_byte_record(&mut rec) { |
| Err(err) => match *err.kind() { |
| ErrorKind::UnequalLengths { |
| expected_len: 1, |
| ref pos, |
| len: 2, |
| } => { |
| assert_eq!(pos, &Some(newpos(4, 2, 1))); |
| } |
| ref wrong => panic!("match failed, got {:?}", wrong), |
| }, |
| wrong => panic!("match failed, got {:?}", wrong), |
| } |
| } |
| |
| #[test] |
| fn read_record_unequal_ok() { |
| let data = b("foo\nbar,baz"); |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(false) |
| .flexible(true) |
| .from_reader(data); |
| let mut rec = ByteRecord::new(); |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(1, rec.len()); |
| assert_eq!("foo", s(&rec[0])); |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(2, rec.len()); |
| assert_eq!("bar", s(&rec[0])); |
| assert_eq!("baz", s(&rec[1])); |
| |
| assert!(!rdr.read_byte_record(&mut rec).unwrap()); |
| } |
| |
| // This tests that even if we get a CSV error, we can continue reading |
| // if we want. |
| #[test] |
| fn read_record_unequal_continue() { |
| let data = b("foo\nbar,baz\nquux"); |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader(data); |
| let mut rec = ByteRecord::new(); |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(1, rec.len()); |
| assert_eq!("foo", s(&rec[0])); |
| |
| match rdr.read_byte_record(&mut rec) { |
| Err(err) => match err.kind() { |
| &ErrorKind::UnequalLengths { |
| expected_len: 1, |
| ref pos, |
| len: 2, |
| } => { |
| assert_eq!(pos, &Some(newpos(4, 2, 1))); |
| } |
| wrong => panic!("match failed, got {:?}", wrong), |
| }, |
| wrong => panic!("match failed, got {:?}", wrong), |
| } |
| |
| assert!(rdr.read_byte_record(&mut rec).unwrap()); |
| assert_eq!(1, rec.len()); |
| assert_eq!("quux", s(&rec[0])); |
| |
| assert!(!rdr.read_byte_record(&mut rec).unwrap()); |
| } |
| |
| #[test] |
| fn read_record_headers() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f"); |
| let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data); |
| let mut rec = StringRecord::new(); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("a", &rec[0]); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("d", &rec[0]); |
| |
| assert!(!rdr.read_record(&mut rec).unwrap()); |
| |
| { |
| let headers = rdr.byte_headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!(b"foo", &headers[0]); |
| assert_eq!(b"bar", &headers[1]); |
| assert_eq!(b"baz", &headers[2]); |
| } |
| { |
| let headers = rdr.headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!("foo", &headers[0]); |
| assert_eq!("bar", &headers[1]); |
| assert_eq!("baz", &headers[2]); |
| } |
| } |
| |
| #[test] |
| fn read_record_headers_invalid_utf8() { |
| let data = &b"foo,b\xFFar,baz\na,b,c\nd,e,f"[..]; |
| let mut rdr = ReaderBuilder::new().has_headers(true).from_reader(data); |
| let mut rec = StringRecord::new(); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("a", &rec[0]); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("d", &rec[0]); |
| |
| assert!(!rdr.read_record(&mut rec).unwrap()); |
| |
| // Check that we can read the headers as raw bytes, but that |
| // if we read them as strings, we get an appropriate UTF-8 error. |
| { |
| let headers = rdr.byte_headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!(b"foo", &headers[0]); |
| assert_eq!(b"b\xFFar", &headers[1]); |
| assert_eq!(b"baz", &headers[2]); |
| } |
| match *rdr.headers().unwrap_err().kind() { |
| ErrorKind::Utf8 { pos: Some(ref pos), ref err } => { |
| assert_eq!(pos, &newpos(0, 1, 0)); |
| assert_eq!(err.field(), 1); |
| assert_eq!(err.valid_up_to(), 1); |
| } |
| ref err => panic!("match failed, got {:?}", err), |
| } |
| } |
| |
| #[test] |
| fn read_record_no_headers_before() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f"); |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader(data); |
| let mut rec = StringRecord::new(); |
| |
| { |
| let headers = rdr.headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!("foo", &headers[0]); |
| assert_eq!("bar", &headers[1]); |
| assert_eq!("baz", &headers[2]); |
| } |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("foo", &rec[0]); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("a", &rec[0]); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("d", &rec[0]); |
| |
| assert!(!rdr.read_record(&mut rec).unwrap()); |
| } |
| |
| #[test] |
| fn read_record_no_headers_after() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f"); |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader(data); |
| let mut rec = StringRecord::new(); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("foo", &rec[0]); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("a", &rec[0]); |
| |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("d", &rec[0]); |
| |
| assert!(!rdr.read_record(&mut rec).unwrap()); |
| |
| let headers = rdr.headers().unwrap(); |
| assert_eq!(3, headers.len()); |
| assert_eq!("foo", &headers[0]); |
| assert_eq!("bar", &headers[1]); |
| assert_eq!("baz", &headers[2]); |
| } |
| |
| #[test] |
| fn seek() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); |
| let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); |
| rdr.seek(newpos(18, 3, 2)).unwrap(); |
| |
| let mut rec = StringRecord::new(); |
| |
| assert_eq!(18, rdr.position().byte()); |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("d", &rec[0]); |
| |
| assert_eq!(24, rdr.position().byte()); |
| assert_eq!(4, rdr.position().line()); |
| assert_eq!(3, rdr.position().record()); |
| assert!(rdr.read_record(&mut rec).unwrap()); |
| assert_eq!(3, rec.len()); |
| assert_eq!("g", &rec[0]); |
| |
| assert!(!rdr.read_record(&mut rec).unwrap()); |
| } |
| |
| // Test that we can read headers after seeking even if the headers weren't |
| // explicit read before seeking. |
| #[test] |
| fn seek_headers_after() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); |
| let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); |
| rdr.seek(newpos(18, 3, 2)).unwrap(); |
| assert_eq!(rdr.headers().unwrap(), vec!["foo", "bar", "baz"]); |
| } |
| |
| // Test that we can read headers after seeking if the headers were read |
| // before seeking. |
| #[test] |
| fn seek_headers_before_after() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); |
| let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); |
| let headers = rdr.headers().unwrap().clone(); |
| rdr.seek(newpos(18, 3, 2)).unwrap(); |
| assert_eq!(&headers, rdr.headers().unwrap()); |
| } |
| |
| // Test that even if we didn't read headers before seeking, if we seek to |
| // the current byte offset, then no seeking is done and therefore we can |
| // still read headers after seeking. |
| #[test] |
| fn seek_headers_no_actual_seek() { |
| let data = b("foo,bar,baz\na,b,c\nd,e,f\ng,h,i"); |
| let mut rdr = ReaderBuilder::new().from_reader(io::Cursor::new(data)); |
| rdr.seek(Position::new()).unwrap(); |
| assert_eq!("foo", &rdr.headers().unwrap()[0]); |
| } |
| |
| // Test that position info is reported correctly in absence of headers. |
| #[test] |
| fn positions_no_headers() { |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(false) |
| .from_reader("a,b,c\nx,y,z".as_bytes()) |
| .into_records(); |
| |
| let pos = rdr.next().unwrap().unwrap().position().unwrap().clone(); |
| assert_eq!(pos.byte(), 0); |
| assert_eq!(pos.line(), 1); |
| assert_eq!(pos.record(), 0); |
| |
| let pos = rdr.next().unwrap().unwrap().position().unwrap().clone(); |
| assert_eq!(pos.byte(), 6); |
| assert_eq!(pos.line(), 2); |
| assert_eq!(pos.record(), 1); |
| } |
| |
| // Test that position info is reported correctly with headers. |
| #[test] |
| fn positions_headers() { |
| let mut rdr = ReaderBuilder::new() |
| .has_headers(true) |
| .from_reader("a,b,c\nx,y,z".as_bytes()) |
| .into_records(); |
| |
| let pos = rdr.next().unwrap().unwrap().position().unwrap().clone(); |
| assert_eq!(pos.byte(), 6); |
| assert_eq!(pos.line(), 2); |
| assert_eq!(pos.record(), 1); |
| } |
| |
| // Test that reading headers on empty data yields an empty record. |
| #[test] |
| fn headers_on_empty_data() { |
| let mut rdr = ReaderBuilder::new().from_reader("".as_bytes()); |
| let r = rdr.byte_headers().unwrap(); |
| assert_eq!(r.len(), 0); |
| } |
| |
| // Test that reading the first record on empty data works. |
| #[test] |
| fn no_headers_on_empty_data() { |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader("".as_bytes()); |
| assert_eq!(rdr.records().count(), 0); |
| } |
| |
| // Test that reading the first record on empty data works, even if |
| // we've tried to read headers before hand. |
| #[test] |
| fn no_headers_on_empty_data_after_headers() { |
| let mut rdr = |
| ReaderBuilder::new().has_headers(false).from_reader("".as_bytes()); |
| assert_eq!(rdr.headers().unwrap().len(), 0); |
| assert_eq!(rdr.records().count(), 0); |
| } |
| } |