| use std::io::{self, Write}; |
| use std::path::Path; |
| use std::time::Instant; |
| |
| use grep_matcher::{Match, Matcher}; |
| use grep_searcher::{ |
| Searcher, Sink, SinkContext, SinkContextKind, SinkFinish, SinkMatch, |
| }; |
| use serde_json as json; |
| |
| use crate::counter::CounterWriter; |
| use crate::jsont; |
| use crate::stats::Stats; |
| use crate::util::find_iter_at_in_context; |
| |
| /// The configuration for the JSON printer. |
| /// |
| /// This is manipulated by the JSONBuilder and then referenced by the actual |
| /// implementation. Once a printer is build, the configuration is frozen and |
| /// cannot changed. |
| #[derive(Debug, Clone)] |
| struct Config { |
| pretty: bool, |
| max_matches: Option<u64>, |
| always_begin_end: bool, |
| } |
| |
| impl Default for Config { |
| fn default() -> Config { |
| Config { pretty: false, max_matches: None, always_begin_end: false } |
| } |
| } |
| |
| /// A builder for a JSON lines printer. |
| /// |
| /// The builder permits configuring how the printer behaves. The JSON printer |
| /// has fewer configuration options than the standard printer because it is |
| /// a structured format, and the printer always attempts to find the most |
| /// information possible. |
| /// |
| /// Some configuration options, such as whether line numbers are included or |
| /// whether contextual lines are shown, are drawn directly from the |
| /// `grep_searcher::Searcher`'s configuration. |
| /// |
| /// Once a `JSON` printer is built, its configuration cannot be changed. |
| #[derive(Clone, Debug)] |
| pub struct JSONBuilder { |
| config: Config, |
| } |
| |
| impl JSONBuilder { |
| /// Return a new builder for configuring the JSON printer. |
| pub fn new() -> JSONBuilder { |
| JSONBuilder { config: Config::default() } |
| } |
| |
| /// Create a JSON printer that writes results to the given writer. |
| pub fn build<W: io::Write>(&self, wtr: W) -> JSON<W> { |
| JSON { |
| config: self.config.clone(), |
| wtr: CounterWriter::new(wtr), |
| matches: vec![], |
| } |
| } |
| |
| /// Print JSON in a pretty printed format. |
| /// |
| /// Enabling this will no longer produce a "JSON lines" format, in that |
| /// each JSON object printed may span multiple lines. |
| /// |
| /// This is disabled by default. |
| pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder { |
| self.config.pretty = yes; |
| self |
| } |
| |
| /// Set the maximum amount of matches that are printed. |
| /// |
| /// If multi line search is enabled and a match spans multiple lines, then |
| /// that match is counted exactly once for the purposes of enforcing this |
| /// limit, regardless of how many lines it spans. |
| pub fn max_matches(&mut self, limit: Option<u64>) -> &mut JSONBuilder { |
| self.config.max_matches = limit; |
| self |
| } |
| |
| /// When enabled, the `begin` and `end` messages are always emitted, even |
| /// when no match is found. |
| /// |
| /// When disabled, the `begin` and `end` messages are only shown if there |
| /// is at least one `match` or `context` message. |
| /// |
| /// This is disabled by default. |
| pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder { |
| self.config.always_begin_end = yes; |
| self |
| } |
| } |
| |
| /// The JSON printer, which emits results in a JSON lines format. |
| /// |
| /// This type is generic over `W`, which represents any implementation of |
| /// the standard library `io::Write` trait. |
| /// |
| /// # Format |
| /// |
| /// This section describes the JSON format used by this printer. |
| /// |
| /// To skip the rigamarole, take a look at the |
| /// [example](#example) |
| /// at the end. |
| /// |
| /// ## Overview |
| /// |
| /// The format of this printer is the [JSON Lines](https://jsonlines.org/) |
| /// format. Specifically, this printer emits a sequence of messages, where |
| /// each message is encoded as a single JSON value on a single line. There are |
| /// four different types of messages (and this number may expand over time): |
| /// |
| /// * **begin** - A message that indicates a file is being searched. |
| /// * **end** - A message the indicates a file is done being searched. This |
| /// message also include summary statistics about the search. |
| /// * **match** - A message that indicates a match was found. This includes |
| /// the text and offsets of the match. |
| /// * **context** - A message that indicates a contextual line was found. |
| /// This includes the text of the line, along with any match information if |
| /// the search was inverted. |
| /// |
| /// Every message is encoded in the same envelope format, which includes a tag |
| /// indicating the message type along with an object for the payload: |
| /// |
| /// ```json |
| /// { |
| /// "type": "{begin|end|match|context}", |
| /// "data": { ... } |
| /// } |
| /// ``` |
| /// |
| /// The message itself is encoded in the envelope's `data` key. |
| /// |
| /// ## Text encoding |
| /// |
| /// Before describing each message format, we first must briefly discuss text |
| /// encoding, since it factors into every type of message. In particular, JSON |
| /// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this |
| /// printer, we need only worry about UTF-8. The problem here is that searching |
| /// is not limited to UTF-8 exclusively, which in turn implies that matches |
| /// may be reported that contain invalid UTF-8. Moreover, this printer may |
| /// also print file paths, and the encoding of file paths is itself not |
| /// guarnateed to be valid UTF-8. Therefore, this printer must deal with the |
| /// presence of invalid UTF-8 somehow. The printer could silently ignore such |
| /// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8 |
| /// by replacing all invalid sequences with the Unicode replacement character. |
| /// However, this would prevent consumers of this format from accessing the |
| /// original data in a non-lossy way. |
| /// |
| /// Therefore, this printer will emit valid UTF-8 encoded bytes as normal |
| /// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To |
| /// communicate whether this process occurs or not, strings are keyed by the |
| /// name `text` where as arbitrary bytes are keyed by `bytes`. |
| /// |
| /// For example, when a path is included in a message, it is formatted like so, |
| /// if and only if the path is valid UTF-8: |
| /// |
| /// ```json |
| /// { |
| /// "path": { |
| /// "text": "/home/ubuntu/lib.rs" |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte |
| /// makes it invalid UTF-8, the path would instead be encoded like so: |
| /// |
| /// ```json |
| /// { |
| /// "path": { |
| /// "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM=" |
| /// } |
| /// } |
| /// ``` |
| /// |
| /// This same representation is used for reporting matches as well. |
| /// |
| /// The printer guarantees that the `text` field is used whenever the |
| /// underlying bytes are valid UTF-8. |
| /// |
| /// ## Wire format |
| /// |
| /// This section documents the wire format emitted by this printer, starting |
| /// with the four types of messages. |
| /// |
| /// Each message has its own format, and is contained inside an envelope that |
| /// indicates the type of message. The envelope has these fields: |
| /// |
| /// * **type** - A string indicating the type of this message. It may be one |
| /// of four possible strings: `begin`, `end`, `match` or `context`. This |
| /// list may expand over time. |
| /// * **data** - The actual message data. The format of this field depends on |
| /// the value of `type`. The possible message formats are |
| /// [`begin`](#message-begin), |
| /// [`end`](#message-end), |
| /// [`match`](#message-match), |
| /// [`context`](#message-context). |
| /// |
| /// #### Message: **begin** |
| /// |
| /// This message indicates that a search has begun. It has these fields: |
| /// |
| /// * **path** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// representing the file path corresponding to the search, if one is |
| /// present. If no file path is available, then this field is `null`. |
| /// |
| /// #### Message: **end** |
| /// |
| /// This message indicates that a search has finished. It has these fields: |
| /// |
| /// * **path** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// representing the file path corresponding to the search, if one is |
| /// present. If no file path is available, then this field is `null`. |
| /// * **binary_offset** - The absolute offset in the data searched |
| /// corresponding to the place at which binary data was detected. If no |
| /// binary data was detected (or if binary detection was disabled), then this |
| /// field is `null`. |
| /// * **stats** - A [`stats` object](#object-stats) that contains summary |
| /// statistics for the previous search. |
| /// |
| /// #### Message: **match** |
| /// |
| /// This message indicates that a match has been found. A match generally |
| /// corresponds to a single line of text, although it may correspond to |
| /// multiple lines if the search can emit matches over multiple lines. It |
| /// has these fields: |
| /// |
| /// * **path** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// representing the file path corresponding to the search, if one is |
| /// present. If no file path is available, then this field is `null`. |
| /// * **lines** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// representing one or more lines contained in this match. |
| /// * **line_number** - If the searcher has been configured to report line |
| /// numbers, then this corresponds to the line number of the first line |
| /// in `lines`. If no line numbers are available, then this is `null`. |
| /// * **absolute_offset** - The absolute byte offset corresponding to the start |
| /// of `lines` in the data being searched. |
| /// * **submatches** - An array of [`submatch` objects](#object-submatch) |
| /// corresponding to matches in `lines`. The offsets included in each |
| /// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64 |
| /// encoded, then the byte offsets correspond to the data after base64 |
| /// decoding.) The `submatch` objects are guaranteed to be sorted by their |
| /// starting offsets. Note that it is possible for this array to be empty, |
| /// for example, when searching reports inverted matches. |
| /// |
| /// #### Message: **context** |
| /// |
| /// This message indicates that a contextual line has been found. A contextual |
| /// line is a line that doesn't contain a match, but is generally adjacent to |
| /// a line that does contain a match. The precise way in which contextual lines |
| /// are reported is determined by the searcher. It has these fields, which are |
| /// exactly the same fields found in a [`match`](#message-match): |
| /// |
| /// * **path** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// representing the file path corresponding to the search, if one is |
| /// present. If no file path is available, then this field is `null`. |
| /// * **lines** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// representing one or more lines contained in this context. This includes |
| /// line terminators, if they're present. |
| /// * **line_number** - If the searcher has been configured to report line |
| /// numbers, then this corresponds to the line number of the first line |
| /// in `lines`. If no line numbers are available, then this is `null`. |
| /// * **absolute_offset** - The absolute byte offset corresponding to the start |
| /// of `lines` in the data being searched. |
| /// * **submatches** - An array of [`submatch` objects](#object-submatch) |
| /// corresponding to matches in `lines`. The offsets included in each |
| /// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64 |
| /// encoded, then the byte offsets correspond to the data after base64 |
| /// decoding.) The `submatch` objects are guaranteed to be sorted by |
| /// their starting offsets. Note that it is possible for this array to be |
| /// non-empty, for example, when searching reports inverted matches such that |
| /// the original matcher could match things in the contextual lines. |
| /// |
| /// #### Object: **submatch** |
| /// |
| /// This object describes submatches found within `match` or `context` |
| /// messages. The `start` and `end` fields indicate the half-open interval on |
| /// which the match occurs (`start` is included, but `end` is not). It is |
| /// guaranteed that `start <= end`. It has these fields: |
| /// |
| /// * **match** - An |
| /// [arbitrary data object](#object-arbitrary-data) |
| /// corresponding to the text in this submatch. |
| /// * **start** - A byte offset indicating the start of this match. This offset |
| /// is generally reported in terms of the parent object's data. For example, |
| /// the `lines` field in the |
| /// [`match`](#message-match) or [`context`](#message-context) |
| /// messages. |
| /// * **end** - A byte offset indicating the end of this match. This offset |
| /// is generally reported in terms of the parent object's data. For example, |
| /// the `lines` field in the |
| /// [`match`](#message-match) or [`context`](#message-context) |
| /// messages. |
| /// |
| /// #### Object: **stats** |
| /// |
| /// This object is included in messages and contains summary statistics about |
| /// a search. It has these fields: |
| /// |
| /// * **elapsed** - A [`duration` object](#object-duration) describing the |
| /// length of time that elapsed while performing the search. |
| /// * **searches** - The number of searches that have run. For this printer, |
| /// this value is always `1`. (Implementations may emit additional message |
| /// types that use this same `stats` object that represents summary |
| /// statistics over multiple searches.) |
| /// * **searches_with_match** - The number of searches that have run that have |
| /// found at least one match. This is never more than `searches`. |
| /// * **bytes_searched** - The total number of bytes that have been searched. |
| /// * **bytes_printed** - The total number of bytes that have been printed. |
| /// This includes everything emitted by this printer. |
| /// * **matched_lines** - The total number of lines that participated in a |
| /// match. When matches may contain multiple lines, then this includes every |
| /// line that is part of every match. |
| /// * **matches** - The total number of matches. There may be multiple matches |
| /// per line. When matches may contain multiple lines, each match is counted |
| /// only once, regardless of how many lines it spans. |
| /// |
| /// #### Object: **duration** |
| /// |
| /// This object includes a few fields for describing a duration. Two of its |
| /// fields, `secs` and `nanos`, can be combined to give nanosecond precision |
| /// on systems that support it. It has these fields: |
| /// |
| /// * **secs** - A whole number of seconds indicating the length of this |
| /// duration. |
| /// * **nanos** - A fractional part of this duration represent by nanoseconds. |
| /// If nanosecond precision isn't supported, then this is typically rounded |
| /// up to the nearest number of nanoseconds. |
| /// * **human** - A human readable string describing the length of the |
| /// duration. The format of the string is itself unspecified. |
| /// |
| /// #### Object: **arbitrary data** |
| /// |
| /// This object is used whenever arbitrary data needs to be represented as a |
| /// JSON value. This object contains two fields, where generally only one of |
| /// the fields is present: |
| /// |
| /// * **text** - A normal JSON string that is UTF-8 encoded. This field is |
| /// populated if and only if the underlying data is valid UTF-8. |
| /// * **bytes** - A normal JSON string that is a base64 encoding of the |
| /// underlying bytes. |
| /// |
| /// More information on the motivation for this representation can be seen in |
| /// the section [text encoding](#text-encoding) above. |
| /// |
| /// ## Example |
| /// |
| /// This section shows a small example that includes all message types. |
| /// |
| /// Here's the file we want to search, located at `/home/andrew/sherlock`: |
| /// |
| /// ```text |
| /// For the Doctor Watsons of this world, as opposed to the Sherlock |
| /// Holmeses, success in the province of detective work must always |
| /// be, to a very large extent, the result of luck. Sherlock Holmes |
| /// can extract a clew from a wisp of straw or a flake of cigar ash; |
| /// but Doctor Watson has to have it taken out for him and dusted, |
| /// and exhibited clearly, with a label attached. |
| /// ``` |
| /// |
| /// Searching for `Watson` with a `before_context` of `1` with line numbers |
| /// enabled shows something like this using the standard printer: |
| /// |
| /// ```text |
| /// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock |
| /// -- |
| /// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash; |
| /// sherlock:5:but Doctor Watson has to have it taken out for him and dusted, |
| /// ``` |
| /// |
| /// Here's what the same search looks like using the JSON wire format described |
| /// above, where in we show semi-prettified JSON (instead of a strict JSON |
| /// Lines format), for illustrative purposes: |
| /// |
| /// ```json |
| /// { |
| /// "type": "begin", |
| /// "data": { |
| /// "path": {"text": "/home/andrew/sherlock"}} |
| /// } |
| /// } |
| /// { |
| /// "type": "match", |
| /// "data": { |
| /// "path": {"text": "/home/andrew/sherlock"}, |
| /// "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"}, |
| /// "line_number": 1, |
| /// "absolute_offset": 0, |
| /// "submatches": [ |
| /// {"match": {"text": "Watson"}, "start": 15, "end": 21} |
| /// ] |
| /// } |
| /// } |
| /// { |
| /// "type": "context", |
| /// "data": { |
| /// "path": {"text": "/home/andrew/sherlock"}, |
| /// "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"}, |
| /// "line_number": 4, |
| /// "absolute_offset": 193, |
| /// "submatches": [] |
| /// } |
| /// } |
| /// { |
| /// "type": "match", |
| /// "data": { |
| /// "path": {"text": "/home/andrew/sherlock"}, |
| /// "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"}, |
| /// "line_number": 5, |
| /// "absolute_offset": 258, |
| /// "submatches": [ |
| /// {"match": {"text": "Watson"}, "start": 11, "end": 17} |
| /// ] |
| /// } |
| /// } |
| /// { |
| /// "type": "end", |
| /// "data": { |
| /// "path": {"text": "/home/andrew/sherlock"}, |
| /// "binary_offset": null, |
| /// "stats": { |
| /// "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"}, |
| /// "searches": 1, |
| /// "searches_with_match": 1, |
| /// "bytes_searched": 367, |
| /// "bytes_printed": 1151, |
| /// "matched_lines": 2, |
| /// "matches": 2 |
| /// } |
| /// } |
| /// } |
| /// ``` |
| #[derive(Debug)] |
| pub struct JSON<W> { |
| config: Config, |
| wtr: CounterWriter<W>, |
| matches: Vec<Match>, |
| } |
| |
| impl<W: io::Write> JSON<W> { |
| /// Return a JSON lines printer with a default configuration that writes |
| /// matches to the given writer. |
| pub fn new(wtr: W) -> JSON<W> { |
| JSONBuilder::new().build(wtr) |
| } |
| |
| /// Return an implementation of `Sink` for the JSON printer. |
| /// |
| /// This does not associate the printer with a file path, which means this |
| /// implementation will never print a file path along with the matches. |
| pub fn sink<'s, M: Matcher>( |
| &'s mut self, |
| matcher: M, |
| ) -> JSONSink<'static, 's, M, W> { |
| JSONSink { |
| matcher: matcher, |
| json: self, |
| path: None, |
| start_time: Instant::now(), |
| match_count: 0, |
| after_context_remaining: 0, |
| binary_byte_offset: None, |
| begin_printed: false, |
| stats: Stats::new(), |
| } |
| } |
| |
| /// Return an implementation of `Sink` associated with a file path. |
| /// |
| /// When the printer is associated with a path, then it may, depending on |
| /// its configuration, print the path along with the matches found. |
| pub fn sink_with_path<'p, 's, M, P>( |
| &'s mut self, |
| matcher: M, |
| path: &'p P, |
| ) -> JSONSink<'p, 's, M, W> |
| where |
| M: Matcher, |
| P: ?Sized + AsRef<Path>, |
| { |
| JSONSink { |
| matcher: matcher, |
| json: self, |
| path: Some(path.as_ref()), |
| start_time: Instant::now(), |
| match_count: 0, |
| after_context_remaining: 0, |
| binary_byte_offset: None, |
| begin_printed: false, |
| stats: Stats::new(), |
| } |
| } |
| |
| /// Write the given message followed by a new line. The new line is |
| /// determined from the configuration of the given searcher. |
| fn write_message( |
| &mut self, |
| message: &jsont::Message<'_>, |
| ) -> io::Result<()> { |
| if self.config.pretty { |
| json::to_writer_pretty(&mut self.wtr, message)?; |
| } else { |
| json::to_writer(&mut self.wtr, message)?; |
| } |
| self.wtr.write(&[b'\n'])?; |
| Ok(()) |
| } |
| } |
| |
| impl<W> JSON<W> { |
| /// Returns true if and only if this printer has written at least one byte |
| /// to the underlying writer during any of the previous searches. |
| pub fn has_written(&self) -> bool { |
| self.wtr.total_count() > 0 |
| } |
| |
| /// Return a mutable reference to the underlying writer. |
| pub fn get_mut(&mut self) -> &mut W { |
| self.wtr.get_mut() |
| } |
| |
| /// Consume this printer and return back ownership of the underlying |
| /// writer. |
| pub fn into_inner(self) -> W { |
| self.wtr.into_inner() |
| } |
| } |
| |
| /// An implementation of `Sink` associated with a matcher and an optional file |
| /// path for the JSON printer. |
| /// |
| /// This type is generic over a few type parameters: |
| /// |
| /// * `'p` refers to the lifetime of the file path, if one is provided. When |
| /// no file path is given, then this is `'static`. |
| /// * `'s` refers to the lifetime of the |
| /// [`JSON`](struct.JSON.html) |
| /// printer that this type borrows. |
| /// * `M` refers to the type of matcher used by |
| /// `grep_searcher::Searcher` that is reporting results to this sink. |
| /// * `W` refers to the underlying writer that this printer is writing its |
| /// output to. |
| #[derive(Debug)] |
| pub struct JSONSink<'p, 's, M: Matcher, W> { |
| matcher: M, |
| json: &'s mut JSON<W>, |
| path: Option<&'p Path>, |
| start_time: Instant, |
| match_count: u64, |
| after_context_remaining: u64, |
| binary_byte_offset: Option<u64>, |
| begin_printed: bool, |
| stats: Stats, |
| } |
| |
| impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { |
| /// Returns true if and only if this printer received a match in the |
| /// previous search. |
| /// |
| /// This is unaffected by the result of searches before the previous |
| /// search. |
| pub fn has_match(&self) -> bool { |
| self.match_count > 0 |
| } |
| |
| /// Return the total number of matches reported to this sink. |
| /// |
| /// This corresponds to the number of times `Sink::matched` is called. |
| pub fn match_count(&self) -> u64 { |
| self.match_count |
| } |
| |
| /// If binary data was found in the previous search, this returns the |
| /// offset at which the binary data was first detected. |
| /// |
| /// The offset returned is an absolute offset relative to the entire |
| /// set of bytes searched. |
| /// |
| /// This is unaffected by the result of searches before the previous |
| /// search. e.g., If the search prior to the previous search found binary |
| /// data but the previous search found no binary data, then this will |
| /// return `None`. |
| pub fn binary_byte_offset(&self) -> Option<u64> { |
| self.binary_byte_offset |
| } |
| |
| /// Return a reference to the stats produced by the printer for all |
| /// searches executed on this sink. |
| pub fn stats(&self) -> &Stats { |
| &self.stats |
| } |
| |
| /// Execute the matcher over the given bytes and record the match |
| /// locations if the current configuration demands match granularity. |
| fn record_matches( |
| &mut self, |
| searcher: &Searcher, |
| bytes: &[u8], |
| range: std::ops::Range<usize>, |
| ) -> io::Result<()> { |
| self.json.matches.clear(); |
| // If printing requires knowing the location of each individual match, |
| // then compute and stored those right now for use later. While this |
| // adds an extra copy for storing the matches, we do amortize the |
| // allocation for it and this greatly simplifies the printing logic to |
| // the extent that it's easy to ensure that we never do more than |
| // one search to find the matches. |
| let matches = &mut self.json.matches; |
| find_iter_at_in_context( |
| searcher, |
| &self.matcher, |
| bytes, |
| range.clone(), |
| |m| { |
| let (s, e) = (m.start() - range.start, m.end() - range.start); |
| matches.push(Match::new(s, e)); |
| true |
| }, |
| )?; |
| // Don't report empty matches appearing at the end of the bytes. |
| if !matches.is_empty() |
| && matches.last().unwrap().is_empty() |
| && matches.last().unwrap().start() >= bytes.len() |
| { |
| matches.pop().unwrap(); |
| } |
| Ok(()) |
| } |
| |
| /// Returns true if this printer should quit. |
| /// |
| /// This implements the logic for handling quitting after seeing a certain |
| /// amount of matches. In most cases, the logic is simple, but we must |
| /// permit all "after" contextual lines to print after reaching the limit. |
| fn should_quit(&self) -> bool { |
| let limit = match self.json.config.max_matches { |
| None => return false, |
| Some(limit) => limit, |
| }; |
| if self.match_count < limit { |
| return false; |
| } |
| self.after_context_remaining == 0 |
| } |
| |
| /// Returns whether the current match count exceeds the configured limit. |
| /// If there is no limit, then this always returns false. |
| fn match_more_than_limit(&self) -> bool { |
| let limit = match self.json.config.max_matches { |
| None => return false, |
| Some(limit) => limit, |
| }; |
| self.match_count > limit |
| } |
| |
| /// Write the "begin" message. |
| fn write_begin_message(&mut self) -> io::Result<()> { |
| if self.begin_printed { |
| return Ok(()); |
| } |
| let msg = jsont::Message::Begin(jsont::Begin { path: self.path }); |
| self.json.write_message(&msg)?; |
| self.begin_printed = true; |
| Ok(()) |
| } |
| } |
| |
| impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { |
| type Error = io::Error; |
| |
| fn matched( |
| &mut self, |
| searcher: &Searcher, |
| mat: &SinkMatch<'_>, |
| ) -> Result<bool, io::Error> { |
| self.write_begin_message()?; |
| |
| self.match_count += 1; |
| // When we've exceeded our match count, then the remaining context |
| // lines should not be reset, but instead, decremented. This avoids a |
| // bug where we display more matches than a configured limit. The main |
| // idea here is that 'matched' might be called again while printing |
| // an after-context line. In that case, we should treat this as a |
| // contextual line rather than a matching line for the purposes of |
| // termination. |
| if self.match_more_than_limit() { |
| self.after_context_remaining = |
| self.after_context_remaining.saturating_sub(1); |
| } else { |
| self.after_context_remaining = searcher.after_context() as u64; |
| } |
| |
| self.record_matches( |
| searcher, |
| mat.buffer(), |
| mat.bytes_range_in_buffer(), |
| )?; |
| self.stats.add_matches(self.json.matches.len() as u64); |
| self.stats.add_matched_lines(mat.lines().count() as u64); |
| |
| let submatches = SubMatches::new(mat.bytes(), &self.json.matches); |
| let msg = jsont::Message::Match(jsont::Match { |
| path: self.path, |
| lines: mat.bytes(), |
| line_number: mat.line_number(), |
| absolute_offset: mat.absolute_byte_offset(), |
| submatches: submatches.as_slice(), |
| }); |
| self.json.write_message(&msg)?; |
| Ok(!self.should_quit()) |
| } |
| |
| fn context( |
| &mut self, |
| searcher: &Searcher, |
| ctx: &SinkContext<'_>, |
| ) -> Result<bool, io::Error> { |
| self.write_begin_message()?; |
| self.json.matches.clear(); |
| |
| if ctx.kind() == &SinkContextKind::After { |
| self.after_context_remaining = |
| self.after_context_remaining.saturating_sub(1); |
| } |
| let submatches = if searcher.invert_match() { |
| self.record_matches(searcher, ctx.bytes(), 0..ctx.bytes().len())?; |
| SubMatches::new(ctx.bytes(), &self.json.matches) |
| } else { |
| SubMatches::empty() |
| }; |
| let msg = jsont::Message::Context(jsont::Context { |
| path: self.path, |
| lines: ctx.bytes(), |
| line_number: ctx.line_number(), |
| absolute_offset: ctx.absolute_byte_offset(), |
| submatches: submatches.as_slice(), |
| }); |
| self.json.write_message(&msg)?; |
| Ok(!self.should_quit()) |
| } |
| |
| fn begin(&mut self, _searcher: &Searcher) -> Result<bool, io::Error> { |
| self.json.wtr.reset_count(); |
| self.start_time = Instant::now(); |
| self.match_count = 0; |
| self.after_context_remaining = 0; |
| self.binary_byte_offset = None; |
| if self.json.config.max_matches == Some(0) { |
| return Ok(false); |
| } |
| |
| if !self.json.config.always_begin_end { |
| return Ok(true); |
| } |
| self.write_begin_message()?; |
| Ok(true) |
| } |
| |
| fn finish( |
| &mut self, |
| _searcher: &Searcher, |
| finish: &SinkFinish, |
| ) -> Result<(), io::Error> { |
| if !self.begin_printed { |
| return Ok(()); |
| } |
| |
| self.binary_byte_offset = finish.binary_byte_offset(); |
| self.stats.add_elapsed(self.start_time.elapsed()); |
| self.stats.add_searches(1); |
| if self.match_count > 0 { |
| self.stats.add_searches_with_match(1); |
| } |
| self.stats.add_bytes_searched(finish.byte_count()); |
| self.stats.add_bytes_printed(self.json.wtr.count()); |
| |
| let msg = jsont::Message::End(jsont::End { |
| path: self.path, |
| binary_offset: finish.binary_byte_offset(), |
| stats: self.stats.clone(), |
| }); |
| self.json.write_message(&msg)?; |
| Ok(()) |
| } |
| } |
| |
| /// SubMatches represents a set of matches in a contiguous range of bytes. |
| /// |
| /// A simpler representation for this would just simply be `Vec<SubMatch>`, |
| /// but the common case is exactly one match per range of bytes, which we |
| /// specialize here using a fixed size array without any allocation. |
| enum SubMatches<'a> { |
| Empty, |
| Small([jsont::SubMatch<'a>; 1]), |
| Big(Vec<jsont::SubMatch<'a>>), |
| } |
| |
| impl<'a> SubMatches<'a> { |
| /// Create a new set of match ranges from a set of matches and the |
| /// corresponding bytes that those matches apply to. |
| fn new(bytes: &'a [u8], matches: &[Match]) -> SubMatches<'a> { |
| if matches.len() == 1 { |
| let mat = matches[0]; |
| SubMatches::Small([jsont::SubMatch { |
| m: &bytes[mat], |
| start: mat.start(), |
| end: mat.end(), |
| }]) |
| } else { |
| let mut match_ranges = vec![]; |
| for &mat in matches { |
| match_ranges.push(jsont::SubMatch { |
| m: &bytes[mat], |
| start: mat.start(), |
| end: mat.end(), |
| }); |
| } |
| SubMatches::Big(match_ranges) |
| } |
| } |
| |
| /// Create an empty set of match ranges. |
| fn empty() -> SubMatches<'static> { |
| SubMatches::Empty |
| } |
| |
| /// Return this set of match ranges as a slice. |
| fn as_slice(&self) -> &[jsont::SubMatch<'_>] { |
| match *self { |
| SubMatches::Empty => &[], |
| SubMatches::Small(ref x) => x, |
| SubMatches::Big(ref x) => x, |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use grep_matcher::LineTerminator; |
| use grep_regex::{RegexMatcher, RegexMatcherBuilder}; |
| use grep_searcher::SearcherBuilder; |
| |
| use super::{JSONBuilder, JSON}; |
| |
| const SHERLOCK: &'static [u8] = b"\ |
| For the Doctor Watsons of this world, as opposed to the Sherlock |
| Holmeses, success in the province of detective work must always |
| be, to a very large extent, the result of luck. Sherlock Holmes |
| can extract a clew from a wisp of straw or a flake of cigar ash; |
| but Doctor Watson has to have it taken out for him and dusted, |
| and exhibited clearly, with a label attached. |
| "; |
| |
| fn printer_contents(printer: &mut JSON<Vec<u8>>) -> String { |
| String::from_utf8(printer.get_mut().to_owned()).unwrap() |
| } |
| |
| #[test] |
| fn binary_detection() { |
| use grep_searcher::BinaryDetection; |
| |
| const BINARY: &'static [u8] = b"\ |
| For the Doctor Watsons of this world, as opposed to the Sherlock |
| Holmeses, success in the province of detective work must always |
| be, to a very large extent, the result of luck. Sherlock Holmes |
| can extract a clew \x00 from a wisp of straw or a flake of cigar ash; |
| but Doctor Watson has to have it taken out for him and dusted, |
| and exhibited clearly, with a label attached.\ |
| "; |
| |
| let matcher = RegexMatcher::new(r"Watson").unwrap(); |
| let mut printer = JSONBuilder::new().build(vec![]); |
| SearcherBuilder::new() |
| .binary_detection(BinaryDetection::quit(b'\x00')) |
| .heap_limit(Some(80)) |
| .build() |
| .search_reader(&matcher, BINARY, printer.sink(&matcher)) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| |
| assert_eq!(got.lines().count(), 3); |
| let last = got.lines().last().unwrap(); |
| assert!(last.contains(r#""binary_offset":212,"#)); |
| } |
| |
| #[test] |
| fn max_matches() { |
| let matcher = RegexMatcher::new(r"Watson").unwrap(); |
| let mut printer = |
| JSONBuilder::new().max_matches(Some(1)).build(vec![]); |
| SearcherBuilder::new() |
| .build() |
| .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| |
| assert_eq!(got.lines().count(), 3); |
| } |
| |
| #[test] |
| fn max_matches_after_context() { |
| let haystack = "\ |
| a |
| b |
| c |
| d |
| e |
| d |
| e |
| d |
| e |
| d |
| e |
| "; |
| let matcher = RegexMatcher::new(r"d").unwrap(); |
| let mut printer = |
| JSONBuilder::new().max_matches(Some(1)).build(vec![]); |
| SearcherBuilder::new() |
| .after_context(2) |
| .build() |
| .search_reader( |
| &matcher, |
| haystack.as_bytes(), |
| printer.sink(&matcher), |
| ) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| |
| assert_eq!(got.lines().count(), 5); |
| } |
| |
| #[test] |
| fn no_match() { |
| let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap(); |
| let mut printer = JSONBuilder::new().build(vec![]); |
| SearcherBuilder::new() |
| .build() |
| .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| |
| assert!(got.is_empty()); |
| } |
| |
| #[test] |
| fn always_begin_end_no_match() { |
| let matcher = RegexMatcher::new(r"DOES NOT MATCH").unwrap(); |
| let mut printer = |
| JSONBuilder::new().always_begin_end(true).build(vec![]); |
| SearcherBuilder::new() |
| .build() |
| .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| |
| assert_eq!(got.lines().count(), 2); |
| assert!(got.contains("begin") && got.contains("end")); |
| } |
| |
| #[test] |
| fn missing_crlf() { |
| let haystack = "test\r\n".as_bytes(); |
| |
| let matcher = RegexMatcherBuilder::new().build("test").unwrap(); |
| let mut printer = JSONBuilder::new().build(vec![]); |
| SearcherBuilder::new() |
| .build() |
| .search_reader(&matcher, haystack, printer.sink(&matcher)) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| assert_eq!(got.lines().count(), 3); |
| assert!( |
| got.lines().nth(1).unwrap().contains(r"test\r\n"), |
| r"missing 'test\r\n' in '{}'", |
| got.lines().nth(1).unwrap(), |
| ); |
| |
| let matcher = |
| RegexMatcherBuilder::new().crlf(true).build("test").unwrap(); |
| let mut printer = JSONBuilder::new().build(vec![]); |
| SearcherBuilder::new() |
| .line_terminator(LineTerminator::crlf()) |
| .build() |
| .search_reader(&matcher, haystack, printer.sink(&matcher)) |
| .unwrap(); |
| let got = printer_contents(&mut printer); |
| assert_eq!(got.lines().count(), 3); |
| assert!( |
| got.lines().nth(1).unwrap().contains(r"test\r\n"), |
| r"missing 'test\r\n' in '{}'", |
| got.lines().nth(1).unwrap(), |
| ); |
| } |
| } |