blob: 1c75bd02611b34f4ced6f636c4ad0d7ddaff5035 [file] [log] [blame]
use std::mem::take;
use std::str::from_utf8_unchecked;
use crate::TokenSource;
/// Returns a [`TokenSource`](crate::intern::TokenSource) that uses
/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
/// not included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff).
pub fn lines(data: &str) -> Lines<'_, false> {
Lines(ByteLines(data.as_bytes()))
}
/// Returns a [`TokenSource`](crate::intern::TokenSource) that uses
/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
/// included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff).
pub fn lines_with_terminator(data: &str) -> Lines<'_, true> {
Lines(ByteLines(data.as_bytes()))
}
/// Returns a [`TokenSource`](crate::intern::TokenSource) that uses
/// the lines in `data` as Tokens. A lines is a continous subslice of
/// `data` which does not contain `\n` (or `\r\n`).
/// The newline seperator (`\r\n` or `\n`) is not included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff).
pub fn byte_lines_with_terminator(data: &[u8]) -> ByteLines<'_, true> {
ByteLines(data)
}
/// Returns a [`TokenSource`](crate::intern::TokenSource) that uses
/// the lines in `data` as Tokens. The newline seperator (`\r\n` or `\n`) is
/// included in the emitted tokens.
/// This means that changing the newline seperator from `\r\n` to `\n`
/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff).
pub fn byte_lines(data: &[u8]) -> ByteLines<'_, false> {
ByteLines(data)
}
/// By default a line diff is produced for a string
impl<'a> TokenSource for &'a str {
type Token = &'a str;
type Tokenizer = Lines<'a, false>;
fn tokenize(&self) -> Self::Tokenizer {
lines(self)
}
fn estimate_tokens(&self) -> u32 {
lines_with_terminator(self).estimate_tokens()
}
}
/// By default a line diff is produced for a bytes
impl<'a> TokenSource for &'a [u8] {
type Token = Self;
type Tokenizer = ByteLines<'a, false>;
fn tokenize(&self) -> Self::Tokenizer {
byte_lines(self)
}
fn estimate_tokens(&self) -> u32 {
byte_lines(self).estimate_tokens()
}
}
/// A [`TokenSource`](crate::intern::TokenSource) that returns the lines of a `str` as tokens.
/// See [`lines`](crate::sources::lines) and [`lines_with_terminator`](crate::sources::lines_with_terminator) for details
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct Lines<'a, const INCLUDE_LINE_TERMINATOR: bool>(ByteLines<'a, INCLUDE_LINE_TERMINATOR>);
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for Lines<'a, INCLUDE_LINE_TERMINATOR> {
type Item = &'a str;
fn next(&mut self) -> Option<Self::Item> {
// safety invariant: this struct may only contain valid utf8
// dividing valid utf8 bytes by ascii characters always produces valid utf-8
self.0.next().map(|it| unsafe { from_utf8_unchecked(it) })
}
}
/// By default a line diff is produced for a string
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for Lines<'a, INCLUDE_LINE_TERMINATOR> {
type Token = &'a str;
type Tokenizer = Self;
fn tokenize(&self) -> Self::Tokenizer {
*self
}
fn estimate_tokens(&self) -> u32 {
self.0.estimate_tokens()
}
}
/// A [`TokenSource`](crate::intern::TokenSource) that returns the lines of a byte slice as tokens.
/// See [`byte_lines`](crate::sources::lines) and [`byte_lines_with_terminator`](crate::sources::byte_lines_with_terminator) for details
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct ByteLines<'a, const INCLUDE_LINE_TERMINATOR: bool>(&'a [u8]);
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for ByteLines<'a, INCLUDE_LINE_TERMINATOR> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
let mut saw_carriage_return = false;
let mut iter = self.0.iter().enumerate();
let line_len = loop {
match iter.next() {
Some((i, b'\n')) => break i + 1,
None => {
return (!self.0.is_empty()).then(|| take(&mut self.0));
}
Some((_, &it)) => saw_carriage_return = it == b'\r',
}
};
let (mut line, rem) = self.0.split_at(line_len);
self.0 = rem;
if !INCLUDE_LINE_TERMINATOR {
line = &line[..line_len - 1 - saw_carriage_return as usize];
}
Some(line)
}
}
/// By default a line diff is produced for a string
impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource
for ByteLines<'a, INCLUDE_LINE_TERMINATOR>
{
type Token = &'a [u8];
type Tokenizer = Self;
fn tokenize(&self) -> Self::Tokenizer {
*self
}
fn estimate_tokens(&self) -> u32 {
let len: usize = self.take(20).map(|line| line.len()).sum();
if len == 0 {
100
} else {
(self.0.len() * 20 / len) as u32
}
}
}