| use core::{iter, slice, str}; |
| |
| #[cfg(all(feature = "alloc", feature = "unicode"))] |
| use alloc::vec; |
| #[cfg(feature = "alloc")] |
| use alloc::{borrow::Cow, string::String, vec::Vec}; |
| |
| #[cfg(feature = "std")] |
| use std::{ffi::OsStr, path::Path}; |
| |
| use memchr::{memchr, memmem, memrchr}; |
| |
| use crate::escape_bytes::EscapeBytes; |
| #[cfg(feature = "alloc")] |
| use crate::ext_vec::ByteVec; |
| #[cfg(feature = "unicode")] |
| use crate::unicode::{ |
| whitespace_len_fwd, whitespace_len_rev, GraphemeIndices, Graphemes, |
| SentenceIndices, Sentences, WordIndices, Words, WordsWithBreakIndices, |
| WordsWithBreaks, |
| }; |
| use crate::{ |
| ascii, |
| bstr::BStr, |
| byteset, |
| utf8::{self, CharIndices, Chars, Utf8Chunks, Utf8Error}, |
| }; |
| |
| /// A short-hand constructor for building a `&[u8]`. |
| /// |
| /// This idiosyncratic constructor is useful for concisely building byte string |
| /// slices. Its primary utility is in conveniently writing byte string literals |
| /// in a uniform way. For example, consider this code that does not compile: |
| /// |
| /// ```ignore |
| /// let strs = vec![b"a", b"xy"]; |
| /// ``` |
| /// |
| /// The above code doesn't compile because the type of the byte string literal |
| /// `b"a"` is `&'static [u8; 1]`, and the type of `b"xy"` is |
| /// `&'static [u8; 2]`. Since their types aren't the same, they can't be stored |
| /// in the same `Vec`. (This is dissimilar from normal Unicode string slices, |
| /// where both `"a"` and `"xy"` have the same type of `&'static str`.) |
| /// |
| /// One way of getting the above code to compile is to convert byte strings to |
| /// slices. You might try this: |
| /// |
| /// ```ignore |
| /// let strs = vec![&b"a", &b"xy"]; |
| /// ``` |
| /// |
| /// But this just creates values with type `& &'static [u8; 1]` and |
| /// `& &'static [u8; 2]`. Instead, you need to force the issue like so: |
| /// |
| /// ``` |
| /// let strs = vec![&b"a"[..], &b"xy"[..]]; |
| /// // or |
| /// let strs = vec![b"a".as_ref(), b"xy".as_ref()]; |
| /// ``` |
| /// |
| /// But neither of these are particularly convenient to type, especially when |
| /// it's something as common as a string literal. Thus, this constructor |
| /// permits writing the following instead: |
| /// |
| /// ``` |
| /// use bstr::B; |
| /// |
| /// let strs = vec![B("a"), B(b"xy")]; |
| /// ``` |
| /// |
| /// Notice that this also lets you mix and match both string literals and byte |
| /// string literals. This can be quite convenient! |
| #[allow(non_snake_case)] |
| #[inline] |
| pub fn B<'a, B: ?Sized + AsRef<[u8]>>(bytes: &'a B) -> &'a [u8] { |
| bytes.as_ref() |
| } |
| |
| impl ByteSlice for [u8] { |
| #[inline] |
| fn as_bytes(&self) -> &[u8] { |
| self |
| } |
| |
| #[inline] |
| fn as_bytes_mut(&mut self) -> &mut [u8] { |
| self |
| } |
| } |
| |
| impl<const N: usize> ByteSlice for [u8; N] { |
| #[inline] |
| fn as_bytes(&self) -> &[u8] { |
| self |
| } |
| |
| #[inline] |
| fn as_bytes_mut(&mut self) -> &mut [u8] { |
| self |
| } |
| } |
| |
| /// Ensure that callers cannot implement `ByteSlice` by making an |
| /// umplementable trait its super trait. |
| mod private { |
| pub trait Sealed {} |
| } |
| impl private::Sealed for [u8] {} |
| impl<const N: usize> private::Sealed for [u8; N] {} |
| |
| /// A trait that extends `&[u8]` with string oriented methods. |
| /// |
| /// This trait is sealed and cannot be implemented outside of `bstr`. |
| pub trait ByteSlice: private::Sealed { |
| /// A method for accessing the raw bytes of this type. This is always a |
| /// no-op and callers shouldn't care about it. This only exists for making |
| /// the extension trait work. |
| #[doc(hidden)] |
| fn as_bytes(&self) -> &[u8]; |
| |
| /// A method for accessing the raw bytes of this type, mutably. This is |
| /// always a no-op and callers shouldn't care about it. This only exists |
| /// for making the extension trait work. |
| #[doc(hidden)] |
| fn as_bytes_mut(&mut self) -> &mut [u8]; |
| |
| /// Return this byte slice as a `&BStr`. |
| /// |
| /// Use `&BStr` is useful because of its `fmt::Debug` representation |
| /// and various other trait implementations (such as `PartialEq` and |
| /// `PartialOrd`). In particular, the `Debug` implementation for `BStr` |
| /// shows its bytes as a normal string. For invalid UTF-8, hex escape |
| /// sequences are used. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// println!("{:?}", b"foo\xFFbar".as_bstr()); |
| /// ``` |
| #[inline] |
| fn as_bstr(&self) -> &BStr { |
| BStr::new(self.as_bytes()) |
| } |
| |
| /// Return this byte slice as a `&mut BStr`. |
| /// |
| /// Use `&mut BStr` is useful because of its `fmt::Debug` representation |
| /// and various other trait implementations (such as `PartialEq` and |
| /// `PartialOrd`). In particular, the `Debug` implementation for `BStr` |
| /// shows its bytes as a normal string. For invalid UTF-8, hex escape |
| /// sequences are used. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut bytes = *b"foo\xFFbar"; |
| /// println!("{:?}", &mut bytes.as_bstr_mut()); |
| /// ``` |
| #[inline] |
| fn as_bstr_mut(&mut self) -> &mut BStr { |
| BStr::new_mut(self.as_bytes_mut()) |
| } |
| |
| /// Create an immutable byte string from an OS string slice. |
| /// |
| /// When the underlying bytes of OS strings are accessible, then this |
| /// always succeeds and is zero cost. Otherwise, this returns `None` if the |
| /// given OS string is not valid UTF-8. (For example, when the underlying |
| /// bytes are inaccessible on Windows, file paths are allowed to be a |
| /// sequence of arbitrary 16-bit integers. Not all such sequences can be |
| /// transcoded to valid UTF-8.) |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use std::ffi::OsStr; |
| /// |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let os_str = OsStr::new("foo"); |
| /// let bs = <[u8]>::from_os_str(os_str).expect("should be valid UTF-8"); |
| /// assert_eq!(bs, B("foo")); |
| /// ``` |
| #[cfg(feature = "std")] |
| #[inline] |
| fn from_os_str(os_str: &OsStr) -> Option<&[u8]> { |
| #[cfg(unix)] |
| #[inline] |
| fn imp(os_str: &OsStr) -> Option<&[u8]> { |
| use std::os::unix::ffi::OsStrExt; |
| |
| Some(os_str.as_bytes()) |
| } |
| |
| #[cfg(not(unix))] |
| #[inline] |
| fn imp(os_str: &OsStr) -> Option<&[u8]> { |
| os_str.to_str().map(|s| s.as_bytes()) |
| } |
| |
| imp(os_str) |
| } |
| |
| /// Create an immutable byte string from a file path. |
| /// |
| /// When the underlying bytes of paths are accessible, then this always |
| /// succeeds and is zero cost. Otherwise, this returns `None` if the given |
| /// path is not valid UTF-8. (For example, when the underlying bytes are |
| /// inaccessible on Windows, file paths are allowed to be a sequence of |
| /// arbitrary 16-bit integers. Not all such sequences can be transcoded to |
| /// valid UTF-8.) |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use std::path::Path; |
| /// |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let path = Path::new("foo"); |
| /// let bs = <[u8]>::from_path(path).expect("should be valid UTF-8"); |
| /// assert_eq!(bs, B("foo")); |
| /// ``` |
| #[cfg(feature = "std")] |
| #[inline] |
| fn from_path(path: &Path) -> Option<&[u8]> { |
| Self::from_os_str(path.as_os_str()) |
| } |
| |
| /// Safely convert this byte string into a `&str` if it's valid UTF-8. |
| /// |
| /// If this byte string is not valid UTF-8, then an error is returned. The |
| /// error returned indicates the first invalid byte found and the length |
| /// of the error. |
| /// |
| /// In cases where a lossy conversion to `&str` is acceptable, then use one |
| /// of the [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) or |
| /// [`to_str_lossy_into`](trait.ByteSlice.html#method.to_str_lossy_into) |
| /// methods. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// # #[cfg(feature = "alloc")] { |
| /// use bstr::{B, ByteSlice, ByteVec}; |
| /// |
| /// # fn example() -> Result<(), bstr::Utf8Error> { |
| /// let s = B("☃βツ").to_str()?; |
| /// assert_eq!("☃βツ", s); |
| /// |
| /// let mut bstring = <Vec<u8>>::from("☃βツ"); |
| /// bstring.push(b'\xFF'); |
| /// let err = bstring.to_str().unwrap_err(); |
| /// assert_eq!(8, err.valid_up_to()); |
| /// # Ok(()) }; example().unwrap() |
| /// # } |
| /// ``` |
| #[inline] |
| fn to_str(&self) -> Result<&str, Utf8Error> { |
| utf8::validate(self.as_bytes()).map(|_| { |
| // SAFETY: This is safe because of the guarantees provided by |
| // utf8::validate. |
| unsafe { str::from_utf8_unchecked(self.as_bytes()) } |
| }) |
| } |
| |
| /// Unsafely convert this byte string into a `&str`, without checking for |
| /// valid UTF-8. |
| /// |
| /// # Safety |
| /// |
| /// Callers *must* ensure that this byte string is valid UTF-8 before |
| /// calling this method. Converting a byte string into a `&str` that is |
| /// not valid UTF-8 is considered undefined behavior. |
| /// |
| /// This routine is useful in performance sensitive contexts where the |
| /// UTF-8 validity of the byte string is already known and it is |
| /// undesirable to pay the cost of an additional UTF-8 validation check |
| /// that [`to_str`](trait.ByteSlice.html#method.to_str) performs. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// // SAFETY: This is safe because string literals are guaranteed to be |
| /// // valid UTF-8 by the Rust compiler. |
| /// let s = unsafe { B("☃βツ").to_str_unchecked() }; |
| /// assert_eq!("☃βツ", s); |
| /// ``` |
| #[inline] |
| unsafe fn to_str_unchecked(&self) -> &str { |
| str::from_utf8_unchecked(self.as_bytes()) |
| } |
| |
| /// Convert this byte string to a valid UTF-8 string by replacing invalid |
| /// UTF-8 bytes with the Unicode replacement codepoint (`U+FFFD`). |
| /// |
| /// If the byte string is already valid UTF-8, then no copying or |
| /// allocation is performed and a borrrowed string slice is returned. If |
| /// the byte string is not valid UTF-8, then an owned string buffer is |
| /// returned with invalid bytes replaced by the replacement codepoint. |
| /// |
| /// This method uses the "substitution of maximal subparts" (Unicode |
| /// Standard, Chapter 3, Section 9) strategy for inserting the replacement |
| /// codepoint. Specifically, a replacement codepoint is inserted whenever a |
| /// byte is found that cannot possibly lead to a valid code unit sequence. |
| /// If there were previous bytes that represented a prefix of a well-formed |
| /// code unit sequence, then all of those bytes are substituted with a |
| /// single replacement codepoint. The "substitution of maximal subparts" |
| /// strategy is the same strategy used by |
| /// [W3C's Encoding standard](https://www.w3.org/TR/encoding/). |
| /// For a more precise description of the maximal subpart strategy, see |
| /// the Unicode Standard, Chapter 3, Section 9. See also |
| /// [Public Review Issue #121](https://www.unicode.org/review/pr-121.html). |
| /// |
| /// N.B. Rust's standard library also appears to use the same strategy, |
| /// but it does not appear to be an API guarantee. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use std::borrow::Cow; |
| /// |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut bstring = <Vec<u8>>::from("☃βツ"); |
| /// assert_eq!(Cow::Borrowed("☃βツ"), bstring.to_str_lossy()); |
| /// |
| /// // Add a byte that makes the sequence invalid. |
| /// bstring.push(b'\xFF'); |
| /// assert_eq!(Cow::Borrowed("☃βツ\u{FFFD}"), bstring.to_str_lossy()); |
| /// ``` |
| /// |
| /// This demonstrates the "maximal subpart" substitution logic. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// // \x61 is the ASCII codepoint for 'a'. |
| /// // \xF1\x80\x80 is a valid 3-byte code unit prefix. |
| /// // \xE1\x80 is a valid 2-byte code unit prefix. |
| /// // \xC2 is a valid 1-byte code unit prefix. |
| /// // \x62 is the ASCII codepoint for 'b'. |
| /// // |
| /// // In sum, each of the prefixes is replaced by a single replacement |
| /// // codepoint since none of the prefixes are properly completed. This |
| /// // is in contrast to other strategies that might insert a replacement |
| /// // codepoint for every single byte. |
| /// let bs = B(b"\x61\xF1\x80\x80\xE1\x80\xC2\x62"); |
| /// assert_eq!("a\u{FFFD}\u{FFFD}\u{FFFD}b", bs.to_str_lossy()); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn to_str_lossy(&self) -> Cow<'_, str> { |
| match utf8::validate(self.as_bytes()) { |
| Ok(()) => { |
| // SAFETY: This is safe because of the guarantees provided by |
| // utf8::validate. |
| unsafe { |
| Cow::Borrowed(str::from_utf8_unchecked(self.as_bytes())) |
| } |
| } |
| Err(err) => { |
| let mut lossy = String::with_capacity(self.as_bytes().len()); |
| let (valid, after) = |
| self.as_bytes().split_at(err.valid_up_to()); |
| // SAFETY: This is safe because utf8::validate guarantees |
| // that all of `valid` is valid UTF-8. |
| lossy.push_str(unsafe { str::from_utf8_unchecked(valid) }); |
| lossy.push_str("\u{FFFD}"); |
| if let Some(len) = err.error_len() { |
| after[len..].to_str_lossy_into(&mut lossy); |
| } |
| Cow::Owned(lossy) |
| } |
| } |
| } |
| |
| /// Copy the contents of this byte string into the given owned string |
| /// buffer, while replacing invalid UTF-8 code unit sequences with the |
| /// Unicode replacement codepoint (`U+FFFD`). |
| /// |
| /// This method uses the same "substitution of maximal subparts" strategy |
| /// for inserting the replacement codepoint as the |
| /// [`to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) method. |
| /// |
| /// This routine is useful for amortizing allocation. However, unlike |
| /// `to_str_lossy`, this routine will _always_ copy the contents of this |
| /// byte string into the destination buffer, even if this byte string is |
| /// valid UTF-8. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use std::borrow::Cow; |
| /// |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut bstring = <Vec<u8>>::from("☃βツ"); |
| /// // Add a byte that makes the sequence invalid. |
| /// bstring.push(b'\xFF'); |
| /// |
| /// let mut dest = String::new(); |
| /// bstring.to_str_lossy_into(&mut dest); |
| /// assert_eq!("☃βツ\u{FFFD}", dest); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn to_str_lossy_into(&self, dest: &mut String) { |
| let mut bytes = self.as_bytes(); |
| dest.reserve(bytes.len()); |
| loop { |
| match utf8::validate(bytes) { |
| Ok(()) => { |
| // SAFETY: This is safe because utf8::validate guarantees |
| // that all of `bytes` is valid UTF-8. |
| dest.push_str(unsafe { str::from_utf8_unchecked(bytes) }); |
| break; |
| } |
| Err(err) => { |
| let (valid, after) = bytes.split_at(err.valid_up_to()); |
| // SAFETY: This is safe because utf8::validate guarantees |
| // that all of `valid` is valid UTF-8. |
| dest.push_str(unsafe { str::from_utf8_unchecked(valid) }); |
| dest.push_str("\u{FFFD}"); |
| match err.error_len() { |
| None => break, |
| Some(len) => bytes = &after[len..], |
| } |
| } |
| } |
| } |
| } |
| |
| /// Create an OS string slice from this byte string. |
| /// |
| /// When OS strings can be constructed from arbitrary byte sequences, this |
| /// always succeeds and is zero cost. Otherwise, this returns a UTF-8 |
| /// decoding error if this byte string is not valid UTF-8. (For example, |
| /// assuming the representation of `OsStr` is opaque on Windows, file paths |
| /// are allowed to be a sequence of arbitrary 16-bit integers. There is |
| /// no obvious mapping from an arbitrary sequence of 8-bit integers to an |
| /// arbitrary sequence of 16-bit integers. If the representation of `OsStr` |
| /// is even opened up, then this will convert any sequence of bytes to an |
| /// `OsStr` without cost.) |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let os_str = b"foo".to_os_str().expect("should be valid UTF-8"); |
| /// assert_eq!(os_str, "foo"); |
| /// ``` |
| #[cfg(feature = "std")] |
| #[inline] |
| fn to_os_str(&self) -> Result<&OsStr, Utf8Error> { |
| #[cfg(unix)] |
| #[inline] |
| fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> { |
| use std::os::unix::ffi::OsStrExt; |
| |
| Ok(OsStr::from_bytes(bytes)) |
| } |
| |
| #[cfg(not(unix))] |
| #[inline] |
| fn imp(bytes: &[u8]) -> Result<&OsStr, Utf8Error> { |
| bytes.to_str().map(OsStr::new) |
| } |
| |
| imp(self.as_bytes()) |
| } |
| |
| /// Lossily create an OS string slice from this byte string. |
| /// |
| /// When OS strings can be constructed from arbitrary byte sequences, this |
| /// is zero cost and always returns a slice. Otherwise, this will perform a |
| /// UTF-8 check and lossily convert this byte string into valid UTF-8 using |
| /// the Unicode replacement codepoint. |
| /// |
| /// Note that this can prevent the correct roundtripping of file paths when |
| /// the representation of `OsStr` is opaque. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let os_str = b"foo\xFFbar".to_os_str_lossy(); |
| /// assert_eq!(os_str.to_string_lossy(), "foo\u{FFFD}bar"); |
| /// ``` |
| #[cfg(feature = "std")] |
| #[inline] |
| fn to_os_str_lossy(&self) -> Cow<'_, OsStr> { |
| #[cfg(unix)] |
| #[inline] |
| fn imp(bytes: &[u8]) -> Cow<'_, OsStr> { |
| use std::os::unix::ffi::OsStrExt; |
| |
| Cow::Borrowed(OsStr::from_bytes(bytes)) |
| } |
| |
| #[cfg(not(unix))] |
| #[inline] |
| fn imp(bytes: &[u8]) -> Cow<OsStr> { |
| use std::ffi::OsString; |
| |
| match bytes.to_str_lossy() { |
| Cow::Borrowed(x) => Cow::Borrowed(OsStr::new(x)), |
| Cow::Owned(x) => Cow::Owned(OsString::from(x)), |
| } |
| } |
| |
| imp(self.as_bytes()) |
| } |
| |
| /// Create a path slice from this byte string. |
| /// |
| /// When paths can be constructed from arbitrary byte sequences, this |
| /// always succeeds and is zero cost. Otherwise, this returns a UTF-8 |
| /// decoding error if this byte string is not valid UTF-8. (For example, |
| /// assuming the representation of `Path` is opaque on Windows, file paths |
| /// are allowed to be a sequence of arbitrary 16-bit integers. There is |
| /// no obvious mapping from an arbitrary sequence of 8-bit integers to an |
| /// arbitrary sequence of 16-bit integers. If the representation of `Path` |
| /// is even opened up, then this will convert any sequence of bytes to an |
| /// `Path` without cost.) |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let path = b"foo".to_path().expect("should be valid UTF-8"); |
| /// assert_eq!(path.as_os_str(), "foo"); |
| /// ``` |
| #[cfg(feature = "std")] |
| #[inline] |
| fn to_path(&self) -> Result<&Path, Utf8Error> { |
| self.to_os_str().map(Path::new) |
| } |
| |
| /// Lossily create a path slice from this byte string. |
| /// |
| /// When paths can be constructed from arbitrary byte sequences, this is |
| /// zero cost and always returns a slice. Otherwise, this will perform a |
| /// UTF-8 check and lossily convert this byte string into valid UTF-8 using |
| /// the Unicode replacement codepoint. |
| /// |
| /// Note that this can prevent the correct roundtripping of file paths when |
| /// the representation of `Path` is opaque. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"foo\xFFbar"; |
| /// let path = bs.to_path_lossy(); |
| /// assert_eq!(path.to_string_lossy(), "foo\u{FFFD}bar"); |
| /// ``` |
| #[cfg(feature = "std")] |
| #[inline] |
| fn to_path_lossy(&self) -> Cow<'_, Path> { |
| use std::path::PathBuf; |
| |
| match self.to_os_str_lossy() { |
| Cow::Borrowed(x) => Cow::Borrowed(Path::new(x)), |
| Cow::Owned(x) => Cow::Owned(PathBuf::from(x)), |
| } |
| } |
| |
| /// Create a new byte string by repeating this byte string `n` times. |
| /// |
| /// # Panics |
| /// |
| /// This function panics if the capacity of the new byte string would |
| /// overflow. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!(b"foo".repeatn(4), B("foofoofoofoo")); |
| /// assert_eq!(b"foo".repeatn(0), B("")); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn repeatn(&self, n: usize) -> Vec<u8> { |
| self.as_bytes().repeat(n) |
| } |
| |
| /// Returns true if and only if this byte string contains the given needle. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert!(b"foo bar".contains_str("foo")); |
| /// assert!(b"foo bar".contains_str("bar")); |
| /// assert!(!b"foo".contains_str("foobar")); |
| /// ``` |
| #[inline] |
| fn contains_str<B: AsRef<[u8]>>(&self, needle: B) -> bool { |
| self.find(needle).is_some() |
| } |
| |
| /// Returns true if and only if this byte string has the given prefix. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert!(b"foo bar".starts_with_str("foo")); |
| /// assert!(!b"foo bar".starts_with_str("bar")); |
| /// assert!(!b"foo".starts_with_str("foobar")); |
| /// ``` |
| #[inline] |
| fn starts_with_str<B: AsRef<[u8]>>(&self, prefix: B) -> bool { |
| self.as_bytes().starts_with(prefix.as_ref()) |
| } |
| |
| /// Returns true if and only if this byte string has the given suffix. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert!(b"foo bar".ends_with_str("bar")); |
| /// assert!(!b"foo bar".ends_with_str("foo")); |
| /// assert!(!b"bar".ends_with_str("foobar")); |
| /// ``` |
| #[inline] |
| fn ends_with_str<B: AsRef<[u8]>>(&self, suffix: B) -> bool { |
| self.as_bytes().ends_with(suffix.as_ref()) |
| } |
| |
| /// Returns the index of the first occurrence of the given needle. |
| /// |
| /// The needle may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// Note that if you're are searching for the same needle in many |
| /// different small haystacks, it may be faster to initialize a |
| /// [`Finder`](struct.Finder.html) once, and reuse it for each search. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo bar baz"; |
| /// assert_eq!(Some(0), s.find("foo")); |
| /// assert_eq!(Some(4), s.find("bar")); |
| /// assert_eq!(None, s.find("quux")); |
| /// ``` |
| #[inline] |
| fn find<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> { |
| Finder::new(needle.as_ref()).find(self.as_bytes()) |
| } |
| |
| /// Returns the index of the last occurrence of the given needle. |
| /// |
| /// The needle may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// Note that if you're are searching for the same needle in many |
| /// different small haystacks, it may be faster to initialize a |
| /// [`FinderReverse`](struct.FinderReverse.html) once, and reuse it for |
| /// each search. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo bar baz"; |
| /// assert_eq!(Some(0), s.rfind("foo")); |
| /// assert_eq!(Some(4), s.rfind("bar")); |
| /// assert_eq!(Some(8), s.rfind("ba")); |
| /// assert_eq!(None, s.rfind("quux")); |
| /// ``` |
| #[inline] |
| fn rfind<B: AsRef<[u8]>>(&self, needle: B) -> Option<usize> { |
| FinderReverse::new(needle.as_ref()).rfind(self.as_bytes()) |
| } |
| |
| /// Returns an iterator of the non-overlapping occurrences of the given |
| /// needle. The iterator yields byte offset positions indicating the start |
| /// of each match. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo bar foo foo quux foo"; |
| /// let matches: Vec<usize> = s.find_iter("foo").collect(); |
| /// assert_eq!(matches, vec![0, 8, 12, 21]); |
| /// ``` |
| /// |
| /// An empty string matches at every position, including the position |
| /// immediately following the last byte: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let matches: Vec<usize> = b"foo".find_iter("").collect(); |
| /// assert_eq!(matches, vec![0, 1, 2, 3]); |
| /// |
| /// let matches: Vec<usize> = b"".find_iter("").collect(); |
| /// assert_eq!(matches, vec![0]); |
| /// ``` |
| #[inline] |
| fn find_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>( |
| &'h self, |
| needle: &'n B, |
| ) -> Find<'h, 'n> { |
| Find::new(self.as_bytes(), needle.as_ref()) |
| } |
| |
| /// Returns an iterator of the non-overlapping occurrences of the given |
| /// needle in reverse. The iterator yields byte offset positions indicating |
| /// the start of each match. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo bar foo foo quux foo"; |
| /// let matches: Vec<usize> = s.rfind_iter("foo").collect(); |
| /// assert_eq!(matches, vec![21, 12, 8, 0]); |
| /// ``` |
| /// |
| /// An empty string matches at every position, including the position |
| /// immediately following the last byte: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let matches: Vec<usize> = b"foo".rfind_iter("").collect(); |
| /// assert_eq!(matches, vec![3, 2, 1, 0]); |
| /// |
| /// let matches: Vec<usize> = b"".rfind_iter("").collect(); |
| /// assert_eq!(matches, vec![0]); |
| /// ``` |
| #[inline] |
| fn rfind_iter<'h, 'n, B: ?Sized + AsRef<[u8]>>( |
| &'h self, |
| needle: &'n B, |
| ) -> FindReverse<'h, 'n> { |
| FindReverse::new(self.as_bytes(), needle.as_ref()) |
| } |
| |
| /// Returns the index of the first occurrence of the given byte. If the |
| /// byte does not occur in this byte string, then `None` is returned. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(Some(10), b"foo bar baz".find_byte(b'z')); |
| /// assert_eq!(None, b"foo bar baz".find_byte(b'y')); |
| /// ``` |
| #[inline] |
| fn find_byte(&self, byte: u8) -> Option<usize> { |
| memchr(byte, self.as_bytes()) |
| } |
| |
| /// Returns the index of the last occurrence of the given byte. If the |
| /// byte does not occur in this byte string, then `None` is returned. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(Some(10), b"foo bar baz".rfind_byte(b'z')); |
| /// assert_eq!(None, b"foo bar baz".rfind_byte(b'y')); |
| /// ``` |
| #[inline] |
| fn rfind_byte(&self, byte: u8) -> Option<usize> { |
| memrchr(byte, self.as_bytes()) |
| } |
| |
| /// Returns the index of the first occurrence of the given codepoint. |
| /// If the codepoint does not occur in this byte string, then `None` is |
| /// returned. |
| /// |
| /// Note that if one searches for the replacement codepoint, `\u{FFFD}`, |
| /// then only explicit occurrences of that encoding will be found. Invalid |
| /// UTF-8 sequences will not be matched. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!(Some(10), b"foo bar baz".find_char('z')); |
| /// assert_eq!(Some(4), B("αβγγδ").find_char('γ')); |
| /// assert_eq!(None, b"foo bar baz".find_char('y')); |
| /// ``` |
| #[inline] |
| fn find_char(&self, ch: char) -> Option<usize> { |
| self.find(ch.encode_utf8(&mut [0; 4])) |
| } |
| |
| /// Returns the index of the last occurrence of the given codepoint. |
| /// If the codepoint does not occur in this byte string, then `None` is |
| /// returned. |
| /// |
| /// Note that if one searches for the replacement codepoint, `\u{FFFD}`, |
| /// then only explicit occurrences of that encoding will be found. Invalid |
| /// UTF-8 sequences will not be matched. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!(Some(10), b"foo bar baz".rfind_char('z')); |
| /// assert_eq!(Some(6), B("αβγγδ").rfind_char('γ')); |
| /// assert_eq!(None, b"foo bar baz".rfind_char('y')); |
| /// ``` |
| #[inline] |
| fn rfind_char(&self, ch: char) -> Option<usize> { |
| self.rfind(ch.encode_utf8(&mut [0; 4])) |
| } |
| |
| /// Returns the index of the first occurrence of any of the bytes in the |
| /// provided set. |
| /// |
| /// The `byteset` may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but |
| /// note that passing a `&str` which contains multibyte characters may not |
| /// behave as you expect: each byte in the `&str` is treated as an |
| /// individual member of the byte set. |
| /// |
| /// Note that order is irrelevant for the `byteset` parameter, and |
| /// duplicate bytes present in its body are ignored. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the set of bytes and the haystack. That is, this |
| /// runs in `O(byteset.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(b"foo bar baz".find_byteset(b"zr"), Some(6)); |
| /// assert_eq!(b"foo baz bar".find_byteset(b"bzr"), Some(4)); |
| /// assert_eq!(None, b"foo baz bar".find_byteset(b"\t\n")); |
| /// // The empty byteset never matches. |
| /// assert_eq!(None, b"abc".find_byteset(b"")); |
| /// assert_eq!(None, b"".find_byteset(b"")); |
| /// ``` |
| #[inline] |
| fn find_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { |
| byteset::find(self.as_bytes(), byteset.as_ref()) |
| } |
| |
| /// Returns the index of the first occurrence of a byte that is not a |
| /// member of the provided set. |
| /// |
| /// The `byteset` may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but |
| /// note that passing a `&str` which contains multibyte characters may not |
| /// behave as you expect: each byte in the `&str` is treated as an |
| /// individual member of the byte set. |
| /// |
| /// Note that order is irrelevant for the `byteset` parameter, and |
| /// duplicate bytes present in its body are ignored. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the set of bytes and the haystack. That is, this |
| /// runs in `O(byteset.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(b"foo bar baz".find_not_byteset(b"fo "), Some(4)); |
| /// assert_eq!(b"\t\tbaz bar".find_not_byteset(b" \t\r\n"), Some(2)); |
| /// assert_eq!(b"foo\nbaz\tbar".find_not_byteset(b"\t\n"), Some(0)); |
| /// // The negation of the empty byteset matches everything. |
| /// assert_eq!(Some(0), b"abc".find_not_byteset(b"")); |
| /// // But an empty string never contains anything. |
| /// assert_eq!(None, b"".find_not_byteset(b"")); |
| /// ``` |
| #[inline] |
| fn find_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { |
| byteset::find_not(self.as_bytes(), byteset.as_ref()) |
| } |
| |
| /// Returns the index of the last occurrence of any of the bytes in the |
| /// provided set. |
| /// |
| /// The `byteset` may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but |
| /// note that passing a `&str` which contains multibyte characters may not |
| /// behave as you expect: each byte in the `&str` is treated as an |
| /// individual member of the byte set. |
| /// |
| /// Note that order is irrelevant for the `byteset` parameter, and duplicate |
| /// bytes present in its body are ignored. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the set of bytes and the haystack. That is, this |
| /// runs in `O(byteset.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(b"foo bar baz".rfind_byteset(b"agb"), Some(9)); |
| /// assert_eq!(b"foo baz bar".rfind_byteset(b"rabz "), Some(10)); |
| /// assert_eq!(b"foo baz bar".rfind_byteset(b"\n123"), None); |
| /// ``` |
| #[inline] |
| fn rfind_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { |
| byteset::rfind(self.as_bytes(), byteset.as_ref()) |
| } |
| |
| /// Returns the index of the last occurrence of a byte that is not a member |
| /// of the provided set. |
| /// |
| /// The `byteset` may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`, but |
| /// note that passing a `&str` which contains multibyte characters may not |
| /// behave as you expect: each byte in the `&str` is treated as an |
| /// individual member of the byte set. |
| /// |
| /// Note that order is irrelevant for the `byteset` parameter, and |
| /// duplicate bytes present in its body are ignored. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the set of bytes and the haystack. That is, this |
| /// runs in `O(byteset.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(b"foo bar baz,\t".rfind_not_byteset(b",\t"), Some(10)); |
| /// assert_eq!(b"foo baz bar".rfind_not_byteset(b"rabz "), Some(2)); |
| /// assert_eq!(None, b"foo baz bar".rfind_not_byteset(b"barfoz ")); |
| /// ``` |
| #[inline] |
| fn rfind_not_byteset<B: AsRef<[u8]>>(&self, byteset: B) -> Option<usize> { |
| byteset::rfind_not(self.as_bytes(), byteset.as_ref()) |
| } |
| |
| /// Returns an iterator over the fields in a byte string, separated |
| /// by contiguous whitespace (according to the Unicode property |
| /// `White_Space`). |
| /// |
| /// # Example |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(" foo\tbar\t\u{2003}\nquux \n"); |
| /// let fields: Vec<&[u8]> = s.fields().collect(); |
| /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]); |
| /// ``` |
| /// |
| /// A byte string consisting of just whitespace yields no elements: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!(0, B(" \n\t\u{2003}\n \t").fields().count()); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn fields(&self) -> Fields<'_> { |
| Fields::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the fields in a byte string, separated by |
| /// contiguous codepoints satisfying the given predicate. |
| /// |
| /// If this byte string is not valid UTF-8, then the given closure will |
| /// be called with a Unicode replacement codepoint when invalid UTF-8 |
| /// bytes are seen. |
| /// |
| /// # Example |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"123foo999999bar1quux123456"; |
| /// let fields: Vec<&[u8]> = s.fields_with(|c| c.is_numeric()).collect(); |
| /// assert_eq!(fields, vec![B("foo"), B("bar"), B("quux")]); |
| /// ``` |
| /// |
| /// A byte string consisting of all codepoints satisfying the predicate |
| /// yields no elements: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(0, b"1911354563".fields_with(|c| c.is_numeric()).count()); |
| /// ``` |
| #[inline] |
| fn fields_with<F: FnMut(char) -> bool>(&self, f: F) -> FieldsWith<'_, F> { |
| FieldsWith::new(self.as_bytes(), f) |
| } |
| |
| /// Returns an iterator over substrings of this byte string, separated |
| /// by the given byte string. Each element yielded is guaranteed not to |
| /// include the splitter substring. |
| /// |
| /// The splitter may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"Mary had a little lamb".split_str(" ").collect(); |
| /// assert_eq!(x, vec![ |
| /// B("Mary"), B("had"), B("a"), B("little"), B("lamb"), |
| /// ]); |
| /// |
| /// let x: Vec<&[u8]> = b"".split_str("X").collect(); |
| /// assert_eq!(x, vec![b""]); |
| /// |
| /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".split_str("X").collect(); |
| /// assert_eq!(x, vec![B("lion"), B(""), B("tiger"), B("leopard")]); |
| /// |
| /// let x: Vec<&[u8]> = b"lion::tiger::leopard".split_str("::").collect(); |
| /// assert_eq!(x, vec![B("lion"), B("tiger"), B("leopard")]); |
| /// ``` |
| /// |
| /// If a string contains multiple contiguous separators, you will end up |
| /// with empty strings yielded by the iterator: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"||||a||b|c".split_str("|").collect(); |
| /// assert_eq!(x, vec![ |
| /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"), |
| /// ]); |
| /// |
| /// let x: Vec<&[u8]> = b"(///)".split_str("/").collect(); |
| /// assert_eq!(x, vec![B("("), B(""), B(""), B(")")]); |
| /// ``` |
| /// |
| /// Separators at the start or end of a string are neighbored by empty |
| /// strings. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"010".split_str("0").collect(); |
| /// assert_eq!(x, vec![B(""), B("1"), B("")]); |
| /// ``` |
| /// |
| /// When the empty string is used as a separator, it splits every **byte** |
| /// in the byte string, along with the beginning and end of the byte |
| /// string. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"rust".split_str("").collect(); |
| /// assert_eq!(x, vec![ |
| /// B(""), B("r"), B("u"), B("s"), B("t"), B(""), |
| /// ]); |
| /// |
| /// // Splitting by an empty string is not UTF-8 aware. Elements yielded |
| /// // may not be valid UTF-8! |
| /// let x: Vec<&[u8]> = B("☃").split_str("").collect(); |
| /// assert_eq!(x, vec![ |
| /// B(""), B(b"\xE2"), B(b"\x98"), B(b"\x83"), B(""), |
| /// ]); |
| /// ``` |
| /// |
| /// Contiguous separators, especially whitespace, can lead to possibly |
| /// surprising behavior. For example, this code is correct: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b" a b c".split_str(" ").collect(); |
| /// assert_eq!(x, vec![ |
| /// B(""), B(""), B(""), B(""), B("a"), B(""), B("b"), B("c"), |
| /// ]); |
| /// ``` |
| /// |
| /// It does *not* give you `["a", "b", "c"]`. For that behavior, use |
| /// [`fields`](#method.fields) instead. |
| #[inline] |
| fn split_str<'h, 's, B: ?Sized + AsRef<[u8]>>( |
| &'h self, |
| splitter: &'s B, |
| ) -> Split<'h, 's> { |
| Split::new(self.as_bytes(), splitter.as_ref()) |
| } |
| |
| /// Returns an iterator over substrings of this byte string, separated by |
| /// the given byte string, in reverse. Each element yielded is guaranteed |
| /// not to include the splitter substring. |
| /// |
| /// The splitter may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = |
| /// b"Mary had a little lamb".rsplit_str(" ").collect(); |
| /// assert_eq!(x, vec![ |
| /// B("lamb"), B("little"), B("a"), B("had"), B("Mary"), |
| /// ]); |
| /// |
| /// let x: Vec<&[u8]> = b"".rsplit_str("X").collect(); |
| /// assert_eq!(x, vec![b""]); |
| /// |
| /// let x: Vec<&[u8]> = b"lionXXtigerXleopard".rsplit_str("X").collect(); |
| /// assert_eq!(x, vec![B("leopard"), B("tiger"), B(""), B("lion")]); |
| /// |
| /// let x: Vec<&[u8]> = b"lion::tiger::leopard".rsplit_str("::").collect(); |
| /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lion")]); |
| /// ``` |
| /// |
| /// If a string contains multiple contiguous separators, you will end up |
| /// with empty strings yielded by the iterator: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"||||a||b|c".rsplit_str("|").collect(); |
| /// assert_eq!(x, vec![ |
| /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""), |
| /// ]); |
| /// |
| /// let x: Vec<&[u8]> = b"(///)".rsplit_str("/").collect(); |
| /// assert_eq!(x, vec![B(")"), B(""), B(""), B("(")]); |
| /// ``` |
| /// |
| /// Separators at the start or end of a string are neighbored by empty |
| /// strings. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"010".rsplit_str("0").collect(); |
| /// assert_eq!(x, vec![B(""), B("1"), B("")]); |
| /// ``` |
| /// |
| /// When the empty string is used as a separator, it splits every **byte** |
| /// in the byte string, along with the beginning and end of the byte |
| /// string. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b"rust".rsplit_str("").collect(); |
| /// assert_eq!(x, vec![ |
| /// B(""), B("t"), B("s"), B("u"), B("r"), B(""), |
| /// ]); |
| /// |
| /// // Splitting by an empty string is not UTF-8 aware. Elements yielded |
| /// // may not be valid UTF-8! |
| /// let x: Vec<&[u8]> = B("☃").rsplit_str("").collect(); |
| /// assert_eq!(x, vec![B(""), B(b"\x83"), B(b"\x98"), B(b"\xE2"), B("")]); |
| /// ``` |
| /// |
| /// Contiguous separators, especially whitespace, can lead to possibly |
| /// surprising behavior. For example, this code is correct: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<&[u8]> = b" a b c".rsplit_str(" ").collect(); |
| /// assert_eq!(x, vec![ |
| /// B("c"), B("b"), B(""), B("a"), B(""), B(""), B(""), B(""), |
| /// ]); |
| /// ``` |
| /// |
| /// It does *not* give you `["a", "b", "c"]`. |
| #[inline] |
| fn rsplit_str<'h, 's, B: ?Sized + AsRef<[u8]>>( |
| &'h self, |
| splitter: &'s B, |
| ) -> SplitReverse<'h, 's> { |
| SplitReverse::new(self.as_bytes(), splitter.as_ref()) |
| } |
| |
| /// Split this byte string at the first occurrence of `splitter`. |
| /// |
| /// If the `splitter` is found in the byte string, returns a tuple |
| /// containing the parts of the string before and after the first occurrence |
| /// of `splitter` respectively. Otherwise, if there are no occurrences of |
| /// `splitter` in the byte string, returns `None`. |
| /// |
| /// The splitter may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// If you need to split on the *last* instance of a delimiter instead, see |
| /// the [`ByteSlice::rsplit_once_str`](#method.rsplit_once_str) method . |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!( |
| /// B("foo,bar").split_once_str(","), |
| /// Some((B("foo"), B("bar"))), |
| /// ); |
| /// assert_eq!( |
| /// B("foo,bar,baz").split_once_str(","), |
| /// Some((B("foo"), B("bar,baz"))), |
| /// ); |
| /// assert_eq!(B("foo").split_once_str(","), None); |
| /// assert_eq!(B("foo,").split_once_str(b","), Some((B("foo"), B("")))); |
| /// assert_eq!(B(",foo").split_once_str(b","), Some((B(""), B("foo")))); |
| /// ``` |
| #[inline] |
| fn split_once_str<'a, B: ?Sized + AsRef<[u8]>>( |
| &'a self, |
| splitter: &B, |
| ) -> Option<(&'a [u8], &'a [u8])> { |
| let bytes = self.as_bytes(); |
| let splitter = splitter.as_ref(); |
| let start = Finder::new(splitter).find(bytes)?; |
| let end = start + splitter.len(); |
| Some((&bytes[..start], &bytes[end..])) |
| } |
| |
| /// Split this byte string at the last occurrence of `splitter`. |
| /// |
| /// If the `splitter` is found in the byte string, returns a tuple |
| /// containing the parts of the string before and after the last occurrence |
| /// of `splitter`, respectively. Otherwise, if there are no occurrences of |
| /// `splitter` in the byte string, returns `None`. |
| /// |
| /// The splitter may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// If you need to split on the *first* instance of a delimiter instead, see |
| /// the [`ByteSlice::split_once_str`](#method.split_once_str) method. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!( |
| /// B("foo,bar").rsplit_once_str(","), |
| /// Some((B("foo"), B("bar"))), |
| /// ); |
| /// assert_eq!( |
| /// B("foo,bar,baz").rsplit_once_str(","), |
| /// Some((B("foo,bar"), B("baz"))), |
| /// ); |
| /// assert_eq!(B("foo").rsplit_once_str(","), None); |
| /// assert_eq!(B("foo,").rsplit_once_str(b","), Some((B("foo"), B("")))); |
| /// assert_eq!(B(",foo").rsplit_once_str(b","), Some((B(""), B("foo")))); |
| /// ``` |
| #[inline] |
| fn rsplit_once_str<'a, B: ?Sized + AsRef<[u8]>>( |
| &'a self, |
| splitter: &B, |
| ) -> Option<(&'a [u8], &'a [u8])> { |
| let bytes = self.as_bytes(); |
| let splitter = splitter.as_ref(); |
| let start = FinderReverse::new(splitter).rfind(bytes)?; |
| let end = start + splitter.len(); |
| Some((&bytes[..start], &bytes[end..])) |
| } |
| |
| /// Returns an iterator of at most `limit` substrings of this byte string, |
| /// separated by the given byte string. If `limit` substrings are yielded, |
| /// then the last substring will contain the remainder of this byte string. |
| /// |
| /// The needle may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<_> = b"Mary had a little lamb".splitn_str(3, " ").collect(); |
| /// assert_eq!(x, vec![B("Mary"), B("had"), B("a little lamb")]); |
| /// |
| /// let x: Vec<_> = b"".splitn_str(3, "X").collect(); |
| /// assert_eq!(x, vec![b""]); |
| /// |
| /// let x: Vec<_> = b"lionXXtigerXleopard".splitn_str(3, "X").collect(); |
| /// assert_eq!(x, vec![B("lion"), B(""), B("tigerXleopard")]); |
| /// |
| /// let x: Vec<_> = b"lion::tiger::leopard".splitn_str(2, "::").collect(); |
| /// assert_eq!(x, vec![B("lion"), B("tiger::leopard")]); |
| /// |
| /// let x: Vec<_> = b"abcXdef".splitn_str(1, "X").collect(); |
| /// assert_eq!(x, vec![B("abcXdef")]); |
| /// |
| /// let x: Vec<_> = b"abcdef".splitn_str(2, "X").collect(); |
| /// assert_eq!(x, vec![B("abcdef")]); |
| /// |
| /// let x: Vec<_> = b"abcXdef".splitn_str(0, "X").collect(); |
| /// assert!(x.is_empty()); |
| /// ``` |
| #[inline] |
| fn splitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>( |
| &'h self, |
| limit: usize, |
| splitter: &'s B, |
| ) -> SplitN<'h, 's> { |
| SplitN::new(self.as_bytes(), splitter.as_ref(), limit) |
| } |
| |
| /// Returns an iterator of at most `limit` substrings of this byte string, |
| /// separated by the given byte string, in reverse. If `limit` substrings |
| /// are yielded, then the last substring will contain the remainder of this |
| /// byte string. |
| /// |
| /// The needle may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let x: Vec<_> = |
| /// b"Mary had a little lamb".rsplitn_str(3, " ").collect(); |
| /// assert_eq!(x, vec![B("lamb"), B("little"), B("Mary had a")]); |
| /// |
| /// let x: Vec<_> = b"".rsplitn_str(3, "X").collect(); |
| /// assert_eq!(x, vec![b""]); |
| /// |
| /// let x: Vec<_> = b"lionXXtigerXleopard".rsplitn_str(3, "X").collect(); |
| /// assert_eq!(x, vec![B("leopard"), B("tiger"), B("lionX")]); |
| /// |
| /// let x: Vec<_> = b"lion::tiger::leopard".rsplitn_str(2, "::").collect(); |
| /// assert_eq!(x, vec![B("leopard"), B("lion::tiger")]); |
| /// |
| /// let x: Vec<_> = b"abcXdef".rsplitn_str(1, "X").collect(); |
| /// assert_eq!(x, vec![B("abcXdef")]); |
| /// |
| /// let x: Vec<_> = b"abcdef".rsplitn_str(2, "X").collect(); |
| /// assert_eq!(x, vec![B("abcdef")]); |
| /// |
| /// let x: Vec<_> = b"abcXdef".rsplitn_str(0, "X").collect(); |
| /// assert!(x.is_empty()); |
| /// ``` |
| #[inline] |
| fn rsplitn_str<'h, 's, B: ?Sized + AsRef<[u8]>>( |
| &'h self, |
| limit: usize, |
| splitter: &'s B, |
| ) -> SplitNReverse<'h, 's> { |
| SplitNReverse::new(self.as_bytes(), splitter.as_ref(), limit) |
| } |
| |
| /// Replace all matches of the given needle with the given replacement, and |
| /// the result as a new `Vec<u8>`. |
| /// |
| /// This routine is useful as a convenience. If you need to reuse an |
| /// allocation, use [`replace_into`](#method.replace_into) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"this is old".replace("old", "new"); |
| /// assert_eq!(s, "this is new".as_bytes()); |
| /// ``` |
| /// |
| /// When the pattern doesn't match: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"this is old".replace("nada nada", "limonada"); |
| /// assert_eq!(s, "this is old".as_bytes()); |
| /// ``` |
| /// |
| /// When the needle is an empty string: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo".replace("", "Z"); |
| /// assert_eq!(s, "ZfZoZoZ".as_bytes()); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn replace<N: AsRef<[u8]>, R: AsRef<[u8]>>( |
| &self, |
| needle: N, |
| replacement: R, |
| ) -> Vec<u8> { |
| let mut dest = Vec::with_capacity(self.as_bytes().len()); |
| self.replace_into(needle, replacement, &mut dest); |
| dest |
| } |
| |
| /// Replace up to `limit` matches of the given needle with the given |
| /// replacement, and the result as a new `Vec<u8>`. |
| /// |
| /// This routine is useful as a convenience. If you need to reuse an |
| /// allocation, use [`replacen_into`](#method.replacen_into) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foofoo".replacen("o", "z", 2); |
| /// assert_eq!(s, "fzzfoo".as_bytes()); |
| /// ``` |
| /// |
| /// When the pattern doesn't match: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foofoo".replacen("a", "z", 2); |
| /// assert_eq!(s, "foofoo".as_bytes()); |
| /// ``` |
| /// |
| /// When the needle is an empty string: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo".replacen("", "Z", 2); |
| /// assert_eq!(s, "ZfZoo".as_bytes()); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn replacen<N: AsRef<[u8]>, R: AsRef<[u8]>>( |
| &self, |
| needle: N, |
| replacement: R, |
| limit: usize, |
| ) -> Vec<u8> { |
| let mut dest = Vec::with_capacity(self.as_bytes().len()); |
| self.replacen_into(needle, replacement, limit, &mut dest); |
| dest |
| } |
| |
| /// Replace all matches of the given needle with the given replacement, |
| /// and write the result into the provided `Vec<u8>`. |
| /// |
| /// This does **not** clear `dest` before writing to it. |
| /// |
| /// This routine is useful for reusing allocation. For a more convenient |
| /// API, use [`replace`](#method.replace) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"this is old"; |
| /// |
| /// let mut dest = vec![]; |
| /// s.replace_into("old", "new", &mut dest); |
| /// assert_eq!(dest, "this is new".as_bytes()); |
| /// ``` |
| /// |
| /// When the pattern doesn't match: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"this is old"; |
| /// |
| /// let mut dest = vec![]; |
| /// s.replace_into("nada nada", "limonada", &mut dest); |
| /// assert_eq!(dest, "this is old".as_bytes()); |
| /// ``` |
| /// |
| /// When the needle is an empty string: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo"; |
| /// |
| /// let mut dest = vec![]; |
| /// s.replace_into("", "Z", &mut dest); |
| /// assert_eq!(dest, "ZfZoZoZ".as_bytes()); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn replace_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( |
| &self, |
| needle: N, |
| replacement: R, |
| dest: &mut Vec<u8>, |
| ) { |
| let (needle, replacement) = (needle.as_ref(), replacement.as_ref()); |
| |
| let mut last = 0; |
| for start in self.find_iter(needle) { |
| dest.push_str(&self.as_bytes()[last..start]); |
| dest.push_str(replacement); |
| last = start + needle.len(); |
| } |
| dest.push_str(&self.as_bytes()[last..]); |
| } |
| |
| /// Replace up to `limit` matches of the given needle with the given |
| /// replacement, and write the result into the provided `Vec<u8>`. |
| /// |
| /// This does **not** clear `dest` before writing to it. |
| /// |
| /// This routine is useful for reusing allocation. For a more convenient |
| /// API, use [`replacen`](#method.replacen) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foofoo"; |
| /// |
| /// let mut dest = vec![]; |
| /// s.replacen_into("o", "z", 2, &mut dest); |
| /// assert_eq!(dest, "fzzfoo".as_bytes()); |
| /// ``` |
| /// |
| /// When the pattern doesn't match: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foofoo"; |
| /// |
| /// let mut dest = vec![]; |
| /// s.replacen_into("a", "z", 2, &mut dest); |
| /// assert_eq!(dest, "foofoo".as_bytes()); |
| /// ``` |
| /// |
| /// When the needle is an empty string: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let s = b"foo"; |
| /// |
| /// let mut dest = vec![]; |
| /// s.replacen_into("", "Z", 2, &mut dest); |
| /// assert_eq!(dest, "ZfZoo".as_bytes()); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn replacen_into<N: AsRef<[u8]>, R: AsRef<[u8]>>( |
| &self, |
| needle: N, |
| replacement: R, |
| limit: usize, |
| dest: &mut Vec<u8>, |
| ) { |
| let (needle, replacement) = (needle.as_ref(), replacement.as_ref()); |
| |
| let mut last = 0; |
| for start in self.find_iter(needle).take(limit) { |
| dest.push_str(&self.as_bytes()[last..start]); |
| dest.push_str(replacement); |
| last = start + needle.len(); |
| } |
| dest.push_str(&self.as_bytes()[last..]); |
| } |
| |
| /// Returns an iterator over the bytes in this byte string. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"foobar"; |
| /// let bytes: Vec<u8> = bs.bytes().collect(); |
| /// assert_eq!(bytes, bs); |
| /// ``` |
| #[inline] |
| fn bytes(&self) -> Bytes<'_> { |
| Bytes { it: self.as_bytes().iter() } |
| } |
| |
| /// Returns an iterator over the Unicode scalar values in this byte string. |
| /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint |
| /// is yielded instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; |
| /// let chars: Vec<char> = bs.chars().collect(); |
| /// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars); |
| /// ``` |
| /// |
| /// Codepoints can also be iterated over in reverse: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; |
| /// let chars: Vec<char> = bs.chars().rev().collect(); |
| /// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars); |
| /// ``` |
| #[inline] |
| fn chars(&self) -> Chars<'_> { |
| Chars::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the Unicode scalar values in this byte string |
| /// along with their starting and ending byte index positions. If invalid |
| /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded |
| /// instead. |
| /// |
| /// Note that this is slightly different from the `CharIndices` iterator |
| /// provided by the standard library. Aside from working on possibly |
| /// invalid UTF-8, this iterator provides both the corresponding starting |
| /// and ending byte indices of each codepoint yielded. The ending position |
| /// is necessary to slice the original byte string when invalid UTF-8 bytes |
| /// are converted into a Unicode replacement codepoint, since a single |
| /// replacement codepoint can substitute anywhere from 1 to 3 invalid bytes |
| /// (inclusive). |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; |
| /// let chars: Vec<(usize, usize, char)> = bs.char_indices().collect(); |
| /// assert_eq!(chars, vec![ |
| /// (0, 3, '☃'), |
| /// (3, 4, '\u{FFFD}'), |
| /// (4, 8, '𝞃'), |
| /// (8, 10, '\u{FFFD}'), |
| /// (10, 11, 'a'), |
| /// ]); |
| /// ``` |
| /// |
| /// Codepoints can also be iterated over in reverse: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61"; |
| /// let chars: Vec<(usize, usize, char)> = bs |
| /// .char_indices() |
| /// .rev() |
| /// .collect(); |
| /// assert_eq!(chars, vec![ |
| /// (10, 11, 'a'), |
| /// (8, 10, '\u{FFFD}'), |
| /// (4, 8, '𝞃'), |
| /// (3, 4, '\u{FFFD}'), |
| /// (0, 3, '☃'), |
| /// ]); |
| /// ``` |
| #[inline] |
| fn char_indices(&self) -> CharIndices<'_> { |
| CharIndices::new(self.as_bytes()) |
| } |
| |
| /// Iterate over chunks of valid UTF-8. |
| /// |
| /// The iterator returned yields chunks of valid UTF-8 separated by invalid |
| /// UTF-8 bytes, if they exist. Invalid UTF-8 bytes are always 1-3 bytes, |
| /// which are determined via the "substitution of maximal subparts" |
| /// strategy described in the docs for the |
| /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy) |
| /// method. |
| /// |
| /// # Examples |
| /// |
| /// This example shows how to gather all valid and invalid chunks from a |
| /// byte slice: |
| /// |
| /// ``` |
| /// use bstr::{ByteSlice, Utf8Chunk}; |
| /// |
| /// let bytes = b"foo\xFD\xFEbar\xFF"; |
| /// |
| /// let (mut valid_chunks, mut invalid_chunks) = (vec![], vec![]); |
| /// for chunk in bytes.utf8_chunks() { |
| /// if !chunk.valid().is_empty() { |
| /// valid_chunks.push(chunk.valid()); |
| /// } |
| /// if !chunk.invalid().is_empty() { |
| /// invalid_chunks.push(chunk.invalid()); |
| /// } |
| /// } |
| /// |
| /// assert_eq!(valid_chunks, vec!["foo", "bar"]); |
| /// assert_eq!(invalid_chunks, vec![b"\xFD", b"\xFE", b"\xFF"]); |
| /// ``` |
| #[inline] |
| fn utf8_chunks(&self) -> Utf8Chunks<'_> { |
| Utf8Chunks { bytes: self.as_bytes() } |
| } |
| |
| /// Returns an iterator over the grapheme clusters in this byte string. |
| /// If invalid UTF-8 is encountered, then the Unicode replacement codepoint |
| /// is yielded instead. |
| /// |
| /// # Examples |
| /// |
| /// This example shows how multiple codepoints can combine to form a |
| /// single grapheme cluster: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes(); |
| /// let graphemes: Vec<&str> = bs.graphemes().collect(); |
| /// assert_eq!(vec!["à̖", "🇺🇸"], graphemes); |
| /// ``` |
| /// |
| /// This shows that graphemes can be iterated over in reverse: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes(); |
| /// let graphemes: Vec<&str> = bs.graphemes().rev().collect(); |
| /// assert_eq!(vec!["🇺🇸", "à̖"], graphemes); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn graphemes(&self) -> Graphemes<'_> { |
| Graphemes::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the grapheme clusters in this byte string |
| /// along with their starting and ending byte index positions. If invalid |
| /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded |
| /// instead. |
| /// |
| /// # Examples |
| /// |
| /// This example shows how to get the byte offsets of each individual |
| /// grapheme cluster: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = "a\u{0300}\u{0316}\u{1F1FA}\u{1F1F8}".as_bytes(); |
| /// let graphemes: Vec<(usize, usize, &str)> = |
| /// bs.grapheme_indices().collect(); |
| /// assert_eq!(vec![(0, 5, "à̖"), (5, 13, "🇺🇸")], graphemes); |
| /// ``` |
| /// |
| /// This example shows what happens when invalid UTF-8 is encountered. Note |
| /// that the offsets are valid indices into the original string, and do |
| /// not necessarily correspond to the length of the `&str` returned! |
| /// |
| /// ``` |
| /// # #[cfg(all(feature = "alloc"))] { |
| /// use bstr::{ByteSlice, ByteVec}; |
| /// |
| /// let mut bytes = vec![]; |
| /// bytes.push_str("a\u{0300}\u{0316}"); |
| /// bytes.push(b'\xFF'); |
| /// bytes.push_str("\u{1F1FA}\u{1F1F8}"); |
| /// |
| /// let graphemes: Vec<(usize, usize, &str)> = |
| /// bytes.grapheme_indices().collect(); |
| /// assert_eq!( |
| /// graphemes, |
| /// vec![(0, 5, "à̖"), (5, 6, "\u{FFFD}"), (6, 14, "🇺🇸")] |
| /// ); |
| /// # } |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn grapheme_indices(&self) -> GraphemeIndices<'_> { |
| GraphemeIndices::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the words in this byte string. If invalid |
| /// UTF-8 is encountered, then the Unicode replacement codepoint is yielded |
| /// instead. |
| /// |
| /// This is similar to |
| /// [`words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks), |
| /// except it only returns elements that contain a "word" character. A word |
| /// character is defined by UTS #18 (Annex C) to be the combination of the |
| /// `Alphabetic` and `Join_Control` properties, along with the |
| /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general |
| /// categories. |
| /// |
| /// Since words are made up of one or more codepoints, this iterator |
| /// yields `&str` elements. When invalid UTF-8 is encountered, replacement |
| /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#; |
| /// let words: Vec<&str> = bs.words().collect(); |
| /// assert_eq!(words, vec![ |
| /// "The", "quick", "brown", "fox", "can't", |
| /// "jump", "32.3", "feet", "right", |
| /// ]); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn words(&self) -> Words<'_> { |
| Words::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the words in this byte string along with |
| /// their starting and ending byte index positions. |
| /// |
| /// This is similar to |
| /// [`words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices), |
| /// except it only returns elements that contain a "word" character. A word |
| /// character is defined by UTS #18 (Annex C) to be the combination of the |
| /// `Alphabetic` and `Join_Control` properties, along with the |
| /// `Decimal_Number`, `Mark` and `Connector_Punctuation` general |
| /// categories. |
| /// |
| /// Since words are made up of one or more codepoints, this iterator |
| /// yields `&str` elements. When invalid UTF-8 is encountered, replacement |
| /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). |
| /// |
| /// # Examples |
| /// |
| /// This example shows how to get the byte offsets of each individual |
| /// word: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"can't jump 32.3 feet"; |
| /// let words: Vec<(usize, usize, &str)> = bs.word_indices().collect(); |
| /// assert_eq!(words, vec![ |
| /// (0, 5, "can't"), |
| /// (6, 10, "jump"), |
| /// (11, 15, "32.3"), |
| /// (16, 20, "feet"), |
| /// ]); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn word_indices(&self) -> WordIndices<'_> { |
| WordIndices::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the words in this byte string, along with |
| /// all breaks between the words. Concatenating all elements yielded by |
| /// the iterator results in the original string (modulo Unicode replacement |
| /// codepoint substitutions if invalid UTF-8 is encountered). |
| /// |
| /// Since words are made up of one or more codepoints, this iterator |
| /// yields `&str` elements. When invalid UTF-8 is encountered, replacement |
| /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = br#"The quick ("brown") fox can't jump 32.3 feet, right?"#; |
| /// let words: Vec<&str> = bs.words_with_breaks().collect(); |
| /// assert_eq!(words, vec![ |
| /// "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", |
| /// " ", "fox", " ", "can't", " ", "jump", " ", "32.3", " ", "feet", |
| /// ",", " ", "right", "?", |
| /// ]); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn words_with_breaks(&self) -> WordsWithBreaks<'_> { |
| WordsWithBreaks::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the words and their byte offsets in this |
| /// byte string, along with all breaks between the words. Concatenating |
| /// all elements yielded by the iterator results in the original string |
| /// (modulo Unicode replacement codepoint substitutions if invalid UTF-8 is |
| /// encountered). |
| /// |
| /// Since words are made up of one or more codepoints, this iterator |
| /// yields `&str` elements. When invalid UTF-8 is encountered, replacement |
| /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). |
| /// |
| /// # Examples |
| /// |
| /// This example shows how to get the byte offsets of each individual |
| /// word: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"can't jump 32.3 feet"; |
| /// let words: Vec<(usize, usize, &str)> = |
| /// bs.words_with_break_indices().collect(); |
| /// assert_eq!(words, vec![ |
| /// (0, 5, "can't"), |
| /// (5, 6, " "), |
| /// (6, 10, "jump"), |
| /// (10, 11, " "), |
| /// (11, 15, "32.3"), |
| /// (15, 16, " "), |
| /// (16, 20, "feet"), |
| /// ]); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn words_with_break_indices(&self) -> WordsWithBreakIndices<'_> { |
| WordsWithBreakIndices::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the sentences in this byte string. |
| /// |
| /// Typically, a sentence will include its trailing punctuation and |
| /// whitespace. Concatenating all elements yielded by the iterator |
| /// results in the original string (modulo Unicode replacement codepoint |
| /// substitutions if invalid UTF-8 is encountered). |
| /// |
| /// Since sentences are made up of one or more codepoints, this iterator |
| /// yields `&str` elements. When invalid UTF-8 is encountered, replacement |
| /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"I want this. Not that. Right now."; |
| /// let sentences: Vec<&str> = bs.sentences().collect(); |
| /// assert_eq!(sentences, vec![ |
| /// "I want this. ", |
| /// "Not that. ", |
| /// "Right now.", |
| /// ]); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn sentences(&self) -> Sentences<'_> { |
| Sentences::new(self.as_bytes()) |
| } |
| |
| /// Returns an iterator over the sentences in this byte string along with |
| /// their starting and ending byte index positions. |
| /// |
| /// Typically, a sentence will include its trailing punctuation and |
| /// whitespace. Concatenating all elements yielded by the iterator |
| /// results in the original string (modulo Unicode replacement codepoint |
| /// substitutions if invalid UTF-8 is encountered). |
| /// |
| /// Since sentences are made up of one or more codepoints, this iterator |
| /// yields `&str` elements. When invalid UTF-8 is encountered, replacement |
| /// codepoints are [substituted](index.html#handling-of-invalid-utf-8). |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let bs = b"I want this. Not that. Right now."; |
| /// let sentences: Vec<(usize, usize, &str)> = |
| /// bs.sentence_indices().collect(); |
| /// assert_eq!(sentences, vec![ |
| /// (0, 13, "I want this. "), |
| /// (13, 23, "Not that. "), |
| /// (23, 33, "Right now."), |
| /// ]); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn sentence_indices(&self) -> SentenceIndices<'_> { |
| SentenceIndices::new(self.as_bytes()) |
| } |
| |
| /// An iterator over all lines in a byte string, without their |
| /// terminators. |
| /// |
| /// For this iterator, the only line terminators recognized are `\r\n` and |
| /// `\n`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"\ |
| /// foo |
| /// |
| /// bar\r |
| /// baz |
| /// |
| /// |
| /// quux"; |
| /// let lines: Vec<&[u8]> = s.lines().collect(); |
| /// assert_eq!(lines, vec![ |
| /// B("foo"), B(""), B("bar"), B("baz"), B(""), B(""), B("quux"), |
| /// ]); |
| /// ``` |
| #[inline] |
| fn lines(&self) -> Lines<'_> { |
| Lines::new(self.as_bytes()) |
| } |
| |
| /// An iterator over all lines in a byte string, including their |
| /// terminators. |
| /// |
| /// For this iterator, the only line terminator recognized is `\n`. (Since |
| /// line terminators are included, this also handles `\r\n` line endings.) |
| /// |
| /// Line terminators are only included if they are present in the original |
| /// byte string. For example, the last line in a byte string may not end |
| /// with a line terminator. |
| /// |
| /// Concatenating all elements yielded by this iterator is guaranteed to |
| /// yield the original byte string. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"\ |
| /// foo |
| /// |
| /// bar\r |
| /// baz |
| /// |
| /// |
| /// quux"; |
| /// let lines: Vec<&[u8]> = s.lines_with_terminator().collect(); |
| /// assert_eq!(lines, vec![ |
| /// B("foo\n"), |
| /// B("\n"), |
| /// B("bar\r\n"), |
| /// B("baz\n"), |
| /// B("\n"), |
| /// B("\n"), |
| /// B("quux"), |
| /// ]); |
| /// ``` |
| #[inline] |
| fn lines_with_terminator(&self) -> LinesWithTerminator<'_> { |
| LinesWithTerminator::new(self.as_bytes()) |
| } |
| |
| /// Return a byte string slice with leading and trailing whitespace |
| /// removed. |
| /// |
| /// Whitespace is defined according to the terms of the `White_Space` |
| /// Unicode property. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(" foo\tbar\t\u{2003}\n"); |
| /// assert_eq!(s.trim(), B("foo\tbar")); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn trim(&self) -> &[u8] { |
| self.trim_start().trim_end() |
| } |
| |
| /// Return a byte string slice with leading whitespace removed. |
| /// |
| /// Whitespace is defined according to the terms of the `White_Space` |
| /// Unicode property. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(" foo\tbar\t\u{2003}\n"); |
| /// assert_eq!(s.trim_start(), B("foo\tbar\t\u{2003}\n")); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn trim_start(&self) -> &[u8] { |
| let start = whitespace_len_fwd(self.as_bytes()); |
| &self.as_bytes()[start..] |
| } |
| |
| /// Return a byte string slice with trailing whitespace removed. |
| /// |
| /// Whitespace is defined according to the terms of the `White_Space` |
| /// Unicode property. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(" foo\tbar\t\u{2003}\n"); |
| /// assert_eq!(s.trim_end(), B(" foo\tbar")); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn trim_end(&self) -> &[u8] { |
| let end = whitespace_len_rev(self.as_bytes()); |
| &self.as_bytes()[..end] |
| } |
| |
| /// Return a byte string slice with leading and trailing characters |
| /// satisfying the given predicate removed. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"123foo5bar789"; |
| /// assert_eq!(s.trim_with(|c| c.is_numeric()), B("foo5bar")); |
| /// ``` |
| #[inline] |
| fn trim_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] { |
| self.trim_start_with(&mut trim).trim_end_with(&mut trim) |
| } |
| |
| /// Return a byte string slice with leading characters satisfying the given |
| /// predicate removed. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"123foo5bar789"; |
| /// assert_eq!(s.trim_start_with(|c| c.is_numeric()), B("foo5bar789")); |
| /// ``` |
| #[inline] |
| fn trim_start_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] { |
| for (s, _, ch) in self.char_indices() { |
| if !trim(ch) { |
| return &self.as_bytes()[s..]; |
| } |
| } |
| b"" |
| } |
| |
| /// Return a byte string slice with trailing characters satisfying the |
| /// given predicate removed. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"123foo5bar789"; |
| /// assert_eq!(s.trim_end_with(|c| c.is_numeric()), B("123foo5bar")); |
| /// ``` |
| #[inline] |
| fn trim_end_with<F: FnMut(char) -> bool>(&self, mut trim: F) -> &[u8] { |
| for (_, e, ch) in self.char_indices().rev() { |
| if !trim(ch) { |
| return &self.as_bytes()[..e]; |
| } |
| } |
| b"" |
| } |
| |
| /// Returns a new `Vec<u8>` containing the lowercase equivalent of this |
| /// byte string. |
| /// |
| /// In this case, lowercase is defined according to the `Lowercase` Unicode |
| /// property. |
| /// |
| /// If invalid UTF-8 is seen, or if a character has no lowercase variant, |
| /// then it is written to the given buffer unchanged. |
| /// |
| /// Note that some characters in this byte string may expand into multiple |
| /// characters when changing the case, so the number of bytes written to |
| /// the given byte string may not be equivalent to the number of bytes in |
| /// this byte string. |
| /// |
| /// If you'd like to reuse an allocation for performance reasons, then use |
| /// [`to_lowercase_into`](#method.to_lowercase_into) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("HELLO Β"); |
| /// assert_eq!("hello β".as_bytes(), s.to_lowercase().as_bytes()); |
| /// ``` |
| /// |
| /// Scripts without case are not changed: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("农历新年"); |
| /// assert_eq!("农历新年".as_bytes(), s.to_lowercase().as_bytes()); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); |
| /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), s.to_lowercase().as_bytes()); |
| /// ``` |
| #[cfg(all(feature = "alloc", feature = "unicode"))] |
| #[inline] |
| fn to_lowercase(&self) -> Vec<u8> { |
| let mut buf = vec![]; |
| self.to_lowercase_into(&mut buf); |
| buf |
| } |
| |
| /// Writes the lowercase equivalent of this byte string into the given |
| /// buffer. The buffer is not cleared before written to. |
| /// |
| /// In this case, lowercase is defined according to the `Lowercase` |
| /// Unicode property. |
| /// |
| /// If invalid UTF-8 is seen, or if a character has no lowercase variant, |
| /// then it is written to the given buffer unchanged. |
| /// |
| /// Note that some characters in this byte string may expand into multiple |
| /// characters when changing the case, so the number of bytes written to |
| /// the given byte string may not be equivalent to the number of bytes in |
| /// this byte string. |
| /// |
| /// If you don't need to amortize allocation and instead prefer |
| /// convenience, then use [`to_lowercase`](#method.to_lowercase) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("HELLO Β"); |
| /// |
| /// let mut buf = vec![]; |
| /// s.to_lowercase_into(&mut buf); |
| /// assert_eq!("hello β".as_bytes(), buf.as_bytes()); |
| /// ``` |
| /// |
| /// Scripts without case are not changed: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("农历新年"); |
| /// |
| /// let mut buf = vec![]; |
| /// s.to_lowercase_into(&mut buf); |
| /// assert_eq!("农历新年".as_bytes(), buf.as_bytes()); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); |
| /// |
| /// let mut buf = vec![]; |
| /// s.to_lowercase_into(&mut buf); |
| /// assert_eq!(B(b"foo\xFFbar\xE2\x98baz"), buf.as_bytes()); |
| /// ``` |
| #[cfg(all(feature = "alloc", feature = "unicode"))] |
| #[inline] |
| fn to_lowercase_into(&self, buf: &mut Vec<u8>) { |
| // TODO: This is the best we can do given what std exposes I think. |
| // If we roll our own case handling, then we might be able to do this |
| // a bit faster. We shouldn't roll our own case handling unless we |
| // need to, e.g., for doing caseless matching or case folding. |
| |
| // TODO(BUG): This doesn't handle any special casing rules. |
| |
| buf.reserve(self.as_bytes().len()); |
| for (s, e, ch) in self.char_indices() { |
| if ch == '\u{FFFD}' { |
| buf.push_str(&self.as_bytes()[s..e]); |
| } else if ch.is_ascii() { |
| buf.push_char(ch.to_ascii_lowercase()); |
| } else { |
| for upper in ch.to_lowercase() { |
| buf.push_char(upper); |
| } |
| } |
| } |
| } |
| |
| /// Returns a new `Vec<u8>` containing the ASCII lowercase equivalent of |
| /// this byte string. |
| /// |
| /// In this case, lowercase is only defined in ASCII letters. Namely, the |
| /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged. |
| /// In particular, the length of the byte string returned is always |
| /// equivalent to the length of this byte string. |
| /// |
| /// If you'd like to reuse an allocation for performance reasons, then use |
| /// [`make_ascii_lowercase`](#method.make_ascii_lowercase) to perform |
| /// the conversion in place. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("HELLO Β"); |
| /// assert_eq!("hello Β".as_bytes(), s.to_ascii_lowercase().as_bytes()); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"FOO\xFFBAR\xE2\x98BAZ"); |
| /// assert_eq!(s.to_ascii_lowercase(), B(b"foo\xFFbar\xE2\x98baz")); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn to_ascii_lowercase(&self) -> Vec<u8> { |
| self.as_bytes().to_ascii_lowercase() |
| } |
| |
| /// Convert this byte string to its lowercase ASCII equivalent in place. |
| /// |
| /// In this case, lowercase is only defined in ASCII letters. Namely, the |
| /// letters `A-Z` are converted to `a-z`. All other bytes remain unchanged. |
| /// |
| /// If you don't need to do the conversion in |
| /// place and instead prefer convenience, then use |
| /// [`to_ascii_lowercase`](#method.to_ascii_lowercase) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("HELLO Β"); |
| /// s.make_ascii_lowercase(); |
| /// assert_eq!(s, "hello Β".as_bytes()); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// # #[cfg(feature = "alloc")] { |
| /// use bstr::{B, ByteSlice, ByteVec}; |
| /// |
| /// let mut s = <Vec<u8>>::from_slice(b"FOO\xFFBAR\xE2\x98BAZ"); |
| /// s.make_ascii_lowercase(); |
| /// assert_eq!(s, B(b"foo\xFFbar\xE2\x98baz")); |
| /// # } |
| /// ``` |
| #[inline] |
| fn make_ascii_lowercase(&mut self) { |
| self.as_bytes_mut().make_ascii_lowercase(); |
| } |
| |
| /// Returns a new `Vec<u8>` containing the uppercase equivalent of this |
| /// byte string. |
| /// |
| /// In this case, uppercase is defined according to the `Uppercase` |
| /// Unicode property. |
| /// |
| /// If invalid UTF-8 is seen, or if a character has no uppercase variant, |
| /// then it is written to the given buffer unchanged. |
| /// |
| /// Note that some characters in this byte string may expand into multiple |
| /// characters when changing the case, so the number of bytes written to |
| /// the given byte string may not be equivalent to the number of bytes in |
| /// this byte string. |
| /// |
| /// If you'd like to reuse an allocation for performance reasons, then use |
| /// [`to_uppercase_into`](#method.to_uppercase_into) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("hello β"); |
| /// assert_eq!(s.to_uppercase(), B("HELLO Β")); |
| /// ``` |
| /// |
| /// Scripts without case are not changed: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("农历新年"); |
| /// assert_eq!(s.to_uppercase(), B("农历新年")); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"foo\xFFbar\xE2\x98baz"); |
| /// assert_eq!(s.to_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ")); |
| /// ``` |
| #[cfg(all(feature = "alloc", feature = "unicode"))] |
| #[inline] |
| fn to_uppercase(&self) -> Vec<u8> { |
| let mut buf = vec![]; |
| self.to_uppercase_into(&mut buf); |
| buf |
| } |
| |
| /// Writes the uppercase equivalent of this byte string into the given |
| /// buffer. The buffer is not cleared before written to. |
| /// |
| /// In this case, uppercase is defined according to the `Uppercase` |
| /// Unicode property. |
| /// |
| /// If invalid UTF-8 is seen, or if a character has no uppercase variant, |
| /// then it is written to the given buffer unchanged. |
| /// |
| /// Note that some characters in this byte string may expand into multiple |
| /// characters when changing the case, so the number of bytes written to |
| /// the given byte string may not be equivalent to the number of bytes in |
| /// this byte string. |
| /// |
| /// If you don't need to amortize allocation and instead prefer |
| /// convenience, then use [`to_uppercase`](#method.to_uppercase) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("hello β"); |
| /// |
| /// let mut buf = vec![]; |
| /// s.to_uppercase_into(&mut buf); |
| /// assert_eq!(buf, B("HELLO Β")); |
| /// ``` |
| /// |
| /// Scripts without case are not changed: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("农历新年"); |
| /// |
| /// let mut buf = vec![]; |
| /// s.to_uppercase_into(&mut buf); |
| /// assert_eq!(buf, B("农历新年")); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"foo\xFFbar\xE2\x98baz"); |
| /// |
| /// let mut buf = vec![]; |
| /// s.to_uppercase_into(&mut buf); |
| /// assert_eq!(buf, B(b"FOO\xFFBAR\xE2\x98BAZ")); |
| /// ``` |
| #[cfg(all(feature = "alloc", feature = "unicode"))] |
| #[inline] |
| fn to_uppercase_into(&self, buf: &mut Vec<u8>) { |
| // TODO: This is the best we can do given what std exposes I think. |
| // If we roll our own case handling, then we might be able to do this |
| // a bit faster. We shouldn't roll our own case handling unless we |
| // need to, e.g., for doing caseless matching or case folding. |
| buf.reserve(self.as_bytes().len()); |
| for (s, e, ch) in self.char_indices() { |
| if ch == '\u{FFFD}' { |
| buf.push_str(&self.as_bytes()[s..e]); |
| } else if ch.is_ascii() { |
| buf.push_char(ch.to_ascii_uppercase()); |
| } else { |
| for upper in ch.to_uppercase() { |
| buf.push_char(upper); |
| } |
| } |
| } |
| } |
| |
| /// Returns a new `Vec<u8>` containing the ASCII uppercase equivalent of |
| /// this byte string. |
| /// |
| /// In this case, uppercase is only defined in ASCII letters. Namely, the |
| /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged. |
| /// In particular, the length of the byte string returned is always |
| /// equivalent to the length of this byte string. |
| /// |
| /// If you'd like to reuse an allocation for performance reasons, then use |
| /// [`make_ascii_uppercase`](#method.make_ascii_uppercase) to perform |
| /// the conversion in place. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B("hello β"); |
| /// assert_eq!(s.to_ascii_uppercase(), B("HELLO β")); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = B(b"foo\xFFbar\xE2\x98baz"); |
| /// assert_eq!(s.to_ascii_uppercase(), B(b"FOO\xFFBAR\xE2\x98BAZ")); |
| /// ``` |
| #[cfg(feature = "alloc")] |
| #[inline] |
| fn to_ascii_uppercase(&self) -> Vec<u8> { |
| self.as_bytes().to_ascii_uppercase() |
| } |
| |
| /// Convert this byte string to its uppercase ASCII equivalent in place. |
| /// |
| /// In this case, uppercase is only defined in ASCII letters. Namely, the |
| /// letters `a-z` are converted to `A-Z`. All other bytes remain unchanged. |
| /// |
| /// If you don't need to do the conversion in |
| /// place and instead prefer convenience, then use |
| /// [`to_ascii_uppercase`](#method.to_ascii_uppercase) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let mut s = <Vec<u8>>::from("hello β"); |
| /// s.make_ascii_uppercase(); |
| /// assert_eq!(s, B("HELLO β")); |
| /// ``` |
| /// |
| /// Invalid UTF-8 remains as is: |
| /// |
| /// ``` |
| /// # #[cfg(feature = "alloc")] { |
| /// use bstr::{B, ByteSlice, ByteVec}; |
| /// |
| /// let mut s = <Vec<u8>>::from_slice(b"foo\xFFbar\xE2\x98baz"); |
| /// s.make_ascii_uppercase(); |
| /// assert_eq!(s, B(b"FOO\xFFBAR\xE2\x98BAZ")); |
| /// # } |
| /// ``` |
| #[inline] |
| fn make_ascii_uppercase(&mut self) { |
| self.as_bytes_mut().make_ascii_uppercase(); |
| } |
| |
| /// Escapes this byte string into a sequence of `char` values. |
| /// |
| /// When the sequence of `char` values is concatenated into a string, the |
| /// result is always valid UTF-8. Any unprintable or invalid UTF-8 in this |
| /// byte string are escaped using using `\xNN` notation. Moreover, the |
| /// characters `\0`, `\r`, `\n`, `\t` and `\` are escaped as well. |
| /// |
| /// This is useful when one wants to get a human readable view of the raw |
| /// bytes that is also valid UTF-8. |
| /// |
| /// The iterator returned implements the `Display` trait. So one can do |
| /// `b"foo\xFFbar".escape_bytes().to_string()` to get a `String` with its |
| /// bytes escaped. |
| /// |
| /// The dual of this function is [`ByteVec::unescape_bytes`]. |
| /// |
| /// Note that this is similar to, but not equivalent to the `Debug` |
| /// implementation on [`BStr`] and [`BString`]. The `Debug` implementations |
| /// also use the debug representation for all Unicode codepoints. However, |
| /// this escaping routine only escapes individual bytes. All Unicode |
| /// codepoints above `U+007F` are passed through unchanged without any |
| /// escaping. |
| /// |
| /// # Examples |
| /// |
| /// ``` |
| /// # #[cfg(feature = "alloc")] { |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert_eq!(r"foo\xFFbar", b"foo\xFFbar".escape_bytes().to_string()); |
| /// assert_eq!(r"foo\nbar", b"foo\nbar".escape_bytes().to_string()); |
| /// assert_eq!(r"foo\tbar", b"foo\tbar".escape_bytes().to_string()); |
| /// assert_eq!(r"foo\\bar", b"foo\\bar".escape_bytes().to_string()); |
| /// assert_eq!(r"foo☃bar", B("foo☃bar").escape_bytes().to_string()); |
| /// # } |
| /// ``` |
| #[inline] |
| fn escape_bytes(&self) -> EscapeBytes<'_> { |
| EscapeBytes::new(self.as_bytes()) |
| } |
| |
| /// Reverse the bytes in this string, in place. |
| /// |
| /// This is not necessarily a well formed operation! For example, if this |
| /// byte string contains valid UTF-8 that isn't ASCII, then reversing the |
| /// string will likely result in invalid UTF-8 and otherwise non-sensical |
| /// content. |
| /// |
| /// Note that this is equivalent to the generic `[u8]::reverse` method. |
| /// This method is provided to permit callers to explicitly differentiate |
| /// between reversing bytes, codepoints and graphemes. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("hello"); |
| /// s.reverse_bytes(); |
| /// assert_eq!(s, "olleh".as_bytes()); |
| /// ``` |
| #[inline] |
| fn reverse_bytes(&mut self) { |
| self.as_bytes_mut().reverse(); |
| } |
| |
| /// Reverse the codepoints in this string, in place. |
| /// |
| /// If this byte string is valid UTF-8, then its reversal by codepoint |
| /// is also guaranteed to be valid UTF-8. |
| /// |
| /// This operation is equivalent to the following, but without allocating: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("foo☃bar"); |
| /// |
| /// let mut chars: Vec<char> = s.chars().collect(); |
| /// chars.reverse(); |
| /// |
| /// let reversed: String = chars.into_iter().collect(); |
| /// assert_eq!(reversed, "rab☃oof"); |
| /// ``` |
| /// |
| /// Note that this is not necessarily a well formed operation. For example, |
| /// if this byte string contains grapheme clusters with more than one |
| /// codepoint, then those grapheme clusters will not necessarily be |
| /// preserved. If you'd like to preserve grapheme clusters, then use |
| /// [`reverse_graphemes`](#method.reverse_graphemes) instead. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("foo☃bar"); |
| /// s.reverse_chars(); |
| /// assert_eq!(s, "rab☃oof".as_bytes()); |
| /// ``` |
| /// |
| /// This example shows that not all reversals lead to a well formed string. |
| /// For example, in this case, combining marks are used to put accents over |
| /// some letters, and those accent marks must appear after the codepoints |
| /// they modify. |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let mut s = <Vec<u8>>::from("résumé"); |
| /// s.reverse_chars(); |
| /// assert_eq!(s, B(b"\xCC\x81emus\xCC\x81er")); |
| /// ``` |
| /// |
| /// A word of warning: the above example relies on the fact that |
| /// `résumé` is in decomposed normal form, which means there are separate |
| /// codepoints for the accents above `e`. If it is instead in composed |
| /// normal form, then the example works: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let mut s = <Vec<u8>>::from("résumé"); |
| /// s.reverse_chars(); |
| /// assert_eq!(s, B("émusér")); |
| /// ``` |
| /// |
| /// The point here is to be cautious and not assume that just because |
| /// `reverse_chars` works in one case, that it therefore works in all |
| /// cases. |
| #[inline] |
| fn reverse_chars(&mut self) { |
| let mut i = 0; |
| loop { |
| let (_, size) = utf8::decode(&self.as_bytes()[i..]); |
| if size == 0 { |
| break; |
| } |
| if size > 1 { |
| self.as_bytes_mut()[i..i + size].reverse_bytes(); |
| } |
| i += size; |
| } |
| self.reverse_bytes(); |
| } |
| |
| /// Reverse the graphemes in this string, in place. |
| /// |
| /// If this byte string is valid UTF-8, then its reversal by grapheme |
| /// is also guaranteed to be valid UTF-8. |
| /// |
| /// This operation is equivalent to the following, but without allocating: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("foo☃bar"); |
| /// |
| /// let mut graphemes: Vec<&str> = s.graphemes().collect(); |
| /// graphemes.reverse(); |
| /// |
| /// let reversed = graphemes.concat(); |
| /// assert_eq!(reversed, "rab☃oof"); |
| /// ``` |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("foo☃bar"); |
| /// s.reverse_graphemes(); |
| /// assert_eq!(s, "rab☃oof".as_bytes()); |
| /// ``` |
| /// |
| /// This example shows how this correctly handles grapheme clusters, |
| /// unlike `reverse_chars`. |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// let mut s = <Vec<u8>>::from("résumé"); |
| /// s.reverse_graphemes(); |
| /// assert_eq!(s, "émusér".as_bytes()); |
| /// ``` |
| #[cfg(feature = "unicode")] |
| #[inline] |
| fn reverse_graphemes(&mut self) { |
| use crate::unicode::decode_grapheme; |
| |
| let mut i = 0; |
| loop { |
| let (_, size) = decode_grapheme(&self.as_bytes()[i..]); |
| if size == 0 { |
| break; |
| } |
| if size > 1 { |
| self.as_bytes_mut()[i..i + size].reverse_bytes(); |
| } |
| i += size; |
| } |
| self.reverse_bytes(); |
| } |
| |
| /// Returns true if and only if every byte in this byte string is ASCII. |
| /// |
| /// ASCII is an encoding that defines 128 codepoints. A byte corresponds to |
| /// an ASCII codepoint if and only if it is in the inclusive range |
| /// `[0, 127]`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert!(B("abc").is_ascii()); |
| /// assert!(!B("☃βツ").is_ascii()); |
| /// assert!(!B(b"\xFF").is_ascii()); |
| /// ``` |
| #[inline] |
| fn is_ascii(&self) -> bool { |
| ascii::first_non_ascii_byte(self.as_bytes()) == self.as_bytes().len() |
| } |
| |
| /// Returns true if and only if the entire byte string is valid UTF-8. |
| /// |
| /// If you need location information about where a byte string's first |
| /// invalid UTF-8 byte is, then use the [`to_str`](#method.to_str) method. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// assert!(B("abc").is_utf8()); |
| /// assert!(B("☃βツ").is_utf8()); |
| /// // invalid bytes |
| /// assert!(!B(b"abc\xFF").is_utf8()); |
| /// // surrogate encoding |
| /// assert!(!B(b"\xED\xA0\x80").is_utf8()); |
| /// // incomplete sequence |
| /// assert!(!B(b"\xF0\x9D\x9Ca").is_utf8()); |
| /// // overlong sequence |
| /// assert!(!B(b"\xF0\x82\x82\xAC").is_utf8()); |
| /// ``` |
| #[inline] |
| fn is_utf8(&self) -> bool { |
| utf8::validate(self.as_bytes()).is_ok() |
| } |
| |
| /// Returns the last byte in this byte string, if it's non-empty. If this |
| /// byte string is empty, this returns `None`. |
| /// |
| /// Note that this is like the generic `[u8]::last`, except this returns |
| /// the byte by value instead of a reference to the byte. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::ByteSlice; |
| /// |
| /// assert_eq!(Some(b'z'), b"baz".last_byte()); |
| /// assert_eq!(None, b"".last_byte()); |
| /// ``` |
| #[inline] |
| fn last_byte(&self) -> Option<u8> { |
| let bytes = self.as_bytes(); |
| bytes.get(bytes.len().saturating_sub(1)).map(|&b| b) |
| } |
| |
| /// Returns the index of the first non-ASCII byte in this byte string (if |
| /// any such indices exist). Specifically, it returns the index of the |
| /// first byte with a value greater than or equal to `0x80`. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{ByteSlice, B}; |
| /// |
| /// assert_eq!(Some(3), b"abc\xff".find_non_ascii_byte()); |
| /// assert_eq!(None, b"abcde".find_non_ascii_byte()); |
| /// assert_eq!(Some(0), B("😀").find_non_ascii_byte()); |
| /// ``` |
| #[inline] |
| fn find_non_ascii_byte(&self) -> Option<usize> { |
| let index = ascii::first_non_ascii_byte(self.as_bytes()); |
| if index == self.as_bytes().len() { |
| None |
| } else { |
| Some(index) |
| } |
| } |
| } |
| |
| /// A single substring searcher fixed to a particular needle. |
| /// |
| /// The purpose of this type is to permit callers to construct a substring |
| /// searcher that can be used to search haystacks without the overhead of |
| /// constructing the searcher in the first place. This is a somewhat niche |
| /// concern when it's necessary to re-use the same needle to search multiple |
| /// different haystacks with as little overhead as possible. In general, using |
| /// [`ByteSlice::find`](trait.ByteSlice.html#method.find) |
| /// or |
| /// [`ByteSlice::find_iter`](trait.ByteSlice.html#method.find_iter) |
| /// is good enough, but `Finder` is useful when you can meaningfully observe |
| /// searcher construction time in a profile. |
| /// |
| /// When the `std` feature is enabled, then this type has an `into_owned` |
| /// version which permits building a `Finder` that is not connected to the |
| /// lifetime of its needle. |
| #[derive(Clone, Debug)] |
| pub struct Finder<'a>(memmem::Finder<'a>); |
| |
| impl<'a> Finder<'a> { |
| /// Create a new finder for the given needle. |
| #[inline] |
| pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> Finder<'a> { |
| Finder(memmem::Finder::new(needle.as_ref())) |
| } |
| |
| /// Convert this finder into its owned variant, such that it no longer |
| /// borrows the needle. |
| /// |
| /// If this is already an owned finder, then this is a no-op. Otherwise, |
| /// this copies the needle. |
| /// |
| /// This is only available when the `alloc` feature is enabled. |
| #[cfg(feature = "alloc")] |
| #[inline] |
| pub fn into_owned(self) -> Finder<'static> { |
| Finder(self.0.into_owned()) |
| } |
| |
| /// Returns the needle that this finder searches for. |
| /// |
| /// Note that the lifetime of the needle returned is tied to the lifetime |
| /// of the finder, and may be shorter than the `'a` lifetime. Namely, a |
| /// finder's needle can be either borrowed or owned, so the lifetime of the |
| /// needle returned must necessarily be the shorter of the two. |
| #[inline] |
| pub fn needle(&self) -> &[u8] { |
| self.0.needle() |
| } |
| |
| /// Returns the index of the first occurrence of this needle in the given |
| /// haystack. |
| /// |
| /// The haystack may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::Finder; |
| /// |
| /// let haystack = "foo bar baz"; |
| /// assert_eq!(Some(0), Finder::new("foo").find(haystack)); |
| /// assert_eq!(Some(4), Finder::new("bar").find(haystack)); |
| /// assert_eq!(None, Finder::new("quux").find(haystack)); |
| /// ``` |
| #[inline] |
| pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { |
| self.0.find(haystack.as_ref()) |
| } |
| } |
| |
| /// A single substring reverse searcher fixed to a particular needle. |
| /// |
| /// The purpose of this type is to permit callers to construct a substring |
| /// searcher that can be used to search haystacks without the overhead of |
| /// constructing the searcher in the first place. This is a somewhat niche |
| /// concern when it's necessary to re-use the same needle to search multiple |
| /// different haystacks with as little overhead as possible. In general, using |
| /// [`ByteSlice::rfind`](trait.ByteSlice.html#method.rfind) |
| /// or |
| /// [`ByteSlice::rfind_iter`](trait.ByteSlice.html#method.rfind_iter) |
| /// is good enough, but `FinderReverse` is useful when you can meaningfully |
| /// observe searcher construction time in a profile. |
| /// |
| /// When the `std` feature is enabled, then this type has an `into_owned` |
| /// version which permits building a `FinderReverse` that is not connected to |
| /// the lifetime of its needle. |
| #[derive(Clone, Debug)] |
| pub struct FinderReverse<'a>(memmem::FinderRev<'a>); |
| |
| impl<'a> FinderReverse<'a> { |
| /// Create a new reverse finder for the given needle. |
| #[inline] |
| pub fn new<B: ?Sized + AsRef<[u8]>>(needle: &'a B) -> FinderReverse<'a> { |
| FinderReverse(memmem::FinderRev::new(needle.as_ref())) |
| } |
| |
| /// Convert this finder into its owned variant, such that it no longer |
| /// borrows the needle. |
| /// |
| /// If this is already an owned finder, then this is a no-op. Otherwise, |
| /// this copies the needle. |
| /// |
| /// This is only available when the `alloc` feature is enabled. |
| #[cfg(feature = "alloc")] |
| #[inline] |
| pub fn into_owned(self) -> FinderReverse<'static> { |
| FinderReverse(self.0.into_owned()) |
| } |
| |
| /// Returns the needle that this finder searches for. |
| /// |
| /// Note that the lifetime of the needle returned is tied to the lifetime |
| /// of this finder, and may be shorter than the `'a` lifetime. Namely, |
| /// a finder's needle can be either borrowed or owned, so the lifetime of |
| /// the needle returned must necessarily be the shorter of the two. |
| #[inline] |
| pub fn needle(&self) -> &[u8] { |
| self.0.needle() |
| } |
| |
| /// Returns the index of the last occurrence of this needle in the given |
| /// haystack. |
| /// |
| /// The haystack may be any type that can be cheaply converted into a |
| /// `&[u8]`. This includes, but is not limited to, `&str` and `&[u8]`. |
| /// |
| /// # Complexity |
| /// |
| /// This routine is guaranteed to have worst case linear time complexity |
| /// with respect to both the needle and the haystack. That is, this runs |
| /// in `O(needle.len() + haystack.len())` time. |
| /// |
| /// This routine is also guaranteed to have worst case constant space |
| /// complexity. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::FinderReverse; |
| /// |
| /// let haystack = "foo bar baz"; |
| /// assert_eq!(Some(0), FinderReverse::new("foo").rfind(haystack)); |
| /// assert_eq!(Some(4), FinderReverse::new("bar").rfind(haystack)); |
| /// assert_eq!(None, FinderReverse::new("quux").rfind(haystack)); |
| /// ``` |
| #[inline] |
| pub fn rfind<B: AsRef<[u8]>>(&self, haystack: B) -> Option<usize> { |
| self.0.rfind(haystack.as_ref()) |
| } |
| } |
| |
| /// An iterator over non-overlapping substring matches. |
| /// |
| /// Matches are reported by the byte offset at which they begin. |
| /// |
| /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the |
| /// needle. |
| #[derive(Clone, Debug)] |
| pub struct Find<'h, 'n> { |
| it: memmem::FindIter<'h, 'n>, |
| haystack: &'h [u8], |
| needle: &'n [u8], |
| } |
| |
| impl<'h, 'n> Find<'h, 'n> { |
| fn new(haystack: &'h [u8], needle: &'n [u8]) -> Find<'h, 'n> { |
| Find { it: memmem::find_iter(haystack, needle), haystack, needle } |
| } |
| } |
| |
| impl<'h, 'n> Iterator for Find<'h, 'n> { |
| type Item = usize; |
| |
| #[inline] |
| fn next(&mut self) -> Option<usize> { |
| self.it.next() |
| } |
| } |
| |
| /// An iterator over non-overlapping substring matches in reverse. |
| /// |
| /// Matches are reported by the byte offset at which they begin. |
| /// |
| /// `'h` is the lifetime of the haystack while `'n` is the lifetime of the |
| /// needle. |
| #[derive(Clone, Debug)] |
| pub struct FindReverse<'h, 'n> { |
| it: memmem::FindRevIter<'h, 'n>, |
| haystack: &'h [u8], |
| needle: &'n [u8], |
| } |
| |
| impl<'h, 'n> FindReverse<'h, 'n> { |
| fn new(haystack: &'h [u8], needle: &'n [u8]) -> FindReverse<'h, 'n> { |
| FindReverse { |
| it: memmem::rfind_iter(haystack, needle), |
| haystack, |
| needle, |
| } |
| } |
| |
| fn haystack(&self) -> &'h [u8] { |
| self.haystack |
| } |
| |
| fn needle(&self) -> &'n [u8] { |
| self.needle |
| } |
| } |
| |
| impl<'h, 'n> Iterator for FindReverse<'h, 'n> { |
| type Item = usize; |
| |
| #[inline] |
| fn next(&mut self) -> Option<usize> { |
| self.it.next() |
| } |
| } |
| |
| /// An iterator over the bytes in a byte string. |
| /// |
| /// `'a` is the lifetime of the byte string being traversed. |
| #[derive(Clone, Debug)] |
| pub struct Bytes<'a> { |
| it: slice::Iter<'a, u8>, |
| } |
| |
| impl<'a> Bytes<'a> { |
| /// Views the remaining underlying data as a subslice of the original data. |
| /// This has the same lifetime as the original slice, |
| /// and so the iterator can continue to be used while this exists. |
| #[inline] |
| pub fn as_bytes(&self) -> &'a [u8] { |
| self.it.as_slice() |
| } |
| } |
| |
| impl<'a> Iterator for Bytes<'a> { |
| type Item = u8; |
| |
| #[inline] |
| fn next(&mut self) -> Option<u8> { |
| self.it.next().map(|&b| b) |
| } |
| |
| #[inline] |
| fn size_hint(&self) -> (usize, Option<usize>) { |
| self.it.size_hint() |
| } |
| } |
| |
| impl<'a> DoubleEndedIterator for Bytes<'a> { |
| #[inline] |
| fn next_back(&mut self) -> Option<u8> { |
| self.it.next_back().map(|&b| b) |
| } |
| } |
| |
| impl<'a> ExactSizeIterator for Bytes<'a> { |
| #[inline] |
| fn len(&self) -> usize { |
| self.it.len() |
| } |
| } |
| |
| impl<'a> iter::FusedIterator for Bytes<'a> {} |
| |
| /// An iterator over the fields in a byte string, separated by whitespace. |
| /// |
| /// Whitespace for this iterator is defined by the Unicode property |
| /// `White_Space`. |
| /// |
| /// This iterator splits on contiguous runs of whitespace, such that the fields |
| /// in `foo\t\t\n \nbar` are `foo` and `bar`. |
| /// |
| /// `'a` is the lifetime of the byte string being split. |
| #[cfg(feature = "unicode")] |
| #[derive(Clone, Debug)] |
| pub struct Fields<'a> { |
| it: FieldsWith<'a, fn(char) -> bool>, |
| } |
| |
| #[cfg(feature = "unicode")] |
| impl<'a> Fields<'a> { |
| fn new(bytes: &'a [u8]) -> Fields<'a> { |
| Fields { it: bytes.fields_with(|ch| ch.is_whitespace()) } |
| } |
| } |
| |
| #[cfg(feature = "unicode")] |
| impl<'a> Iterator for Fields<'a> { |
| type Item = &'a [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'a [u8]> { |
| self.it.next() |
| } |
| } |
| |
| /// An iterator over fields in the byte string, separated by a predicate over |
| /// codepoints. |
| /// |
| /// This iterator splits a byte string based on its predicate function such |
| /// that the elements returned are separated by contiguous runs of codepoints |
| /// for which the predicate returns true. |
| /// |
| /// `'a` is the lifetime of the byte string being split, while `F` is the type |
| /// of the predicate, i.e., `FnMut(char) -> bool`. |
| #[derive(Clone, Debug)] |
| pub struct FieldsWith<'a, F> { |
| f: F, |
| bytes: &'a [u8], |
| chars: CharIndices<'a>, |
| } |
| |
| impl<'a, F: FnMut(char) -> bool> FieldsWith<'a, F> { |
| fn new(bytes: &'a [u8], f: F) -> FieldsWith<'a, F> { |
| FieldsWith { f, bytes, chars: bytes.char_indices() } |
| } |
| } |
| |
| impl<'a, F: FnMut(char) -> bool> Iterator for FieldsWith<'a, F> { |
| type Item = &'a [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'a [u8]> { |
| let (start, mut end); |
| loop { |
| match self.chars.next() { |
| None => return None, |
| Some((s, e, ch)) => { |
| if !(self.f)(ch) { |
| start = s; |
| end = e; |
| break; |
| } |
| } |
| } |
| } |
| while let Some((_, e, ch)) = self.chars.next() { |
| if (self.f)(ch) { |
| break; |
| } |
| end = e; |
| } |
| Some(&self.bytes[start..end]) |
| } |
| } |
| |
| /// An iterator over substrings in a byte string, split by a separator. |
| /// |
| /// `'h` is the lifetime of the byte string being split (the haystack), while |
| /// `'s` is the lifetime of the byte string doing the splitting. |
| #[derive(Clone, Debug)] |
| pub struct Split<'h, 's> { |
| finder: Find<'h, 's>, |
| /// The end position of the previous match of our splitter. The element |
| /// we yield corresponds to the substring starting at `last` up to the |
| /// beginning of the next match of the splitter. |
| last: usize, |
| /// Only set when iteration is complete. A corner case here is when a |
| /// splitter is matched at the end of the haystack. At that point, we still |
| /// need to yield an empty string following it. |
| done: bool, |
| } |
| |
| impl<'h, 's> Split<'h, 's> { |
| fn new(haystack: &'h [u8], splitter: &'s [u8]) -> Split<'h, 's> { |
| let finder = haystack.find_iter(splitter); |
| Split { finder, last: 0, done: false } |
| } |
| } |
| |
| impl<'h, 's> Iterator for Split<'h, 's> { |
| type Item = &'h [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'h [u8]> { |
| let haystack = self.finder.haystack; |
| match self.finder.next() { |
| Some(start) => { |
| let next = &haystack[self.last..start]; |
| self.last = start + self.finder.needle.len(); |
| Some(next) |
| } |
| None => { |
| if self.last >= haystack.len() { |
| if !self.done { |
| self.done = true; |
| Some(b"") |
| } else { |
| None |
| } |
| } else { |
| let s = &haystack[self.last..]; |
| self.last = haystack.len(); |
| self.done = true; |
| Some(s) |
| } |
| } |
| } |
| } |
| } |
| |
| /// An iterator over substrings in a byte string, split by a separator, in |
| /// reverse. |
| /// |
| /// `'h` is the lifetime of the byte string being split (the haystack), while |
| /// `'s` is the lifetime of the byte string doing the splitting. |
| #[derive(Clone, Debug)] |
| pub struct SplitReverse<'h, 's> { |
| finder: FindReverse<'h, 's>, |
| /// The end position of the previous match of our splitter. The element |
| /// we yield corresponds to the substring starting at `last` up to the |
| /// beginning of the next match of the splitter. |
| last: usize, |
| /// Only set when iteration is complete. A corner case here is when a |
| /// splitter is matched at the end of the haystack. At that point, we still |
| /// need to yield an empty string following it. |
| done: bool, |
| } |
| |
| impl<'h, 's> SplitReverse<'h, 's> { |
| fn new(haystack: &'h [u8], splitter: &'s [u8]) -> SplitReverse<'h, 's> { |
| let finder = haystack.rfind_iter(splitter); |
| SplitReverse { finder, last: haystack.len(), done: false } |
| } |
| } |
| |
| impl<'h, 's> Iterator for SplitReverse<'h, 's> { |
| type Item = &'h [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'h [u8]> { |
| let haystack = self.finder.haystack(); |
| match self.finder.next() { |
| Some(start) => { |
| let nlen = self.finder.needle().len(); |
| let next = &haystack[start + nlen..self.last]; |
| self.last = start; |
| Some(next) |
| } |
| None => { |
| if self.last == 0 { |
| if !self.done { |
| self.done = true; |
| Some(b"") |
| } else { |
| None |
| } |
| } else { |
| let s = &haystack[..self.last]; |
| self.last = 0; |
| self.done = true; |
| Some(s) |
| } |
| } |
| } |
| } |
| } |
| |
| /// An iterator over at most `n` substrings in a byte string, split by a |
| /// separator. |
| /// |
| /// `'h` is the lifetime of the byte string being split (the haystack), while |
| /// `'s` is the lifetime of the byte string doing the splitting. |
| #[derive(Clone, Debug)] |
| pub struct SplitN<'h, 's> { |
| split: Split<'h, 's>, |
| limit: usize, |
| count: usize, |
| } |
| |
| impl<'h, 's> SplitN<'h, 's> { |
| fn new( |
| haystack: &'h [u8], |
| splitter: &'s [u8], |
| limit: usize, |
| ) -> SplitN<'h, 's> { |
| let split = haystack.split_str(splitter); |
| SplitN { split, limit, count: 0 } |
| } |
| } |
| |
| impl<'h, 's> Iterator for SplitN<'h, 's> { |
| type Item = &'h [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'h [u8]> { |
| self.count += 1; |
| if self.count > self.limit || self.split.done { |
| None |
| } else if self.count == self.limit { |
| Some(&self.split.finder.haystack[self.split.last..]) |
| } else { |
| self.split.next() |
| } |
| } |
| } |
| |
| /// An iterator over at most `n` substrings in a byte string, split by a |
| /// separator, in reverse. |
| /// |
| /// `'h` is the lifetime of the byte string being split (the haystack), while |
| /// `'s` is the lifetime of the byte string doing the splitting. |
| #[derive(Clone, Debug)] |
| pub struct SplitNReverse<'h, 's> { |
| split: SplitReverse<'h, 's>, |
| limit: usize, |
| count: usize, |
| } |
| |
| impl<'h, 's> SplitNReverse<'h, 's> { |
| fn new( |
| haystack: &'h [u8], |
| splitter: &'s [u8], |
| limit: usize, |
| ) -> SplitNReverse<'h, 's> { |
| let split = haystack.rsplit_str(splitter); |
| SplitNReverse { split, limit, count: 0 } |
| } |
| } |
| |
| impl<'h, 's> Iterator for SplitNReverse<'h, 's> { |
| type Item = &'h [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'h [u8]> { |
| self.count += 1; |
| if self.count > self.limit || self.split.done { |
| None |
| } else if self.count == self.limit { |
| Some(&self.split.finder.haystack()[..self.split.last]) |
| } else { |
| self.split.next() |
| } |
| } |
| } |
| |
| /// An iterator over all lines in a byte string, without their terminators. |
| /// |
| /// For this iterator, the only line terminators recognized are `\r\n` and |
| /// `\n`. |
| /// |
| /// `'a` is the lifetime of the byte string being iterated over. |
| #[derive(Clone, Debug)] |
| pub struct Lines<'a> { |
| it: LinesWithTerminator<'a>, |
| } |
| |
| impl<'a> Lines<'a> { |
| fn new(bytes: &'a [u8]) -> Lines<'a> { |
| Lines { it: LinesWithTerminator::new(bytes) } |
| } |
| |
| /// Return a copy of the rest of the underlying bytes without affecting the |
| /// iterator itself. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"\ |
| /// foo |
| /// bar\r |
| /// baz"; |
| /// let mut lines = s.lines(); |
| /// assert_eq!(lines.next(), Some(B("foo"))); |
| /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz")); |
| /// ``` |
| pub fn as_bytes(&self) -> &'a [u8] { |
| self.it.bytes |
| } |
| } |
| |
| impl<'a> Iterator for Lines<'a> { |
| type Item = &'a [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'a [u8]> { |
| Some(trim_last_terminator(self.it.next()?)) |
| } |
| } |
| |
| impl<'a> DoubleEndedIterator for Lines<'a> { |
| #[inline] |
| fn next_back(&mut self) -> Option<Self::Item> { |
| Some(trim_last_terminator(self.it.next_back()?)) |
| } |
| } |
| |
| impl<'a> iter::FusedIterator for Lines<'a> {} |
| |
| /// An iterator over all lines in a byte string, including their terminators. |
| /// |
| /// For this iterator, the only line terminator recognized is `\n`. (Since |
| /// line terminators are included, this also handles `\r\n` line endings.) |
| /// |
| /// Line terminators are only included if they are present in the original |
| /// byte string. For example, the last line in a byte string may not end with |
| /// a line terminator. |
| /// |
| /// Concatenating all elements yielded by this iterator is guaranteed to yield |
| /// the original byte string. |
| /// |
| /// `'a` is the lifetime of the byte string being iterated over. |
| #[derive(Clone, Debug)] |
| pub struct LinesWithTerminator<'a> { |
| bytes: &'a [u8], |
| } |
| |
| impl<'a> LinesWithTerminator<'a> { |
| fn new(bytes: &'a [u8]) -> LinesWithTerminator<'a> { |
| LinesWithTerminator { bytes } |
| } |
| |
| /// Return a copy of the rest of the underlying bytes without affecting the |
| /// iterator itself. |
| /// |
| /// # Examples |
| /// |
| /// Basic usage: |
| /// |
| /// ``` |
| /// use bstr::{B, ByteSlice}; |
| /// |
| /// let s = b"\ |
| /// foo |
| /// bar\r |
| /// baz"; |
| /// let mut lines = s.lines_with_terminator(); |
| /// assert_eq!(lines.next(), Some(B("foo\n"))); |
| /// assert_eq!(lines.as_bytes(), B("bar\r\nbaz")); |
| /// ``` |
| pub fn as_bytes(&self) -> &'a [u8] { |
| self.bytes |
| } |
| } |
| |
| impl<'a> Iterator for LinesWithTerminator<'a> { |
| type Item = &'a [u8]; |
| |
| #[inline] |
| fn next(&mut self) -> Option<&'a [u8]> { |
| match self.bytes.find_byte(b'\n') { |
| None if self.bytes.is_empty() => None, |
| None => { |
| let line = self.bytes; |
| self.bytes = b""; |
| Some(line) |
| } |
| Some(end) => { |
| let line = &self.bytes[..end + 1]; |
| self.bytes = &self.bytes[end + 1..]; |
| Some(line) |
| } |
| } |
| } |
| } |
| |
| impl<'a> DoubleEndedIterator for LinesWithTerminator<'a> { |
| #[inline] |
| fn next_back(&mut self) -> Option<Self::Item> { |
| let end = self.bytes.len().checked_sub(1)?; |
| match self.bytes[..end].rfind_byte(b'\n') { |
| None => { |
| let line = self.bytes; |
| self.bytes = b""; |
| Some(line) |
| } |
| Some(end) => { |
| let line = &self.bytes[end + 1..]; |
| self.bytes = &self.bytes[..end + 1]; |
| Some(line) |
| } |
| } |
| } |
| } |
| |
| impl<'a> iter::FusedIterator for LinesWithTerminator<'a> {} |
| |
| fn trim_last_terminator(mut s: &[u8]) -> &[u8] { |
| if s.last_byte() == Some(b'\n') { |
| s = &s[..s.len() - 1]; |
| if s.last_byte() == Some(b'\r') { |
| s = &s[..s.len() - 1]; |
| } |
| } |
| s |
| } |
| |
| #[cfg(all(test, feature = "std"))] |
| mod tests { |
| use alloc::{string::String, vec::Vec}; |
| |
| use crate::{ |
| ext_slice::{ByteSlice, Lines, LinesWithTerminator, B}, |
| tests::LOSSY_TESTS, |
| }; |
| |
| #[test] |
| fn to_str_lossy() { |
| for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() { |
| let got = B(input).to_str_lossy(); |
| assert_eq!( |
| expected.as_bytes(), |
| got.as_bytes(), |
| "to_str_lossy(ith: {:?}, given: {:?})", |
| i, |
| input, |
| ); |
| |
| let mut got = String::new(); |
| B(input).to_str_lossy_into(&mut got); |
| assert_eq!( |
| expected.as_bytes(), |
| got.as_bytes(), |
| "to_str_lossy_into", |
| ); |
| |
| let got = String::from_utf8_lossy(input); |
| assert_eq!(expected.as_bytes(), got.as_bytes(), "std"); |
| } |
| } |
| |
| #[test] |
| fn lines_iteration() { |
| macro_rules! t { |
| ($it:expr, $forward:expr) => { |
| let mut res: Vec<&[u8]> = Vec::from($forward); |
| assert_eq!($it.collect::<Vec<_>>(), res); |
| res.reverse(); |
| assert_eq!($it.rev().collect::<Vec<_>>(), res); |
| }; |
| } |
| |
| t!(Lines::new(b""), []); |
| t!(LinesWithTerminator::new(b""), []); |
| |
| t!(Lines::new(b"\n"), [B("")]); |
| t!(Lines::new(b"\r\n"), [B("")]); |
| t!(LinesWithTerminator::new(b"\n"), [B("\n")]); |
| |
| t!(Lines::new(b"a"), [B("a")]); |
| t!(LinesWithTerminator::new(b"a"), [B("a")]); |
| |
| t!(Lines::new(b"abc"), [B("abc")]); |
| t!(LinesWithTerminator::new(b"abc"), [B("abc")]); |
| |
| t!(Lines::new(b"abc\n"), [B("abc")]); |
| t!(Lines::new(b"abc\r\n"), [B("abc")]); |
| t!(LinesWithTerminator::new(b"abc\n"), [B("abc\n")]); |
| |
| t!(Lines::new(b"abc\n\n"), [B("abc"), B("")]); |
| t!(LinesWithTerminator::new(b"abc\n\n"), [B("abc\n"), B("\n")]); |
| |
| t!(Lines::new(b"abc\n\ndef"), [B("abc"), B(""), B("def")]); |
| t!( |
| LinesWithTerminator::new(b"abc\n\ndef"), |
| [B("abc\n"), B("\n"), B("def")] |
| ); |
| |
| t!(Lines::new(b"abc\n\ndef\n"), [B("abc"), B(""), B("def")]); |
| t!( |
| LinesWithTerminator::new(b"abc\n\ndef\n"), |
| [B("abc\n"), B("\n"), B("def\n")] |
| ); |
| |
| t!(Lines::new(b"\na\nb\n"), [B(""), B("a"), B("b")]); |
| t!( |
| LinesWithTerminator::new(b"\na\nb\n"), |
| [B("\n"), B("a\n"), B("b\n")] |
| ); |
| |
| t!(Lines::new(b"\n\n\n"), [B(""), B(""), B("")]); |
| t!(LinesWithTerminator::new(b"\n\n\n"), [B("\n"), B("\n"), B("\n")]); |
| } |
| } |