blob: 645208d37a58bcceb0b06e13d5f6bf494b791db5 [file] [log] [blame]
//
// https://vt100.net/emu/dec_ansi_parser
//
// The parser is heavily inspired by the vte (https://crates.io/crates/vte) crate.
// Tried to use this crate, but it doesn't work for opposite way (terminal -> sequence),
// because there're couple of exceptions we have to handle and it doesn't make much
// sense to add them to the vte crate. An example is Esc key where we need to know if
// there's additional input available or not and then the decision is made if the
// Esc char is dispatched immediately (user hits just Esc key) or if it's an escape/csi/...
// sequence.
//
const MAX_PARAMETERS: usize = 30;
const DEFAULT_PARAMETER_VALUE: u64 = 0;
const MAX_UTF8_CODE_POINTS: usize = 4;
/// A parser engine state.
///
/// All these variant names come from the
/// [A parser for DEC’s ANSI-compatible video terminals](https://vt100.net/emu/dec_ansi_parser)
/// description.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum State {
/// Initial state.
Ground,
/// Escape sequence started.
///
/// `Esc` received with a flag that there's more data available.
Escape,
/// Escape sequence and we're collecting intermediates.
///
/// # Notes
///
/// This implementation doesn't collect intermediates. It just handles the state
/// to distinguish between (im)proper sequences.
EscapeIntermediate,
/// CSI sequence started.
///
/// `Esc` followed by the `[` received.
CsiEntry,
/// CSI sequence should be consumed, but not dispatched.
CsiIgnore,
/// CSI sequence and we're collecting parameters.
CsiParameter,
/// CSI sequence and we're collecting intermediates.
///
/// # Notes
///
/// This implementation doesn't collect intermediates. It just handles the state
/// to distinguish between (im)proper sequences.
CsiIntermediate,
/// Possible UTF-8 sequence and we're collecting UTF-8 code points.
Utf8,
}
pub(crate) trait Provide {
fn provide_char(&mut self, ch: char);
fn provide_esc_sequence(&mut self, ch: char);
fn provide_csi_sequence(&mut self, parameters: &[u64], ignored_count: usize, ch: char);
}
pub(crate) struct Engine {
parameters: [u64; MAX_PARAMETERS],
parameters_count: usize,
parameter: u64,
ignored_parameters_count: usize,
state: State,
utf8_points: [u8; MAX_UTF8_CODE_POINTS],
utf8_points_count: usize,
utf8_points_expected_count: usize,
}
impl Default for Engine {
fn default() -> Self {
Engine {
parameters: [DEFAULT_PARAMETER_VALUE; MAX_PARAMETERS],
parameters_count: 0,
parameter: DEFAULT_PARAMETER_VALUE,
ignored_parameters_count: 0,
state: State::Ground,
utf8_points: [0; MAX_UTF8_CODE_POINTS],
utf8_points_count: 0,
utf8_points_expected_count: 0,
}
}
}
impl Engine {
fn set_state(&mut self, state: State) {
if let State::Ground = state {
self.parameters_count = 0;
self.parameter = DEFAULT_PARAMETER_VALUE;
self.ignored_parameters_count = 0;
self.utf8_points_count = 0;
self.utf8_points_expected_count = 0;
}
self.state = state;
}
fn store_parameter(&mut self) {
if self.parameters_count < MAX_PARAMETERS {
self.parameters[self.parameters_count] = self.parameter;
self.parameters_count += 1;
} else {
self.ignored_parameters_count += 1;
}
self.parameter = DEFAULT_PARAMETER_VALUE;
}
fn handle_possible_esc(&mut self, provider: &mut dyn Provide, byte: u8, more: bool) -> bool {
if byte != 0x1B {
return false;
}
match (self.state, more) {
// More input means possible Esc sequence, just switch state and wait
(State::Ground, true) => self.set_state(State::Escape),
// No more input means Esc key, dispatch it
(State::Ground, false) => provider.provide_char('\x1B'),
// More input means possible Esc sequence, dispatch the previous Esc char
(State::Escape, true) => provider.provide_char('\x1B'),
// No more input means Esc key, dispatch the previous & current Esc char
(State::Escape, false) => {
provider.provide_char('\x1B');
provider.provide_char('\x1B');
self.set_state(State::Ground);
}
// Discard any state
// More input means possible Esc sequence
(_, true) => self.set_state(State::Escape),
// Discard any state
// No more input means Esc key, dispatch it
(_, false) => {
provider.provide_char('\x1B');
self.set_state(State::Ground);
}
}
true
}
fn handle_possible_utf8_code_points(&mut self, provider: &mut dyn Provide, byte: u8) -> bool {
if byte & 0b1000_0000 == 0b0000_0000 {
provider.provide_char(byte as char);
true
} else if byte & 0b1110_0000 == 0b1100_0000 {
self.utf8_points_count = 1;
self.utf8_points[0] = byte;
self.utf8_points_expected_count = 2;
self.set_state(State::Utf8);
true
} else if byte & 0b1111_0000 == 0b1110_0000 {
self.utf8_points_count = 1;
self.utf8_points[0] = byte;
self.utf8_points_expected_count = 3;
self.set_state(State::Utf8);
true
} else if byte & 0b1111_1000 == 0b1111_0000 {
self.utf8_points_count = 1;
self.utf8_points[0] = byte;
self.utf8_points_expected_count = 4;
self.set_state(State::Utf8);
true
} else {
false
}
}
fn advance_ground_state(&mut self, provider: &mut dyn Provide, byte: u8) {
if self.handle_possible_utf8_code_points(provider, byte) {
return;
}
match byte {
0x1B => unreachable!(),
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// Print
0x20..=0x7F => provider.provide_char(byte as char),
_ => {}
};
}
fn advance_escape_state(&mut self, provider: &mut dyn Provide, byte: u8) {
match byte {
0x1B => unreachable!(),
// Intermediate bytes to collect
0x20..=0x2F => {
self.set_state(State::EscapeIntermediate);
}
// Escape followed by '[' (0x5B)
// -> CSI sequence start
0x5B => self.set_state(State::CsiEntry),
// Escape sequence final character
0x30..=0x4F | 0x51..=0x57 | 0x59 | 0x5A | 0x5C | 0x60..=0x7E => {
provider.provide_esc_sequence(byte as char);
self.set_state(State::Ground);
}
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// TODO Does it mean we should ignore the whole sequence?
// Ignore
0x7F => {}
// Other bytes are considered as invalid -> cancel whatever we have
_ => self.set_state(State::Ground),
};
}
fn advance_escape_intermediate_state(&mut self, provider: &mut dyn Provide, byte: u8) {
match byte {
0x1B => unreachable!(),
// Intermediate bytes to collect
0x20..=0x2F => {}
// Escape followed by '[' (0x5B)
// -> CSI sequence start
0x5B => self.set_state(State::CsiEntry),
// Escape sequence final character
0x30..=0x5A | 0x5C..=0x7E => {
provider.provide_esc_sequence(byte as char);
self.set_state(State::Ground);
}
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// TODO Does it mean we should ignore the whole sequence?
// Ignore
0x7F => {}
// Other bytes are considered as invalid -> cancel whatever we have
_ => self.set_state(State::Ground),
};
}
fn advance_csi_entry_state(&mut self, provider: &mut dyn Provide, byte: u8) {
match byte {
0x1B => unreachable!(),
// Semicolon = parameter delimiter
0x3B => {
self.store_parameter();
self.set_state(State::CsiParameter);
}
// '0' ..= '9' = parameter value
0x30..=0x39 => {
self.parameter = (byte as u64) - 0x30;
self.set_state(State::CsiParameter);
}
0x3A => self.set_state(State::CsiIgnore),
// CSI sequence final character
// -> dispatch CSI sequence
0x40..=0x7E => {
provider.provide_csi_sequence(
&self.parameters[..self.parameters_count],
self.ignored_parameters_count,
byte as char,
);
self.set_state(State::Ground);
}
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// TODO Does it mean we should ignore the whole sequence?
// Ignore
0x7F => {}
// Collect rest as parameters
_ => {
self.parameter = byte as u64;
self.store_parameter();
}
};
}
fn advance_csi_ignore_state(&mut self, provider: &mut dyn Provide, byte: u8) {
match byte {
0x1B => unreachable!(),
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// TODO Does it mean we should ignore the whole sequence?
// Ignore
0x20..=0x3F | 0x7F => {}
0x40..=0x7E => self.set_state(State::Ground),
// Other bytes are considered as invalid -> cancel whatever we have
_ => self.set_state(State::Ground),
};
}
fn advance_csi_parameter_state(&mut self, provider: &mut dyn Provide, byte: u8) {
match byte {
0x1B => unreachable!(),
// '0' ..= '9' = parameter value
0x30..=0x39 => {
self.parameter = self.parameter.saturating_mul(10);
self.parameter = self.parameter.saturating_add((byte as u64) - 0x30);
}
// Semicolon = parameter delimiter
0x3B => self.store_parameter(),
// CSI sequence final character
// -> dispatch CSI sequence
0x40..=0x7E => {
self.store_parameter();
provider.provide_csi_sequence(
&self.parameters[..self.parameters_count],
self.ignored_parameters_count,
byte as char,
);
self.set_state(State::Ground);
}
// Intermediates to collect
0x20..=0x2F => {
self.store_parameter();
self.set_state(State::CsiIntermediate);
}
// Ignore
0x3A | 0x3C..=0x3F => self.set_state(State::CsiIgnore),
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// TODO Does it mean we should ignore the whole sequence?
// Ignore
0x7F => {}
// Other bytes are considered as invalid -> cancel whatever we have
_ => self.set_state(State::Ground),
};
}
fn advance_csi_intermediate_state(&mut self, provider: &mut dyn Provide, byte: u8) {
match byte {
0x1B => unreachable!(),
// Intermediates to collect
0x20..=0x2F => {}
// CSI sequence final character
// -> dispatch CSI sequence
0x40..=0x7E => {
provider.provide_csi_sequence(
&self.parameters[..self.parameters_count],
self.ignored_parameters_count,
byte as char,
);
self.set_state(State::Ground);
}
// Execute
0x00..=0x17 | 0x19 | 0x1C..=0x1F => provider.provide_char(byte as char),
// TODO Does it mean we should ignore the whole sequence?
// Ignore
0x7F => {}
// Other bytes are considered as invalid -> cancel whatever we have
_ => self.set_state(State::Ground),
}
}
fn advance_utf8_state(&mut self, provider: &mut dyn Provide, byte: u8) {
if byte & 0b1100_0000 != 0b1000_0000 {
self.set_state(State::Ground);
return;
}
self.utf8_points[self.utf8_points_count] = byte;
self.utf8_points_count += 1;
if self.utf8_points_count == self.utf8_points_expected_count {
if let Some(ch) = std::str::from_utf8(&self.utf8_points[..self.utf8_points_count])
.ok()
.and_then(|s| s.chars().next())
{
provider.provide_char(ch);
}
self.set_state(State::Ground);
}
}
pub(crate) fn advance(&mut self, provider: &mut dyn Provide, byte: u8, more: bool) {
// eprintln!("advance: {:?} {} {}", self.state, byte, more);
if self.handle_possible_esc(provider, byte, more) {
return;
}
match self.state {
State::Ground => self.advance_ground_state(provider, byte),
State::Escape => self.advance_escape_state(provider, byte),
State::EscapeIntermediate => self.advance_escape_intermediate_state(provider, byte),
State::CsiEntry => self.advance_csi_entry_state(provider, byte),
State::CsiIgnore => self.advance_csi_ignore_state(provider, byte),
State::CsiParameter => self.advance_csi_parameter_state(provider, byte),
State::CsiIntermediate => self.advance_csi_intermediate_state(provider, byte),
State::Utf8 => self.advance_utf8_state(provider, byte),
};
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn esc_char() {
let mut engine = Engine::default();
let mut provider = CharProvider::default();
// No more input means that the Esc character should be dispatched immediately
engine.advance(&mut provider, 0x1B, false);
assert_eq!(provider.chars, &['\x1B']);
// There's more input so the machine should wait before dispatching Esc character
engine.advance(&mut provider, 0x1B, true);
assert_eq!(provider.chars, &['\x1B']);
// Another Esc character, but no more input, machine should dispatch the postponed Esc
// character and the new one too.
engine.advance(&mut provider, 0x1B, false);
assert_eq!(provider.chars, &['\x1B', '\x1B', '\x1B']);
}
#[test]
fn esc_without_intermediates() {
let mut engine = Engine::default();
let mut provider = EscProvider::default();
let input = b"\x1B0\x1B~";
advance(&mut engine, &mut provider, input, false);
assert_eq!(provider.chars.len(), 2);
assert_eq!(provider.chars[0], '0');
assert_eq!(provider.chars[1], '~');
}
#[test]
fn csi_without_parameters() {
let mut engine = Engine::default();
let mut provider = CsiProvider::default();
let input = b"\x1B\x5Bm";
advance(&mut engine, &mut provider, input, false);
assert_eq!(provider.parameters.len(), 1);
assert_eq!(provider.parameters[0], &[]);
assert_eq!(provider.chars.len(), 1);
assert_eq!(provider.chars[0], 'm');
}
#[test]
fn csi_with_two_default_parameters() {
let mut engine = Engine::default();
let mut provider = CsiProvider::default();
let input = b"\x1B\x5B;m";
advance(&mut engine, &mut provider, input, false);
assert_eq!(provider.parameters.len(), 1);
assert_eq!(
provider.parameters[0],
&[DEFAULT_PARAMETER_VALUE, DEFAULT_PARAMETER_VALUE]
);
assert_eq!(provider.chars.len(), 1);
assert_eq!(provider.chars[0], 'm');
}
#[test]
fn csi_with_trailing_semicolon() {
let mut engine = Engine::default();
let mut provider = CsiProvider::default();
let input = b"\x1B\x5B123;m";
advance(&mut engine, &mut provider, input, false);
assert_eq!(provider.parameters.len(), 1);
assert_eq!(provider.parameters[0], &[123, DEFAULT_PARAMETER_VALUE]);
assert_eq!(provider.chars.len(), 1);
assert_eq!(provider.chars[0], 'm');
}
#[test]
fn csi_max_parameters() {
let mut engine = Engine::default();
let mut provider = CsiProvider::default();
let input = b"\x1B\x5B1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;25;26;27;28;29;30m";
advance(&mut engine, &mut provider, input, false);
assert_eq!(provider.parameters.len(), 1);
assert_eq!(provider.parameters[0].len(), MAX_PARAMETERS);
assert_eq!(
provider.parameters[0],
&[
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30
]
);
assert_eq!(provider.chars.len(), 1);
assert_eq!(provider.chars[0], 'm');
}
#[test]
fn test_parse_utf8_character() {
let mut engine = Engine::default();
let mut provider = CharProvider::default();
advance(&mut engine, &mut provider, &['a' as u8], false);
assert_eq!(provider.chars.len(), 1);
assert_eq!(provider.chars[0], 'a');
advance(&mut engine, &mut provider, &[0xC3, 0xB1], false);
assert_eq!(provider.chars.len(), 2);
assert_eq!(provider.chars[1], 'ñ');
advance(&mut engine, &mut provider, &[0xE2, 0x81, 0xA1], false);
assert_eq!(provider.chars.len(), 3);
assert_eq!(provider.chars[2], '\u{2061}');
advance(&mut engine, &mut provider, &[0xF0, 0x90, 0x8C, 0xBC], false);
assert_eq!(provider.chars.len(), 4);
assert_eq!(provider.chars[3], '𐌼');
}
fn advance(engine: &mut Engine, provider: &mut dyn Provide, bytes: &[u8], more: bool) {
let len = bytes.len();
for (i, byte) in bytes.iter().enumerate() {
engine.advance(provider, *byte, i < len - 1 || more);
}
}
#[derive(Default)]
struct CharProvider {
chars: Vec<char>,
}
impl Provide for CharProvider {
fn provide_char(&mut self, ch: char) {
self.chars.push(ch);
}
fn provide_esc_sequence(&mut self, _ch: char) {}
fn provide_csi_sequence(&mut self, _parameters: &[u64], _ignored_count: usize, _ch: char) {}
}
#[derive(Default)]
struct CsiProvider {
parameters: Vec<Vec<u64>>,
chars: Vec<char>,
}
impl Provide for CsiProvider {
fn provide_char(&mut self, _ch: char) {}
fn provide_esc_sequence(&mut self, _ch: char) {}
fn provide_csi_sequence(&mut self, parameters: &[u64], _ignored_count: usize, ch: char) {
self.parameters.push(parameters.to_vec());
self.chars.push(ch);
}
}
#[derive(Default)]
struct EscProvider {
chars: Vec<char>,
}
impl Provide for EscProvider {
fn provide_char(&mut self, _ch: char) {}
fn provide_esc_sequence(&mut self, ch: char) {
self.chars.push(ch);
}
fn provide_csi_sequence(&mut self, _parameters: &[u64], _ignored_count: usize, _ch: char) {}
}
}