blob: f4c979b5bdc7c9ebd4e4d25253be931ced6f3f32 [file] [log] [blame] [edit]
//! Simple hand-written ungrammar lexer
use crate::error::{bail, Result};
#[derive(Debug, Eq, PartialEq)]
pub(crate) enum TokenKind {
Node(String),
Token(String),
Eq,
Star,
Pipe,
QMark,
Colon,
LParen,
RParen,
}
#[derive(Debug)]
pub(crate) struct Token {
pub(crate) kind: TokenKind,
pub(crate) loc: Location,
}
#[derive(Copy, Clone, Default, Debug)]
pub(crate) struct Location {
pub(crate) line: usize,
pub(crate) column: usize,
}
impl Location {
fn advance(&mut self, text: &str) {
match text.rfind('\n') {
Some(idx) => {
self.line += text.chars().filter(|&it| it == '\n').count();
self.column = text[idx + 1..].chars().count();
}
None => self.column += text.chars().count(),
}
}
}
pub(crate) fn tokenize(mut input: &str) -> Result<Vec<Token>> {
let mut res = Vec::new();
let mut loc = Location::default();
while !input.is_empty() {
let old_input = input;
skip_ws(&mut input);
skip_comment(&mut input);
if old_input.len() == input.len() {
match advance(&mut input) {
Ok(kind) => {
res.push(Token { kind, loc });
}
Err(err) => return Err(err.with_location(loc)),
}
}
let consumed = old_input.len() - input.len();
loc.advance(&old_input[..consumed]);
}
Ok(res)
}
fn skip_ws(input: &mut &str) {
*input = input.trim_start_matches(is_whitespace)
}
fn skip_comment(input: &mut &str) {
if input.starts_with("//") {
let idx = input.find('\n').map_or(input.len(), |it| it + 1);
*input = &input[idx..]
}
}
fn advance(input: &mut &str) -> Result<TokenKind> {
let mut chars = input.chars();
let c = chars.next().unwrap();
let res = match c {
'=' => TokenKind::Eq,
'*' => TokenKind::Star,
'?' => TokenKind::QMark,
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'|' => TokenKind::Pipe,
':' => TokenKind::Colon,
'\'' => {
let mut buf = String::new();
loop {
match chars.next() {
None => bail!("unclosed token literal"),
Some('\\') => match chars.next() {
Some(c) if is_escapable(c) => buf.push(c),
_ => bail!("invalid escape in token literal"),
},
Some('\'') => break,
Some(c) => buf.push(c),
}
}
TokenKind::Token(buf)
}
c if is_ident_char(c) => {
let mut buf = String::new();
buf.push(c);
loop {
match chars.clone().next() {
Some(c) if is_ident_char(c) => {
chars.next();
buf.push(c);
}
_ => break,
}
}
TokenKind::Node(buf)
}
'\r' => bail!("unexpected `\\r`, only Unix-style line endings allowed"),
c => bail!("unexpected character: `{}`", c),
};
*input = chars.as_str();
Ok(res)
}
fn is_escapable(c: char) -> bool {
matches!(c, '\\' | '\'')
}
fn is_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t' | '\n')
}
fn is_ident_char(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_')
}