lex: support backwards seeking and more tokens

The lexer now records every token it encounters
internally such that peeking and other useful
stuff is possible.  Will come in handy for the
parser.  Also, while i was at it, i also added
more primitive tokens like + - * / because i'm
probably gonna need it later anyway.
main
anna 2 years ago
parent 8ae3bb2f57
commit 68254757a3
Signed by: fef
GPG Key ID: EC22E476DC2D3D84

@ -4,11 +4,13 @@ use std::str::Chars;
mod cursor; mod cursor;
use cursor::Cursor; use cursor::Cursor;
mod token; pub(crate) mod token;
use token::Token; use token::Token;
pub struct Lexer<'a> { pub struct Lexer<'a> {
cursor: Cursor<'a>, cursor: Cursor<'a>,
history: Vec<Token>,
offset: usize,
token_line: usize, token_line: usize,
token_col: usize, token_col: usize,
} }
@ -39,7 +41,13 @@ impl Iterator for Lexer<'_> {
type Item = Result<Token, SyntaxError>; type Item = Result<Token, SyntaxError>;
fn next(&mut self) -> Option<Result<Token, SyntaxError>> { fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
Some(match self.cursor.next()? { if self.offset > 0 {
let tmp = self.history[self.history.len() - self.offset];
self.offset -= 1;
return Some(Ok(tmp));
}
let result = match self.cursor.next()? {
c if c.is_ascii_whitespace() => { c if c.is_ascii_whitespace() => {
self.cursor.skip_whitespace(); self.cursor.skip_whitespace();
self.cursor.chop(); self.cursor.chop();
@ -47,11 +55,18 @@ impl Iterator for Lexer<'_> {
} }
',' => self.token_ok(token::Kind::Comma), ',' => self.token_ok(token::Kind::Comma),
';' => self.token_ok(token::Kind::Semi), ';' => self.token_ok(token::Kind::Semi),
'=' => self.token_ok(token::Kind::Eq),
'{' => self.token_ok(token::Kind::OBrace), '{' => self.token_ok(token::Kind::OBrace),
'}' => self.token_ok(token::Kind::CBrace), '}' => self.token_ok(token::Kind::CBrace),
'[' => self.token_ok(token::Kind::OBracket), '[' => self.token_ok(token::Kind::OBracket),
']' => self.token_ok(token::Kind::CBracket), ']' => self.token_ok(token::Kind::CBracket),
'=' => self.token_ok(token::Kind::Eq),
'+' => self.token_ok(token::Kind::Plus),
'-' => self.token_ok(token::Kind::Minus),
'*' => self.token_ok(token::Kind::Asterisk),
'/' => self.token_ok(token::Kind::Slash),
'%' => self.token_ok(token::Kind::Percent),
'#' => self.read_comment(), '#' => self.read_comment(),
'"' => self.read_string_literal(), '"' => self.read_string_literal(),
'0' => self.read_prefix_int_literal(), '0' => self.read_prefix_int_literal(),
@ -59,7 +74,12 @@ impl Iterator for Lexer<'_> {
_c @ 'A'..='Z' => self.read_ident(), _c @ 'A'..='Z' => self.read_ident(),
_c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase _c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
c => self.syntax_error(format!("Unexpected character '{}'", c)), c => self.syntax_error(format!("Unexpected character '{}'", c)),
}) };
if let Ok(token) = result {
self.history.push(token);
}
Some(result)
} }
} }
@ -67,11 +87,43 @@ impl<'a> Lexer<'a> {
pub fn new(stream: Chars<'a>) -> Lexer<'a> { pub fn new(stream: Chars<'a>) -> Lexer<'a> {
Lexer { Lexer {
cursor: Cursor::new(stream), cursor: Cursor::new(stream),
history: Vec::new(),
offset: 0,
token_line: 1, token_line: 1,
token_col: 1, token_col: 1,
} }
} }
pub fn peek(&mut self) -> Option<Result<Token, SyntaxError>> {
let t = self.next()?;
self.prev();
Some(t)
}
pub fn prev(&mut self) -> Option<&Token> {
let prev = self.history.last()?;
self.offset += 1;
Some(prev)
}
pub fn expect_kind(&mut self, kind: token::Kind) -> Result<Token, SyntaxError> {
match self.next() {
Some(t) => if t?.kind == kind {
Ok(t?)
} else {
self.syntax_error(format!("Expected {}, got {}", kind, t?.kind))
}
None => self.syntax_error("Unexpected EOF"),
}
}
pub fn require_next(&mut self) -> Result<Token, SyntaxError> {
match self.next() {
Some(t) => t,
None => self.syntax_error("Unexpected EOF"),
}
}
fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> { fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
let current = self.cursor.current().unwrap(); let current = self.cursor.current().unwrap();
for kw in &KEYWORDS { for kw in &KEYWORDS {
@ -124,7 +176,7 @@ impl<'a> Lexer<'a> {
Some('o') => self.read_int_literal(8), Some('o') => self.read_int_literal(8),
Some('b') => self.read_int_literal(2), Some('b') => self.read_int_literal(2),
Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)), Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
None => self.syntax_error(String::from("Unexpected end-of-file")), None => self.syntax_error("Unexpected end-of-file"),
} }
} }
@ -175,7 +227,7 @@ impl<'a> Lexer<'a> {
true true
} }
fn syntax_error<T>(&mut self, msg: String) -> Result<T, SyntaxError> { fn syntax_error<T>(&mut self, msg: &str) -> Result<T, SyntaxError> {
Err(SyntaxError { Err(SyntaxError {
line: self.cursor.line(), line: self.cursor.line(),
col: self.cursor.col(), col: self.cursor.col(),

@ -18,17 +18,23 @@ impl fmt::Display for Token {
} }
} }
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy, PartialEq)]
pub enum Kind { pub enum Kind {
Ident, Ident,
OBrace, OBrace,
CBrace, CBrace,
OBracket, OBracket,
CBracket, CBracket,
Eq,
Comma, Comma,
Semi, Semi,
Eq,
Plus,
Minus,
Asterisk,
Slash,
Percent,
DependKeyword, DependKeyword,
IncludeKeyword, IncludeKeyword,
ModuleKeyword, ModuleKeyword,
@ -52,10 +58,16 @@ impl fmt::Display for Kind {
Kind::CBrace => "cbrace", Kind::CBrace => "cbrace",
Kind::OBracket => "obracket", Kind::OBracket => "obracket",
Kind::CBracket => "cbracket", Kind::CBracket => "cbracket",
Kind::Eq => "eq",
Kind::Comma => "comma", Kind::Comma => "comma",
Kind::Semi => "semi", Kind::Semi => "semi",
Kind::Eq => "eq",
Kind::Plus => "plus",
Kind::Minus => "minus",
Kind::Asterisk => "asterisk",
Kind::Slash => "slash",
Kind::Percent => "percent",
Kind::DependKeyword => "keyword", Kind::DependKeyword => "keyword",
Kind::IncludeKeyword => "keyword", Kind::IncludeKeyword => "keyword",
Kind::ModuleKeyword => "keyword", Kind::ModuleKeyword => "keyword",

Loading…
Cancel
Save