lex: support backwards seeking and more tokens
The lexer now records every token it encounters internally such that peeking and other useful stuff is possible. Will come in handy for the parser. Also, while i was at it, i also added more primitive tokens like + - * / because i'm probably gonna need it later anyway.
This commit is contained in:
parent
8ae3bb2f57
commit
68254757a3
|
@ -4,11 +4,13 @@ use std::str::Chars;
|
||||||
mod cursor;
|
mod cursor;
|
||||||
use cursor::Cursor;
|
use cursor::Cursor;
|
||||||
|
|
||||||
mod token;
|
pub(crate) mod token;
|
||||||
use token::Token;
|
use token::Token;
|
||||||
|
|
||||||
pub struct Lexer<'a> {
|
pub struct Lexer<'a> {
|
||||||
cursor: Cursor<'a>,
|
cursor: Cursor<'a>,
|
||||||
|
history: Vec<Token>,
|
||||||
|
offset: usize,
|
||||||
token_line: usize,
|
token_line: usize,
|
||||||
token_col: usize,
|
token_col: usize,
|
||||||
}
|
}
|
||||||
|
@ -39,7 +41,13 @@ impl Iterator for Lexer<'_> {
|
||||||
type Item = Result<Token, SyntaxError>;
|
type Item = Result<Token, SyntaxError>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
|
fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
|
||||||
Some(match self.cursor.next()? {
|
if self.offset > 0 {
|
||||||
|
let tmp = self.history[self.history.len() - self.offset];
|
||||||
|
self.offset -= 1;
|
||||||
|
return Some(Ok(tmp));
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = match self.cursor.next()? {
|
||||||
c if c.is_ascii_whitespace() => {
|
c if c.is_ascii_whitespace() => {
|
||||||
self.cursor.skip_whitespace();
|
self.cursor.skip_whitespace();
|
||||||
self.cursor.chop();
|
self.cursor.chop();
|
||||||
|
@ -47,11 +55,18 @@ impl Iterator for Lexer<'_> {
|
||||||
}
|
}
|
||||||
',' => self.token_ok(token::Kind::Comma),
|
',' => self.token_ok(token::Kind::Comma),
|
||||||
';' => self.token_ok(token::Kind::Semi),
|
';' => self.token_ok(token::Kind::Semi),
|
||||||
'=' => self.token_ok(token::Kind::Eq),
|
|
||||||
'{' => self.token_ok(token::Kind::OBrace),
|
'{' => self.token_ok(token::Kind::OBrace),
|
||||||
'}' => self.token_ok(token::Kind::CBrace),
|
'}' => self.token_ok(token::Kind::CBrace),
|
||||||
'[' => self.token_ok(token::Kind::OBracket),
|
'[' => self.token_ok(token::Kind::OBracket),
|
||||||
']' => self.token_ok(token::Kind::CBracket),
|
']' => self.token_ok(token::Kind::CBracket),
|
||||||
|
|
||||||
|
'=' => self.token_ok(token::Kind::Eq),
|
||||||
|
'+' => self.token_ok(token::Kind::Plus),
|
||||||
|
'-' => self.token_ok(token::Kind::Minus),
|
||||||
|
'*' => self.token_ok(token::Kind::Asterisk),
|
||||||
|
'/' => self.token_ok(token::Kind::Slash),
|
||||||
|
'%' => self.token_ok(token::Kind::Percent),
|
||||||
|
|
||||||
'#' => self.read_comment(),
|
'#' => self.read_comment(),
|
||||||
'"' => self.read_string_literal(),
|
'"' => self.read_string_literal(),
|
||||||
'0' => self.read_prefix_int_literal(),
|
'0' => self.read_prefix_int_literal(),
|
||||||
|
@ -59,7 +74,12 @@ impl Iterator for Lexer<'_> {
|
||||||
_c @ 'A'..='Z' => self.read_ident(),
|
_c @ 'A'..='Z' => self.read_ident(),
|
||||||
_c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
|
_c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
|
||||||
c => self.syntax_error(format!("Unexpected character '{}'", c)),
|
c => self.syntax_error(format!("Unexpected character '{}'", c)),
|
||||||
})
|
};
|
||||||
|
|
||||||
|
if let Ok(token) = result {
|
||||||
|
self.history.push(token);
|
||||||
|
}
|
||||||
|
Some(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -67,11 +87,43 @@ impl<'a> Lexer<'a> {
|
||||||
pub fn new(stream: Chars<'a>) -> Lexer<'a> {
|
pub fn new(stream: Chars<'a>) -> Lexer<'a> {
|
||||||
Lexer {
|
Lexer {
|
||||||
cursor: Cursor::new(stream),
|
cursor: Cursor::new(stream),
|
||||||
|
history: Vec::new(),
|
||||||
|
offset: 0,
|
||||||
token_line: 1,
|
token_line: 1,
|
||||||
token_col: 1,
|
token_col: 1,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn peek(&mut self) -> Option<Result<Token, SyntaxError>> {
|
||||||
|
let t = self.next()?;
|
||||||
|
self.prev();
|
||||||
|
Some(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prev(&mut self) -> Option<&Token> {
|
||||||
|
let prev = self.history.last()?;
|
||||||
|
self.offset += 1;
|
||||||
|
Some(prev)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn expect_kind(&mut self, kind: token::Kind) -> Result<Token, SyntaxError> {
|
||||||
|
match self.next() {
|
||||||
|
Some(t) => if t?.kind == kind {
|
||||||
|
Ok(t?)
|
||||||
|
} else {
|
||||||
|
self.syntax_error(format!("Expected {}, got {}", kind, t?.kind))
|
||||||
|
}
|
||||||
|
None => self.syntax_error("Unexpected EOF"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn require_next(&mut self) -> Result<Token, SyntaxError> {
|
||||||
|
match self.next() {
|
||||||
|
Some(t) => t,
|
||||||
|
None => self.syntax_error("Unexpected EOF"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
|
fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
|
||||||
let current = self.cursor.current().unwrap();
|
let current = self.cursor.current().unwrap();
|
||||||
for kw in &KEYWORDS {
|
for kw in &KEYWORDS {
|
||||||
|
@ -124,7 +176,7 @@ impl<'a> Lexer<'a> {
|
||||||
Some('o') => self.read_int_literal(8),
|
Some('o') => self.read_int_literal(8),
|
||||||
Some('b') => self.read_int_literal(2),
|
Some('b') => self.read_int_literal(2),
|
||||||
Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
|
Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
|
||||||
None => self.syntax_error(String::from("Unexpected end-of-file")),
|
None => self.syntax_error("Unexpected end-of-file"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -175,7 +227,7 @@ impl<'a> Lexer<'a> {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
fn syntax_error<T>(&mut self, msg: String) -> Result<T, SyntaxError> {
|
fn syntax_error<T>(&mut self, msg: &str) -> Result<T, SyntaxError> {
|
||||||
Err(SyntaxError {
|
Err(SyntaxError {
|
||||||
line: self.cursor.line(),
|
line: self.cursor.line(),
|
||||||
col: self.cursor.col(),
|
col: self.cursor.col(),
|
||||||
|
|
|
@ -18,17 +18,23 @@ impl fmt::Display for Token {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||||
pub enum Kind {
|
pub enum Kind {
|
||||||
Ident,
|
Ident,
|
||||||
OBrace,
|
OBrace,
|
||||||
CBrace,
|
CBrace,
|
||||||
OBracket,
|
OBracket,
|
||||||
CBracket,
|
CBracket,
|
||||||
Eq,
|
|
||||||
Comma,
|
Comma,
|
||||||
Semi,
|
Semi,
|
||||||
|
|
||||||
|
Eq,
|
||||||
|
Plus,
|
||||||
|
Minus,
|
||||||
|
Asterisk,
|
||||||
|
Slash,
|
||||||
|
Percent,
|
||||||
|
|
||||||
DependKeyword,
|
DependKeyword,
|
||||||
IncludeKeyword,
|
IncludeKeyword,
|
||||||
ModuleKeyword,
|
ModuleKeyword,
|
||||||
|
@ -52,10 +58,16 @@ impl fmt::Display for Kind {
|
||||||
Kind::CBrace => "cbrace",
|
Kind::CBrace => "cbrace",
|
||||||
Kind::OBracket => "obracket",
|
Kind::OBracket => "obracket",
|
||||||
Kind::CBracket => "cbracket",
|
Kind::CBracket => "cbracket",
|
||||||
Kind::Eq => "eq",
|
|
||||||
Kind::Comma => "comma",
|
Kind::Comma => "comma",
|
||||||
Kind::Semi => "semi",
|
Kind::Semi => "semi",
|
||||||
|
|
||||||
|
Kind::Eq => "eq",
|
||||||
|
Kind::Plus => "plus",
|
||||||
|
Kind::Minus => "minus",
|
||||||
|
Kind::Asterisk => "asterisk",
|
||||||
|
Kind::Slash => "slash",
|
||||||
|
Kind::Percent => "percent",
|
||||||
|
|
||||||
Kind::DependKeyword => "keyword",
|
Kind::DependKeyword => "keyword",
|
||||||
Kind::IncludeKeyword => "keyword",
|
Kind::IncludeKeyword => "keyword",
|
||||||
Kind::ModuleKeyword => "keyword",
|
Kind::ModuleKeyword => "keyword",
|
||||||
|
|
Loading…
Reference in New Issue