From 68254757a3381b057784a98017517c19137431d9 Mon Sep 17 00:00:00 2001 From: fef Date: Mon, 11 Jul 2022 15:57:01 +0200 Subject: [PATCH] lex: support backwards seeking and more tokens The lexer now records every token it encounters internally such that peeking and other useful stuff is possible. Will come in handy for the parser. Also, while i was at it, i also added more primitive tokens like + - * / because i'm probably gonna need it later anyway. --- src/lex/mod.rs | 64 +++++++++++++++++++++++++++++++++++++++++++----- src/lex/token.rs | 18 +++++++++++--- 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/src/lex/mod.rs b/src/lex/mod.rs index e6b97f0..b8274d8 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -4,11 +4,13 @@ use std::str::Chars; mod cursor; use cursor::Cursor; -mod token; +pub(crate) mod token; use token::Token; pub struct Lexer<'a> { cursor: Cursor<'a>, + history: Vec, + offset: usize, token_line: usize, token_col: usize, } @@ -39,7 +41,13 @@ impl Iterator for Lexer<'_> { type Item = Result; fn next(&mut self) -> Option> { - Some(match self.cursor.next()? { + if self.offset > 0 { + let tmp = self.history[self.history.len() - self.offset]; + self.offset -= 1; + return Some(Ok(tmp)); + } + + let result = match self.cursor.next()? { c if c.is_ascii_whitespace() => { self.cursor.skip_whitespace(); self.cursor.chop(); @@ -47,11 +55,18 @@ impl Iterator for Lexer<'_> { } ',' => self.token_ok(token::Kind::Comma), ';' => self.token_ok(token::Kind::Semi), - '=' => self.token_ok(token::Kind::Eq), '{' => self.token_ok(token::Kind::OBrace), '}' => self.token_ok(token::Kind::CBrace), '[' => self.token_ok(token::Kind::OBracket), ']' => self.token_ok(token::Kind::CBracket), + + '=' => self.token_ok(token::Kind::Eq), + '+' => self.token_ok(token::Kind::Plus), + '-' => self.token_ok(token::Kind::Minus), + '*' => self.token_ok(token::Kind::Asterisk), + '/' => self.token_ok(token::Kind::Slash), + '%' => self.token_ok(token::Kind::Percent), + '#' => self.read_comment(), '"' => self.read_string_literal(), '0' => self.read_prefix_int_literal(), @@ -59,7 +74,12 @@ impl Iterator for Lexer<'_> { _c @ 'A'..='Z' => self.read_ident(), _c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase c => self.syntax_error(format!("Unexpected character '{}'", c)), - }) + }; + + if let Ok(token) = result { + self.history.push(token); + } + Some(result) } } @@ -67,11 +87,43 @@ impl<'a> Lexer<'a> { pub fn new(stream: Chars<'a>) -> Lexer<'a> { Lexer { cursor: Cursor::new(stream), + history: Vec::new(), + offset: 0, token_line: 1, token_col: 1, } } + pub fn peek(&mut self) -> Option> { + let t = self.next()?; + self.prev(); + Some(t) + } + + pub fn prev(&mut self) -> Option<&Token> { + let prev = self.history.last()?; + self.offset += 1; + Some(prev) + } + + pub fn expect_kind(&mut self, kind: token::Kind) -> Result { + match self.next() { + Some(t) => if t?.kind == kind { + Ok(t?) + } else { + self.syntax_error(format!("Expected {}, got {}", kind, t?.kind)) + } + None => self.syntax_error("Unexpected EOF"), + } + } + + pub fn require_next(&mut self) -> Result { + match self.next() { + Some(t) => t, + None => self.syntax_error("Unexpected EOF"), + } + } + fn read_keyword_or_ident(&mut self) -> Result { let current = self.cursor.current().unwrap(); for kw in &KEYWORDS { @@ -124,7 +176,7 @@ impl<'a> Lexer<'a> { Some('o') => self.read_int_literal(8), Some('b') => self.read_int_literal(2), Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)), - None => self.syntax_error(String::from("Unexpected end-of-file")), + None => self.syntax_error("Unexpected end-of-file"), } } @@ -175,7 +227,7 @@ impl<'a> Lexer<'a> { true } - fn syntax_error(&mut self, msg: String) -> Result { + fn syntax_error(&mut self, msg: &str) -> Result { Err(SyntaxError { line: self.cursor.line(), col: self.cursor.col(), diff --git a/src/lex/token.rs b/src/lex/token.rs index ae749c1..3f971e9 100644 --- a/src/lex/token.rs +++ b/src/lex/token.rs @@ -18,17 +18,23 @@ impl fmt::Display for Token { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] pub enum Kind { Ident, OBrace, CBrace, OBracket, CBracket, - Eq, Comma, Semi, + Eq, + Plus, + Minus, + Asterisk, + Slash, + Percent, + DependKeyword, IncludeKeyword, ModuleKeyword, @@ -52,10 +58,16 @@ impl fmt::Display for Kind { Kind::CBrace => "cbrace", Kind::OBracket => "obracket", Kind::CBracket => "cbracket", - Kind::Eq => "eq", Kind::Comma => "comma", Kind::Semi => "semi", + Kind::Eq => "eq", + Kind::Plus => "plus", + Kind::Minus => "minus", + Kind::Asterisk => "asterisk", + Kind::Slash => "slash", + Kind::Percent => "percent", + Kind::DependKeyword => "keyword", Kind::IncludeKeyword => "keyword", Kind::ModuleKeyword => "keyword",