lex: support backwards seeking and more tokens

The lexer now records every token it encounters internally such that peeking and other useful stuff is possible. Will come in handy for the parser. Also, while i was at it, i also added more primitive tokens like + - * / because i'm probably gonna need it later anyway.
2022-07-11 15:57:01 +02:00 · 2022-07-11 15:57:01 +02:00 · 68254757a3
parent 8ae3bb2f57
commit 68254757a3
2 changed files with 73 additions and 9 deletions
--- a/src/lex/mod.rs
+++ b/src/lex/mod.rs
@ -4,11 +4,13 @@ use std::str::Chars;
 mod cursor;
 use cursor::Cursor;
-mod token;
+pub(crate) mod token;
 use token::Token;
 pub struct Lexer<'a> {
    cursor: Cursor<'a>,
    history: Vec<Token>,
    offset: usize,
    token_line: usize,
    token_col: usize,
 }
@ -39,7 +41,13 @@ impl Iterator for Lexer<'_> {
    type Item = Result<Token, SyntaxError>;
    fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
-        Some(match self.cursor.next()? {
+        if self.offset > 0 {
            let tmp = self.history[self.history.len() - self.offset];
            self.offset -= 1;
            return Some(Ok(tmp));
        }
        let result = match self.cursor.next()? {
            c if c.is_ascii_whitespace() => {
                self.cursor.skip_whitespace();
                self.cursor.chop();
@ -47,11 +55,18 @@ impl Iterator for Lexer<'_> {
            }
            ',' => self.token_ok(token::Kind::Comma),
            ';' => self.token_ok(token::Kind::Semi),
            '=' => self.token_ok(token::Kind::Eq),
            '{' => self.token_ok(token::Kind::OBrace),
            '}' => self.token_ok(token::Kind::CBrace),
            '[' => self.token_ok(token::Kind::OBracket),
            ']' => self.token_ok(token::Kind::CBracket),
            '=' => self.token_ok(token::Kind::Eq),
            '+' => self.token_ok(token::Kind::Plus),
            '-' => self.token_ok(token::Kind::Minus),
            '*' => self.token_ok(token::Kind::Asterisk),
            '/' => self.token_ok(token::Kind::Slash),
            '%' => self.token_ok(token::Kind::Percent),
            '#' => self.read_comment(),
            '"' => self.read_string_literal(),
            '0' => self.read_prefix_int_literal(),
@ -59,7 +74,12 @@ impl Iterator for Lexer<'_> {
            _c @ 'A'..='Z' => self.read_ident(),
            _c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
            c => self.syntax_error(format!("Unexpected character '{}'", c)),
-        })
+        };
        if let Ok(token) = result {
            self.history.push(token);
        }
        Some(result)
    }
 }
@ -67,11 +87,43 @@ impl<'a> Lexer<'a> {
    pub fn new(stream: Chars<'a>) -> Lexer<'a> {
        Lexer {
            cursor: Cursor::new(stream),
            history: Vec::new(),
            offset: 0,
            token_line: 1,
            token_col: 1,
        }
    }
    pub fn peek(&mut self) -> Option<Result<Token, SyntaxError>> {
        let t = self.next()?;
        self.prev();
        Some(t)
    }
    pub fn prev(&mut self) -> Option<&Token> {
        let prev = self.history.last()?;
        self.offset += 1;
        Some(prev)
    }
    pub fn expect_kind(&mut self, kind: token::Kind) -> Result<Token, SyntaxError> {
        match self.next() {
            Some(t) => if t?.kind == kind {
                Ok(t?)
            } else {
                self.syntax_error(format!("Expected {}, got {}", kind, t?.kind))
            }
            None => self.syntax_error("Unexpected EOF"),
        }
    }
    pub fn require_next(&mut self) -> Result<Token, SyntaxError> {
        match self.next() {
            Some(t) => t,
            None => self.syntax_error("Unexpected EOF"),
        }
    }
    fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
        let current = self.cursor.current().unwrap();
        for kw in &KEYWORDS {
@ -124,7 +176,7 @@ impl<'a> Lexer<'a> {
            Some('o') => self.read_int_literal(8),
            Some('b') => self.read_int_literal(2),
            Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
-            None => self.syntax_error(String::from("Unexpected end-of-file")),
+            None => self.syntax_error("Unexpected end-of-file"),
        }
    }
@ -175,7 +227,7 @@ impl<'a> Lexer<'a> {
        true
    }
-    fn syntax_error<T>(&mut self, msg: String) -> Result<T, SyntaxError> {
+    fn syntax_error<T>(&mut self, msg: &str) -> Result<T, SyntaxError> {
        Err(SyntaxError {
            line: self.cursor.line(),
            col: self.cursor.col(),
--- a/src/lex/token.rs
+++ b/src/lex/token.rs
@ -18,17 +18,23 @@ impl fmt::Display for Token {
    }
 }
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq)]
 pub enum Kind {
    Ident,
    OBrace,
    CBrace,
    OBracket,
    CBracket,
    Eq,
    Comma,
    Semi,
    Eq,
    Plus,
    Minus,
    Asterisk,
    Slash,
    Percent,
    DependKeyword,
    IncludeKeyword,
    ModuleKeyword,
@ -52,10 +58,16 @@ impl fmt::Display for Kind {
                Kind::CBrace => "cbrace",
                Kind::OBracket => "obracket",
                Kind::CBracket => "cbracket",
                Kind::Eq => "eq",
                Kind::Comma => "comma",
                Kind::Semi => "semi",
                Kind::Eq => "eq",
                Kind::Plus => "plus",
                Kind::Minus => "minus",
                Kind::Asterisk => "asterisk",
                Kind::Slash => "slash",
                Kind::Percent => "percent",
                Kind::DependKeyword => "keyword",
                Kind::IncludeKeyword => "keyword",
                Kind::ModuleKeyword => "keyword",