gaybuild/src/lex/mod.rs

mod cursor;
use cursor::Cursor;

pub(crate) mod token;
use token::{Position, Token};

use crate::error::Error;

pub struct Lexer {
    file: String,
    cursor: Cursor,
    history: Vec<Token>,
    offset: usize,
    token_line: usize,
    token_col: usize,
}

/// A simple bookmark for restoring the lexer's state in case of a failed lookahead.
pub struct Bookmark {
    /// Index within `Lexer.history`
    index: usize,
}

static NUMERALS: [char; 16] = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
];

struct KeywordMap {
    raw: &'static str,
    kind: token::Kind,
}

const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap {
    assert!(raw.len() >= 2);
    KeywordMap { raw, kind }
}

static KEYWORDS: [KeywordMap; 10] = [
    kw("depend", token::Kind::DependKeyword),
    kw("else", token::Kind::ElseKeyword),
    kw("false", token::Kind::FalseKeyword),
    kw("if", token::Kind::IfKeyword),
    kw("include", token::Kind::IncludeKeyword),
    kw("set", token::Kind::SetKeyword),
    kw("source", token::Kind::SourceKeyword),
    kw("target", token::Kind::TargetKeyword),
    kw("true", token::Kind::TrueKeyword),
    kw("type", token::Kind::TypeKeyword),
];

impl Iterator for Lexer {
    type Item = Result<Token, Error>;

    fn next(&mut self) -> Option<Result<Token, Error>> {
        if self.offset > 0 {
            let tmp = self.history[self.history.len() - self.offset].clone();
            self.offset -= 1;
            return Some(Ok(tmp));
        }

        let result = match self.cursor.next()? {
            c if c.is_ascii_whitespace() => {
                self.cursor.skip_whitespace();
                self.cursor.chop();
                self.token_line = self.cursor.line();
                self.token_col = self.cursor.col();
                self.next()?
            }
            ',' => self.token_ok(token::Kind::Comma),
            ';' => self.token_ok(token::Kind::Semi),
            '{' => self.token_ok(token::Kind::OBrace),
            '}' => self.token_ok(token::Kind::CBrace),
            '[' => self.token_ok(token::Kind::OBracket),
            ']' => self.token_ok(token::Kind::CBracket),
            '(' => self.token_ok(token::Kind::OParen),
            ')' => self.token_ok(token::Kind::CParen),

            '=' if self.skip_if_match("=") => self.token_ok(token::Kind::EqEq),
            '=' => self.token_ok(token::Kind::Eq),
            '!' if self.skip_if_match("=") => self.token_ok(token::Kind::BangEq),
            '!' => self.token_ok(token::Kind::Bang),
            '>' if self.skip_if_match(">=") => self.token_ok(token::Kind::GtGtEq),
            '>' if self.skip_if_match("=") => self.token_ok(token::Kind::GtEq),
            '>' if self.skip_if_match("<") => self.token_ok(token::Kind::GtGt),
            '>' => self.token_ok(token::Kind::Gt),
            '<' if self.skip_if_match("<=") => self.token_ok(token::Kind::LtLtEq),
            '<' if self.skip_if_match("=") => self.token_ok(token::Kind::LtEq),
            '<' if self.skip_if_match("<") => self.token_ok(token::Kind::LtLt),
            '<' => self.token_ok(token::Kind::Lt),
            '|' if self.skip_if_match("|=") => self.token_ok(token::Kind::PipePipeEq),
            '|' if self.skip_if_match("|") => self.token_ok(token::Kind::PipePipe),
            '|' if self.skip_if_match("=") => self.token_ok(token::Kind::PipeEq),
            '|' => self.token_ok(token::Kind::Pipe),
            '&' if self.skip_if_match("&=") => self.token_ok(token::Kind::AmpAmpEq),
            '&' if self.skip_if_match("&") => self.token_ok(token::Kind::AmpAmp),
            '&' if self.skip_if_match("=") => self.token_ok(token::Kind::AmpEq),
            '&' => self.token_ok(token::Kind::Amp),
            '^' if self.skip_if_match("=") => self.token_ok(token::Kind::CaretEq),
            '^' => self.token_ok(token::Kind::Caret),

            '+' if self.skip_if_match("=") => self.token_ok(token::Kind::PlusEq),
            '+' => self.token_ok(token::Kind::Plus),
            '-' if self.skip_if_match("=") => self.token_ok(token::Kind::MinusEq),
            '-' => self.token_ok(token::Kind::Minus),
            '*' if self.skip_if_match("=") => self.token_ok(token::Kind::AsteriskEq),
            '*' => self.token_ok(token::Kind::Asterisk),
            '/' if self.skip_if_match("=") => self.token_ok(token::Kind::SlashEq),
            '/' => self.token_ok(token::Kind::Slash),
            '%' if self.skip_if_match("=") => self.token_ok(token::Kind::PercentEq),
            '%' => self.token_ok(token::Kind::Percent),

            '#' => {
                // this can't fail
                self.read_comment().unwrap();
                // we don't need comments for now and they would
                // only confuse the parser so let's just Not
                self.next()?
            }
            '"' => self.read_string_literal(),
            '0' => self.read_prefix_int_literal(),
            _c @ '1'..='9' => self.read_int_literal(10),
            _c @ 'A'..='Z' => self.read_ident(),
            _c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
            c => self.syntax_error(format!("Unexpected character '{}'", c)),
        };

        Some(result)
    }
}

impl Lexer {
    pub fn new(filename: String, raw: String) -> Lexer {
        Lexer {
            file: filename,
            cursor: Cursor::new(raw),
            history: Vec::new(),
            offset: 0,
            token_line: 1,
            token_col: 1,
        }
    }

    /// Save the lexer's state to perform a speculative lookahead.
    pub fn save(&mut self) -> Bookmark {
        Bookmark {
            index: self.history.len() - self.offset,
        }
    }

    /// Restore the lexer's state in case of a failed lookahead.
    pub fn restore(&mut self, bookmark: Bookmark) {
        self.offset = self.history.len() - bookmark.index;
    }

    pub fn current(&self) -> Option<&Token> {
        if self.history.len() > 0 {
            Some(&self.history[self.history.len() - self.offset - 1])
        } else {
            None
        }
    }

    pub fn peek(&mut self) -> Option<Result<Token, Error>> {
        let t = self.next()?;
        self.prev();
        Some(t)
    }

    pub fn peek_or_err(&mut self) -> Result<Token, Error> {
        let token = self.require_next()?;
        self.prev();
        Ok(token)
    }

    pub fn prev(&mut self) -> Option<&Token> {
        if self.offset < self.history.len() - 1 {
            self.offset += 1;
            let prev = &self.history[self.history.len() - self.offset];
            Some(prev)
        } else {
            None
        }
    }

    pub fn expect_kind(&mut self, kind: token::Kind) -> Result<Token, Error> {
        self.expect_kinds(&[kind])
    }

    pub fn expect_kinds(&mut self, kinds: &[token::Kind]) -> Result<Token, Error> {
        match self.next() {
            Some(Ok(t)) => {
                if kinds.contains(&t.kind) {
                    Ok(t)
                } else {
                    self.syntax_error(if kinds.len() == 1 {
                        format!("Expected {}, got {}", kinds[0], t.kind)
                    } else {
                        format!("Expected one of {:?}, got {}", kinds, t.kind)
                    })
                }
            }
            Some(Err(e)) => Err(e),
            None => self.syntax_error(String::from("Unexpected EOF")),
        }
    }

    pub fn require_next(&mut self) -> Result<Token, Error> {
        match self.next() {
            Some(t) => t,
            None => self.syntax_error(String::from("Unexpected EOF")),
        }
    }

    fn read_keyword_or_ident(&mut self) -> Result<Token, Error> {
        let current = self.cursor.current().unwrap();
        for kw in &KEYWORDS {
            // keywords are always at least 2 characters long as per the language spec
            let first_char = kw.raw.chars().next().unwrap();
            if current == first_char && self.skip_if_match(&kw.raw[1..]) {
                // We need to account for identifiers that just happen to start
                // with a keyword name.  For example, "settings" begins with the
                // "set" keyword, but is obviously still an identifier.
                let is_really_a_keyword = match self.cursor.peek() {
                    Some(c) if c.is_ident_part() => false,
                    _ => true,
                };
                if is_really_a_keyword {
                    return self.token_ok(kw.kind);
                } else {
                    break;
                }
            }
        }

        self.read_ident()
    }

    fn read_ident(&mut self) -> Result<Token, Error> {
        for c in &mut self.cursor {
            if !c.is_ident_part() {
                self.cursor.prev();
                break;
            }
        }

        self.token_ok(token::Kind::Ident)
    }

    fn read_comment(&mut self) -> Result<Token, Error> {
        assert_eq!(self.cursor.current(), Some('#'));
        self.cursor.seek_while(|c| c != '\n');
        self.token_ok(token::Kind::Comment)
    }

    fn read_string_literal(&mut self) -> Result<Token, Error> {
        assert_eq!(self.cursor.current(), Some('"'));
        self.cursor.chop();
        let mut raw = String::new();
        for c in &mut self.cursor {
            // TODO: string escape sequences are a thing
            if c == '"' {
                self.cursor.chop();
                return self.token_raw_ok(token::Kind::StringLiteral, raw);
            } else {
                raw.push(c);
            }
        }

        self.token_ok(token::Kind::StringLiteral)
    }

    fn read_prefix_int_literal(&mut self) -> Result<Token, Error> {
        assert_eq!(self.cursor.current(), Some('0'));
        match self.cursor.next() {
            Some('x') => self.read_int_literal(16),
            Some('o') => self.read_int_literal(8),
            Some('b') => self.read_int_literal(2),
            Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
            None => self.syntax_error(String::from("Unexpected end-of-file")),
        }
    }

    fn read_int_literal(&mut self, base: usize) -> Result<Token, Error> {
        assert!(base >= 2 && base <= 16);

        for c in &mut self.cursor {
            if !NUMERALS[0..base].contains(&c.to_ascii_lowercase()) {
                self.cursor.prev();
                break;
            }
        }

        self.token_ok(token::Kind::IntLiteral)
    }

    fn token(&mut self, kind: token::Kind, raw: String) -> Token {
        let t = Token {
            kind,
            pos: Position {
                file: self.file.clone(),
                line: self.token_line,
                col: self.token_col,
            },
            raw,
        };
        self.token_line = self.cursor.line();
        self.token_col = self.cursor.col();
        self.history.push(t.clone());
        t
    }

    fn token_ok<T>(&mut self, kind: token::Kind) -> Result<Token, T> {
        let raw = self.cursor.chop();
        Ok(self.token(kind, raw))
    }

    fn token_raw_ok<T>(&mut self, kind: token::Kind, raw: String) -> Result<Token, T> {
        Ok(self.token(kind, raw))
    }

    fn skip_if_match(&mut self, s: &str) -> bool {
        let mut n: usize = 0;
        for c in s.chars() {
            n += 1;
            if self.cursor.next() != Some(c) {
                self.cursor.seek_back(n);
                return false;
            }
        }
        true
    }

    fn syntax_error<T>(&mut self, msg: String) -> Result<T, Error> {
        Err(Error::syntax_error(
            Position {
                file: self.file.clone(),
                line: self.cursor.line(),
                col: self.cursor.col(),
            },
            msg,
        ))
    }
}

trait IsIdentifier {
    fn is_ident_start(&self) -> bool;
    fn is_ident_part(&self) -> bool;
}

impl IsIdentifier for char {
    fn is_ident_start(&self) -> bool {
        self.is_ascii_alphabetic() || self == &'_'
    }
    fn is_ident_part(&self) -> bool {
        self.is_ascii_alphanumeric() || self == &'_'
    }
}