You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
358 lines
12 KiB
Rust
358 lines
12 KiB
Rust
mod cursor;
|
|
use cursor::Cursor;
|
|
|
|
pub(crate) mod token;
|
|
use token::{Position, Token};
|
|
|
|
use crate::error::Error;
|
|
|
|
pub struct Lexer {
|
|
file: String,
|
|
cursor: Cursor,
|
|
history: Vec<Token>,
|
|
offset: usize,
|
|
token_line: usize,
|
|
token_col: usize,
|
|
}
|
|
|
|
/// A simple bookmark for restoring the lexer's state in case of a failed lookahead.
|
|
pub struct Bookmark {
|
|
/// Index within `Lexer.history`
|
|
index: usize,
|
|
}
|
|
|
|
static NUMERALS: [char; 16] = [
|
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
|
|
];
|
|
|
|
struct KeywordMap {
|
|
raw: &'static str,
|
|
kind: token::Kind,
|
|
}
|
|
|
|
const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap {
|
|
assert!(raw.len() >= 2);
|
|
KeywordMap { raw, kind }
|
|
}
|
|
|
|
static KEYWORDS: [KeywordMap; 10] = [
|
|
kw("depend", token::Kind::DependKeyword),
|
|
kw("else", token::Kind::ElseKeyword),
|
|
kw("false", token::Kind::FalseKeyword),
|
|
kw("if", token::Kind::IfKeyword),
|
|
kw("include", token::Kind::IncludeKeyword),
|
|
kw("set", token::Kind::SetKeyword),
|
|
kw("source", token::Kind::SourceKeyword),
|
|
kw("target", token::Kind::TargetKeyword),
|
|
kw("true", token::Kind::TrueKeyword),
|
|
kw("type", token::Kind::TypeKeyword),
|
|
];
|
|
|
|
impl Iterator for Lexer {
|
|
type Item = Result<Token, Error>;
|
|
|
|
fn next(&mut self) -> Option<Result<Token, Error>> {
|
|
if self.offset > 0 {
|
|
let tmp = self.history[self.history.len() - self.offset].clone();
|
|
self.offset -= 1;
|
|
return Some(Ok(tmp));
|
|
}
|
|
|
|
let result = match self.cursor.next()? {
|
|
c if c.is_ascii_whitespace() => {
|
|
self.cursor.skip_whitespace();
|
|
self.cursor.chop();
|
|
self.token_line = self.cursor.line();
|
|
self.token_col = self.cursor.col();
|
|
self.next()?
|
|
}
|
|
',' => self.token_ok(token::Kind::Comma),
|
|
';' => self.token_ok(token::Kind::Semi),
|
|
'{' => self.token_ok(token::Kind::OBrace),
|
|
'}' => self.token_ok(token::Kind::CBrace),
|
|
'[' => self.token_ok(token::Kind::OBracket),
|
|
']' => self.token_ok(token::Kind::CBracket),
|
|
'(' => self.token_ok(token::Kind::OParen),
|
|
')' => self.token_ok(token::Kind::CParen),
|
|
|
|
'=' if self.skip_if_match("=") => self.token_ok(token::Kind::EqEq),
|
|
'=' => self.token_ok(token::Kind::Eq),
|
|
'!' if self.skip_if_match("=") => self.token_ok(token::Kind::BangEq),
|
|
'!' => self.token_ok(token::Kind::Bang),
|
|
'>' if self.skip_if_match(">=") => self.token_ok(token::Kind::GtGtEq),
|
|
'>' if self.skip_if_match("=") => self.token_ok(token::Kind::GtEq),
|
|
'>' if self.skip_if_match("<") => self.token_ok(token::Kind::GtGt),
|
|
'>' => self.token_ok(token::Kind::Gt),
|
|
'<' if self.skip_if_match("<=") => self.token_ok(token::Kind::LtLtEq),
|
|
'<' if self.skip_if_match("=") => self.token_ok(token::Kind::LtEq),
|
|
'<' if self.skip_if_match("<") => self.token_ok(token::Kind::LtLt),
|
|
'<' => self.token_ok(token::Kind::Lt),
|
|
'|' if self.skip_if_match("|=") => self.token_ok(token::Kind::PipePipeEq),
|
|
'|' if self.skip_if_match("|") => self.token_ok(token::Kind::PipePipe),
|
|
'|' if self.skip_if_match("=") => self.token_ok(token::Kind::PipeEq),
|
|
'|' => self.token_ok(token::Kind::Pipe),
|
|
'&' if self.skip_if_match("&=") => self.token_ok(token::Kind::AmpAmpEq),
|
|
'&' if self.skip_if_match("&") => self.token_ok(token::Kind::AmpAmp),
|
|
'&' if self.skip_if_match("=") => self.token_ok(token::Kind::AmpEq),
|
|
'&' => self.token_ok(token::Kind::Amp),
|
|
'^' if self.skip_if_match("=") => self.token_ok(token::Kind::CaretEq),
|
|
'^' => self.token_ok(token::Kind::Caret),
|
|
|
|
'+' if self.skip_if_match("=") => self.token_ok(token::Kind::PlusEq),
|
|
'+' => self.token_ok(token::Kind::Plus),
|
|
'-' if self.skip_if_match("=") => self.token_ok(token::Kind::MinusEq),
|
|
'-' => self.token_ok(token::Kind::Minus),
|
|
'*' if self.skip_if_match("=") => self.token_ok(token::Kind::AsteriskEq),
|
|
'*' => self.token_ok(token::Kind::Asterisk),
|
|
'/' if self.skip_if_match("=") => self.token_ok(token::Kind::SlashEq),
|
|
'/' => self.token_ok(token::Kind::Slash),
|
|
'%' if self.skip_if_match("=") => self.token_ok(token::Kind::PercentEq),
|
|
'%' => self.token_ok(token::Kind::Percent),
|
|
|
|
'#' => {
|
|
// this can't fail
|
|
self.read_comment().unwrap();
|
|
// we don't need comments for now and they would
|
|
// only confuse the parser so let's just Not
|
|
self.next()?
|
|
}
|
|
'"' => self.read_string_literal(),
|
|
'0' => self.read_prefix_int_literal(),
|
|
_c @ '1'..='9' => self.read_int_literal(10),
|
|
_c @ 'A'..='Z' => self.read_ident(),
|
|
_c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
|
|
c => self.syntax_error(format!("Unexpected character '{}'", c)),
|
|
};
|
|
|
|
Some(result)
|
|
}
|
|
}
|
|
|
|
impl Lexer {
|
|
pub fn new(filename: String, raw: String) -> Lexer {
|
|
Lexer {
|
|
file: filename,
|
|
cursor: Cursor::new(raw),
|
|
history: Vec::new(),
|
|
offset: 0,
|
|
token_line: 1,
|
|
token_col: 1,
|
|
}
|
|
}
|
|
|
|
/// Save the lexer's state to perform a speculative lookahead.
|
|
pub fn save(&mut self) -> Bookmark {
|
|
Bookmark {
|
|
index: self.history.len() - self.offset,
|
|
}
|
|
}
|
|
|
|
/// Restore the lexer's state in case of a failed lookahead.
|
|
pub fn restore(&mut self, bookmark: Bookmark) {
|
|
self.offset = self.history.len() - bookmark.index;
|
|
}
|
|
|
|
pub fn current(&self) -> Option<&Token> {
|
|
if self.history.len() > 0 {
|
|
Some(&self.history[self.history.len() - self.offset - 1])
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
pub fn peek(&mut self) -> Option<Result<Token, Error>> {
|
|
let t = self.next()?;
|
|
self.prev();
|
|
Some(t)
|
|
}
|
|
|
|
pub fn peek_or_err(&mut self) -> Result<Token, Error> {
|
|
let token = self.require_next()?;
|
|
self.prev();
|
|
Ok(token)
|
|
}
|
|
|
|
pub fn prev(&mut self) -> Option<&Token> {
|
|
if self.offset < self.history.len() - 1 {
|
|
self.offset += 1;
|
|
let prev = &self.history[self.history.len() - self.offset];
|
|
Some(prev)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
pub fn expect_kind(&mut self, kind: token::Kind) -> Result<Token, Error> {
|
|
self.expect_kinds(&[kind])
|
|
}
|
|
|
|
pub fn expect_kinds(&mut self, kinds: &[token::Kind]) -> Result<Token, Error> {
|
|
match self.next() {
|
|
Some(Ok(t)) => {
|
|
if kinds.contains(&t.kind) {
|
|
Ok(t)
|
|
} else {
|
|
self.syntax_error(if kinds.len() == 1 {
|
|
format!("Expected {}, got {}", kinds[0], t.kind)
|
|
} else {
|
|
format!("Expected one of {:?}, got {}", kinds, t.kind)
|
|
})
|
|
}
|
|
}
|
|
Some(Err(e)) => Err(e),
|
|
None => self.syntax_error(String::from("Unexpected EOF")),
|
|
}
|
|
}
|
|
|
|
pub fn require_next(&mut self) -> Result<Token, Error> {
|
|
match self.next() {
|
|
Some(t) => t,
|
|
None => self.syntax_error(String::from("Unexpected EOF")),
|
|
}
|
|
}
|
|
|
|
fn read_keyword_or_ident(&mut self) -> Result<Token, Error> {
|
|
let current = self.cursor.current().unwrap();
|
|
for kw in &KEYWORDS {
|
|
// keywords are always at least 2 characters long as per the language spec
|
|
let first_char = kw.raw.chars().next().unwrap();
|
|
if current == first_char && self.skip_if_match(&kw.raw[1..]) {
|
|
// We need to account for identifiers that just happen to start
|
|
// with a keyword name. For example, "settings" begins with the
|
|
// "set" keyword, but is obviously still an identifier.
|
|
let is_really_a_keyword = match self.cursor.peek() {
|
|
Some(c) if c.is_ident_part() => false,
|
|
_ => true,
|
|
};
|
|
if is_really_a_keyword {
|
|
return self.token_ok(kw.kind);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
self.read_ident()
|
|
}
|
|
|
|
fn read_ident(&mut self) -> Result<Token, Error> {
|
|
for c in &mut self.cursor {
|
|
if !c.is_ident_part() {
|
|
self.cursor.prev();
|
|
break;
|
|
}
|
|
}
|
|
|
|
self.token_ok(token::Kind::Ident)
|
|
}
|
|
|
|
fn read_comment(&mut self) -> Result<Token, Error> {
|
|
assert_eq!(self.cursor.current(), Some('#'));
|
|
self.cursor.seek_while(|c| c != '\n');
|
|
self.token_ok(token::Kind::Comment)
|
|
}
|
|
|
|
fn read_string_literal(&mut self) -> Result<Token, Error> {
|
|
assert_eq!(self.cursor.current(), Some('"'));
|
|
self.cursor.chop();
|
|
let mut raw = String::new();
|
|
for c in &mut self.cursor {
|
|
// TODO: string escape sequences are a thing
|
|
if c == '"' {
|
|
self.cursor.chop();
|
|
return self.token_raw_ok(token::Kind::StringLiteral, raw);
|
|
} else {
|
|
raw.push(c);
|
|
}
|
|
}
|
|
|
|
self.token_ok(token::Kind::StringLiteral)
|
|
}
|
|
|
|
fn read_prefix_int_literal(&mut self) -> Result<Token, Error> {
|
|
assert_eq!(self.cursor.current(), Some('0'));
|
|
match self.cursor.next() {
|
|
Some('x') => self.read_int_literal(16),
|
|
Some('o') => self.read_int_literal(8),
|
|
Some('b') => self.read_int_literal(2),
|
|
Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
|
|
None => self.syntax_error(String::from("Unexpected end-of-file")),
|
|
}
|
|
}
|
|
|
|
fn read_int_literal(&mut self, base: usize) -> Result<Token, Error> {
|
|
assert!(base >= 2 && base <= 16);
|
|
|
|
for c in &mut self.cursor {
|
|
if !NUMERALS[0..base].contains(&c.to_ascii_lowercase()) {
|
|
self.cursor.prev();
|
|
break;
|
|
}
|
|
}
|
|
|
|
self.token_ok(token::Kind::IntLiteral)
|
|
}
|
|
|
|
fn token(&mut self, kind: token::Kind, raw: String) -> Token {
|
|
let t = Token {
|
|
kind,
|
|
pos: Position {
|
|
file: self.file.clone(),
|
|
line: self.token_line,
|
|
col: self.token_col,
|
|
},
|
|
raw,
|
|
};
|
|
self.token_line = self.cursor.line();
|
|
self.token_col = self.cursor.col();
|
|
self.history.push(t.clone());
|
|
t
|
|
}
|
|
|
|
fn token_ok<T>(&mut self, kind: token::Kind) -> Result<Token, T> {
|
|
let raw = self.cursor.chop();
|
|
Ok(self.token(kind, raw))
|
|
}
|
|
|
|
fn token_raw_ok<T>(&mut self, kind: token::Kind, raw: String) -> Result<Token, T> {
|
|
Ok(self.token(kind, raw))
|
|
}
|
|
|
|
fn skip_if_match(&mut self, s: &str) -> bool {
|
|
let mut n: usize = 0;
|
|
for c in s.chars() {
|
|
n += 1;
|
|
if self.cursor.next() != Some(c) {
|
|
self.cursor.seek_back(n);
|
|
return false;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
fn syntax_error<T>(&mut self, msg: String) -> Result<T, Error> {
|
|
Err(Error::syntax_error(
|
|
Position {
|
|
file: self.file.clone(),
|
|
line: self.cursor.line(),
|
|
col: self.cursor.col(),
|
|
},
|
|
msg,
|
|
))
|
|
}
|
|
}
|
|
|
|
trait IsIdentifier {
|
|
fn is_ident_start(&self) -> bool;
|
|
fn is_ident_part(&self) -> bool;
|
|
}
|
|
|
|
impl IsIdentifier for char {
|
|
fn is_ident_start(&self) -> bool {
|
|
self.is_ascii_alphabetic() || self == &'_'
|
|
}
|
|
fn is_ident_part(&self) -> bool {
|
|
self.is_ascii_alphanumeric() || self == &'_'
|
|
}
|
|
}
|