You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

257 lines
7.3 KiB
Rust

use std::fmt;
use std::str::Chars;
mod cursor;
use cursor::Cursor;
pub(crate) mod token;
use token::Token;
pub struct Lexer<'a> {
cursor: Cursor<'a>,
history: Vec<Token>,
offset: usize,
token_line: usize,
token_col: usize,
}
static NUMERALS: [char; 16] = [
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
];
struct KeywordMap {
raw: &'static str,
kind: token::Kind,
}
const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap {
KeywordMap { raw, kind }
}
static KEYWORDS: [KeywordMap; 6] = [
kw("depend", token::Kind::DependKeyword),
kw("include", token::Kind::IncludeKeyword),
kw("module", token::Kind::ModuleKeyword),
kw("set", token::Kind::SetKeyword),
kw("source", token::Kind::SourceKeyword),
kw("type", token::Kind::TypeKeyword),
];
impl Iterator for Lexer<'_> {
type Item = Result<Token, SyntaxError>;
fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
if self.offset > 0 {
let tmp = self.history[self.history.len() - self.offset];
self.offset -= 1;
return Some(Ok(tmp));
}
let result = match self.cursor.next()? {
c if c.is_ascii_whitespace() => {
self.cursor.skip_whitespace();
self.cursor.chop();
self.next()?
}
',' => self.token_ok(token::Kind::Comma),
';' => self.token_ok(token::Kind::Semi),
'{' => self.token_ok(token::Kind::OBrace),
'}' => self.token_ok(token::Kind::CBrace),
'[' => self.token_ok(token::Kind::OBracket),
']' => self.token_ok(token::Kind::CBracket),
'=' => self.token_ok(token::Kind::Eq),
'+' => self.token_ok(token::Kind::Plus),
'-' => self.token_ok(token::Kind::Minus),
'*' => self.token_ok(token::Kind::Asterisk),
'/' => self.token_ok(token::Kind::Slash),
'%' => self.token_ok(token::Kind::Percent),
'#' => self.read_comment(),
'"' => self.read_string_literal(),
'0' => self.read_prefix_int_literal(),
_c @ '1'..='9' => self.read_int_literal(10),
_c @ 'A'..='Z' => self.read_ident(),
_c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
c => self.syntax_error(format!("Unexpected character '{}'", c)),
};
if let Ok(token) = result {
self.history.push(token);
}
Some(result)
}
}
impl<'a> Lexer<'a> {
pub fn new(stream: Chars<'a>) -> Lexer<'a> {
Lexer {
cursor: Cursor::new(stream),
history: Vec::new(),
offset: 0,
token_line: 1,
token_col: 1,
}
}
pub fn peek(&mut self) -> Option<Result<Token, SyntaxError>> {
let t = self.next()?;
self.prev();
Some(t)
}
pub fn prev(&mut self) -> Option<&Token> {
let prev = self.history.last()?;
self.offset += 1;
Some(prev)
}
pub fn expect_kind(&mut self, kind: token::Kind) -> Result<Token, SyntaxError> {
match self.next() {
Some(t) => if t?.kind == kind {
Ok(t?)
} else {
self.syntax_error(format!("Expected {}, got {}", kind, t?.kind))
}
None => self.syntax_error("Unexpected EOF"),
}
}
pub fn require_next(&mut self) -> Result<Token, SyntaxError> {
match self.next() {
Some(t) => t,
None => self.syntax_error("Unexpected EOF"),
}
}
fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
let current = self.cursor.current().unwrap();
for kw in &KEYWORDS {
// keywords are always at least 2 characters long as per the language spec
let first_char = kw.raw.chars().next().unwrap();
if current == first_char && self.skip_if_match(&kw.raw[1..]) {
return self.token_ok(kw.kind);
}
}
self.read_ident()
}
fn read_ident(&mut self) -> Result<Token, SyntaxError> {
for c in &mut self.cursor {
if !c.is_ascii_alphanumeric() && c != '_' {
self.cursor.prev();
break;
}
}
self.token_ok(token::Kind::Ident)
}
fn read_comment(&mut self) -> Result<Token, SyntaxError> {
self.cursor.seek_while(|c| c != '\n');
self.token_ok(token::Kind::Comment)
}
fn read_string_literal(&mut self) -> Result<Token, SyntaxError> {
assert!(self.cursor.current() == Some('"'));
self.cursor.chop();
let mut raw = String::new();
for c in &mut self.cursor {
if c == '"' {
self.cursor.chop();
return self.token_raw_ok(token::Kind::StringLiteral, raw);
} else {
raw.push(c);
}
}
self.token_ok(token::Kind::StringLiteral)
}
fn read_prefix_int_literal(&mut self) -> Result<Token, SyntaxError> {
assert!(self.cursor.next() == Some('0'));
match self.cursor.next() {
Some('x') => self.read_int_literal(16),
Some('o') => self.read_int_literal(8),
Some('b') => self.read_int_literal(2),
Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
None => self.syntax_error("Unexpected end-of-file"),
}
}
fn read_int_literal(&mut self, base: usize) -> Result<Token, SyntaxError> {
assert!(base >= 2 && base <= 16);
for c in &mut self.cursor {
if !NUMERALS[0..base].contains(&c.to_ascii_lowercase()) {
self.cursor.prev();
break;
}
}
self.token_ok(token::Kind::IntLiteral)
}
fn token(&mut self, kind: token::Kind, raw: String) -> Token {
let t = Token {
kind,
line: self.token_line,
col: self.token_col,
raw: raw,
};
self.token_line = self.cursor.line();
self.token_col = self.cursor.col();
t
}
fn token_ok<T>(&mut self, kind: token::Kind) -> Result<Token, T> {
let raw = self.cursor.chop();
Ok(self.token(kind, raw))
}
fn token_raw_ok<T>(&mut self, kind: token::Kind, raw: String) -> Result<Token, T> {
Ok(self.token(kind, raw))
}
fn skip_if_match(&mut self, s: &str) -> bool {
let mut n: usize = 0;
for c in s.chars() {
if self.cursor.next() == Some(c) {
n += 1;
} else {
self.cursor.seek_back(n);
return false;
}
}
true
}
fn syntax_error<T>(&mut self, msg: &str) -> Result<T, SyntaxError> {
Err(SyntaxError {
line: self.cursor.line(),
col: self.cursor.col(),
msg: String::from(msg),
})
}
}
#[derive(Debug)]
pub struct SyntaxError {
pub line: usize,
pub col: usize,
pub msg: String,
}
impl fmt::Display for SyntaxError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"SyntaxError in {}:{}: {}",
self.line,
self.col,
self.msg.as_str()
)
}
}