add lexer and test file
There are most likely gonna be a lot of changes until we reach the final syntax, but this is a good start i think.
This commit is contained in:
parent
0557369397
commit
8ae3bb2f57
6 changed files with 365 additions and 62 deletions
|
@ -2,6 +2,5 @@
|
||||||
name = "gaybuild"
|
name = "gaybuild"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
author = "anna <owo@fef.moe>"
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
|
|
@ -3,12 +3,42 @@ use std::str::Chars;
|
||||||
/// A cursor for iterating over individual characters in a stream.
|
/// A cursor for iterating over individual characters in a stream.
|
||||||
/// Supports backwards seeking.
|
/// Supports backwards seeking.
|
||||||
pub struct Cursor<'a> {
|
pub struct Cursor<'a> {
|
||||||
stream: Chars<'a>, // where we pull our characters from
|
stream: Chars<'a>,
|
||||||
offset: usize, // how many chars we are behind the stream position (when seeking back)
|
offset: usize, // where we pull our characters from
|
||||||
history: Vec<char>, // complete list of all characters we've read so far
|
history: Vec<char>, // how many chars we are behind the stream position (when seeking back)
|
||||||
line_lengths: Vec<usize>, // length of all previous lines (for seeking back)
|
line_lengths: Vec<usize>, // complete list of all characters we've read so far
|
||||||
line: usize, // current line (starting from 1)
|
line: usize, // length of all previous lines (for seeking back)
|
||||||
col: usize, // current column in line (starting from 1)
|
col: usize, // current line (starting from 1)
|
||||||
|
pos: usize, // current column in line (starting from 1)
|
||||||
|
chop: usize,
|
||||||
|
current: Option<char>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for Cursor<'_> {
|
||||||
|
type Item = char;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<char> {
|
||||||
|
let c = if self.offset > 0 {
|
||||||
|
let tmp = self.history[self.history.len() - self.offset];
|
||||||
|
self.offset -= 1;
|
||||||
|
tmp
|
||||||
|
} else {
|
||||||
|
let tmp = self.stream.next()?;
|
||||||
|
self.history.push(tmp);
|
||||||
|
tmp
|
||||||
|
};
|
||||||
|
|
||||||
|
self.current = Some(c);
|
||||||
|
|
||||||
|
if c == '\n' {
|
||||||
|
self.new_line();
|
||||||
|
} else {
|
||||||
|
self.col += 1;
|
||||||
|
}
|
||||||
|
self.pos += 1;
|
||||||
|
|
||||||
|
Some(c)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Cursor<'a> {
|
impl<'a> Cursor<'a> {
|
||||||
|
@ -20,29 +50,12 @@ impl<'a> Cursor<'a> {
|
||||||
line_lengths: Vec::new(),
|
line_lengths: Vec::new(),
|
||||||
line: 1,
|
line: 1,
|
||||||
col: 0, // increments in first call to next()
|
col: 0, // increments in first call to next()
|
||||||
|
pos: 0,
|
||||||
|
chop: 0,
|
||||||
|
current: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advance the cursor by a single character.
|
|
||||||
pub fn next(&mut self) -> Option<char> {
|
|
||||||
let c = if self.offset > 0 {
|
|
||||||
self.offset -= 1;
|
|
||||||
self.history[self.history.len() - self.offset]
|
|
||||||
} else {
|
|
||||||
let tmp = self.stream.next()?;
|
|
||||||
self.history.push(tmp);
|
|
||||||
tmp
|
|
||||||
};
|
|
||||||
|
|
||||||
if c == '\n' {
|
|
||||||
self.new_line();
|
|
||||||
} else {
|
|
||||||
self.col += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(c)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reverse the cursor by a single character.
|
/// Reverse the cursor by a single character.
|
||||||
pub fn prev(&mut self) -> Option<char> {
|
pub fn prev(&mut self) -> Option<char> {
|
||||||
if self.history.len() == 0 {
|
if self.history.len() == 0 {
|
||||||
|
@ -55,23 +68,11 @@ impl<'a> Cursor<'a> {
|
||||||
} else {
|
} else {
|
||||||
self.col -= 1;
|
self.col -= 1;
|
||||||
}
|
}
|
||||||
|
self.pos -= 1;
|
||||||
Some(c)
|
Some(c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Seek forward and return all characters that were encountered.
|
|
||||||
pub fn seek(&mut self, n: usize) -> Vec<char> {
|
|
||||||
// TODO: implement this properly
|
|
||||||
let mut v = Vec::with_capacity(n);
|
|
||||||
for _ in 0..n {
|
|
||||||
match self.next() {
|
|
||||||
Some(c) => v.push(c),
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
v
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Seek backward and return all characters that were encountered.
|
/// Seek backward and return all characters that were encountered.
|
||||||
pub fn seek_back(&mut self, n: usize) -> Vec<char> {
|
pub fn seek_back(&mut self, n: usize) -> Vec<char> {
|
||||||
// TODO: implement this properly as well
|
// TODO: implement this properly as well
|
||||||
|
@ -87,7 +88,7 @@ impl<'a> Cursor<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Seek forward until the `test` callback returns false.
|
/// Seek forward until the `test` callback returns false.
|
||||||
pub fn seek_until(&mut self, test: fn(c: char) -> bool) -> Vec<char> {
|
pub fn seek_while(&mut self, test: fn(c: char) -> bool) -> Vec<char> {
|
||||||
let mut v = Vec::new();
|
let mut v = Vec::new();
|
||||||
while let Some(c) = self.peek() {
|
while let Some(c) = self.peek() {
|
||||||
if test(c) {
|
if test(c) {
|
||||||
|
@ -100,21 +101,6 @@ impl<'a> Cursor<'a> {
|
||||||
v
|
v
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Seek backward until the test callback returns false.
|
|
||||||
pub fn seek_back_until(&mut self, test: fn(c: char) -> bool) -> Vec<char> {
|
|
||||||
let mut v = Vec::new();
|
|
||||||
while let Some(c) = self.peek_back() {
|
|
||||||
if test(c) {
|
|
||||||
v.push(c);
|
|
||||||
self.prev();
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
v.reverse(); // TODO: again, probably not ideal
|
|
||||||
v
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the next character without actually advancing the cursor.
|
/// Return the next character without actually advancing the cursor.
|
||||||
pub fn peek(&mut self) -> Option<char> {
|
pub fn peek(&mut self) -> Option<char> {
|
||||||
if self.offset > 0 {
|
if self.offset > 0 {
|
||||||
|
@ -126,13 +112,15 @@ impl<'a> Cursor<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the previous character without actually reversing the cursor.
|
pub fn skip_whitespace(&mut self) {
|
||||||
pub fn peek_back(&self) -> Option<char> {
|
self.seek_while(|c| c.is_ascii_whitespace());
|
||||||
self.history.last().and_then(|c| Some(*c))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn skip_whitespace(&mut self) {
|
pub fn chop(&mut self) -> String {
|
||||||
self.seek_until(|c| !c.is_ascii_whitespace());
|
assert!(self.pos >= self.chop);
|
||||||
|
let s = String::from_iter(self.history[self.chop..self.pos].into_iter());
|
||||||
|
self.chop = self.pos;
|
||||||
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the line number (starting from 1) of the last
|
/// Return the line number (starting from 1) of the last
|
||||||
|
@ -149,6 +137,10 @@ impl<'a> Cursor<'a> {
|
||||||
self.col
|
self.col
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn current(&self) -> Option<char> {
|
||||||
|
self.current
|
||||||
|
}
|
||||||
|
|
||||||
fn new_line(&mut self) {
|
fn new_line(&mut self) {
|
||||||
self.line_lengths.push(self.col);
|
self.line_lengths.push(self.col);
|
||||||
self.col = 0;
|
self.col = 0;
|
||||||
|
|
203
src/lex/mod.rs
203
src/lex/mod.rs
|
@ -1 +1,204 @@
|
||||||
|
use std::fmt;
|
||||||
|
use std::str::Chars;
|
||||||
|
|
||||||
mod cursor;
|
mod cursor;
|
||||||
|
use cursor::Cursor;
|
||||||
|
|
||||||
|
mod token;
|
||||||
|
use token::Token;
|
||||||
|
|
||||||
|
pub struct Lexer<'a> {
|
||||||
|
cursor: Cursor<'a>,
|
||||||
|
token_line: usize,
|
||||||
|
token_col: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
static NUMERALS: [char; 16] = [
|
||||||
|
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
|
||||||
|
];
|
||||||
|
|
||||||
|
struct KeywordMap {
|
||||||
|
raw: &'static str,
|
||||||
|
kind: token::Kind,
|
||||||
|
}
|
||||||
|
|
||||||
|
const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap {
|
||||||
|
KeywordMap { raw, kind }
|
||||||
|
}
|
||||||
|
|
||||||
|
static KEYWORDS: [KeywordMap; 6] = [
|
||||||
|
kw("depend", token::Kind::DependKeyword),
|
||||||
|
kw("include", token::Kind::IncludeKeyword),
|
||||||
|
kw("module", token::Kind::ModuleKeyword),
|
||||||
|
kw("set", token::Kind::SetKeyword),
|
||||||
|
kw("source", token::Kind::SourceKeyword),
|
||||||
|
kw("type", token::Kind::TypeKeyword),
|
||||||
|
];
|
||||||
|
|
||||||
|
impl Iterator for Lexer<'_> {
|
||||||
|
type Item = Result<Token, SyntaxError>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
|
||||||
|
Some(match self.cursor.next()? {
|
||||||
|
c if c.is_ascii_whitespace() => {
|
||||||
|
self.cursor.skip_whitespace();
|
||||||
|
self.cursor.chop();
|
||||||
|
self.next()?
|
||||||
|
}
|
||||||
|
',' => self.token_ok(token::Kind::Comma),
|
||||||
|
';' => self.token_ok(token::Kind::Semi),
|
||||||
|
'=' => self.token_ok(token::Kind::Eq),
|
||||||
|
'{' => self.token_ok(token::Kind::OBrace),
|
||||||
|
'}' => self.token_ok(token::Kind::CBrace),
|
||||||
|
'[' => self.token_ok(token::Kind::OBracket),
|
||||||
|
']' => self.token_ok(token::Kind::CBracket),
|
||||||
|
'#' => self.read_comment(),
|
||||||
|
'"' => self.read_string_literal(),
|
||||||
|
'0' => self.read_prefix_int_literal(),
|
||||||
|
_c @ '1'..='9' => self.read_int_literal(10),
|
||||||
|
_c @ 'A'..='Z' => self.read_ident(),
|
||||||
|
_c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
|
||||||
|
c => self.syntax_error(format!("Unexpected character '{}'", c)),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Lexer<'a> {
|
||||||
|
pub fn new(stream: Chars<'a>) -> Lexer<'a> {
|
||||||
|
Lexer {
|
||||||
|
cursor: Cursor::new(stream),
|
||||||
|
token_line: 1,
|
||||||
|
token_col: 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
|
||||||
|
let current = self.cursor.current().unwrap();
|
||||||
|
for kw in &KEYWORDS {
|
||||||
|
// keywords are always at least 2 characters long as per the language spec
|
||||||
|
let first_char = kw.raw.chars().next().unwrap();
|
||||||
|
if current == first_char && self.skip_if_match(&kw.raw[1..]) {
|
||||||
|
return self.token_ok(kw.kind);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.read_ident()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_ident(&mut self) -> Result<Token, SyntaxError> {
|
||||||
|
for c in &mut self.cursor {
|
||||||
|
if !c.is_ascii_alphanumeric() && c != '_' {
|
||||||
|
self.cursor.prev();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.token_ok(token::Kind::Ident)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_comment(&mut self) -> Result<Token, SyntaxError> {
|
||||||
|
self.cursor.seek_while(|c| c != '\n');
|
||||||
|
self.token_ok(token::Kind::Comment)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_string_literal(&mut self) -> Result<Token, SyntaxError> {
|
||||||
|
assert!(self.cursor.current() == Some('"'));
|
||||||
|
self.cursor.chop();
|
||||||
|
let mut raw = String::new();
|
||||||
|
for c in &mut self.cursor {
|
||||||
|
if c == '"' {
|
||||||
|
self.cursor.chop();
|
||||||
|
return self.token_raw_ok(token::Kind::StringLiteral, raw);
|
||||||
|
} else {
|
||||||
|
raw.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.token_ok(token::Kind::StringLiteral)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_prefix_int_literal(&mut self) -> Result<Token, SyntaxError> {
|
||||||
|
assert!(self.cursor.next() == Some('0'));
|
||||||
|
match self.cursor.next() {
|
||||||
|
Some('x') => self.read_int_literal(16),
|
||||||
|
Some('o') => self.read_int_literal(8),
|
||||||
|
Some('b') => self.read_int_literal(2),
|
||||||
|
Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
|
||||||
|
None => self.syntax_error(String::from("Unexpected end-of-file")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn read_int_literal(&mut self, base: usize) -> Result<Token, SyntaxError> {
|
||||||
|
assert!(base >= 2 && base <= 16);
|
||||||
|
|
||||||
|
for c in &mut self.cursor {
|
||||||
|
if !NUMERALS[0..base].contains(&c.to_ascii_lowercase()) {
|
||||||
|
self.cursor.prev();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.token_ok(token::Kind::IntLiteral)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&mut self, kind: token::Kind, raw: String) -> Token {
|
||||||
|
let t = Token {
|
||||||
|
kind,
|
||||||
|
line: self.token_line,
|
||||||
|
col: self.token_col,
|
||||||
|
raw: raw,
|
||||||
|
};
|
||||||
|
self.token_line = self.cursor.line();
|
||||||
|
self.token_col = self.cursor.col();
|
||||||
|
t
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_ok<T>(&mut self, kind: token::Kind) -> Result<Token, T> {
|
||||||
|
let raw = self.cursor.chop();
|
||||||
|
Ok(self.token(kind, raw))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_raw_ok<T>(&mut self, kind: token::Kind, raw: String) -> Result<Token, T> {
|
||||||
|
Ok(self.token(kind, raw))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn skip_if_match(&mut self, s: &str) -> bool {
|
||||||
|
let mut n: usize = 0;
|
||||||
|
for c in s.chars() {
|
||||||
|
if self.cursor.next() == Some(c) {
|
||||||
|
n += 1;
|
||||||
|
} else {
|
||||||
|
self.cursor.seek_back(n);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn syntax_error<T>(&mut self, msg: String) -> Result<T, SyntaxError> {
|
||||||
|
Err(SyntaxError {
|
||||||
|
line: self.cursor.line(),
|
||||||
|
col: self.cursor.col(),
|
||||||
|
msg: String::from(msg),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct SyntaxError {
|
||||||
|
pub line: usize,
|
||||||
|
pub col: usize,
|
||||||
|
pub msg: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for SyntaxError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"SyntaxError in {}:{}: {}",
|
||||||
|
self.line,
|
||||||
|
self.col,
|
||||||
|
self.msg.as_str()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
72
src/lex/token.rs
Normal file
72
src/lex/token.rs
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
/// A single syntactic element.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Token {
|
||||||
|
pub kind: Kind,
|
||||||
|
/// line of the first character (starting from 1)
|
||||||
|
pub line: usize,
|
||||||
|
/// column of the first character (starting from 1)
|
||||||
|
pub col: usize,
|
||||||
|
/// raw text
|
||||||
|
pub raw: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Token {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(f, "{}: \"{}\"", self.kind, self.raw)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum Kind {
|
||||||
|
Ident,
|
||||||
|
OBrace,
|
||||||
|
CBrace,
|
||||||
|
OBracket,
|
||||||
|
CBracket,
|
||||||
|
Eq,
|
||||||
|
Comma,
|
||||||
|
Semi,
|
||||||
|
|
||||||
|
DependKeyword,
|
||||||
|
IncludeKeyword,
|
||||||
|
ModuleKeyword,
|
||||||
|
SetKeyword,
|
||||||
|
SourceKeyword,
|
||||||
|
TypeKeyword,
|
||||||
|
|
||||||
|
StringLiteral,
|
||||||
|
IntLiteral,
|
||||||
|
Comment,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Kind {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"{}",
|
||||||
|
match self {
|
||||||
|
Kind::Ident => "ident",
|
||||||
|
Kind::OBrace => "obrace",
|
||||||
|
Kind::CBrace => "cbrace",
|
||||||
|
Kind::OBracket => "obracket",
|
||||||
|
Kind::CBracket => "cbracket",
|
||||||
|
Kind::Eq => "eq",
|
||||||
|
Kind::Comma => "comma",
|
||||||
|
Kind::Semi => "semi",
|
||||||
|
|
||||||
|
Kind::DependKeyword => "keyword",
|
||||||
|
Kind::IncludeKeyword => "keyword",
|
||||||
|
Kind::ModuleKeyword => "keyword",
|
||||||
|
Kind::SetKeyword => "keyword",
|
||||||
|
Kind::SourceKeyword => "keyword",
|
||||||
|
Kind::TypeKeyword => "keyword",
|
||||||
|
|
||||||
|
Kind::StringLiteral => "string",
|
||||||
|
Kind::IntLiteral => "int",
|
||||||
|
Kind::Comment => "comment",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
11
src/main.rs
11
src/main.rs
|
@ -1,3 +1,12 @@
|
||||||
|
use std::fs;
|
||||||
|
|
||||||
|
mod lex;
|
||||||
|
use lex::Lexer;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
println!("Hello, world!");
|
let s = fs::read_to_string("test.gaybuild").unwrap();
|
||||||
|
let lexer = Lexer::new(s.chars());
|
||||||
|
for token in lexer {
|
||||||
|
println!("{}", token.unwrap());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
28
test.gaybuild
Normal file
28
test.gaybuild
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
set RUSTC_EXE = "rustc";
|
||||||
|
set ASM_EXE = "clang";
|
||||||
|
set CC_EXE = "clang";
|
||||||
|
set LINK_EXE = "ld.lld";
|
||||||
|
set BUILD_PREFIX = "build";
|
||||||
|
|
||||||
|
module kern {
|
||||||
|
type exe;
|
||||||
|
depend [
|
||||||
|
libk,
|
||||||
|
arch,
|
||||||
|
];
|
||||||
|
source "kern/lib.rs";
|
||||||
|
}
|
||||||
|
|
||||||
|
module libk {
|
||||||
|
type static; # static library
|
||||||
|
depend arch;
|
||||||
|
source "libk/lib.rs";
|
||||||
|
}
|
||||||
|
|
||||||
|
module arch {
|
||||||
|
type static;
|
||||||
|
source [
|
||||||
|
"arch/lib.rs",
|
||||||
|
"arch/**.nasm",
|
||||||
|
];
|
||||||
|
}
|
Loading…
Reference in a new issue