diff --git a/Cargo.toml b/Cargo.toml index e126bd7..5babab1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,5 @@ name = "gaybuild" version = "0.1.0" edition = "2021" -author = "anna " [dependencies] diff --git a/src/lex/cursor.rs b/src/lex/cursor.rs index dd069aa..5db8271 100644 --- a/src/lex/cursor.rs +++ b/src/lex/cursor.rs @@ -3,12 +3,42 @@ use std::str::Chars; /// A cursor for iterating over individual characters in a stream. /// Supports backwards seeking. pub struct Cursor<'a> { - stream: Chars<'a>, // where we pull our characters from - offset: usize, // how many chars we are behind the stream position (when seeking back) - history: Vec, // complete list of all characters we've read so far - line_lengths: Vec, // length of all previous lines (for seeking back) - line: usize, // current line (starting from 1) - col: usize, // current column in line (starting from 1) + stream: Chars<'a>, + offset: usize, // where we pull our characters from + history: Vec, // how many chars we are behind the stream position (when seeking back) + line_lengths: Vec, // complete list of all characters we've read so far + line: usize, // length of all previous lines (for seeking back) + col: usize, // current line (starting from 1) + pos: usize, // current column in line (starting from 1) + chop: usize, + current: Option, +} + +impl Iterator for Cursor<'_> { + type Item = char; + + fn next(&mut self) -> Option { + let c = if self.offset > 0 { + let tmp = self.history[self.history.len() - self.offset]; + self.offset -= 1; + tmp + } else { + let tmp = self.stream.next()?; + self.history.push(tmp); + tmp + }; + + self.current = Some(c); + + if c == '\n' { + self.new_line(); + } else { + self.col += 1; + } + self.pos += 1; + + Some(c) + } } impl<'a> Cursor<'a> { @@ -20,29 +50,12 @@ impl<'a> Cursor<'a> { line_lengths: Vec::new(), line: 1, col: 0, // increments in first call to next() + pos: 0, + chop: 0, + current: None, } } - /// Advance the cursor by a single character. - pub fn next(&mut self) -> Option { - let c = if self.offset > 0 { - self.offset -= 1; - self.history[self.history.len() - self.offset] - } else { - let tmp = self.stream.next()?; - self.history.push(tmp); - tmp - }; - - if c == '\n' { - self.new_line(); - } else { - self.col += 1; - } - - Some(c) - } - /// Reverse the cursor by a single character. pub fn prev(&mut self) -> Option { if self.history.len() == 0 { @@ -55,23 +68,11 @@ impl<'a> Cursor<'a> { } else { self.col -= 1; } + self.pos -= 1; Some(c) } } - /// Seek forward and return all characters that were encountered. - pub fn seek(&mut self, n: usize) -> Vec { - // TODO: implement this properly - let mut v = Vec::with_capacity(n); - for _ in 0..n { - match self.next() { - Some(c) => v.push(c), - None => break, - } - } - v - } - /// Seek backward and return all characters that were encountered. pub fn seek_back(&mut self, n: usize) -> Vec { // TODO: implement this properly as well @@ -87,7 +88,7 @@ impl<'a> Cursor<'a> { } /// Seek forward until the `test` callback returns false. - pub fn seek_until(&mut self, test: fn(c: char) -> bool) -> Vec { + pub fn seek_while(&mut self, test: fn(c: char) -> bool) -> Vec { let mut v = Vec::new(); while let Some(c) = self.peek() { if test(c) { @@ -100,21 +101,6 @@ impl<'a> Cursor<'a> { v } - /// Seek backward until the test callback returns false. - pub fn seek_back_until(&mut self, test: fn(c: char) -> bool) -> Vec { - let mut v = Vec::new(); - while let Some(c) = self.peek_back() { - if test(c) { - v.push(c); - self.prev(); - } else { - break; - } - } - v.reverse(); // TODO: again, probably not ideal - v - } - /// Return the next character without actually advancing the cursor. pub fn peek(&mut self) -> Option { if self.offset > 0 { @@ -126,13 +112,15 @@ impl<'a> Cursor<'a> { } } - /// Return the previous character without actually reversing the cursor. - pub fn peek_back(&self) -> Option { - self.history.last().and_then(|c| Some(*c)) + pub fn skip_whitespace(&mut self) { + self.seek_while(|c| c.is_ascii_whitespace()); } - pub fn skip_whitespace(&mut self) { - self.seek_until(|c| !c.is_ascii_whitespace()); + pub fn chop(&mut self) -> String { + assert!(self.pos >= self.chop); + let s = String::from_iter(self.history[self.chop..self.pos].into_iter()); + self.chop = self.pos; + s } /// Return the line number (starting from 1) of the last @@ -149,6 +137,10 @@ impl<'a> Cursor<'a> { self.col } + pub fn current(&self) -> Option { + self.current + } + fn new_line(&mut self) { self.line_lengths.push(self.col); self.col = 0; diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 0e02b79..e6b97f0 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -1 +1,204 @@ +use std::fmt; +use std::str::Chars; + mod cursor; +use cursor::Cursor; + +mod token; +use token::Token; + +pub struct Lexer<'a> { + cursor: Cursor<'a>, + token_line: usize, + token_col: usize, +} + +static NUMERALS: [char; 16] = [ + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', +]; + +struct KeywordMap { + raw: &'static str, + kind: token::Kind, +} + +const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap { + KeywordMap { raw, kind } +} + +static KEYWORDS: [KeywordMap; 6] = [ + kw("depend", token::Kind::DependKeyword), + kw("include", token::Kind::IncludeKeyword), + kw("module", token::Kind::ModuleKeyword), + kw("set", token::Kind::SetKeyword), + kw("source", token::Kind::SourceKeyword), + kw("type", token::Kind::TypeKeyword), +]; + +impl Iterator for Lexer<'_> { + type Item = Result; + + fn next(&mut self) -> Option> { + Some(match self.cursor.next()? { + c if c.is_ascii_whitespace() => { + self.cursor.skip_whitespace(); + self.cursor.chop(); + self.next()? + } + ',' => self.token_ok(token::Kind::Comma), + ';' => self.token_ok(token::Kind::Semi), + '=' => self.token_ok(token::Kind::Eq), + '{' => self.token_ok(token::Kind::OBrace), + '}' => self.token_ok(token::Kind::CBrace), + '[' => self.token_ok(token::Kind::OBracket), + ']' => self.token_ok(token::Kind::CBracket), + '#' => self.read_comment(), + '"' => self.read_string_literal(), + '0' => self.read_prefix_int_literal(), + _c @ '1'..='9' => self.read_int_literal(10), + _c @ 'A'..='Z' => self.read_ident(), + _c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase + c => self.syntax_error(format!("Unexpected character '{}'", c)), + }) + } +} + +impl<'a> Lexer<'a> { + pub fn new(stream: Chars<'a>) -> Lexer<'a> { + Lexer { + cursor: Cursor::new(stream), + token_line: 1, + token_col: 1, + } + } + + fn read_keyword_or_ident(&mut self) -> Result { + let current = self.cursor.current().unwrap(); + for kw in &KEYWORDS { + // keywords are always at least 2 characters long as per the language spec + let first_char = kw.raw.chars().next().unwrap(); + if current == first_char && self.skip_if_match(&kw.raw[1..]) { + return self.token_ok(kw.kind); + } + } + + self.read_ident() + } + + fn read_ident(&mut self) -> Result { + for c in &mut self.cursor { + if !c.is_ascii_alphanumeric() && c != '_' { + self.cursor.prev(); + break; + } + } + + self.token_ok(token::Kind::Ident) + } + + fn read_comment(&mut self) -> Result { + self.cursor.seek_while(|c| c != '\n'); + self.token_ok(token::Kind::Comment) + } + + fn read_string_literal(&mut self) -> Result { + assert!(self.cursor.current() == Some('"')); + self.cursor.chop(); + let mut raw = String::new(); + for c in &mut self.cursor { + if c == '"' { + self.cursor.chop(); + return self.token_raw_ok(token::Kind::StringLiteral, raw); + } else { + raw.push(c); + } + } + + self.token_ok(token::Kind::StringLiteral) + } + + fn read_prefix_int_literal(&mut self) -> Result { + assert!(self.cursor.next() == Some('0')); + match self.cursor.next() { + Some('x') => self.read_int_literal(16), + Some('o') => self.read_int_literal(8), + Some('b') => self.read_int_literal(2), + Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)), + None => self.syntax_error(String::from("Unexpected end-of-file")), + } + } + + fn read_int_literal(&mut self, base: usize) -> Result { + assert!(base >= 2 && base <= 16); + + for c in &mut self.cursor { + if !NUMERALS[0..base].contains(&c.to_ascii_lowercase()) { + self.cursor.prev(); + break; + } + } + + self.token_ok(token::Kind::IntLiteral) + } + + fn token(&mut self, kind: token::Kind, raw: String) -> Token { + let t = Token { + kind, + line: self.token_line, + col: self.token_col, + raw: raw, + }; + self.token_line = self.cursor.line(); + self.token_col = self.cursor.col(); + t + } + + fn token_ok(&mut self, kind: token::Kind) -> Result { + let raw = self.cursor.chop(); + Ok(self.token(kind, raw)) + } + + fn token_raw_ok(&mut self, kind: token::Kind, raw: String) -> Result { + Ok(self.token(kind, raw)) + } + + fn skip_if_match(&mut self, s: &str) -> bool { + let mut n: usize = 0; + for c in s.chars() { + if self.cursor.next() == Some(c) { + n += 1; + } else { + self.cursor.seek_back(n); + return false; + } + } + true + } + + fn syntax_error(&mut self, msg: String) -> Result { + Err(SyntaxError { + line: self.cursor.line(), + col: self.cursor.col(), + msg: String::from(msg), + }) + } +} + +#[derive(Debug)] +pub struct SyntaxError { + pub line: usize, + pub col: usize, + pub msg: String, +} + +impl fmt::Display for SyntaxError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "SyntaxError in {}:{}: {}", + self.line, + self.col, + self.msg.as_str() + ) + } +} diff --git a/src/lex/token.rs b/src/lex/token.rs new file mode 100644 index 0000000..ae749c1 --- /dev/null +++ b/src/lex/token.rs @@ -0,0 +1,72 @@ +use std::fmt; + +/// A single syntactic element. +#[derive(Debug)] +pub struct Token { + pub kind: Kind, + /// line of the first character (starting from 1) + pub line: usize, + /// column of the first character (starting from 1) + pub col: usize, + /// raw text + pub raw: String, +} + +impl fmt::Display for Token { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}: \"{}\"", self.kind, self.raw) + } +} + +#[derive(Debug, Clone, Copy)] +pub enum Kind { + Ident, + OBrace, + CBrace, + OBracket, + CBracket, + Eq, + Comma, + Semi, + + DependKeyword, + IncludeKeyword, + ModuleKeyword, + SetKeyword, + SourceKeyword, + TypeKeyword, + + StringLiteral, + IntLiteral, + Comment, +} + +impl fmt::Display for Kind { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match self { + Kind::Ident => "ident", + Kind::OBrace => "obrace", + Kind::CBrace => "cbrace", + Kind::OBracket => "obracket", + Kind::CBracket => "cbracket", + Kind::Eq => "eq", + Kind::Comma => "comma", + Kind::Semi => "semi", + + Kind::DependKeyword => "keyword", + Kind::IncludeKeyword => "keyword", + Kind::ModuleKeyword => "keyword", + Kind::SetKeyword => "keyword", + Kind::SourceKeyword => "keyword", + Kind::TypeKeyword => "keyword", + + Kind::StringLiteral => "string", + Kind::IntLiteral => "int", + Kind::Comment => "comment", + } + ) + } +} diff --git a/src/main.rs b/src/main.rs index e7a11a9..5357465 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,12 @@ +use std::fs; + +mod lex; +use lex::Lexer; + fn main() { - println!("Hello, world!"); + let s = fs::read_to_string("test.gaybuild").unwrap(); + let lexer = Lexer::new(s.chars()); + for token in lexer { + println!("{}", token.unwrap()); + } } diff --git a/test.gaybuild b/test.gaybuild new file mode 100644 index 0000000..c0d3fbf --- /dev/null +++ b/test.gaybuild @@ -0,0 +1,28 @@ +set RUSTC_EXE = "rustc"; +set ASM_EXE = "clang"; +set CC_EXE = "clang"; +set LINK_EXE = "ld.lld"; +set BUILD_PREFIX = "build"; + +module kern { + type exe; + depend [ + libk, + arch, + ]; + source "kern/lib.rs"; +} + +module libk { + type static; # static library + depend arch; + source "libk/lib.rs"; +} + +module arch { + type static; + source [ + "arch/lib.rs", + "arch/**.nasm", + ]; +}