add lexer and test file

There are most likely gonna be a lot of changes until we reach the final syntax, but this is a good start i think.
2022-07-10 21:56:00 +02:00 · 2022-07-10 21:56:00 +02:00 · 8ae3bb2f57
commit 8ae3bb2f57
parent 0557369397
6 changed files with 365 additions and 62 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -2,6 +2,5 @@
 name = "gaybuild"
 version = "0.1.0"
 edition = "2021"
-author = "anna <owo@fef.moe>"

 [dependencies]
--- a/src/lex/cursor.rs
+++ b/src/lex/cursor.rs
@ -3,12 +3,42 @@ use std::str::Chars;
 /// A cursor for iterating over individual characters in a stream.
 /// Supports backwards seeking.
 pub struct Cursor<'a> {
-    stream: Chars<'a>, // where we pull our characters from
-    offset: usize, // how many chars we are behind the stream position (when seeking back)
-    history: Vec<char>, // complete list of all characters we've read so far
-    line_lengths: Vec<usize>, // length of all previous lines (for seeking back)
-    line: usize, // current line (starting from 1)
-    col: usize, // current column in line (starting from 1)
+    stream: Chars<'a>,
+    offset: usize,            // where we pull our characters from
+    history: Vec<char>, // how many chars we are behind the stream position (when seeking back)
+    line_lengths: Vec<usize>, // complete list of all characters we've read so far
+    line: usize,        // length of all previous lines (for seeking back)
+    col: usize,         // current line (starting from 1)
+    pos: usize,         // current column in line (starting from 1)
+    chop: usize,
+    current: Option<char>,
+}
+
+impl Iterator for Cursor<'_> {
+    type Item = char;
+
+    fn next(&mut self) -> Option<char> {
+        let c = if self.offset > 0 {
+            let tmp = self.history[self.history.len() - self.offset];
+            self.offset -= 1;
+            tmp
+        } else {
+            let tmp = self.stream.next()?;
+            self.history.push(tmp);
+            tmp
+        };
+
+        self.current = Some(c);
+
+        if c == '\n' {
+            self.new_line();
+        } else {
+            self.col += 1;
+        }
+        self.pos += 1;
+
+        Some(c)
+    }
 }

 impl<'a> Cursor<'a> {
@ -20,29 +50,12 @@ impl<'a> Cursor<'a> {
            line_lengths: Vec::new(),
            line: 1,
            col: 0, // increments in first call to next()
+            pos: 0,
+            chop: 0,
+            current: None,
        }
    }

-    /// Advance the cursor by a single character.
-    pub fn next(&mut self) -> Option<char> {
-        let c = if self.offset > 0 {
-            self.offset -= 1;
-            self.history[self.history.len() - self.offset]
-        } else {
-            let tmp = self.stream.next()?;
-            self.history.push(tmp);
-            tmp
-        };
-
-        if c == '\n' {
-            self.new_line();
-        } else {
-            self.col += 1;
-        }
-
-        Some(c)
-    }
-
    /// Reverse the cursor by a single character.
    pub fn prev(&mut self) -> Option<char> {
        if self.history.len() == 0 {
@ -55,23 +68,11 @@ impl<'a> Cursor<'a> {
            } else {
                self.col -= 1;
            }
+            self.pos -= 1;
            Some(c)
        }
    }

-    /// Seek forward and return all characters that were encountered.
-    pub fn seek(&mut self, n: usize) -> Vec<char> {
-        // TODO: implement this properly
-        let mut v = Vec::with_capacity(n);
-        for _ in 0..n {
-            match self.next() {
-                Some(c) => v.push(c),
-                None => break,
-            }
-        }
-        v
-    }
-
    /// Seek backward and return all characters that were encountered.
    pub fn seek_back(&mut self, n: usize) -> Vec<char> {
        // TODO: implement this properly as well
@ -87,7 +88,7 @@ impl<'a> Cursor<'a> {
    }

    /// Seek forward until the `test` callback returns false.
-    pub fn seek_until(&mut self, test: fn(c: char) -> bool) -> Vec<char> {
+    pub fn seek_while(&mut self, test: fn(c: char) -> bool) -> Vec<char> {
        let mut v = Vec::new();
        while let Some(c) = self.peek() {
            if test(c) {
@ -100,21 +101,6 @@ impl<'a> Cursor<'a> {
        v
    }

-    /// Seek backward until the test callback returns false.
-    pub fn seek_back_until(&mut self, test: fn(c: char) -> bool) -> Vec<char> {
-        let mut v = Vec::new();
-        while let Some(c) = self.peek_back() {
-            if test(c) {
-                v.push(c);
-                self.prev();
-            } else {
-                break;
-            }
-        }
-        v.reverse(); // TODO: again, probably not ideal
-        v
-    }
-
    /// Return the next character without actually advancing the cursor.
    pub fn peek(&mut self) -> Option<char> {
        if self.offset > 0 {
@ -126,13 +112,15 @@ impl<'a> Cursor<'a> {
        }
    }

-    /// Return the previous character without actually reversing the cursor.
-    pub fn peek_back(&self) -> Option<char> {
-        self.history.last().and_then(|c| Some(*c))
+    pub fn skip_whitespace(&mut self) {
+        self.seek_while(|c| c.is_ascii_whitespace());
    }

-    pub fn skip_whitespace(&mut self) {
-        self.seek_until(|c| !c.is_ascii_whitespace());
+    pub fn chop(&mut self) -> String {
+        assert!(self.pos >= self.chop);
+        let s = String::from_iter(self.history[self.chop..self.pos].into_iter());
+        self.chop = self.pos;
+        s
    }

    /// Return the line number (starting from 1) of the last
@ -149,6 +137,10 @@ impl<'a> Cursor<'a> {
        self.col
    }

+    pub fn current(&self) -> Option<char> {
+        self.current
+    }
+
    fn new_line(&mut self) {
        self.line_lengths.push(self.col);
        self.col = 0;
--- a/src/lex/mod.rs
+++ b/src/lex/mod.rs
@ -1 +1,204 @@
+use std::fmt;
+use std::str::Chars;
+
 mod cursor;
+use cursor::Cursor;
+
+mod token;
+use token::Token;
+
+pub struct Lexer<'a> {
+    cursor: Cursor<'a>,
+    token_line: usize,
+    token_col: usize,
+}
+
+static NUMERALS: [char; 16] = [
+    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
+];
+
+struct KeywordMap {
+    raw: &'static str,
+    kind: token::Kind,
+}
+
+const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap {
+    KeywordMap { raw, kind }
+}
+
+static KEYWORDS: [KeywordMap; 6] = [
+    kw("depend", token::Kind::DependKeyword),
+    kw("include", token::Kind::IncludeKeyword),
+    kw("module", token::Kind::ModuleKeyword),
+    kw("set", token::Kind::SetKeyword),
+    kw("source", token::Kind::SourceKeyword),
+    kw("type", token::Kind::TypeKeyword),
+];
+
+impl Iterator for Lexer<'_> {
+    type Item = Result<Token, SyntaxError>;
+
+    fn next(&mut self) -> Option<Result<Token, SyntaxError>> {
+        Some(match self.cursor.next()? {
+            c if c.is_ascii_whitespace() => {
+                self.cursor.skip_whitespace();
+                self.cursor.chop();
+                self.next()?
+            }
+            ',' => self.token_ok(token::Kind::Comma),
+            ';' => self.token_ok(token::Kind::Semi),
+            '=' => self.token_ok(token::Kind::Eq),
+            '{' => self.token_ok(token::Kind::OBrace),
+            '}' => self.token_ok(token::Kind::CBrace),
+            '[' => self.token_ok(token::Kind::OBracket),
+            ']' => self.token_ok(token::Kind::CBracket),
+            '#' => self.read_comment(),
+            '"' => self.read_string_literal(),
+            '0' => self.read_prefix_int_literal(),
+            _c @ '1'..='9' => self.read_int_literal(10),
+            _c @ 'A'..='Z' => self.read_ident(),
+            _c @ 'a'..='z' => self.read_keyword_or_ident(), // keywords are always lowercase
+            c => self.syntax_error(format!("Unexpected character '{}'", c)),
+        })
+    }
+}
+
+impl<'a> Lexer<'a> {
+    pub fn new(stream: Chars<'a>) -> Lexer<'a> {
+        Lexer {
+            cursor: Cursor::new(stream),
+            token_line: 1,
+            token_col: 1,
+        }
+    }
+
+    fn read_keyword_or_ident(&mut self) -> Result<Token, SyntaxError> {
+        let current = self.cursor.current().unwrap();
+        for kw in &KEYWORDS {
+            // keywords are always at least 2 characters long as per the language spec
+            let first_char = kw.raw.chars().next().unwrap();
+            if current == first_char && self.skip_if_match(&kw.raw[1..]) {
+                return self.token_ok(kw.kind);
+            }
+        }
+
+        self.read_ident()
+    }
+
+    fn read_ident(&mut self) -> Result<Token, SyntaxError> {
+        for c in &mut self.cursor {
+            if !c.is_ascii_alphanumeric() && c != '_' {
+                self.cursor.prev();
+                break;
+            }
+        }
+
+        self.token_ok(token::Kind::Ident)
+    }
+
+    fn read_comment(&mut self) -> Result<Token, SyntaxError> {
+        self.cursor.seek_while(|c| c != '\n');
+        self.token_ok(token::Kind::Comment)
+    }
+
+    fn read_string_literal(&mut self) -> Result<Token, SyntaxError> {
+        assert!(self.cursor.current() == Some('"'));
+        self.cursor.chop();
+        let mut raw = String::new();
+        for c in &mut self.cursor {
+            if c == '"' {
+                self.cursor.chop();
+                return self.token_raw_ok(token::Kind::StringLiteral, raw);
+            } else {
+                raw.push(c);
+            }
+        }
+
+        self.token_ok(token::Kind::StringLiteral)
+    }
+
+    fn read_prefix_int_literal(&mut self) -> Result<Token, SyntaxError> {
+        assert!(self.cursor.next() == Some('0'));
+        match self.cursor.next() {
+            Some('x') => self.read_int_literal(16),
+            Some('o') => self.read_int_literal(8),
+            Some('b') => self.read_int_literal(2),
+            Some(c) => self.syntax_error(format!("Unexpected character '{}'", c)),
+            None => self.syntax_error(String::from("Unexpected end-of-file")),
+        }
+    }
+
+    fn read_int_literal(&mut self, base: usize) -> Result<Token, SyntaxError> {
+        assert!(base >= 2 && base <= 16);
+
+        for c in &mut self.cursor {
+            if !NUMERALS[0..base].contains(&c.to_ascii_lowercase()) {
+                self.cursor.prev();
+                break;
+            }
+        }
+
+        self.token_ok(token::Kind::IntLiteral)
+    }
+
+    fn token(&mut self, kind: token::Kind, raw: String) -> Token {
+        let t = Token {
+            kind,
+            line: self.token_line,
+            col: self.token_col,
+            raw: raw,
+        };
+        self.token_line = self.cursor.line();
+        self.token_col = self.cursor.col();
+        t
+    }
+
+    fn token_ok<T>(&mut self, kind: token::Kind) -> Result<Token, T> {
+        let raw = self.cursor.chop();
+        Ok(self.token(kind, raw))
+    }
+
+    fn token_raw_ok<T>(&mut self, kind: token::Kind, raw: String) -> Result<Token, T> {
+        Ok(self.token(kind, raw))
+    }
+
+    fn skip_if_match(&mut self, s: &str) -> bool {
+        let mut n: usize = 0;
+        for c in s.chars() {
+            if self.cursor.next() == Some(c) {
+                n += 1;
+            } else {
+                self.cursor.seek_back(n);
+                return false;
+            }
+        }
+        true
+    }
+
+    fn syntax_error<T>(&mut self, msg: String) -> Result<T, SyntaxError> {
+        Err(SyntaxError {
+            line: self.cursor.line(),
+            col: self.cursor.col(),
+            msg: String::from(msg),
+        })
+    }
+}
+
+#[derive(Debug)]
+pub struct SyntaxError {
+    pub line: usize,
+    pub col: usize,
+    pub msg: String,
+}
+
+impl fmt::Display for SyntaxError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "SyntaxError in {}:{}: {}",
+            self.line,
+            self.col,
+            self.msg.as_str()
+        )
+    }
+}
--- a/src/lex/token.rs
+++ b/src/lex/token.rs
@ -0,0 +1,72 @@
+use std::fmt;
+
+/// A single syntactic element.
+#[derive(Debug)]
+pub struct Token {
+    pub kind: Kind,
+    /// line of the first character (starting from 1)
+    pub line: usize,
+    /// column of the first character (starting from 1)
+    pub col: usize,
+    /// raw text
+    pub raw: String,
+}
+
+impl fmt::Display for Token {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}: \"{}\"", self.kind, self.raw)
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub enum Kind {
+    Ident,
+    OBrace,
+    CBrace,
+    OBracket,
+    CBracket,
+    Eq,
+    Comma,
+    Semi,
+
+    DependKeyword,
+    IncludeKeyword,
+    ModuleKeyword,
+    SetKeyword,
+    SourceKeyword,
+    TypeKeyword,
+
+    StringLiteral,
+    IntLiteral,
+    Comment,
+}
+
+impl fmt::Display for Kind {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(
+            f,
+            "{}",
+            match self {
+                Kind::Ident => "ident",
+                Kind::OBrace => "obrace",
+                Kind::CBrace => "cbrace",
+                Kind::OBracket => "obracket",
+                Kind::CBracket => "cbracket",
+                Kind::Eq => "eq",
+                Kind::Comma => "comma",
+                Kind::Semi => "semi",
+
+                Kind::DependKeyword => "keyword",
+                Kind::IncludeKeyword => "keyword",
+                Kind::ModuleKeyword => "keyword",
+                Kind::SetKeyword => "keyword",
+                Kind::SourceKeyword => "keyword",
+                Kind::TypeKeyword => "keyword",
+
+                Kind::StringLiteral => "string",
+                Kind::IntLiteral => "int",
+                Kind::Comment => "comment",
+            }
+        )
+    }
+}
--- a/src/main.rs
+++ b/src/main.rs
@ -1,3 +1,12 @@
+use std::fs;
+
+mod lex;
+use lex::Lexer;
+
 fn main() {
-    println!("Hello, world!");
+    let s = fs::read_to_string("test.gaybuild").unwrap();
+    let lexer = Lexer::new(s.chars());
+    for token in lexer {
+        println!("{}", token.unwrap());
+    }
 }
--- a/test.gaybuild
+++ b/test.gaybuild
@ -0,0 +1,28 @@
+set RUSTC_EXE = "rustc";
+set ASM_EXE = "clang";
+set CC_EXE = "clang";
+set LINK_EXE = "ld.lld";
+set BUILD_PREFIX = "build";
+
+module kern {
+    type exe;
+    depend [
+        libk,
+        arch,
+    ];
+    source "kern/lib.rs";
+}
+
+module libk {
+    type static; # static library
+    depend arch;
+    source "libk/lib.rs";
+}
+
+module arch {
+    type static;
+    source [
+        "arch/lib.rs",
+        "arch/**.nasm",
+    ];
+}