From c7e42449729b24d043b5fdb7425fd4521058b14b Mon Sep 17 00:00:00 2001 From: fef Date: Sat, 23 Jul 2022 20:51:15 +0200 Subject: [PATCH] cursor: refactor to only work on strings --- src/error.rs | 55 ++++++++++++++-------------- src/lex/cursor.rs | 91 +++++++++++++++++++++-------------------------- src/lex/mod.rs | 57 +++++++++++++++++------------ src/lex/token.rs | 12 ++++--- src/main.rs | 2 +- 5 files changed, 109 insertions(+), 108 deletions(-) diff --git a/src/error.rs b/src/error.rs index 1380621..3bcace4 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,27 +1,27 @@ -use std::fmt; -use std::fmt::Formatter; +use std::{fmt, io}; + use crate::ast::tree::Type; +use crate::lex::token::Position; /// This is just a wrapper for the actual error types. /// I have no idea whether this is good design (probably not), /// but idc for now. Shouldn't be too hard to change the API /// later on bc each component of the compiler has its own /// wrappers for instantiating errors anyway. -#[derive(Debug)] pub struct Error { e: Box, } impl Error { - pub fn syntax_error(file: String, line: usize, col: usize, msg: String) -> Error { + pub fn syntax_error(pos: Position, msg: String) -> Error { Error { - e: Box::new(SyntaxError::new(file, line, col, msg)) + e: Box::new(SyntaxError::new(pos, msg)), } } - pub fn type_error(file: String, line: usize, col: usize, expected: Type, actual: Type) -> Error { + pub fn type_error(pos: Position, expected: Type, actual: Type) -> Error { Error { - e: Box::new(TypeError::new(file, line, col, expected, actual)) + e: Box::new(TypeError::new(pos, expected, actual)), } } @@ -56,13 +56,6 @@ trait ErrorDetails { fn name(&self) -> &str; } -#[derive(Debug)] -struct Position { - file: String, - line: usize, - col: usize, -} - #[derive(Debug)] pub struct SyntaxError { pos: Position, @@ -72,31 +65,35 @@ pub struct SyntaxError { #[derive(Debug)] pub struct TypeError { pos: Position, - expected: Type, - actual: Type, + msg: String, } -impl fmt::Display for dyn ErrorDetails { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - write!(f, "{} in {}:{}:{}: {}", self.name(), self.file(), self.line(), self.col(), self.msg()) +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let e = &self.e; + write!( + f, + "{} in {}:{}:{}: {}", + e.name(), + e.file(), + e.line(), + e.col(), + e.msg() + ) } } impl SyntaxError { - pub fn new(file: String, line: usize, col: usize, msg: String) -> SyntaxError { - SyntaxError { - pos: Position { file, line, col }, - msg, - } + pub fn new(pos: Position, msg: String) -> SyntaxError { + SyntaxError { pos, msg } } } impl TypeError { - pub fn new(file: String, line: usize, col: usize, expected: Type, actual: Type) -> TypeError { + pub fn new(pos: Position, expected: Type, actual: Type) -> TypeError { TypeError { - pos: Position { file, line, col }, - expected, - actual, + pos, + msg: format!("Expected type {}, got {} instead", expected, actual), } } } @@ -137,7 +134,7 @@ impl ErrorDetails for TypeError { } fn msg(&self) -> &String { - &format!("Expected type {}, got {} instead", self.expected, self.actual) + &self.msg } fn name(&self) -> &str { diff --git a/src/lex/cursor.rs b/src/lex/cursor.rs index 5db8271..e89cb3a 100644 --- a/src/lex/cursor.rs +++ b/src/lex/cursor.rs @@ -1,52 +1,41 @@ -use std::str::Chars; - -/// A cursor for iterating over individual characters in a stream. -/// Supports backwards seeking. -pub struct Cursor<'a> { - stream: Chars<'a>, - offset: usize, // where we pull our characters from - history: Vec, // how many chars we are behind the stream position (when seeking back) - line_lengths: Vec, // complete list of all characters we've read so far - line: usize, // length of all previous lines (for seeking back) - col: usize, // current line (starting from 1) - pos: usize, // current column in line (starting from 1) - chop: usize, - current: Option, +/// Convenience helper for iterating over the characters in a string. +/// Supports backwards seeking and tracks line/column numbers. +pub struct Cursor { + raw: Vec, // array of all characters + pos: usize, // index (in `raw`) of the *next* character to be read + line_lengths: Vec, // previous line lengths (for seeking back) + line: usize, // current line (counting from 1) + col: usize, // current column (counting from 1) + chop: usize, // value of `pos` when `chop()` was called the last time + current: Option, // current character } -impl Iterator for Cursor<'_> { +impl Iterator for Cursor { type Item = char; fn next(&mut self) -> Option { - let c = if self.offset > 0 { - let tmp = self.history[self.history.len() - self.offset]; - self.offset -= 1; - tmp - } else { - let tmp = self.stream.next()?; - self.history.push(tmp); - tmp - }; + if self.pos < self.raw.len() { + let c = self.raw[self.pos]; + self.pos += 1; - self.current = Some(c); + if c == '\n' { + self.new_line(); + } else { + self.col += 1; + } - if c == '\n' { - self.new_line(); + self.current = Some(c); + Some(c) } else { - self.col += 1; + None } - self.pos += 1; - - Some(c) } } -impl<'a> Cursor<'a> { - pub fn new(stream: Chars<'a>) -> Cursor<'a> { +impl Cursor { + pub fn new(raw: String) -> Cursor { Cursor { - stream, - offset: 0, - history: Vec::new(), + raw: Vec::from_iter(raw.chars()), line_lengths: Vec::new(), line: 1, col: 0, // increments in first call to next() @@ -58,18 +47,20 @@ impl<'a> Cursor<'a> { /// Reverse the cursor by a single character. pub fn prev(&mut self) -> Option { - if self.history.len() == 0 { - None - } else { - self.offset += 1; - let c = self.history[self.history.len() - self.offset]; + if self.pos > 0 { + self.pos -= 1; + let c = self.raw[self.pos]; + if self.col == 0 { self.prev_line(); } else { self.col -= 1; } - self.pos -= 1; + + self.current = Some(c); Some(c) + } else { + None } } @@ -103,22 +94,20 @@ impl<'a> Cursor<'a> { /// Return the next character without actually advancing the cursor. pub fn peek(&mut self) -> Option { - if self.offset > 0 { - Some(self.history[self.history.len() - self.offset]) - } else { - let c = self.next()?; - self.prev(); - Some(c) - } + let c = self.next()?; + self.prev(); + Some(c) } pub fn skip_whitespace(&mut self) { self.seek_while(|c| c.is_ascii_whitespace()); } + /// Return a string of every character since + /// the last time this method was called. pub fn chop(&mut self) -> String { assert!(self.pos >= self.chop); - let s = String::from_iter(self.history[self.chop..self.pos].into_iter()); + let s = String::from_iter(self.raw[self.chop..self.pos].into_iter()); self.chop = self.pos; s } @@ -149,7 +138,7 @@ impl<'a> Cursor<'a> { fn prev_line(&mut self) { assert!(self.line > 0); - assert!(self.col == 0); + assert_eq!(self.col, 0); self.col = self.line_lengths.pop().unwrap(); self.line -= 1; } diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 1a96540..83beb4d 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -1,16 +1,14 @@ -use std::str::Chars; - mod cursor; use cursor::Cursor; pub(crate) mod token; -use token::Token; +use token::{Position, Token}; use crate::error::Error; -pub struct Lexer<'a> { +pub struct Lexer { file: String, - cursor: Cursor<'a>, + cursor: Cursor, history: Vec, offset: usize, token_line: usize, @@ -27,6 +25,7 @@ struct KeywordMap { } const fn kw(raw: &'static str, kind: token::Kind) -> KeywordMap { + assert!(raw.len() >= 2); KeywordMap { raw, kind } } @@ -39,7 +38,7 @@ static KEYWORDS: [KeywordMap; 6] = [ kw("type", token::Kind::TypeKeyword), ]; -impl Iterator for Lexer<'_> { +impl Iterator for Lexer { type Item = Result; fn next(&mut self) -> Option> { @@ -70,7 +69,8 @@ impl Iterator for Lexer<'_> { '%' => self.token_ok(token::Kind::Percent), '#' => { - self.read_comment().unwrap(); // this can't fail + // this can't fail + self.read_comment().unwrap(); // we don't need comments for now and they would // only confuse the parser so let's just Not self.next()? @@ -90,11 +90,11 @@ impl Iterator for Lexer<'_> { } } -impl<'a> Lexer<'a> { - pub fn new(file: String, stream: Chars<'a>) -> Lexer<'a> { +impl Lexer { + pub fn new(filename: String, raw: String) -> Lexer { Lexer { - file, - cursor: Cursor::new(stream), + file: filename, + cursor: Cursor::new(raw), history: Vec::new(), offset: 0, token_line: 1, @@ -123,12 +123,18 @@ impl<'a> Lexer<'a> { } pub fn expect_kind(&mut self, kind: token::Kind) -> Result { + self.expect_kinds(&[kind]) + } + + pub fn expect_kinds(&mut self, kinds: &[token::Kind]) -> Result { match self.next() { - Some(Ok(t)) => if t.kind == kind { - Ok(t) - } else { - self.syntax_error(format!("Expected {}, got {}", kind, t.kind)) - }, + Some(Ok(t)) => { + if kinds.contains(&t.kind) { + Ok(t) + } else { + self.syntax_error(format!("Expected one of {:?}, got {}", kinds, t.kind)) + } + } Some(Err(e)) => Err(e), None => self.syntax_error(String::from("Unexpected EOF")), } @@ -189,7 +195,7 @@ impl<'a> Lexer<'a> { } fn read_prefix_int_literal(&mut self) -> Result { - assert_eq!(self.cursor.next(), Some('0')); + assert_eq!(self.cursor.current(), Some('0')); match self.cursor.next() { Some('x') => self.read_int_literal(16), Some('o') => self.read_int_literal(8), @@ -215,8 +221,11 @@ impl<'a> Lexer<'a> { fn token(&mut self, kind: token::Kind, raw: String) -> Token { let t = Token { kind, - line: self.token_line, - col: self.token_col, + pos: Position { + file: self.file.clone(), + line: self.token_line, + col: self.token_col, + }, raw, }; self.token_line = self.cursor.line(); @@ -248,10 +257,12 @@ impl<'a> Lexer<'a> { fn syntax_error(&mut self, msg: String) -> Result { Err(Error::syntax_error( - self.file.clone(), - self.cursor.line(), - self.cursor.col(), - msg + Position { + file: self.file.clone(), + line: self.cursor.line(), + col: self.cursor.col(), + }, + msg, )) } } diff --git a/src/lex/token.rs b/src/lex/token.rs index 4b9beda..dd1f35d 100644 --- a/src/lex/token.rs +++ b/src/lex/token.rs @@ -4,14 +4,18 @@ use std::fmt; #[derive(Debug, Clone)] pub struct Token { pub kind: Kind, - /// line of the first character (starting from 1) - pub line: usize, - /// column of the first character (starting from 1) - pub col: usize, + pub pos: Position, /// raw text pub raw: String, } +#[derive(Debug, Clone)] +pub struct Position { + pub file: String, + pub line: usize, + pub col: usize, +} + impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "{}: \"{}\"", self.kind, self.raw) diff --git a/src/main.rs b/src/main.rs index 4abc633..fd3686d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ mod error; fn main() { let s = fs::read_to_string("test.gaybuild").unwrap(); - let lexer = Lexer::new(String::from("test.gaybuild"), s.chars()); + let lexer = Lexer::new(String::from("test.gaybuild"), s); for token in lexer { println!("{}", token.unwrap()); }