From d8422261f6df4b3e6f82998d3c9f9f3c3f44e714 Mon Sep 17 00:00:00 2001 From: Nareshkumar Rao Date: Tue, 27 May 2025 23:46:52 +0200 Subject: [PATCH] wip --- .gitignore | 1 + Cargo.lock | 16 ++++ Cargo.toml | 7 ++ src/ast.rs | 23 ++++++ src/format.rs | 51 +++++++++++++ src/lexer.rs | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 40 ++++++++++ src/parse.rs | 13 ++++ src/token.rs | 71 ++++++++++++++++++ 9 files changed, 420 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/ast.rs create mode 100644 src/format.rs create mode 100644 src/lexer.rs create mode 100644 src/main.rs create mode 100644 src/parse.rs create mode 100644 src/token.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..d55859b --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "ts-parser" +version = "0.1.0" +dependencies = [ + "anyhow", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..83a84b5 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "ts-parser" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = "1.0.98" diff --git a/src/ast.rs b/src/ast.rs new file mode 100644 index 0000000..2d921ef --- /dev/null +++ b/src/ast.rs @@ -0,0 +1,23 @@ +use crate::token::Token; + +pub struct Module { + pub statements: Vec, +} + +pub enum Statement { + FunctionDeclaration { + name: Token, + parameters: Vec, + }, + Expression, +} + +pub struct ParameterDeclaration { + name: Token, + typename: Token, +} + +pub enum Expression { + Identifier(Token), + FunctionCall {}, +} diff --git a/src/format.rs b/src/format.rs new file mode 100644 index 0000000..1449eec --- /dev/null +++ b/src/format.rs @@ -0,0 +1,51 @@ +use crate::token::{CommentKind, KeywordKind, LiteralKind, Number, Token, TokenKind}; + +pub trait Formatter { + fn format(self, options: FormatterOptions) -> anyhow::Result; +} +pub struct FormatterOptions {} + +impl Formatter for &[Token] { + fn format(self, options: FormatterOptions) -> anyhow::Result { + let mut result = String::new(); + for t in self { + let kind = &t.kind; + let s = match kind { + TokenKind::Identifier(i) => i.clone(), + TokenKind::Literal(kind) => match kind { + LiteralKind::String(s) => { + format!("\"{s}\"") + } + LiteralKind::Number(number) => match number { + Number::Integer(i) => format!("{i}"), + Number::Float(f) => format!("{f}"), + }, + }, + TokenKind::Keyword(kind) => format!( + "{}", + match kind { + KeywordKind::function => "function ", + KeywordKind::string => "string", + KeywordKind::number => "number", + } + ), + TokenKind::Comment(kind, s) => match kind { + CommentKind::Line => format!("// {s}"), + CommentKind::Block => format!("/* {s} */"), + }, + TokenKind::LeftParen => "(".to_string(), + TokenKind::RightParen => ")".to_string(), + TokenKind::LeftCurly => " {".to_string(), + TokenKind::RightCurly => "}".to_string(), + TokenKind::Comma => ", ".to_string(), + TokenKind::Colon => ": ".to_string(), + TokenKind::Semicolon => ";".to_string(), + TokenKind::Period => ".".to_string(), + TokenKind::NewLine => "\n".to_string(), + TokenKind::EndOfFile => "".to_string(), + }; + result += &s; + } + Ok(result) + } +} diff --git a/src/lexer.rs b/src/lexer.rs new file mode 100644 index 0000000..4e70ba0 --- /dev/null +++ b/src/lexer.rs @@ -0,0 +1,198 @@ +use anyhow::bail; + +use crate::token::{CommentKind, KeywordKind, LiteralKind, Token, TokenKind, TokenLocation}; + +pub struct Lexer { + file: Option, + source: Vec, + tokens: Vec, + line: usize, + current_line_offset: usize, + start: usize, + current: usize, +} + +impl Lexer { + pub fn new(source: &str, file: Option) -> Lexer { + Lexer { + source: source.chars().collect::>(), + tokens: Vec::new(), + line: 1, + start: 0, + current: 0, + file, + current_line_offset: 0, + } + } + + pub fn lex(mut self) -> Vec { + while self.current < self.source.len() { + self.start = self.current; + self.next_token(); + } + self.clean_newlines(); + self.tokens + } + + fn next_token(&mut self) { + let c = self.consume(); + + let t = match c { + '(' => Some(TokenKind::LeftParen), + ')' => Some(TokenKind::RightParen), + '{' => Some(TokenKind::LeftCurly), + '}' => Some(TokenKind::RightCurly), + ',' => Some(TokenKind::Comma), + ':' => Some(TokenKind::Colon), + ';' => Some(TokenKind::Semicolon), + '.' => Some(TokenKind::Period), + '\n' => { + self.line += 1; + self.current_line_offset = self.current; + Some(TokenKind::NewLine) + } + _ => None, + }; + + if let Some(t) = t { + self.push(t); + return; + } + + if c == '/' { + let p = self.peek(); + let t = match p { + '/' => { + while !self.is_eof() { + let c = self.consume(); + if c == '\n' { + self.line += 1; + self.current_line_offset = self.current; + break; + } + } + let s = self.current_scan(2, 0); + TokenKind::Comment(CommentKind::Line, s) + } + '*' => { + while !self.is_eof() { + if self.peek_match("*/") { + break; + } + } + if self.is_eof() { + todo!("Expected */ before EOF"); + } + self.current += 2; + let s = self.current_scan(2, 2); + TokenKind::Comment(CommentKind::Block, s) + } + _ => todo!("forward slash"), + }; + self.push(t); + } + + if c == '"' { + while !self.is_eof() { + let c = self.consume(); + match c { + '"' => break, + '\n' => todo!("Expected closing string before new line"), + _ => (), + } + } + if self.is_eof() { + todo!("Expected closing string before EOL") + } + let s = self.current_scan(1, 1); + self.push(TokenKind::Literal(LiteralKind::String(s))); + return; + } + + if c.is_ascii_alphabetic() { + while !self.is_eof() { + let p = self.peek(); + if p.is_alphanumeric() || p == '_' { + self.consume(); + } else { + break; + } + } + if self.is_eof() { + todo!("Not sure if handling is necessary") + } + let s = self.current_scan(0, 0); + if let Ok(k) = TryInto::::try_into(s.as_str()) { + self.push(TokenKind::Keyword(k)); + return; + } + self.push(TokenKind::Identifier(s)); + return; + } + } + + fn clean_newlines(&mut self) { + while let Some(TokenKind::NewLine) = self.tokens.first().map(|t| &t.kind) { + self.tokens.remove(0); + } + + let mut i = 0; + loop { + let w = self + .tokens + .get(i..(i + 3)) + .map(|ts| ts.iter().map(|t| &t.kind).collect::>()); + match w.as_deref() { + Some([TokenKind::NewLine, TokenKind::NewLine, TokenKind::NewLine]) => { + self.tokens.remove(i + 2); + } + Some(_) => { + i += 1; + } + None => break, + } + } + } + + fn current_scan(&self, start_offset: usize, end_offset: usize) -> String { + self.source[(self.start + start_offset)..(self.current - end_offset)] + .iter() + .collect::() + } + + fn push(&mut self, kind: TokenKind) { + self.tokens.push(Token { + kind, + location: TokenLocation { + file: self.file.clone(), + line: self.line, + column: self.start.saturating_sub(self.current_line_offset), + }, + }); + } + + fn peek(&self) -> char { + self.source[self.current] + } + fn peek_n(&self, n: usize) -> Option { + self.source.get(self.current + n).copied() + } + fn peek_match(&self, m: &str) -> bool { + let c = self.current; + let s = self + .source + .get(c..(c + m.len())) + .map(|s| s.iter().collect::()); + if let Some(s) = s { s == m } else { false } + } + + fn consume(&mut self) -> char { + let c = self.source[self.current]; + self.current += 1; + c + } + + fn is_eof(&self) -> bool { + self.current == self.source.len() + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..0345762 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,40 @@ +use token::Token; + +mod ast; +mod format; +mod lexer; +mod parse; +mod token; + +fn main() { + println!("Hello, world!"); +} + +#[cfg(test)] +mod tests { + use crate::{ + format::Formatter, + lexer::{self, Lexer}, + }; + + const BASIC: &str = r#" +function hello(name: string){ + console.log("Hey, ", name); +} + + +console.log("Starting!"); + +hello(); +"#; + #[test] + fn lex() { + println!("Running lex"); + let lexer = Lexer::new(BASIC, Some("basic.file".to_string())); + let tokens = lexer.lex(); + println!( + "{}", + tokens.format(crate::format::FormatterOptions {}).unwrap() + ); + } +} diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..1970f90 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,13 @@ +use crate::token::Token; + +pub struct Parser { + tokens: Vec, +} + +impl Parser { + pub fn new(tokens: Vec) -> Parser { + Self { tokens } + } + + fn parse(&mut self) {} +} diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..51c7e07 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,71 @@ +use anyhow::anyhow; + +#[derive(Debug)] +pub struct Token { + pub kind: TokenKind, + pub location: TokenLocation, +} + +#[derive(Debug)] +pub enum TokenKind { + Identifier(String), + Literal(LiteralKind), + Keyword(KeywordKind), + Comment(CommentKind, String), + + LeftParen, + RightParen, + LeftCurly, + RightCurly, + Comma, + Colon, + Semicolon, + Period, + NewLine, + EndOfFile, +} + +#[derive(Debug)] +pub enum CommentKind { + Line, + Block, +} + +#[allow(non_camel_case_types)] +#[derive(Debug)] +pub enum KeywordKind { + function, + string, + number, +} + +impl TryFrom<&str> for KeywordKind { + type Error = anyhow::Error; + fn try_from(value: &str) -> Result { + match value { + "function" => Ok(Self::function), + "string" => Ok(Self::string), + "number" => Ok(Self::number), + _ => Err(anyhow!("unknown keyword")), + } + } +} + +#[derive(Debug)] +pub enum LiteralKind { + String(String), + Number(Number), +} + +#[derive(Debug)] +pub enum Number { + Integer(usize), + Float(f64), +} + +#[derive(Debug)] +pub struct TokenLocation { + pub file: Option, + pub line: usize, + pub column: usize, +}