From 4a241b3c38b37eda072aec180fc2d72ad7a7199e Mon Sep 17 00:00:00 2001 From: hardglitch <49201692+hardglitch@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:12:57 -0700 Subject: [PATCH 1/5] Fixed lexer::scan() --- src/lexer.rs | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 4032b69..800d473 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -42,50 +42,41 @@ pub struct Lexer; impl Lexer { pub fn scan(src: &str) -> Result, LexerError> { let src = src.trim(); // Sanitize src : Trim the leading whitespaces - - let mut tokens: Vec = vec![]; + + let mut tokens: Vec = Vec::new(); let mut slice_start_index = 0; - let mut current_index = 0; let mut previous_char = ' '; - - // This is faster than using an iterator - let len = src.len(); - let bytes = src.as_bytes(); - let mut i = 0; - while i < len { - let c = bytes[i] as char; - i += 1; - + + // Работаем с безопасными UTF-8 индексами + for (current_index, c) in src.char_indices() { match c { // TODO: Handle char over code 127 for escaped chars // Handle Escaped chars : "\" + any charcode below 127 '{' | '}' | '\\' | '\n' if previous_char == '\\' => {} '{' | '}' | '\\' | '\n' => { - // End of slice chars if slice_start_index < current_index { - // Close slice + // Выделяем корректный UTF-8 срез let slice = &src[slice_start_index..current_index]; - // Get the corresponding token(s) let slice_tokens = Self::tokenize(slice)?; - tokens.extend_from_slice(&slice_tokens.as_slice()); + tokens.extend_from_slice(&slice_tokens); slice_start_index = current_index; } } - // Others chars _ => {} } - current_index += 1; previous_char = c; } - // Manage last token (should always be "}") - if slice_start_index < current_index { - let slice = &src[slice_start_index..current_index]; + + // Обработка последнего токена + if slice_start_index < src.len() { + let slice = &src[slice_start_index..]; if slice != "}" { return Err(LexerError::InvalidLastChar); } tokens.push(Token::ClosingBracket); } - return Ok(tokens); + + Ok(tokens) } /// Get a string slice cut but the scanner and return the coreesponding token(s) From 3691a91a856cbd8b08eca73d290573cc6a2a4758 Mon Sep 17 00:00:00 2001 From: hardglitch <49201692+hardglitch@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:27:31 -0700 Subject: [PATCH 2/5] Update utils::StrUtils --- src/utils.rs | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index b2f7e3e..953e8b7 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,34 +6,20 @@ pub trait StrUtils { impl StrUtils for str { // Split the string at the first whitespace - // ex : split_first_whitespace("\b I'm a bold string") -> ("\b", "I'm a bold string") fn split_first_whitespace(&self) -> (&str, &str) { - let mut first_whitespace_index = 0; - - let len = self.len(); - let bytes = self.as_bytes(); - let mut i = 0; - // Faster than an iterator - while i < len { - let c = bytes[i] as char; - i += 1; - + for (i, c) in self.char_indices() { if c.is_whitespace() { - break; - } else { - first_whitespace_index += 1; + let first = &self[..i]; + // +c.len_utf8() чтобы срез начинался после пробела + let second = &self[i + c.len_utf8()..]; + return (first, second); } } - if first_whitespace_index > 0 && first_whitespace_index != self.len() { - return (&self[0..first_whitespace_index], &self[first_whitespace_index + 1..]); - } else { - return (self, ""); - } + (self, "") } fn is_only_whitespace(&self) -> bool { - // TODO - false + self.chars().all(|c| c.is_whitespace()) } } From 41c5a93ac53b8ba2019cc1e7e5d6ea41c3e49fa2 Mon Sep 17 00:00:00 2001 From: hardglitch <49201692+hardglitch@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:50:54 -0700 Subject: [PATCH 3/5] Update utils.rs --- src/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index 953e8b7..90d9639 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -10,7 +10,7 @@ impl StrUtils for str { for (i, c) in self.char_indices() { if c.is_whitespace() { let first = &self[..i]; - // +c.len_utf8() чтобы срез начинался после пробела + // +c.len_utf8() so that the slice starts after the whitespace let second = &self[i + c.len_utf8()..]; return (first, second); } From bb8b5e9c6974d8ddb97c5ecc6824bfb31bc61b6e Mon Sep 17 00:00:00 2001 From: hardglitch <49201692+hardglitch@users.noreply.github.com> Date: Mon, 6 Oct 2025 12:54:52 -0700 Subject: [PATCH 4/5] Update lexer.rs --- src/lexer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 800d473..499ca05 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -47,7 +47,7 @@ impl Lexer { let mut slice_start_index = 0; let mut previous_char = ' '; - // Работаем с безопасными UTF-8 индексами + // Working with safe UTF-8 indices for (current_index, c) in src.char_indices() { match c { // TODO: Handle char over code 127 for escaped chars @@ -55,7 +55,7 @@ impl Lexer { '{' | '}' | '\\' | '\n' if previous_char == '\\' => {} '{' | '}' | '\\' | '\n' => { if slice_start_index < current_index { - // Выделяем корректный UTF-8 срез + // Extract a valid UTF-8 slice let slice = &src[slice_start_index..current_index]; let slice_tokens = Self::tokenize(slice)?; tokens.extend_from_slice(&slice_tokens); @@ -67,7 +67,7 @@ impl Lexer { previous_char = c; } - // Обработка последнего токена + // Handling the last token if slice_start_index < src.len() { let slice = &src[slice_start_index..]; if slice != "}" { From 4b45af06922512e3c30c6263fec16a63c8dc2fe4 Mon Sep 17 00:00:00 2001 From: hardglitch <49201692+hardglitch@users.noreply.github.com> Date: Mon, 6 Oct 2025 15:49:46 -0700 Subject: [PATCH 5/5] Update lexer::tokenize() --- src/lexer.rs | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 499ca05..697c226 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -79,7 +79,7 @@ impl Lexer { Ok(tokens) } - /// Get a string slice cut but the scanner and return the coreesponding token(s) + /// Get a string slice cut by the scanner and return the corresponding token(s) fn tokenize(slice: &str) -> Result, LexerError> { let mut starting_chars = slice.trim_matches(' ').chars().take(2); return match (starting_chars.next(), starting_chars.next()) { @@ -88,18 +88,18 @@ impl Lexer { '{' | '}' | '\\' => { // Handle escaped chars let tail = slice.get(1..).unwrap_or(""); - return Ok(vec![Token::PlainText(tail)]); // No recursive tokenize here, juste some plain text because the char is escaped + Ok(vec![Token::PlainText(tail)]) // Escaped single char -> plain text } '\'' => { - // Escaped unicode in hex value : \'f0 + // Escaped unicode hex value: \'f0 let tail = slice.get(1..).unwrap_or(""); if tail.len() < 2 { return Err(LexerError::InvalidUnicode(tail.into())); } - let byte = u8::from_str_radix(&tail[1..3], 16)?; // f0 + let byte = u8::from_str_radix(&tail[1..3], 16)?; let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))]; recursive_tokenize!(&tail[3..], ret); - return Ok(ret); + Ok(ret) } '\n' => { // CRLF @@ -107,29 +107,36 @@ impl Lexer { if let Some(tail) = slice.get(2..) { recursive_tokenize!(tail, ret); } - return Ok(ret); + Ok(ret) } 'a'..='z' => { // Identify control word - // ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold") let (mut ident, tail) = slice.split_first_whitespace(); - // if ident end with semicolon, strip it for correct value parsing - ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident }; - let control_word = ControlWord::from(ident)?; + ident = if ident.ends_with(';') { &ident[..ident.len() - 1] } else { ident }; + + // Try parse control word, fallback for symbols like "-" in \pntext + let control_word = match ControlWord::from(ident) { + Ok(cw) => cw, + Err(_) => { + // Treat as plain text if it cannot be parsed as control word + return Ok(vec![Token::PlainText(slice)]); + } + }; + let mut ret = vec![Token::ControlSymbol(control_word)]; recursive_tokenize!(tail, ret); - - // \u1234 \u1234 is ok, but \u1234 \u1234 is lost a space, \u1234 \u1234 lost two spaces, and so on - // \u1234 1 -> No need to walk in here, it will enter plain text - if control_word.0 == ControlWord::Unicode && tail.len() > 0 && tail.trim() == "" { + + // Handle special case for \u1234 and trailing spaces + if control_word.0 == ControlWord::Unicode && !tail.trim().is_empty() && tail.trim().chars().all(|ch| ch.is_whitespace()) { ret.push(Token::PlainText(tail)); } - return Ok(ret); + + Ok(ret) } '*' => Ok(vec![Token::IgnorableDestination]), _ => Ok(vec![]), }, - (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore the CRLF if it's not escaped + (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore CRLF if not escaped // Handle brackets (Some('{'), None) => Ok(vec![Token::OpeningBracket]), (Some('}'), None) => Ok(vec![Token::ClosingBracket]), @@ -139,10 +146,11 @@ impl Lexer { // Else, it's plain text _ => { let text = slice.trim(); - if text == "" { - return Ok(vec![]); + if text.is_empty() { + Ok(vec![]) + } else { + Ok(vec![Token::PlainText(slice)]) } - return Ok(vec![Token::PlainText(slice)]); } }; }