diff --git a/src/lexer.rs b/src/lexer.rs index 4032b69..697c226 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -42,53 +42,44 @@ pub struct Lexer; impl Lexer { pub fn scan(src: &str) -> Result, LexerError> { let src = src.trim(); // Sanitize src : Trim the leading whitespaces - - let mut tokens: Vec = vec![]; + + let mut tokens: Vec = Vec::new(); let mut slice_start_index = 0; - let mut current_index = 0; let mut previous_char = ' '; - - // This is faster than using an iterator - let len = src.len(); - let bytes = src.as_bytes(); - let mut i = 0; - while i < len { - let c = bytes[i] as char; - i += 1; - + + // Working with safe UTF-8 indices + for (current_index, c) in src.char_indices() { match c { // TODO: Handle char over code 127 for escaped chars // Handle Escaped chars : "\" + any charcode below 127 '{' | '}' | '\\' | '\n' if previous_char == '\\' => {} '{' | '}' | '\\' | '\n' => { - // End of slice chars if slice_start_index < current_index { - // Close slice + // Extract a valid UTF-8 slice let slice = &src[slice_start_index..current_index]; - // Get the corresponding token(s) let slice_tokens = Self::tokenize(slice)?; - tokens.extend_from_slice(&slice_tokens.as_slice()); + tokens.extend_from_slice(&slice_tokens); slice_start_index = current_index; } } - // Others chars _ => {} } - current_index += 1; previous_char = c; } - // Manage last token (should always be "}") - if slice_start_index < current_index { - let slice = &src[slice_start_index..current_index]; + + // Handling the last token + if slice_start_index < src.len() { + let slice = &src[slice_start_index..]; if slice != "}" { return Err(LexerError::InvalidLastChar); } tokens.push(Token::ClosingBracket); } - return Ok(tokens); + + Ok(tokens) } - /// Get a string slice cut but the scanner and return the coreesponding token(s) + /// Get a string slice cut by the scanner and return the corresponding token(s) fn tokenize(slice: &str) -> Result, LexerError> { let mut starting_chars = slice.trim_matches(' ').chars().take(2); return match (starting_chars.next(), starting_chars.next()) { @@ -97,18 +88,18 @@ impl Lexer { '{' | '}' | '\\' => { // Handle escaped chars let tail = slice.get(1..).unwrap_or(""); - return Ok(vec![Token::PlainText(tail)]); // No recursive tokenize here, juste some plain text because the char is escaped + Ok(vec![Token::PlainText(tail)]) // Escaped single char -> plain text } '\'' => { - // Escaped unicode in hex value : \'f0 + // Escaped unicode hex value: \'f0 let tail = slice.get(1..).unwrap_or(""); if tail.len() < 2 { return Err(LexerError::InvalidUnicode(tail.into())); } - let byte = u8::from_str_radix(&tail[1..3], 16)?; // f0 + let byte = u8::from_str_radix(&tail[1..3], 16)?; let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))]; recursive_tokenize!(&tail[3..], ret); - return Ok(ret); + Ok(ret) } '\n' => { // CRLF @@ -116,29 +107,36 @@ impl Lexer { if let Some(tail) = slice.get(2..) { recursive_tokenize!(tail, ret); } - return Ok(ret); + Ok(ret) } 'a'..='z' => { // Identify control word - // ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold") let (mut ident, tail) = slice.split_first_whitespace(); - // if ident end with semicolon, strip it for correct value parsing - ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident }; - let control_word = ControlWord::from(ident)?; + ident = if ident.ends_with(';') { &ident[..ident.len() - 1] } else { ident }; + + // Try parse control word, fallback for symbols like "-" in \pntext + let control_word = match ControlWord::from(ident) { + Ok(cw) => cw, + Err(_) => { + // Treat as plain text if it cannot be parsed as control word + return Ok(vec![Token::PlainText(slice)]); + } + }; + let mut ret = vec![Token::ControlSymbol(control_word)]; recursive_tokenize!(tail, ret); - - // \u1234 \u1234 is ok, but \u1234 \u1234 is lost a space, \u1234 \u1234 lost two spaces, and so on - // \u1234 1 -> No need to walk in here, it will enter plain text - if control_word.0 == ControlWord::Unicode && tail.len() > 0 && tail.trim() == "" { + + // Handle special case for \u1234 and trailing spaces + if control_word.0 == ControlWord::Unicode && !tail.trim().is_empty() && tail.trim().chars().all(|ch| ch.is_whitespace()) { ret.push(Token::PlainText(tail)); } - return Ok(ret); + + Ok(ret) } '*' => Ok(vec![Token::IgnorableDestination]), _ => Ok(vec![]), }, - (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore the CRLF if it's not escaped + (Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore CRLF if not escaped // Handle brackets (Some('{'), None) => Ok(vec![Token::OpeningBracket]), (Some('}'), None) => Ok(vec![Token::ClosingBracket]), @@ -148,10 +146,11 @@ impl Lexer { // Else, it's plain text _ => { let text = slice.trim(); - if text == "" { - return Ok(vec![]); + if text.is_empty() { + Ok(vec![]) + } else { + Ok(vec![Token::PlainText(slice)]) } - return Ok(vec![Token::PlainText(slice)]); } }; } diff --git a/src/utils.rs b/src/utils.rs index b2f7e3e..90d9639 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,34 +6,20 @@ pub trait StrUtils { impl StrUtils for str { // Split the string at the first whitespace - // ex : split_first_whitespace("\b I'm a bold string") -> ("\b", "I'm a bold string") fn split_first_whitespace(&self) -> (&str, &str) { - let mut first_whitespace_index = 0; - - let len = self.len(); - let bytes = self.as_bytes(); - let mut i = 0; - // Faster than an iterator - while i < len { - let c = bytes[i] as char; - i += 1; - + for (i, c) in self.char_indices() { if c.is_whitespace() { - break; - } else { - first_whitespace_index += 1; + let first = &self[..i]; + // +c.len_utf8() so that the slice starts after the whitespace + let second = &self[i + c.len_utf8()..]; + return (first, second); } } - if first_whitespace_index > 0 && first_whitespace_index != self.len() { - return (&self[0..first_whitespace_index], &self[first_whitespace_index + 1..]); - } else { - return (self, ""); - } + (self, "") } fn is_only_whitespace(&self) -> bool { - // TODO - false + self.chars().all(|c| c.is_whitespace()) } }