Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 40 additions & 41 deletions src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,53 +42,44 @@ pub struct Lexer;
impl Lexer {
pub fn scan(src: &str) -> Result<Vec<Token>, LexerError> {
let src = src.trim(); // Sanitize src : Trim the leading whitespaces

let mut tokens: Vec<Token> = vec![];
let mut tokens: Vec<Token> = Vec::new();
let mut slice_start_index = 0;
let mut current_index = 0;
let mut previous_char = ' ';

// This is faster than using an iterator
let len = src.len();
let bytes = src.as_bytes();
let mut i = 0;
while i < len {
let c = bytes[i] as char;
i += 1;


// Working with safe UTF-8 indices
for (current_index, c) in src.char_indices() {
match c {
// TODO: Handle char over code 127 for escaped chars
// Handle Escaped chars : "\" + any charcode below 127
'{' | '}' | '\\' | '\n' if previous_char == '\\' => {}
'{' | '}' | '\\' | '\n' => {
// End of slice chars
if slice_start_index < current_index {
// Close slice
// Extract a valid UTF-8 slice
let slice = &src[slice_start_index..current_index];
// Get the corresponding token(s)
let slice_tokens = Self::tokenize(slice)?;
tokens.extend_from_slice(&slice_tokens.as_slice());
tokens.extend_from_slice(&slice_tokens);
slice_start_index = current_index;
}
}
// Others chars
_ => {}
}
current_index += 1;
previous_char = c;
}
// Manage last token (should always be "}")
if slice_start_index < current_index {
let slice = &src[slice_start_index..current_index];

// Handling the last token
if slice_start_index < src.len() {
let slice = &src[slice_start_index..];
if slice != "}" {
return Err(LexerError::InvalidLastChar);
}
tokens.push(Token::ClosingBracket);
}
return Ok(tokens);

Ok(tokens)
}

/// Get a string slice cut but the scanner and return the coreesponding token(s)
/// Get a string slice cut by the scanner and return the corresponding token(s)
fn tokenize(slice: &str) -> Result<Vec<Token>, LexerError> {
let mut starting_chars = slice.trim_matches(' ').chars().take(2);
return match (starting_chars.next(), starting_chars.next()) {
Expand All @@ -97,48 +88,55 @@ impl Lexer {
'{' | '}' | '\\' => {
// Handle escaped chars
let tail = slice.get(1..).unwrap_or("");
return Ok(vec![Token::PlainText(tail)]); // No recursive tokenize here, juste some plain text because the char is escaped
Ok(vec![Token::PlainText(tail)]) // Escaped single char -> plain text
}
'\'' => {
// Escaped unicode in hex value : \'f0
// Escaped unicode hex value: \'f0
let tail = slice.get(1..).unwrap_or("");
if tail.len() < 2 {
return Err(LexerError::InvalidUnicode(tail.into()));
}
let byte = u8::from_str_radix(&tail[1..3], 16)?; // f0
let byte = u8::from_str_radix(&tail[1..3], 16)?;
let mut ret = vec![Token::ControlSymbol((ControlWord::Unicode, Property::Value(byte as i32)))];
recursive_tokenize!(&tail[3..], ret);
return Ok(ret);
Ok(ret)
}
'\n' => {
// CRLF
let mut ret = vec![Token::CRLF];
if let Some(tail) = slice.get(2..) {
recursive_tokenize!(tail, ret);
}
return Ok(ret);
Ok(ret)
}
'a'..='z' => {
// Identify control word
// ex: parse "\b Words in bold" -> (Token::ControlWord(ControlWord::Bold), Token::ControlWordArgument("Words in bold")
let (mut ident, tail) = slice.split_first_whitespace();
// if ident end with semicolon, strip it for correct value parsing
ident = if ident.chars().last().unwrap_or(' ') == ';' { &ident[0..ident.len() - 1] } else { ident };
let control_word = ControlWord::from(ident)?;
ident = if ident.ends_with(';') { &ident[..ident.len() - 1] } else { ident };

// Try parse control word, fallback for symbols like "-" in \pntext
let control_word = match ControlWord::from(ident) {
Ok(cw) => cw,
Err(_) => {
// Treat as plain text if it cannot be parsed as control word
return Ok(vec![Token::PlainText(slice)]);
}
};

let mut ret = vec![Token::ControlSymbol(control_word)];
recursive_tokenize!(tail, ret);

// \u1234 \u1234 is ok, but \u1234 \u1234 is lost a space, \u1234 \u1234 lost two spaces, and so on
// \u1234 1 -> No need to walk in here, it will enter plain text
if control_word.0 == ControlWord::Unicode && tail.len() > 0 && tail.trim() == "" {

// Handle special case for \u1234 and trailing spaces
if control_word.0 == ControlWord::Unicode && !tail.trim().is_empty() && tail.trim().chars().all(|ch| ch.is_whitespace()) {
ret.push(Token::PlainText(tail));
}
return Ok(ret);

Ok(ret)
}
'*' => Ok(vec![Token::IgnorableDestination]),
_ => Ok(vec![]),
},
(Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore the CRLF if it's not escaped
(Some('\n'), Some(_)) => recursive_tokenize!(&slice[1..]), // Ignore CRLF if not escaped
// Handle brackets
(Some('{'), None) => Ok(vec![Token::OpeningBracket]),
(Some('}'), None) => Ok(vec![Token::ClosingBracket]),
Expand All @@ -148,10 +146,11 @@ impl Lexer {
// Else, it's plain text
_ => {
let text = slice.trim();
if text == "" {
return Ok(vec![]);
if text.is_empty() {
Ok(vec![])
} else {
Ok(vec![Token::PlainText(slice)])
}
return Ok(vec![Token::PlainText(slice)]);
}
};
}
Expand Down
28 changes: 7 additions & 21 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,34 +6,20 @@ pub trait StrUtils {

impl StrUtils for str {
// Split the string at the first whitespace
// ex : split_first_whitespace("\b I'm a bold string") -> ("\b", "I'm a bold string")
fn split_first_whitespace(&self) -> (&str, &str) {
let mut first_whitespace_index = 0;

let len = self.len();
let bytes = self.as_bytes();
let mut i = 0;
// Faster than an iterator
while i < len {
let c = bytes[i] as char;
i += 1;

for (i, c) in self.char_indices() {
if c.is_whitespace() {
break;
} else {
first_whitespace_index += 1;
let first = &self[..i];
// +c.len_utf8() so that the slice starts after the whitespace
let second = &self[i + c.len_utf8()..];
return (first, second);
}
}
if first_whitespace_index > 0 && first_whitespace_index != self.len() {
return (&self[0..first_whitespace_index], &self[first_whitespace_index + 1..]);
} else {
return (self, "");
}
(self, "")
}

fn is_only_whitespace(&self) -> bool {
// TODO
false
self.chars().all(|c| c.is_whitespace())
}
}

Expand Down
Loading