diff --git a/html5ever/src/encoding.rs b/html5ever/src/encoding.rs new file mode 100644 index 00000000..19f58459 --- /dev/null +++ b/html5ever/src/encoding.rs @@ -0,0 +1,173 @@ +// Copyright 2014-2025 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use crate::tendril::StrTendril; + +/// +pub(crate) fn extract_a_character_encoding_from_a_meta_element( + input: StrTendril, +) -> Option { + // Step 1. Let position be a pointer into s, initially pointing at the start of the string. + let mut position = 0; + loop { + // Step 2. Loop: Find the first seven characters in s after position that are an ASCII + // case-insensitive match for the word "charset". If no such match is found, return nothing. + loop { + let candidate = input.as_bytes().get(position..position + "charset".len())?; + if candidate.eq_ignore_ascii_case(b"charset") { + break; + } + + position += 1; + } + position += "charset".len(); + + // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any). + position += input.as_bytes()[position..] + .iter() + .take_while(|byte| byte.is_ascii_whitespace()) + .count(); + + // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before + // that next character, and jump back to the step labeled loop. + if input.as_bytes()[position] == b'=' { + break; + } + } + // Skip the "=" + position += 1; + + // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any). + position += input.as_bytes()[position..] + .iter() + .take_while(|byte| byte.is_ascii_whitespace()) + .count(); + + // Step 6. Process the next character as follows: + match input.as_bytes().get(position)? { + quote @ (b'"' | b'\'') => { + // Return the result of getting an encoding from the substring that is between this character + // and the next earliest occurrence of this character. + let length = input.as_bytes()[position + 1..] + .iter() + .position(|byte| byte == quote)?; + Some(input.subtendril(position as u32 + 1, length as u32)) + }, + _ => { + // Return the result of getting an encoding from the substring that consists of this character + // up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;), + // or the end of s, whichever comes first. + let length = input.as_bytes()[position..] + .iter() + .position(|byte| byte.is_ascii_whitespace() || *byte == b';'); + if let Some(length) = length { + Some(input.subtendril(position as u32, length as u32)) + } else { + Some(input.subtendril(position as u32, (input.len() - position) as u32)) + } + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn meta_element_without_charset() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")), + None + ); + } + + #[test] + fn meta_element_with_capitalized_charset() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "cHarSet=utf8" + )), + Some(StrTendril::from_slice("utf8")) + ); + } + + #[test] + fn meta_element_with_no_equals_after_charset() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset utf8" + )), + None + ); + } + + #[test] + fn meta_element_with_whitespace_around_equals() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset \t=\tutf8" + )), + Some(StrTendril::from_slice("utf8")) + ); + } + + #[test] + fn meta_element_with_quoted_value() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset='utf8'" + )), + Some(StrTendril::from_slice("utf8")) + ); + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset=\"utf8\"" + )), + Some(StrTendril::from_slice("utf8")) + ); + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset='utf8" + )), + None + ); + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset=\"utf8" + )), + None + ); + } + + #[test] + fn meta_element_with_implicit_terminator() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset=utf8 foo" + )), + Some(StrTendril::from_slice("utf8")) + ); + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "charset=utf8;foo" + )), + Some(StrTendril::from_slice("utf8")) + ); + } + + #[test] + fn meta_element_with_content_type() { + assert_eq!( + extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice( + "text/html; charset=utf8" + )), + Some(StrTendril::from_slice("utf8")) + ); + } +} diff --git a/html5ever/src/lib.rs b/html5ever/src/lib.rs index e4200d6b..207a390f 100644 --- a/html5ever/src/lib.rs +++ b/html5ever/src/lib.rs @@ -22,6 +22,7 @@ mod util { pub(crate) mod str; } +pub(crate) mod encoding; pub(crate) mod macros; pub mod driver; diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index edc6afb9..55425e2a 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -7,6 +7,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use markup5ever::ns; + use crate::interface::Attribute; use crate::tendril::StrTendril; use crate::tokenizer::states; @@ -57,6 +59,13 @@ impl Tag { self_attrs == other_attrs } + + pub(crate) fn get_attribute(&self, name: &LocalName) -> Option { + self.attrs + .iter() + .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name) + .map(|attribute| attribute.value.clone()) + } } #[derive(PartialEq, Eq, Debug)] @@ -77,6 +86,15 @@ pub enum TokenSinkResult { Script(Handle), Plaintext, RawData(states::RawKind), + /// The document indicated that the given encoding should be used to parse it. + /// + /// HTML5-compatible implementations should parse the encoding label using the algorithm + /// described in . The label + /// has not been validated by html5ever. Invalid or unknown encodings can be ignored. + /// + /// If the decoder is confident that the current encoding is correct then this message + /// can safely be ignored. + EncodingIndicator(StrTendril), } /// Types which can receive tokens from the tokenizer. diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index eccc5690..b884c2fb 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -43,6 +43,7 @@ pub enum ProcessResult { Continue, Suspend, Script(Handle), + EncodingIndicator(StrTendril), } fn option_push(opt_str: &mut Option, c: char) { @@ -357,6 +358,9 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), + ProcessResult::EncodingIndicator(encoding) => { + return TokenizerResult::EncodingIndicator(encoding) + }, } } } else { @@ -365,6 +369,9 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), + ProcessResult::EncodingIndicator(encoding) => { + return TokenizerResult::EncodingIndicator(encoding) + }, } } } @@ -456,6 +463,9 @@ impl Tokenizer { self.state.set(states::RawData(kind)); ProcessResult::Continue }, + TokenSinkResult::EncodingIndicator(encoding) => { + ProcessResult::EncodingIndicator(encoding) + }, } } @@ -1726,7 +1736,7 @@ impl Tokenizer { match self.eof_step() { ProcessResult::Continue => (), ProcessResult::Suspend => break, - ProcessResult::Script(_) => unreachable!(), + ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(), } } diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index f0d89b92..182e9f1b 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -400,6 +400,9 @@ where assert!(more_tokens.is_empty()); return tokenizer::TokenSinkResult::RawData(k); }, + ProcessResult::EncodingIndicator(encoding) => { + return tokenizer::TokenSinkResult::EncodingIndicator(encoding) + }, } } } diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs index e0ddff45..72d21852 100644 --- a/html5ever/src/tree_builder/rules.rs +++ b/html5ever/src/tree_builder/rules.rs @@ -9,6 +9,7 @@ // The tree builder rules, as a single, enormous nested match expression. +use crate::encoding::extract_a_character_encoding_from_a_meta_element; use crate::interface::Quirks; use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData}; use crate::tokenizer::TagKind::{EndTag, StartTag}; @@ -189,8 +190,32 @@ where Token::Tag(tag!()) => self.step(InsertionMode::InBody, token), Token::Tag(tag @ tag!( | | | | )) => { - // FIXME: handle and - self.insert_and_pop_element_for(tag); + self.insert_and_pop_element_for(tag.clone()); + + // Step 1. If the element has a charset attribute, and getting an encoding from its value + // results in an encoding, and the confidence is currently tentative, then change the encoding + // to the resulting encoding. + // NOTE: We don't verify the validity of the encoding here. If the embedder detects the + // encoding to be invalid then they can safely continue spinning the tokenizer. + if let Some(charset) = tag.get_attribute(&local_name!("charset")) { + return ProcessResult::EncodingIndicator(charset); + } + // Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII + // case-insensitive match for the string "Content-Type", and the element has a content + // attribute, and applying the algorithm for extracting a character encoding from a meta + // element to that attribute's value returns an encoding, and the confidence is currently + // tentative, then change the encoding to the extracted encoding. + else if tag + .get_attribute(&local_name!("http-equiv")) + .is_some_and(|value| value.eq_ignore_ascii_case("content-type")) + { + if let Some(encoding) = tag + .get_attribute(&local_name!("content")) + .and_then(extract_a_character_encoding_from_a_meta_element) + { + return ProcessResult::EncodingIndicator(encoding); + } + } ProcessResult::DoneAckSelfClosing }, diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs index 684d5b0b..c336b93b 100644 --- a/html5ever/src/tree_builder/types.rs +++ b/html5ever/src/tree_builder/types.rs @@ -70,6 +70,7 @@ pub(crate) enum ProcessResult { Script(Handle), ToPlaintext, ToRawData(RawKind), + EncodingIndicator(StrTendril), } pub(crate) enum FormatEntry { diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs index 92e6abf5..247daf35 100644 --- a/markup5ever/interface/mod.rs +++ b/markup5ever/interface/mod.rs @@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> { pub enum TokenizerResult { Done, Script(Handle), + /// The document indicated that the given encoding should be used to parse it. + /// + /// HTML5-compatible implementations should parse the encoding label using the algorithm + /// described in . The label + /// has not been validated by html5ever. Invalid or unknown encodings can be ignored. + /// + /// If you are confident that the current encoding is correct then you can safely + /// ignore this message. + EncodingIndicator(StrTendril), } /// Helper to quickly create an expanded name.