diff --git a/html5ever/src/encoding.rs b/html5ever/src/encoding.rs
new file mode 100644
index 00000000..19f58459
--- /dev/null
+++ b/html5ever/src/encoding.rs
@@ -0,0 +1,173 @@
+// Copyright 2014-2025 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 or the MIT license
+// , at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::tendril::StrTendril;
+
+///
+pub(crate) fn extract_a_character_encoding_from_a_meta_element(
+ input: StrTendril,
+) -> Option {
+ // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
+ let mut position = 0;
+ loop {
+ // Step 2. Loop: Find the first seven characters in s after position that are an ASCII
+ // case-insensitive match for the word "charset". If no such match is found, return nothing.
+ loop {
+ let candidate = input.as_bytes().get(position..position + "charset".len())?;
+ if candidate.eq_ignore_ascii_case(b"charset") {
+ break;
+ }
+
+ position += 1;
+ }
+ position += "charset".len();
+
+ // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
+ position += input.as_bytes()[position..]
+ .iter()
+ .take_while(|byte| byte.is_ascii_whitespace())
+ .count();
+
+ // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
+ // that next character, and jump back to the step labeled loop.
+ if input.as_bytes()[position] == b'=' {
+ break;
+ }
+ }
+ // Skip the "="
+ position += 1;
+
+ // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
+ position += input.as_bytes()[position..]
+ .iter()
+ .take_while(|byte| byte.is_ascii_whitespace())
+ .count();
+
+ // Step 6. Process the next character as follows:
+ match input.as_bytes().get(position)? {
+ quote @ (b'"' | b'\'') => {
+ // Return the result of getting an encoding from the substring that is between this character
+ // and the next earliest occurrence of this character.
+ let length = input.as_bytes()[position + 1..]
+ .iter()
+ .position(|byte| byte == quote)?;
+ Some(input.subtendril(position as u32 + 1, length as u32))
+ },
+ _ => {
+ // Return the result of getting an encoding from the substring that consists of this character
+ // up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
+ // or the end of s, whichever comes first.
+ let length = input.as_bytes()[position..]
+ .iter()
+ .position(|byte| byte.is_ascii_whitespace() || *byte == b';');
+ if let Some(length) = length {
+ Some(input.subtendril(position as u32, length as u32))
+ } else {
+ Some(input.subtendril(position as u32, (input.len() - position) as u32))
+ }
+ },
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn meta_element_without_charset() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")),
+ None
+ );
+ }
+
+ #[test]
+ fn meta_element_with_capitalized_charset() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "cHarSet=utf8"
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ }
+
+ #[test]
+ fn meta_element_with_no_equals_after_charset() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset utf8"
+ )),
+ None
+ );
+ }
+
+ #[test]
+ fn meta_element_with_whitespace_around_equals() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset \t=\tutf8"
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ }
+
+ #[test]
+ fn meta_element_with_quoted_value() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset='utf8'"
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset=\"utf8\""
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset='utf8"
+ )),
+ None
+ );
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset=\"utf8"
+ )),
+ None
+ );
+ }
+
+ #[test]
+ fn meta_element_with_implicit_terminator() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset=utf8 foo"
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "charset=utf8;foo"
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ }
+
+ #[test]
+ fn meta_element_with_content_type() {
+ assert_eq!(
+ extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+ "text/html; charset=utf8"
+ )),
+ Some(StrTendril::from_slice("utf8"))
+ );
+ }
+}
diff --git a/html5ever/src/lib.rs b/html5ever/src/lib.rs
index e4200d6b..207a390f 100644
--- a/html5ever/src/lib.rs
+++ b/html5ever/src/lib.rs
@@ -22,6 +22,7 @@ mod util {
pub(crate) mod str;
}
+pub(crate) mod encoding;
pub(crate) mod macros;
pub mod driver;
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
index edc6afb9..55425e2a 100644
--- a/html5ever/src/tokenizer/interface.rs
+++ b/html5ever/src/tokenizer/interface.rs
@@ -7,6 +7,8 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.
+use markup5ever::ns;
+
use crate::interface::Attribute;
use crate::tendril::StrTendril;
use crate::tokenizer::states;
@@ -57,6 +59,13 @@ impl Tag {
self_attrs == other_attrs
}
+
+ pub(crate) fn get_attribute(&self, name: &LocalName) -> Option {
+ self.attrs
+ .iter()
+ .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
+ .map(|attribute| attribute.value.clone())
+ }
}
#[derive(PartialEq, Eq, Debug)]
@@ -77,6 +86,15 @@ pub enum TokenSinkResult {
Script(Handle),
Plaintext,
RawData(states::RawKind),
+ /// The document indicated that the given encoding should be used to parse it.
+ ///
+ /// HTML5-compatible implementations should parse the encoding label using the algorithm
+ /// described in . The label
+ /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+ ///
+ /// If the decoder is confident that the current encoding is correct then this message
+ /// can safely be ignored.
+ EncodingIndicator(StrTendril),
}
/// Types which can receive tokens from the tokenizer.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index eccc5690..b884c2fb 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -43,6 +43,7 @@ pub enum ProcessResult {
Continue,
Suspend,
Script(Handle),
+ EncodingIndicator(StrTendril),
}
fn option_push(opt_str: &mut Option, c: char) {
@@ -357,6 +358,9 @@ impl Tokenizer {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ ProcessResult::EncodingIndicator(encoding) => {
+ return TokenizerResult::EncodingIndicator(encoding)
+ },
}
}
} else {
@@ -365,6 +369,9 @@ impl Tokenizer {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ ProcessResult::EncodingIndicator(encoding) => {
+ return TokenizerResult::EncodingIndicator(encoding)
+ },
}
}
}
@@ -456,6 +463,9 @@ impl Tokenizer {
self.state.set(states::RawData(kind));
ProcessResult::Continue
},
+ TokenSinkResult::EncodingIndicator(encoding) => {
+ ProcessResult::EncodingIndicator(encoding)
+ },
}
}
@@ -1726,7 +1736,7 @@ impl Tokenizer {
match self.eof_step() {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
- ProcessResult::Script(_) => unreachable!(),
+ ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
}
}
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
index f0d89b92..182e9f1b 100644
--- a/html5ever/src/tree_builder/mod.rs
+++ b/html5ever/src/tree_builder/mod.rs
@@ -400,6 +400,9 @@ where
assert!(more_tokens.is_empty());
return tokenizer::TokenSinkResult::RawData(k);
},
+ ProcessResult::EncodingIndicator(encoding) => {
+ return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
+ },
}
}
}
diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs
index e0ddff45..72d21852 100644
--- a/html5ever/src/tree_builder/rules.rs
+++ b/html5ever/src/tree_builder/rules.rs
@@ -9,6 +9,7 @@
// The tree builder rules, as a single, enormous nested match expression.
+use crate::encoding::extract_a_character_encoding_from_a_meta_element;
use crate::interface::Quirks;
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
use crate::tokenizer::TagKind::{EndTag, StartTag};
@@ -189,8 +190,32 @@ where
Token::Tag(tag!()) => self.step(InsertionMode::InBody, token),
Token::Tag(tag @ tag!( | | | | )) => {
- // FIXME: handle and
- self.insert_and_pop_element_for(tag);
+ self.insert_and_pop_element_for(tag.clone());
+
+ // Step 1. If the element has a charset attribute, and getting an encoding from its value
+ // results in an encoding, and the confidence is currently tentative, then change the encoding
+ // to the resulting encoding.
+ // NOTE: We don't verify the validity of the encoding here. If the embedder detects the
+ // encoding to be invalid then they can safely continue spinning the tokenizer.
+ if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
+ return ProcessResult::EncodingIndicator(charset);
+ }
+ // Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
+ // case-insensitive match for the string "Content-Type", and the element has a content
+ // attribute, and applying the algorithm for extracting a character encoding from a meta
+ // element to that attribute's value returns an encoding, and the confidence is currently
+ // tentative, then change the encoding to the extracted encoding.
+ else if tag
+ .get_attribute(&local_name!("http-equiv"))
+ .is_some_and(|value| value.eq_ignore_ascii_case("content-type"))
+ {
+ if let Some(encoding) = tag
+ .get_attribute(&local_name!("content"))
+ .and_then(extract_a_character_encoding_from_a_meta_element)
+ {
+ return ProcessResult::EncodingIndicator(encoding);
+ }
+ }
ProcessResult::DoneAckSelfClosing
},
diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs
index 684d5b0b..c336b93b 100644
--- a/html5ever/src/tree_builder/types.rs
+++ b/html5ever/src/tree_builder/types.rs
@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult {
Script(Handle),
ToPlaintext,
ToRawData(RawKind),
+ EncodingIndicator(StrTendril),
}
pub(crate) enum FormatEntry {
diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs
index 92e6abf5..247daf35 100644
--- a/markup5ever/interface/mod.rs
+++ b/markup5ever/interface/mod.rs
@@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
pub enum TokenizerResult {
Done,
Script(Handle),
+ /// The document indicated that the given encoding should be used to parse it.
+ ///
+ /// HTML5-compatible implementations should parse the encoding label using the algorithm
+ /// described in . The label
+ /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+ ///
+ /// If you are confident that the current encoding is correct then you can safely
+ /// ignore this message.
+ EncodingIndicator(StrTendril),
}
/// Helper to quickly create an expanded name.