Propagate encoding hints from the document to the caller

simonwuelker · simonwuelker · commit 6a6ed6498c25 · 2025-12-20T16:24:17.000+01:00
Signed-off-by: Simon Wülker &lt;simon.wuelker@arcor.de&gt;
diff --git a/html5ever/src/encoding.rs b/html5ever/src/encoding.rs
@@ -0,0 +1,173 @@
+// Copyright 2014-2025 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::tendril::StrTendril;
+
+/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
+pub(crate) fn extract_a_character_encoding_from_a_meta_element(
+    input: StrTendril,
+) -> Option<StrTendril> {
+    // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
+    let mut position = 0;
+    loop {
+        // Step 2. Loop: Find the first seven characters in s after position that are an ASCII
+        // case-insensitive match for the word "charset". If no such match is found, return nothing.
+        loop {
+            let candidate = input.as_bytes().get(position..position + "charset".len())?;
+            if candidate.eq_ignore_ascii_case(b"charset") {
+                break;
+            }
+
+            position += 1;
+        }
+        position += "charset".len();
+
+        // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
+        position += input.as_bytes()[position..]
+            .iter()
+            .take_while(|byte| byte.is_ascii_whitespace())
+            .count();
+
+        // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
+        // that next character, and jump back to the step labeled loop.
+        if input.as_bytes()[position] == b'=' {
+            break;
+        }
+    }
+    // Skip the "="
+    position += 1;
+
+    // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
+    position += input.as_bytes()[position..]
+        .iter()
+        .take_while(|byte| byte.is_ascii_whitespace())
+        .count();
+
+    // Step 6. Process the next character as follows:
+    match input.as_bytes().get(position)? {
+        quote @ (b'"' | b'\'') => {
+            // Return the result of getting an encoding from the substring that is between this character
+            // and the next earliest occurrence of this character.
+            let length = input.as_bytes()[position + 1..]
+                .iter()
+                .position(|byte| byte == quote)?;
+            Some(input.subtendril(position as u32 + 1, length as u32))
+        },
+        _ => {
+            // Return the result of getting an encoding from the substring that consists of this character
+            // up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
+            // or the end of s, whichever comes first.
+            let length = input.as_bytes()[position..]
+                .iter()
+                .position(|byte| byte.is_ascii_whitespace() || *byte == b';');
+            if let Some(length) = length {
+                Some(input.subtendril(position as u32, length as u32))
+            } else {
+                Some(input.subtendril(position as u32, (input.len() - position) as u32))
+            }
+        },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn meta_element_without_charset() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")),
+            None
+        );
+    }
+
+    #[test]
+    fn meta_element_with_capitalized_charset() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "cHarSet=utf8"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+
+    #[test]
+    fn meta_element_with_no_equals_after_charset() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset utf8"
+            )),
+            None
+        );
+    }
+
+    #[test]
+    fn meta_element_with_whitespace_around_equals() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset \t=\tutf8"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+
+    #[test]
+    fn meta_element_with_quoted_value() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset='utf8'"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=\"utf8\""
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset='utf8"
+            )),
+            None
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=\"utf8"
+            )),
+            None
+        );
+    }
+
+    #[test]
+    fn meta_element_with_implicit_terminator() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=utf8 foo"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=utf8;foo"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+
+    #[test]
+    fn meta_element_with_content_type() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "text/html; charset=utf8"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+}
diff --git a/html5ever/src/lib.rs b/html5ever/src/lib.rs
@@ -22,6 +22,7 @@ mod util {
     pub(crate) mod str;
 }
 
+pub(crate) mod encoding;
 pub(crate) mod macros;
 
 pub mod driver;
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
@@ -7,6 +7,8 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use markup5ever::ns;
+
 use crate::interface::Attribute;
 use crate::tendril::StrTendril;
 use crate::tokenizer::states;
@@ -57,6 +59,13 @@ impl Tag {
 
         self_attrs == other_attrs
     }
+
+    pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
+        self.attrs
+            .iter()
+            .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
+            .map(|attribute| attribute.value.clone())
+    }
 }
 
 #[derive(PartialEq, Eq, Debug)]
@@ -77,6 +86,15 @@ pub enum TokenSinkResult<Handle> {
     Script(Handle),
     Plaintext,
     RawData(states::RawKind),
+    /// The document indicated that the given encoding should be used to parse it.
+    ///
+    /// HTML5-compatible implementations should parse the encoding label using the algorithm
+    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
+    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+    ///
+    /// If the decoder is confident that the current encoding is correct then this message
+    /// can safely be ignored.
+    EncodingIndicator(StrTendril),
 }
 
 /// Types which can receive tokens from the tokenizer.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {
     Continue,
     Suspend,
     Script(Handle),
+    EncodingIndicator(StrTendril),
 }
 
 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +358,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     ProcessResult::Continue => (),
                     ProcessResult::Suspend => break,
                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
+                    ProcessResult::EncodingIndicator(encoding) => {
+                        return TokenizerResult::EncodingIndicator(encoding)
+                    },
                 }
             }
         } else {
@@ -365,6 +369,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     ProcessResult::Continue => (),
                     ProcessResult::Suspend => break,
                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
+                    ProcessResult::EncodingIndicator(encoding) => {
+                        return TokenizerResult::EncodingIndicator(encoding)
+                    },
                 }
             }
         }
@@ -456,6 +463,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                 self.state.set(states::RawData(kind));
                 ProcessResult::Continue
             },
+            TokenSinkResult::EncodingIndicator(encoding) => {
+                ProcessResult::EncodingIndicator(encoding)
+            },
         }
     }
 
@@ -1726,7 +1736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             match self.eof_step() {
                 ProcessResult::Continue => (),
                 ProcessResult::Suspend => break,
-                ProcessResult::Script(_) => unreachable!(),
+                ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
             }
         }
 
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
@@ -400,6 +400,9 @@ where
                     assert!(more_tokens.is_empty());
                     return tokenizer::TokenSinkResult::RawData(k);
                 },
+                ProcessResult::EncodingIndicator(encoding) => {
+                    return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
+                },
             }
         }
     }
diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs
@@ -9,6 +9,7 @@
 
 // The tree builder rules, as a single, enormous nested match expression.
 
+use crate::encoding::extract_a_character_encoding_from_a_meta_element;
 use crate::interface::Quirks;
 use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
 use crate::tokenizer::TagKind::{EndTag, StartTag};
@@ -189,8 +190,32 @@ where
                     Token::Tag(tag!(<html>)) => self.step(InsertionMode::InBody, token),
 
                     Token::Tag(tag @ tag!(<base> | <basefont> | <bgsound> | <link> | <meta>)) => {
-                        // FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
-                        self.insert_and_pop_element_for(tag);
+                        self.insert_and_pop_element_for(tag.clone());
+
+                        // Step 1. If the element has a charset attribute, and getting an encoding from its value
+                        // results in an encoding, and the confidence is currently tentative, then change the encoding
+                        // to the resulting encoding.
+                        // NOTE: We don't verify the validity of the encoding here. If the embedder detects the
+                        // encoding to be invalid then they can safely continue spinning the tokenizer.
+                        if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
+                            return ProcessResult::EncodingIndicator(charset);
+                        }
+                        // Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
+                        // case-insensitive match for the string "Content-Type", and the element has a content
+                        // attribute, and applying the algorithm for extracting a character encoding from a meta
+                        // element to that attribute's value returns an encoding, and the confidence is currently
+                        // tentative, then change the encoding to the extracted encoding.
+                        else if tag
+                            .get_attribute(&local_name!("http-equiv"))
+                            .is_some_and(|value| value.eq_ignore_ascii_case("content-type"))
+                        {
+                            if let Some(encoding) = tag
+                                .get_attribute(&local_name!("content"))
+                                .and_then(extract_a_character_encoding_from_a_meta_element)
+                            {
+                                return ProcessResult::EncodingIndicator(encoding);
+                            }
+                        }
                         ProcessResult::DoneAckSelfClosing
                     },
 
diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs
@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {
     Script(Handle),
     ToPlaintext,
     ToRawData(RawKind),
+    EncodingIndicator(StrTendril),
 }
 
 pub(crate) enum FormatEntry<Handle> {
diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs
@@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
 pub enum TokenizerResult<Handle> {
     Done,
     Script(Handle),
+    /// The document indicated that the given encoding should be used to parse it.
+    ///
+    /// HTML5-compatible implementations should parse the encoding label using the algorithm
+    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
+    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+    ///
+    /// If you are confident that the current encoding is correct then you can safely
+    /// ignore this message.
+    EncodingIndicator(StrTendril),
 }
 
 /// Helper to quickly create an expanded name.

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@ mod util {`
`22`	`22`	`pub(crate) mod str;`
`23`	`23`	`}`
`24`	`24`
	`25`	`+pub(crate) mod encoding;`
`25`	`26`	`pub(crate) mod macros;`
`26`	`27`
`27`	`28`	`pub mod driver;`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {`
`43`	`43`	`Continue,`
`44`	`44`	`Suspend,`
`45`	`45`	`Script(Handle),`
	`46`	`+ EncodingIndicator(StrTendril),`
`46`	`47`	`}`
`47`	`48`
`48`	`49`	`fn option_push(opt_str: &mut Option<StrTendril>, c: char) {`
`@@ -357,6 +358,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`357`	`358`	`ProcessResult::Continue => (),`
`358`	`359`	`ProcessResult::Suspend => break,`
`359`	`360`	`ProcessResult::Script(node) => return TokenizerResult::Script(node),`
	`361`	`+ ProcessResult::EncodingIndicator(encoding) => {`
	`362`	`+ return TokenizerResult::EncodingIndicator(encoding)`
	`363`	`+ },`
`360`	`364`	`}`
`361`	`365`	`}`
`362`	`366`	`} else {`
`@@ -365,6 +369,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`365`	`369`	`ProcessResult::Continue => (),`
`366`	`370`	`ProcessResult::Suspend => break,`
`367`	`371`	`ProcessResult::Script(node) => return TokenizerResult::Script(node),`
	`372`	`+ ProcessResult::EncodingIndicator(encoding) => {`
	`373`	`+ return TokenizerResult::EncodingIndicator(encoding)`
	`374`	`+ },`
`368`	`375`	`}`
`369`	`376`	`}`
`370`	`377`	`}`
`@@ -456,6 +463,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`456`	`463`	`self.state.set(states::RawData(kind));`
`457`	`464`	`ProcessResult::Continue`
`458`	`465`	`},`
	`466`	`+ TokenSinkResult::EncodingIndicator(encoding) => {`
	`467`	`+ ProcessResult::EncodingIndicator(encoding)`
	`468`	`+ },`
`459`	`469`	`}`
`460`	`470`	`}`
`461`	`471`
`@@ -1726,7 +1736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`1726`	`1736`	`match self.eof_step() {`
`1727`	`1737`	`ProcessResult::Continue => (),`
`1728`	`1738`	`ProcessResult::Suspend => break,`
`1729`		`- ProcessResult::Script(_) => unreachable!(),`
	`1739`	`+ ProcessResult::Script(_) \| ProcessResult::EncodingIndicator(_) => unreachable!(),`
`1730`	`1740`	`}`
`1731`	`1741`	`}`
`1732`	`1742`
Original file line number	Diff line number	Diff line change
`@@ -400,6 +400,9 @@ where`
`400`	`400`	`assert!(more_tokens.is_empty());`
`401`	`401`	`return tokenizer::TokenSinkResult::RawData(k);`
`402`	`402`	`},`
	`403`	`+ ProcessResult::EncodingIndicator(encoding) => {`
	`404`	`+ return tokenizer::TokenSinkResult::EncodingIndicator(encoding)`
	`405`	`+ },`
`403`	`406`	`}`
`404`	`407`	`}`
`405`	`408`	`}`
Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {`
`70`	`70`	`Script(Handle),`
`71`	`71`	`ToPlaintext,`
`72`	`72`	`ToRawData(RawKind),`
	`73`	`+ EncodingIndicator(StrTendril),`
`73`	`74`	`}`
`74`	`75`
`75`	`76`	`pub(crate) enum FormatEntry<Handle> {`