Propagate encoding hints from the document to the caller

simonwuelker · simonwuelker · commit e6f2cf2b3815 · 2025-12-20T15:12:48.000+01:00
Signed-off-by: Simon Wülker &lt;simon.wuelker@arcor.de&gt;
diff --git a/html5ever/src/encoding.rs b/html5ever/src/encoding.rs
@@ -0,0 +1,112 @@
+// Copyright 2014-2025 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::tendril::StrTendril;
+
+/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
+pub(crate) fn extract_a_character_encoding_from_a_meta_element(input: StrTendril) -> Option<StrTendril> {
+    // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
+    let mut position = 0;
+    loop {
+        // Step 2. Loop: Find the first seven characters in s after position that are an ASCII
+        // case-insensitive match for the word "charset". If no such match is found, return nothing.
+        loop {
+            let candidate = input.as_bytes().get(position..position + "charset".len())?;
+            if candidate.eq_ignore_ascii_case(b"charset") {
+                break;
+            }
+
+            position += 1;
+        }
+        position += "charset".len();
+
+        // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
+        let remaining = &input.as_bytes()[position..];
+        position += remaining.len() - remaining.trim_ascii_start().len();
+
+        // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
+        // that next character, and jump back to the step labeled loop.
+        if input.as_bytes()[position] == b'=' {
+            break;
+        }
+    }
+    // Skip the "="
+    position += 1;
+
+    // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
+    let remaining = &input.as_bytes()[position..];
+    position += remaining.len() - remaining.trim_ascii_start().len();
+
+    // Step 6. Process the next character as follows:
+    match input.as_bytes().get(position)? {
+        quote @ (b'"' | b'\'') => {
+            // Return the result of getting an encoding from the substring that is between this character
+            // and the next earliest occurrence of this character.
+            let Some(length) = input.as_bytes()[position + 1..].iter().position(|byte| byte == quote) else {
+                return None;
+            };
+            Some(input.subtendril(position as u32 + 1, length as u32))
+        } ,
+        _ => {
+            // Return the result of getting an encoding from the substring that consists of this character
+            // up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
+            // or the end of s, whichever comes first.
+            let length = input.as_bytes()[position..].iter().position(|byte| byte.is_ascii_whitespace() || *byte == b';');
+            if let Some(length) = length {
+                Some(input.subtendril(position as u32, length as u32))
+            } else {
+                Some(input.subtendril(position as u32, (input.len() - position) as u32))
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn meta_element_without_charset() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")), None);
+    }
+
+    #[test]
+    fn meta_element_with_capitalized_charset() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("cHarSet=utf8")), Some(StrTendril::from_slice("utf8")));
+    }
+
+    #[test]
+    fn meta_element_with_no_equals_after_charset() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset utf8")), None);
+    }
+
+    #[test]
+    fn meta_element_with_whitespace_around_equals() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset \t=\tutf8")), Some(StrTendril::from_slice("utf8")));
+    }
+
+    #[test]
+    fn meta_element_with_quoted_value() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset='utf8'")), Some(StrTendril::from_slice("utf8")));
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=\"utf8\"")), Some(StrTendril::from_slice("utf8")));
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset='utf8")), None);
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=\"utf8")), None);
+    }
+
+    #[test]
+    fn meta_element_with_implicit_terminator() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=utf8 foo")), Some(StrTendril::from_slice("utf8")));
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=utf8;foo")), Some(StrTendril::from_slice("utf8")));
+    }
+
+    #[test]
+    fn meta_element_with_content_type() {
+        assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("text/html; charset=utf8")), Some(StrTendril::from_slice("utf8")));
+    }
+}
diff --git a/html5ever/src/lib.rs b/html5ever/src/lib.rs
@@ -23,6 +23,7 @@ mod util {
 }
 
 pub(crate) mod macros;
+pub(crate) mod encoding;
 
 pub mod driver;
 pub mod serialize;
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
@@ -7,6 +7,8 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use markup5ever::ns;
+
 use crate::interface::Attribute;
 use crate::tendril::StrTendril;
 use crate::tokenizer::states;
@@ -57,6 +59,10 @@ impl Tag {
 
         self_attrs == other_attrs
     }
+
+    pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
+        self.attrs.iter().find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name).map(|attribute| attribute.value.clone())
+    }
 }
 
 #[derive(PartialEq, Eq, Debug)]
@@ -77,6 +83,15 @@ pub enum TokenSinkResult<Handle> {
     Script(Handle),
     Plaintext,
     RawData(states::RawKind),
+    /// The document indicated that the given encoding should be used to parse it.
+    ///
+    /// HTML5-compatible implementations should parse the encoding label using the algorithm
+    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
+    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+    ///
+    /// If the decoder is confident that the current encoding is correct then this message
+    /// can safely be ignored.
+    EncodingIndicator(StrTendril),
 }
 
 /// Types which can receive tokens from the tokenizer.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {
     Continue,
     Suspend,
     Script(Handle),
+    EncodingIndicator(StrTendril),
 }
 
 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +358,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     ProcessResult::Continue => (),
                     ProcessResult::Suspend => break,
                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
+                    ProcessResult::EncodingIndicator(encoding) => return TokenizerResult::EncodingIndicator(encoding),
                 }
             }
         } else {
@@ -365,6 +367,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     ProcessResult::Continue => (),
                     ProcessResult::Suspend => break,
                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
+                    ProcessResult::EncodingIndicator(encoding) => return TokenizerResult::EncodingIndicator(encoding),
                 }
             }
         }
@@ -456,6 +459,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                 self.state.set(states::RawData(kind));
                 ProcessResult::Continue
             },
+            TokenSinkResult::EncodingIndicator(encoding) => ProcessResult::EncodingIndicator(encoding),
         }
     }
 
@@ -1726,7 +1730,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             match self.eof_step() {
                 ProcessResult::Continue => (),
                 ProcessResult::Suspend => break,
-                ProcessResult::Script(_) => unreachable!(),
+                ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
             }
         }
 
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
@@ -400,6 +400,7 @@ where
                     assert!(more_tokens.is_empty());
                     return tokenizer::TokenSinkResult::RawData(k);
                 },
+                ProcessResult::EncodingIndicator(encoding) => return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
             }
         }
     }
diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs
@@ -9,6 +9,7 @@
 
 // The tree builder rules, as a single, enormous nested match expression.
 
+use crate::encoding::extract_a_character_encoding_from_a_meta_element;
 use crate::interface::Quirks;
 use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
 use crate::tokenizer::TagKind::{EndTag, StartTag};
@@ -189,8 +190,26 @@ where
                     Token::Tag(tag!(<html>)) => self.step(InsertionMode::InBody, token),
 
                     Token::Tag(tag @ tag!(<base> | <basefont> | <bgsound> | <link> | <meta>)) => {
-                        // FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
-                        self.insert_and_pop_element_for(tag);
+                        self.insert_and_pop_element_for(tag.clone());
+
+                        // Step 1. If the element has a charset attribute, and getting an encoding from its value
+                        // results in an encoding, and the confidence is currently tentative, then change the encoding
+                        // to the resulting encoding.
+                        // NOTE: We don't verify the validity of the encoding here. If the embedder detects the
+                        // encoding to be invalid then they can safely continue spinning the tokenizer.
+                        if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
+                            return ProcessResult::EncodingIndicator(charset);
+                        }
+                        // Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
+                        // case-insensitive match for the string "Content-Type", and the element has a content
+                        // attribute, and applying the algorithm for extracting a character encoding from a meta
+                        // element to that attribute's value returns an encoding, and the confidence is currently
+                        // tentative, then change the encoding to the extracted encoding.
+                        else if tag.get_attribute(&local_name!("http-equiv")).is_some_and(|value| value.eq_ignore_ascii_case("content-type")) {
+                            if let Some(encoding) = tag.get_attribute(&local_name!("content")).and_then(extract_a_character_encoding_from_a_meta_element) {
+                                return ProcessResult::EncodingIndicator(encoding);
+                            }
+                        }
                         ProcessResult::DoneAckSelfClosing
                     },
 
diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs
@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {
     Script(Handle),
     ToPlaintext,
     ToRawData(RawKind),
+    EncodingIndicator(StrTendril)
 }
 
 pub(crate) enum FormatEntry<Handle> {
diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs
@@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
 pub enum TokenizerResult<Handle> {
     Done,
     Script(Handle),
+    /// The document indicated that the given encoding should be used to parse it.
+    ///
+    /// HTML5-compatible implementations should parse the encoding label using the algorithm
+    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
+    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+    ///
+    /// If you are confident that the current encoding is correct then you can safely
+    /// ignore this message.
+    EncodingIndicator(StrTendril)
 }
 
 /// Helper to quickly create an expanded name.

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ mod util {`
`23`	`23`	`}`
`24`	`24`
`25`	`25`	`pub(crate) mod macros;`
	`26`	`+pub(crate) mod encoding;`
`26`	`27`
`27`	`28`	`pub mod driver;`
`28`	`29`	`pub mod serialize;`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {`
`43`	`43`	`Continue,`
`44`	`44`	`Suspend,`
`45`	`45`	`Script(Handle),`
	`46`	`+ EncodingIndicator(StrTendril),`
`46`	`47`	`}`
`47`	`48`
`48`	`49`	`fn option_push(opt_str: &mut Option<StrTendril>, c: char) {`
`@@ -357,6 +358,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`357`	`358`	`ProcessResult::Continue => (),`
`358`	`359`	`ProcessResult::Suspend => break,`
`359`	`360`	`ProcessResult::Script(node) => return TokenizerResult::Script(node),`
	`361`	`+ ProcessResult::EncodingIndicator(encoding) => return TokenizerResult::EncodingIndicator(encoding),`
`360`	`362`	`}`
`361`	`363`	`}`
`362`	`364`	`} else {`
`@@ -365,6 +367,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`365`	`367`	`ProcessResult::Continue => (),`
`366`	`368`	`ProcessResult::Suspend => break,`
`367`	`369`	`ProcessResult::Script(node) => return TokenizerResult::Script(node),`
	`370`	`+ ProcessResult::EncodingIndicator(encoding) => return TokenizerResult::EncodingIndicator(encoding),`
`368`	`371`	`}`
`369`	`372`	`}`
`370`	`373`	`}`
`@@ -456,6 +459,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`456`	`459`	`self.state.set(states::RawData(kind));`
`457`	`460`	`ProcessResult::Continue`
`458`	`461`	`},`
	`462`	`+ TokenSinkResult::EncodingIndicator(encoding) => ProcessResult::EncodingIndicator(encoding),`
`459`	`463`	`}`
`460`	`464`	`}`
`461`	`465`
`@@ -1726,7 +1730,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {`
`1726`	`1730`	`match self.eof_step() {`
`1727`	`1731`	`ProcessResult::Continue => (),`
`1728`	`1732`	`ProcessResult::Suspend => break,`
`1729`		`- ProcessResult::Script(_) => unreachable!(),`
	`1733`	`+ ProcessResult::Script(_) \| ProcessResult::EncodingIndicator(_) => unreachable!(),`
`1730`	`1734`	`}`
`1731`	`1735`	`}`
`1732`	`1736`
Original file line number	Diff line number	Diff line change
`@@ -400,6 +400,7 @@ where`
`400`	`400`	`assert!(more_tokens.is_empty());`
`401`	`401`	`return tokenizer::TokenSinkResult::RawData(k);`
`402`	`402`	`},`
	`403`	`+ ProcessResult::EncodingIndicator(encoding) => return tokenizer::TokenSinkResult::EncodingIndicator(encoding)`
`403`	`404`	`}`
`404`	`405`	`}`
`405`	`406`	`}`
Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {`
`70`	`70`	`Script(Handle),`
`71`	`71`	`ToPlaintext,`
`72`	`72`	`ToRawData(RawKind),`
	`73`	`+ EncodingIndicator(StrTendril)`
`73`	`74`	`}`
`74`	`75`
`75`	`76`	`pub(crate) enum FormatEntry<Handle> {`