servo · simonwuelker · Dec 16, 2025
diff --git a/html5ever/src/encoding.rs b/html5ever/src/encoding.rs
@@ -0,0 +1,173 @@
+// Copyright 2014-2025 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use crate::tendril::StrTendril;
+
+/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
+pub(crate) fn extract_a_character_encoding_from_a_meta_element(
+    input: StrTendril,
+) -> Option<StrTendril> {
+    // Step 1. Let position be a pointer into s, initially pointing at the start of the string.
+    let mut position = 0;
+    loop {
+        // Step 2. Loop: Find the first seven characters in s after position that are an ASCII
+        // case-insensitive match for the word "charset". If no such match is found, return nothing.
+        loop {
+            let candidate = input.as_bytes().get(position..position + "charset".len())?;
+            if candidate.eq_ignore_ascii_case(b"charset") {
+                break;
+            }
+
+            position += 1;
+        }
+        position += "charset".len();
+
+        // Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
+        position += input.as_bytes()[position..]
+            .iter()
+            .take_while(|byte| byte.is_ascii_whitespace())
+            .count();
+
+        // Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
+        // that next character, and jump back to the step labeled loop.
+        if input.as_bytes()[position] == b'=' {
+            break;
+        }
+    }
+    // Skip the "="
+    position += 1;
+
+    // Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
+    position += input.as_bytes()[position..]
+        .iter()
+        .take_while(|byte| byte.is_ascii_whitespace())
+        .count();
+
+    // Step 6. Process the next character as follows:
+    match input.as_bytes().get(position)? {
+        quote @ (b'"' | b'\'') => {
+            // Return the result of getting an encoding from the substring that is between this character
+            // and the next earliest occurrence of this character.
+            let length = input.as_bytes()[position + 1..]
+                .iter()
+                .position(|byte| byte == quote)?;
+            Some(input.subtendril(position as u32 + 1, length as u32))
+        },
+        _ => {
+            // Return the result of getting an encoding from the substring that consists of this character
+            // up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
+            // or the end of s, whichever comes first.
+            let length = input.as_bytes()[position..]
+                .iter()
+                .position(|byte| byte.is_ascii_whitespace() || *byte == b';');
+            if let Some(length) = length {
+                Some(input.subtendril(position as u32, length as u32))
+            } else {
+                Some(input.subtendril(position as u32, (input.len() - position) as u32))
+            }
+        },
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn meta_element_without_charset() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")),
+            None
+        );
+    }
+
+    #[test]
+    fn meta_element_with_capitalized_charset() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "cHarSet=utf8"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+
+    #[test]
+    fn meta_element_with_no_equals_after_charset() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset utf8"
+            )),
+            None
+        );
+    }
+
+    #[test]
+    fn meta_element_with_whitespace_around_equals() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset \t=\tutf8"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+
+    #[test]
+    fn meta_element_with_quoted_value() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset='utf8'"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=\"utf8\""
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset='utf8"
+            )),
+            None
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=\"utf8"
+            )),
+            None
+        );
+    }
+
+    #[test]
+    fn meta_element_with_implicit_terminator() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=utf8 foo"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "charset=utf8;foo"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+
+    #[test]
+    fn meta_element_with_content_type() {
+        assert_eq!(
+            extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
+                "text/html; charset=utf8"
+            )),
+            Some(StrTendril::from_slice("utf8"))
+        );
+    }
+}
diff --git a/html5ever/src/lib.rs b/html5ever/src/lib.rs
@@ -22,6 +22,7 @@ mod util {
     pub(crate) mod str;
 }
 
+pub(crate) mod encoding;
 pub(crate) mod macros;
 
 pub mod driver;

diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
@@ -7,6 +7,8 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
+use markup5ever::ns;
+
 use crate::interface::Attribute;
 use crate::tendril::StrTendril;
 use crate::tokenizer::states;
@@ -57,6 +59,13 @@ impl Tag {
 
         self_attrs == other_attrs
     }
+
+    pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
+        self.attrs
+            .iter()
+            .find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
+            .map(|attribute| attribute.value.clone())
+    }
 }
 
 #[derive(PartialEq, Eq, Debug)]
@@ -77,6 +86,15 @@ pub enum TokenSinkResult<Handle> {
     Script(Handle),
     Plaintext,
     RawData(states::RawKind),
+    /// The document indicated that the given encoding should be used to parse it.
+    ///
+    /// HTML5-compatible implementations should parse the encoding label using the algorithm
+    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
+    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+    ///
+    /// If the decoder is confident that the current encoding is correct then this message
+    /// can safely be ignored.
+    EncodingIndicator(StrTendril),
 }
 
 /// Types which can receive tokens from the tokenizer.

diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {
     Continue,
     Suspend,
     Script(Handle),
+    EncodingIndicator(StrTendril),
 }
 
 fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +358,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     ProcessResult::Continue => (),
                     ProcessResult::Suspend => break,
                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
+                    ProcessResult::EncodingIndicator(encoding) => {
+                        return TokenizerResult::EncodingIndicator(encoding)
+                    },
                 }
             }
         } else {
@@ -365,6 +369,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                     ProcessResult::Continue => (),
                     ProcessResult::Suspend => break,
                     ProcessResult::Script(node) => return TokenizerResult::Script(node),
+                    ProcessResult::EncodingIndicator(encoding) => {
+                        return TokenizerResult::EncodingIndicator(encoding)
+                    },
                 }
             }
         }
@@ -456,6 +463,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
                 self.state.set(states::RawData(kind));
                 ProcessResult::Continue
             },
+            TokenSinkResult::EncodingIndicator(encoding) => {
+                ProcessResult::EncodingIndicator(encoding)
+            },
         }
     }
 
@@ -1726,7 +1736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
             match self.eof_step() {
                 ProcessResult::Continue => (),
                 ProcessResult::Suspend => break,
-                ProcessResult::Script(_) => unreachable!(),
+                ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
             }
         }
 

diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
@@ -400,6 +400,9 @@ where
                     assert!(more_tokens.is_empty());
                     return tokenizer::TokenSinkResult::RawData(k);
                 },
+                ProcessResult::EncodingIndicator(encoding) => {
+                    return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
+                },
             }
         }
     }

diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs
@@ -9,6 +9,7 @@
 
 // The tree builder rules, as a single, enormous nested match expression.
 
+use crate::encoding::extract_a_character_encoding_from_a_meta_element;
 use crate::interface::Quirks;
 use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
 use crate::tokenizer::TagKind::{EndTag, StartTag};
@@ -189,8 +190,32 @@ where
                     Token::Tag(tag!(<html>)) => self.step(InsertionMode::InBody, token),
 
                     Token::Tag(tag @ tag!(<base> | <basefont> | <bgsound> | <link> | <meta>)) => {
-                        // FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
-                        self.insert_and_pop_element_for(tag);
+                        self.insert_and_pop_element_for(tag.clone());
+
+                        // Step 1. If the element has a charset attribute, and getting an encoding from its value
+                        // results in an encoding, and the confidence is currently tentative, then change the encoding
+                        // to the resulting encoding.
+                        // NOTE: We don't verify the validity of the encoding here. If the embedder detects the
+                        // encoding to be invalid then they can safely continue spinning the tokenizer.
+                        if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
+                            return ProcessResult::EncodingIndicator(charset);
+                        }
+                        // Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
+                        // case-insensitive match for the string "Content-Type", and the element has a content
+                        // attribute, and applying the algorithm for extracting a character encoding from a meta
+                        // element to that attribute's value returns an encoding, and the confidence is currently
+                        // tentative, then change the encoding to the extracted encoding.
+                        else if tag
+                            .get_attribute(&local_name!("http-equiv"))
+                            .is_some_and(|value| value.eq_ignore_ascii_case("content-type"))
+                        {
+                            if let Some(encoding) = tag
+                                .get_attribute(&local_name!("content"))
+                                .and_then(extract_a_character_encoding_from_a_meta_element)
+                            {
+                                return ProcessResult::EncodingIndicator(encoding);
+                            }
+                        }
                         ProcessResult::DoneAckSelfClosing
                     },
 

diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs
@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {
     Script(Handle),
     ToPlaintext,
     ToRawData(RawKind),
+    EncodingIndicator(StrTendril),
 }
 
 pub(crate) enum FormatEntry<Handle> {

diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs
@@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
 pub enum TokenizerResult<Handle> {
     Done,
     Script(Handle),
+    /// The document indicated that the given encoding should be used to parse it.
+    ///
+    /// HTML5-compatible implementations should parse the encoding label using the algorithm
+    /// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
+    /// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
+    ///
+    /// If you are confident that the current encoding is correct then you can safely
+    /// ignore this message.
+    EncodingIndicator(StrTendril),
 }
 
 /// Helper to quickly create an expanded name.