Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions html5ever/src/encoding.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
// Copyright 2014-2025 The html5ever Project Developers. See the
// COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use crate::tendril::StrTendril;

/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
pub(crate) fn extract_a_character_encoding_from_a_meta_element(
input: StrTendril,
) -> Option<StrTendril> {
// Step 1. Let position be a pointer into s, initially pointing at the start of the string.
let mut position = 0;
loop {
// Step 2. Loop: Find the first seven characters in s after position that are an ASCII
// case-insensitive match for the word "charset". If no such match is found, return nothing.
loop {
let candidate = input.as_bytes().get(position..position + "charset".len())?;
if candidate.eq_ignore_ascii_case(b"charset") {
break;
}

position += 1;
}
position += "charset".len();

// Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
position += input.as_bytes()[position..]
.iter()
.take_while(|byte| byte.is_ascii_whitespace())
.count();

// Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
// that next character, and jump back to the step labeled loop.
if input.as_bytes()[position] == b'=' {
break;
}
}
// Skip the "="
position += 1;

// Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
position += input.as_bytes()[position..]
.iter()
.take_while(|byte| byte.is_ascii_whitespace())
.count();

// Step 6. Process the next character as follows:
match input.as_bytes().get(position)? {
quote @ (b'"' | b'\'') => {
// Return the result of getting an encoding from the substring that is between this character
// and the next earliest occurrence of this character.
let length = input.as_bytes()[position + 1..]
.iter()
.position(|byte| byte == quote)?;
Some(input.subtendril(position as u32 + 1, length as u32))
},
_ => {
// Return the result of getting an encoding from the substring that consists of this character
// up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
// or the end of s, whichever comes first.
let length = input.as_bytes()[position..]
.iter()
.position(|byte| byte.is_ascii_whitespace() || *byte == b';');
if let Some(length) = length {
Some(input.subtendril(position as u32, length as u32))
} else {
Some(input.subtendril(position as u32, (input.len() - position) as u32))
}
},
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn meta_element_without_charset() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")),
None
);
}

#[test]
fn meta_element_with_capitalized_charset() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"cHarSet=utf8"
)),
Some(StrTendril::from_slice("utf8"))
);
}

#[test]
fn meta_element_with_no_equals_after_charset() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset utf8"
)),
None
);
}

#[test]
fn meta_element_with_whitespace_around_equals() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset \t=\tutf8"
)),
Some(StrTendril::from_slice("utf8"))
);
}

#[test]
fn meta_element_with_quoted_value() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset='utf8'"
)),
Some(StrTendril::from_slice("utf8"))
);
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset=\"utf8\""
)),
Some(StrTendril::from_slice("utf8"))
);
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset='utf8"
)),
None
);
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset=\"utf8"
)),
None
);
}

#[test]
fn meta_element_with_implicit_terminator() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset=utf8 foo"
)),
Some(StrTendril::from_slice("utf8"))
);
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"charset=utf8;foo"
)),
Some(StrTendril::from_slice("utf8"))
);
}

#[test]
fn meta_element_with_content_type() {
assert_eq!(
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
"text/html; charset=utf8"
)),
Some(StrTendril::from_slice("utf8"))
);
}
}
1 change: 1 addition & 0 deletions html5ever/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ mod util {
pub(crate) mod str;
}

pub(crate) mod encoding;
pub(crate) mod macros;

pub mod driver;
Expand Down
18 changes: 18 additions & 0 deletions html5ever/src/tokenizer/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use markup5ever::ns;

use crate::interface::Attribute;
use crate::tendril::StrTendril;
use crate::tokenizer::states;
Expand Down Expand Up @@ -57,6 +59,13 @@ impl Tag {

self_attrs == other_attrs
}

pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
self.attrs
.iter()
.find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
.map(|attribute| attribute.value.clone())
}
}

#[derive(PartialEq, Eq, Debug)]
Expand All @@ -77,6 +86,15 @@ pub enum TokenSinkResult<Handle> {
Script(Handle),
Plaintext,
RawData(states::RawKind),
/// The document indicated that the given encoding should be used to parse it.
///
/// HTML5-compatible implementations should parse the encoding label using the algorithm
/// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
/// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
///
/// If the decoder is confident that the current encoding is correct then this message
/// can safely be ignored.
EncodingIndicator(StrTendril),
}

/// Types which can receive tokens from the tokenizer.
Expand Down
12 changes: 11 additions & 1 deletion html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {
Continue,
Suspend,
Script(Handle),
EncodingIndicator(StrTendril),
}

fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
Expand Down Expand Up @@ -357,6 +358,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
ProcessResult::EncodingIndicator(encoding) => {
return TokenizerResult::EncodingIndicator(encoding)
},
}
}
} else {
Expand All @@ -365,6 +369,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
ProcessResult::EncodingIndicator(encoding) => {
return TokenizerResult::EncodingIndicator(encoding)
},
}
}
}
Expand Down Expand Up @@ -456,6 +463,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
self.state.set(states::RawData(kind));
ProcessResult::Continue
},
TokenSinkResult::EncodingIndicator(encoding) => {
ProcessResult::EncodingIndicator(encoding)
},
}
}

Expand Down Expand Up @@ -1726,7 +1736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
match self.eof_step() {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(_) => unreachable!(),
ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
}
}

Expand Down
3 changes: 3 additions & 0 deletions html5ever/src/tree_builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,9 @@ where
assert!(more_tokens.is_empty());
return tokenizer::TokenSinkResult::RawData(k);
},
ProcessResult::EncodingIndicator(encoding) => {
return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
},
}
}
}
Expand Down
29 changes: 27 additions & 2 deletions html5ever/src/tree_builder/rules.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

// The tree builder rules, as a single, enormous nested match expression.

use crate::encoding::extract_a_character_encoding_from_a_meta_element;
use crate::interface::Quirks;
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
use crate::tokenizer::TagKind::{EndTag, StartTag};
Expand Down Expand Up @@ -189,8 +190,32 @@ where
Token::Tag(tag!(<html>)) => self.step(InsertionMode::InBody, token),

Token::Tag(tag @ tag!(<base> | <basefont> | <bgsound> | <link> | <meta>)) => {
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
self.insert_and_pop_element_for(tag);
self.insert_and_pop_element_for(tag.clone());

// Step 1. If the element has a charset attribute, and getting an encoding from its value
// results in an encoding, and the confidence is currently tentative, then change the encoding
// to the resulting encoding.
// NOTE: We don't verify the validity of the encoding here. If the embedder detects the
// encoding to be invalid then they can safely continue spinning the tokenizer.
if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
return ProcessResult::EncodingIndicator(charset);
}
// Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
// case-insensitive match for the string "Content-Type", and the element has a content
// attribute, and applying the algorithm for extracting a character encoding from a meta
// element to that attribute's value returns an encoding, and the confidence is currently
// tentative, then change the encoding to the extracted encoding.
else if tag
.get_attribute(&local_name!("http-equiv"))
.is_some_and(|value| value.eq_ignore_ascii_case("content-type"))
{
if let Some(encoding) = tag
.get_attribute(&local_name!("content"))
.and_then(extract_a_character_encoding_from_a_meta_element)
{
return ProcessResult::EncodingIndicator(encoding);
}
}
ProcessResult::DoneAckSelfClosing
},

Expand Down
1 change: 1 addition & 0 deletions html5ever/src/tree_builder/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {
Script(Handle),
ToPlaintext,
ToRawData(RawKind),
EncodingIndicator(StrTendril),
}

pub(crate) enum FormatEntry<Handle> {
Expand Down
9 changes: 9 additions & 0 deletions markup5ever/interface/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
pub enum TokenizerResult<Handle> {
Done,
Script(Handle),
/// The document indicated that the given encoding should be used to parse it.
///
/// HTML5-compatible implementations should parse the encoding label using the algorithm
/// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
/// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
///
/// If you are confident that the current encoding is correct then you can safely
/// ignore this message.
EncodingIndicator(StrTendril),
}

/// Helper to quickly create an expanded name.
Expand Down