Skip to content

Commit e6f2cf2

Browse files
committed
Propagate encoding hints from the document to the caller
Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
1 parent 795caf4 commit e6f2cf2

File tree

8 files changed

+165
-3
lines changed

8 files changed

+165
-3
lines changed

html5ever/src/encoding.rs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// Copyright 2014-2025 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
use crate::tendril::StrTendril;
11+
12+
/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
13+
pub(crate) fn extract_a_character_encoding_from_a_meta_element(input: StrTendril) -> Option<StrTendril> {
14+
// Step 1. Let position be a pointer into s, initially pointing at the start of the string.
15+
let mut position = 0;
16+
loop {
17+
// Step 2. Loop: Find the first seven characters in s after position that are an ASCII
18+
// case-insensitive match for the word "charset". If no such match is found, return nothing.
19+
loop {
20+
let candidate = input.as_bytes().get(position..position + "charset".len())?;
21+
if candidate.eq_ignore_ascii_case(b"charset") {
22+
break;
23+
}
24+
25+
position += 1;
26+
}
27+
position += "charset".len();
28+
29+
// Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
30+
let remaining = &input.as_bytes()[position..];
31+
position += remaining.len() - remaining.trim_ascii_start().len();
32+
33+
// Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
34+
// that next character, and jump back to the step labeled loop.
35+
if input.as_bytes()[position] == b'=' {
36+
break;
37+
}
38+
}
39+
// Skip the "="
40+
position += 1;
41+
42+
// Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
43+
let remaining = &input.as_bytes()[position..];
44+
position += remaining.len() - remaining.trim_ascii_start().len();
45+
46+
// Step 6. Process the next character as follows:
47+
match input.as_bytes().get(position)? {
48+
quote @ (b'"' | b'\'') => {
49+
// Return the result of getting an encoding from the substring that is between this character
50+
// and the next earliest occurrence of this character.
51+
let Some(length) = input.as_bytes()[position + 1..].iter().position(|byte| byte == quote) else {
52+
return None;
53+
};
54+
Some(input.subtendril(position as u32 + 1, length as u32))
55+
} ,
56+
_ => {
57+
// Return the result of getting an encoding from the substring that consists of this character
58+
// up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
59+
// or the end of s, whichever comes first.
60+
let length = input.as_bytes()[position..].iter().position(|byte| byte.is_ascii_whitespace() || *byte == b';');
61+
if let Some(length) = length {
62+
Some(input.subtendril(position as u32, length as u32))
63+
} else {
64+
Some(input.subtendril(position as u32, (input.len() - position) as u32))
65+
}
66+
}
67+
}
68+
}
69+
70+
#[cfg(test)]
71+
mod tests {
72+
use super::*;
73+
74+
#[test]
75+
fn meta_element_without_charset() {
76+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")), None);
77+
}
78+
79+
#[test]
80+
fn meta_element_with_capitalized_charset() {
81+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("cHarSet=utf8")), Some(StrTendril::from_slice("utf8")));
82+
}
83+
84+
#[test]
85+
fn meta_element_with_no_equals_after_charset() {
86+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset utf8")), None);
87+
}
88+
89+
#[test]
90+
fn meta_element_with_whitespace_around_equals() {
91+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset \t=\tutf8")), Some(StrTendril::from_slice("utf8")));
92+
}
93+
94+
#[test]
95+
fn meta_element_with_quoted_value() {
96+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset='utf8'")), Some(StrTendril::from_slice("utf8")));
97+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=\"utf8\"")), Some(StrTendril::from_slice("utf8")));
98+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset='utf8")), None);
99+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=\"utf8")), None);
100+
}
101+
102+
#[test]
103+
fn meta_element_with_implicit_terminator() {
104+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=utf8 foo")), Some(StrTendril::from_slice("utf8")));
105+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("charset=utf8;foo")), Some(StrTendril::from_slice("utf8")));
106+
}
107+
108+
#[test]
109+
fn meta_element_with_content_type() {
110+
assert_eq!(extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("text/html; charset=utf8")), Some(StrTendril::from_slice("utf8")));
111+
}
112+
}

html5ever/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ mod util {
2323
}
2424

2525
pub(crate) mod macros;
26+
pub(crate) mod encoding;
2627

2728
pub mod driver;
2829
pub mod serialize;

html5ever/src/tokenizer/interface.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10+
use markup5ever::ns;
11+
1012
use crate::interface::Attribute;
1113
use crate::tendril::StrTendril;
1214
use crate::tokenizer::states;
@@ -57,6 +59,10 @@ impl Tag {
5759

5860
self_attrs == other_attrs
5961
}
62+
63+
pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
64+
self.attrs.iter().find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name).map(|attribute| attribute.value.clone())
65+
}
6066
}
6167

6268
#[derive(PartialEq, Eq, Debug)]
@@ -77,6 +83,15 @@ pub enum TokenSinkResult<Handle> {
7783
Script(Handle),
7884
Plaintext,
7985
RawData(states::RawKind),
86+
/// The document indicated that the given encoding should be used to parse it.
87+
///
88+
/// HTML5-compatible implementations should parse the encoding label using the algorithm
89+
/// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
90+
/// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
91+
///
92+
/// If the decoder is confident that the current encoding is correct then this message
93+
/// can safely be ignored.
94+
EncodingIndicator(StrTendril),
8095
}
8196

8297
/// Types which can receive tokens from the tokenizer.

html5ever/src/tokenizer/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {
4343
Continue,
4444
Suspend,
4545
Script(Handle),
46+
EncodingIndicator(StrTendril),
4647
}
4748

4849
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +358,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
357358
ProcessResult::Continue => (),
358359
ProcessResult::Suspend => break,
359360
ProcessResult::Script(node) => return TokenizerResult::Script(node),
361+
ProcessResult::EncodingIndicator(encoding) => return TokenizerResult::EncodingIndicator(encoding),
360362
}
361363
}
362364
} else {
@@ -365,6 +367,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
365367
ProcessResult::Continue => (),
366368
ProcessResult::Suspend => break,
367369
ProcessResult::Script(node) => return TokenizerResult::Script(node),
370+
ProcessResult::EncodingIndicator(encoding) => return TokenizerResult::EncodingIndicator(encoding),
368371
}
369372
}
370373
}
@@ -456,6 +459,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
456459
self.state.set(states::RawData(kind));
457460
ProcessResult::Continue
458461
},
462+
TokenSinkResult::EncodingIndicator(encoding) => ProcessResult::EncodingIndicator(encoding),
459463
}
460464
}
461465

@@ -1726,7 +1730,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
17261730
match self.eof_step() {
17271731
ProcessResult::Continue => (),
17281732
ProcessResult::Suspend => break,
1729-
ProcessResult::Script(_) => unreachable!(),
1733+
ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
17301734
}
17311735
}
17321736

html5ever/src/tree_builder/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,7 @@ where
400400
assert!(more_tokens.is_empty());
401401
return tokenizer::TokenSinkResult::RawData(k);
402402
},
403+
ProcessResult::EncodingIndicator(encoding) => return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
403404
}
404405
}
405406
}

html5ever/src/tree_builder/rules.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// The tree builder rules, as a single, enormous nested match expression.
1111

12+
use crate::encoding::extract_a_character_encoding_from_a_meta_element;
1213
use crate::interface::Quirks;
1314
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
1415
use crate::tokenizer::TagKind::{EndTag, StartTag};
@@ -189,8 +190,26 @@ where
189190
Token::Tag(tag!(<html>)) => self.step(InsertionMode::InBody, token),
190191

191192
Token::Tag(tag @ tag!(<base> | <basefont> | <bgsound> | <link> | <meta>)) => {
192-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
193-
self.insert_and_pop_element_for(tag);
193+
self.insert_and_pop_element_for(tag.clone());
194+
195+
// Step 1. If the element has a charset attribute, and getting an encoding from its value
196+
// results in an encoding, and the confidence is currently tentative, then change the encoding
197+
// to the resulting encoding.
198+
// NOTE: We don't verify the validity of the encoding here. If the embedder detects the
199+
// encoding to be invalid then they can safely continue spinning the tokenizer.
200+
if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
201+
return ProcessResult::EncodingIndicator(charset);
202+
}
203+
// Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
204+
// case-insensitive match for the string "Content-Type", and the element has a content
205+
// attribute, and applying the algorithm for extracting a character encoding from a meta
206+
// element to that attribute's value returns an encoding, and the confidence is currently
207+
// tentative, then change the encoding to the extracted encoding.
208+
else if tag.get_attribute(&local_name!("http-equiv")).is_some_and(|value| value.eq_ignore_ascii_case("content-type")) {
209+
if let Some(encoding) = tag.get_attribute(&local_name!("content")).and_then(extract_a_character_encoding_from_a_meta_element) {
210+
return ProcessResult::EncodingIndicator(encoding);
211+
}
212+
}
194213
ProcessResult::DoneAckSelfClosing
195214
},
196215

html5ever/src/tree_builder/types.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {
7070
Script(Handle),
7171
ToPlaintext,
7272
ToRawData(RawKind),
73+
EncodingIndicator(StrTendril)
7374
}
7475

7576
pub(crate) enum FormatEntry<Handle> {

markup5ever/interface/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
6565
pub enum TokenizerResult<Handle> {
6666
Done,
6767
Script(Handle),
68+
/// The document indicated that the given encoding should be used to parse it.
69+
///
70+
/// HTML5-compatible implementations should parse the encoding label using the algorithm
71+
/// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
72+
/// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
73+
///
74+
/// If you are confident that the current encoding is correct then you can safely
75+
/// ignore this message.
76+
EncodingIndicator(StrTendril)
6877
}
6978

7079
/// Helper to quickly create an expanded name.

0 commit comments

Comments
 (0)