Skip to content

Commit 6a6ed64

Browse files
committed
Propagate encoding hints from the document to the caller
Signed-off-by: Simon Wülker <simon.wuelker@arcor.de>
1 parent 795caf4 commit 6a6ed64

File tree

8 files changed

+243
-3
lines changed

8 files changed

+243
-3
lines changed

html5ever/src/encoding.rs

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
// Copyright 2014-2025 The html5ever Project Developers. See the
2+
// COPYRIGHT file at the top-level directory of this distribution.
3+
//
4+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7+
// option. This file may not be copied, modified, or distributed
8+
// except according to those terms.
9+
10+
use crate::tendril::StrTendril;
11+
12+
/// <https://html.spec.whatwg.org/multipage/#algorithm-for-extracting-a-character-encoding-from-a-meta-element>
13+
pub(crate) fn extract_a_character_encoding_from_a_meta_element(
14+
input: StrTendril,
15+
) -> Option<StrTendril> {
16+
// Step 1. Let position be a pointer into s, initially pointing at the start of the string.
17+
let mut position = 0;
18+
loop {
19+
// Step 2. Loop: Find the first seven characters in s after position that are an ASCII
20+
// case-insensitive match for the word "charset". If no such match is found, return nothing.
21+
loop {
22+
let candidate = input.as_bytes().get(position..position + "charset".len())?;
23+
if candidate.eq_ignore_ascii_case(b"charset") {
24+
break;
25+
}
26+
27+
position += 1;
28+
}
29+
position += "charset".len();
30+
31+
// Step 3. Skip any ASCII whitespace that immediately follow the word "charset" (there might not be any).
32+
position += input.as_bytes()[position..]
33+
.iter()
34+
.take_while(|byte| byte.is_ascii_whitespace())
35+
.count();
36+
37+
// Step 4. If the next character is not a U+003D EQUALS SIGN (=), then move position to point just before
38+
// that next character, and jump back to the step labeled loop.
39+
if input.as_bytes()[position] == b'=' {
40+
break;
41+
}
42+
}
43+
// Skip the "="
44+
position += 1;
45+
46+
// Step 5. Skip any ASCII whitespace that immediately follow the equals sign (there might not be any).
47+
position += input.as_bytes()[position..]
48+
.iter()
49+
.take_while(|byte| byte.is_ascii_whitespace())
50+
.count();
51+
52+
// Step 6. Process the next character as follows:
53+
match input.as_bytes().get(position)? {
54+
quote @ (b'"' | b'\'') => {
55+
// Return the result of getting an encoding from the substring that is between this character
56+
// and the next earliest occurrence of this character.
57+
let length = input.as_bytes()[position + 1..]
58+
.iter()
59+
.position(|byte| byte == quote)?;
60+
Some(input.subtendril(position as u32 + 1, length as u32))
61+
},
62+
_ => {
63+
// Return the result of getting an encoding from the substring that consists of this character
64+
// up to but not including the first ASCII whitespace or U+003B SEMICOLON character (;),
65+
// or the end of s, whichever comes first.
66+
let length = input.as_bytes()[position..]
67+
.iter()
68+
.position(|byte| byte.is_ascii_whitespace() || *byte == b';');
69+
if let Some(length) = length {
70+
Some(input.subtendril(position as u32, length as u32))
71+
} else {
72+
Some(input.subtendril(position as u32, (input.len() - position) as u32))
73+
}
74+
},
75+
}
76+
}
77+
78+
#[cfg(test)]
79+
mod tests {
80+
use super::*;
81+
82+
#[test]
83+
fn meta_element_without_charset() {
84+
assert_eq!(
85+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice("foobar")),
86+
None
87+
);
88+
}
89+
90+
#[test]
91+
fn meta_element_with_capitalized_charset() {
92+
assert_eq!(
93+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
94+
"cHarSet=utf8"
95+
)),
96+
Some(StrTendril::from_slice("utf8"))
97+
);
98+
}
99+
100+
#[test]
101+
fn meta_element_with_no_equals_after_charset() {
102+
assert_eq!(
103+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
104+
"charset utf8"
105+
)),
106+
None
107+
);
108+
}
109+
110+
#[test]
111+
fn meta_element_with_whitespace_around_equals() {
112+
assert_eq!(
113+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
114+
"charset \t=\tutf8"
115+
)),
116+
Some(StrTendril::from_slice("utf8"))
117+
);
118+
}
119+
120+
#[test]
121+
fn meta_element_with_quoted_value() {
122+
assert_eq!(
123+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
124+
"charset='utf8'"
125+
)),
126+
Some(StrTendril::from_slice("utf8"))
127+
);
128+
assert_eq!(
129+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
130+
"charset=\"utf8\""
131+
)),
132+
Some(StrTendril::from_slice("utf8"))
133+
);
134+
assert_eq!(
135+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
136+
"charset='utf8"
137+
)),
138+
None
139+
);
140+
assert_eq!(
141+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
142+
"charset=\"utf8"
143+
)),
144+
None
145+
);
146+
}
147+
148+
#[test]
149+
fn meta_element_with_implicit_terminator() {
150+
assert_eq!(
151+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
152+
"charset=utf8 foo"
153+
)),
154+
Some(StrTendril::from_slice("utf8"))
155+
);
156+
assert_eq!(
157+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
158+
"charset=utf8;foo"
159+
)),
160+
Some(StrTendril::from_slice("utf8"))
161+
);
162+
}
163+
164+
#[test]
165+
fn meta_element_with_content_type() {
166+
assert_eq!(
167+
extract_a_character_encoding_from_a_meta_element(StrTendril::from_slice(
168+
"text/html; charset=utf8"
169+
)),
170+
Some(StrTendril::from_slice("utf8"))
171+
);
172+
}
173+
}

html5ever/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ mod util {
2222
pub(crate) mod str;
2323
}
2424

25+
pub(crate) mod encoding;
2526
pub(crate) mod macros;
2627

2728
pub mod driver;

html5ever/src/tokenizer/interface.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10+
use markup5ever::ns;
11+
1012
use crate::interface::Attribute;
1113
use crate::tendril::StrTendril;
1214
use crate::tokenizer::states;
@@ -57,6 +59,13 @@ impl Tag {
5759

5860
self_attrs == other_attrs
5961
}
62+
63+
pub(crate) fn get_attribute(&self, name: &LocalName) -> Option<StrTendril> {
64+
self.attrs
65+
.iter()
66+
.find(|attribute| attribute.name.ns == *ns!() && attribute.name.local == *name)
67+
.map(|attribute| attribute.value.clone())
68+
}
6069
}
6170

6271
#[derive(PartialEq, Eq, Debug)]
@@ -77,6 +86,15 @@ pub enum TokenSinkResult<Handle> {
7786
Script(Handle),
7887
Plaintext,
7988
RawData(states::RawKind),
89+
/// The document indicated that the given encoding should be used to parse it.
90+
///
91+
/// HTML5-compatible implementations should parse the encoding label using the algorithm
92+
/// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
93+
/// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
94+
///
95+
/// If the decoder is confident that the current encoding is correct then this message
96+
/// can safely be ignored.
97+
EncodingIndicator(StrTendril),
8098
}
8199

82100
/// Types which can receive tokens from the tokenizer.

html5ever/src/tokenizer/mod.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ pub enum ProcessResult<Handle> {
4343
Continue,
4444
Suspend,
4545
Script(Handle),
46+
EncodingIndicator(StrTendril),
4647
}
4748

4849
fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
@@ -357,6 +358,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
357358
ProcessResult::Continue => (),
358359
ProcessResult::Suspend => break,
359360
ProcessResult::Script(node) => return TokenizerResult::Script(node),
361+
ProcessResult::EncodingIndicator(encoding) => {
362+
return TokenizerResult::EncodingIndicator(encoding)
363+
},
360364
}
361365
}
362366
} else {
@@ -365,6 +369,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
365369
ProcessResult::Continue => (),
366370
ProcessResult::Suspend => break,
367371
ProcessResult::Script(node) => return TokenizerResult::Script(node),
372+
ProcessResult::EncodingIndicator(encoding) => {
373+
return TokenizerResult::EncodingIndicator(encoding)
374+
},
368375
}
369376
}
370377
}
@@ -456,6 +463,9 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
456463
self.state.set(states::RawData(kind));
457464
ProcessResult::Continue
458465
},
466+
TokenSinkResult::EncodingIndicator(encoding) => {
467+
ProcessResult::EncodingIndicator(encoding)
468+
},
459469
}
460470
}
461471

@@ -1726,7 +1736,7 @@ impl<Sink: TokenSink> Tokenizer<Sink> {
17261736
match self.eof_step() {
17271737
ProcessResult::Continue => (),
17281738
ProcessResult::Suspend => break,
1729-
ProcessResult::Script(_) => unreachable!(),
1739+
ProcessResult::Script(_) | ProcessResult::EncodingIndicator(_) => unreachable!(),
17301740
}
17311741
}
17321742

html5ever/src/tree_builder/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,9 @@ where
400400
assert!(more_tokens.is_empty());
401401
return tokenizer::TokenSinkResult::RawData(k);
402402
},
403+
ProcessResult::EncodingIndicator(encoding) => {
404+
return tokenizer::TokenSinkResult::EncodingIndicator(encoding)
405+
},
403406
}
404407
}
405408
}

html5ever/src/tree_builder/rules.rs

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// The tree builder rules, as a single, enormous nested match expression.
1111

12+
use crate::encoding::extract_a_character_encoding_from_a_meta_element;
1213
use crate::interface::Quirks;
1314
use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
1415
use crate::tokenizer::TagKind::{EndTag, StartTag};
@@ -189,8 +190,32 @@ where
189190
Token::Tag(tag!(<html>)) => self.step(InsertionMode::InBody, token),
190191

191192
Token::Tag(tag @ tag!(<base> | <basefont> | <bgsound> | <link> | <meta>)) => {
192-
// FIXME: handle <meta charset=...> and <meta http-equiv="Content-Type">
193-
self.insert_and_pop_element_for(tag);
193+
self.insert_and_pop_element_for(tag.clone());
194+
195+
// Step 1. If the element has a charset attribute, and getting an encoding from its value
196+
// results in an encoding, and the confidence is currently tentative, then change the encoding
197+
// to the resulting encoding.
198+
// NOTE: We don't verify the validity of the encoding here. If the embedder detects the
199+
// encoding to be invalid then they can safely continue spinning the tokenizer.
200+
if let Some(charset) = tag.get_attribute(&local_name!("charset")) {
201+
return ProcessResult::EncodingIndicator(charset);
202+
}
203+
// Step 2. Otherwise, if the element has an http-equiv attribute whose value is an ASCII
204+
// case-insensitive match for the string "Content-Type", and the element has a content
205+
// attribute, and applying the algorithm for extracting a character encoding from a meta
206+
// element to that attribute's value returns an encoding, and the confidence is currently
207+
// tentative, then change the encoding to the extracted encoding.
208+
else if tag
209+
.get_attribute(&local_name!("http-equiv"))
210+
.is_some_and(|value| value.eq_ignore_ascii_case("content-type"))
211+
{
212+
if let Some(encoding) = tag
213+
.get_attribute(&local_name!("content"))
214+
.and_then(extract_a_character_encoding_from_a_meta_element)
215+
{
216+
return ProcessResult::EncodingIndicator(encoding);
217+
}
218+
}
194219
ProcessResult::DoneAckSelfClosing
195220
},
196221

html5ever/src/tree_builder/types.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ pub(crate) enum ProcessResult<Handle> {
7070
Script(Handle),
7171
ToPlaintext,
7272
ToRawData(RawKind),
73+
EncodingIndicator(StrTendril),
7374
}
7475

7576
pub(crate) enum FormatEntry<Handle> {

markup5ever/interface/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,15 @@ impl fmt::Debug for ExpandedName<'_> {
6565
pub enum TokenizerResult<Handle> {
6666
Done,
6767
Script(Handle),
68+
/// The document indicated that the given encoding should be used to parse it.
69+
///
70+
/// HTML5-compatible implementations should parse the encoding label using the algorithm
71+
/// described in <https://encoding.spec.whatwg.org/#concept-encoding-get>. The label
72+
/// has not been validated by html5ever. Invalid or unknown encodings can be ignored.
73+
///
74+
/// If you are confident that the current encoding is correct then you can safely
75+
/// ignore this message.
76+
EncodingIndicator(StrTendril),
6877
}
6978

7079
/// Helper to quickly create an expanded name.

0 commit comments

Comments
 (0)