Skip to content

Commit 773467d

Browse files
committed
Avoid truncating URLs in unquoted hrefs
1 parent 604f644 commit 773467d

File tree

4 files changed

+89
-38
lines changed

4 files changed

+89
-38
lines changed

src/parser/base.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,11 @@ impl<'a> Parser<'a> {
8787
self.stream.slice(start, start + end)
8888
}
8989

90-
fn read_to4(&mut self, needle: [u8; 4]) -> &'a [u8] {
90+
fn read_to3(&mut self, needle: [u8; 3]) -> &'a [u8] {
9191
let start = self.stream.idx;
9292
let bytes = &self.stream.data()[start..];
9393

94-
let end = simd::find4(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
94+
let end = simd::find3(bytes, needle).unwrap_or_else(|| self.stream.len() - start);
9595

9696
self.stream.idx += end;
9797
self.stream.slice(start, start + end)
@@ -118,8 +118,7 @@ impl<'a> Parser<'a> {
118118

119119
// If we do not find any characters that are not identifiers
120120
// then we are probably at the end of the stream
121-
let end = simd::search_non_ident(bytes)
122-
.unwrap_or_else(|| self.stream.len() - start);
121+
let end = simd::search_non_ident(bytes).unwrap_or_else(|| self.stream.len() - start);
123122

124123
self.stream.idx += end;
125124
Some(self.stream.slice(start, start + end))
@@ -163,7 +162,7 @@ impl<'a> Parser<'a> {
163162
let value = if let Some(quote) = self.stream.expect_oneof_and_skip(&[b'"', b'\'']) {
164163
self.read_to(quote)
165164
} else {
166-
self.read_to4([b' ', b'\n', b'/', b'>'])
165+
self.read_to3([b' ', b'\n', b'>'])
167166
};
168167

169168
Some((name, Some(value)))
@@ -219,10 +218,12 @@ impl<'a> Parser<'a> {
219218
self.stream.advance();
220219

221220
let closing_tag_name = self.read_to(b'>');
222-
221+
223222
self.stream.expect_and_skip_cond(b'>');
224223

225-
let closing_tag_matches_parent = self.stack.last()
224+
let closing_tag_matches_parent = self
225+
.stack
226+
.last()
226227
.and_then(|last_handle| last_handle.get(self))
227228
.and_then(|last_item| last_item.as_tag())
228229
.map_or(false, |last_tag| last_tag.name() == closing_tag_name);

src/simd/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,11 @@ pub fn search_non_ident(haystack: &[u8]) -> Option<usize> {
3737
)
3838
}
3939

40-
/// Searches for the first occurence in `haystack`
40+
/// Searches for the first occurrence in `haystack`
4141
#[inline]
42-
pub fn find4(haystack: &[u8], needle: [u8; 4]) -> Option<usize> {
42+
pub fn find3(haystack: &[u8], needle: [u8; 3]) -> Option<usize> {
4343
decide!(
44-
nightly::find4(haystack, needle),
44+
nightly::find3(haystack, needle),
4545
stable::find_multi(haystack, needle)
4646
)
4747
}

src/simd/nightly.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
use std::{ptr, simd::*, simd::cmp::{SimdPartialEq, SimdPartialOrd}};
1+
use std::{
2+
ptr,
3+
simd::cmp::{SimdPartialEq, SimdPartialOrd},
4+
simd::*,
5+
};
26

37
use crate::simd::fallback;
48

@@ -36,11 +40,11 @@ pub fn find(haystack: &[u8], needle: u8) -> Option<usize> {
3640
fallback::find(&haystack[i..], needle).map(|x| i + x)
3741
}
3842

39-
/// Optimized function for finding one of 4 bytes in `haystack`
40-
pub fn find4(haystack: &[u8], needle: [u8; 4]) -> Option<usize> {
43+
/// Optimized function for finding one of 3 bytes in `haystack`
44+
pub fn find3(haystack: &[u8], needle: [u8; 3]) -> Option<usize> {
4145
#[inline(never)]
4246
#[cold]
43-
fn unlikely_find(haystack: &[u8], needle: [u8; 4]) -> Option<usize> {
47+
fn unlikely_find(haystack: &[u8], needle: [u8; 3]) -> Option<usize> {
4448
fallback::find_multi(haystack, needle)
4549
}
4650

@@ -54,7 +58,6 @@ pub fn find4(haystack: &[u8], needle: [u8; 4]) -> Option<usize> {
5458
let needle16a = u8x16::splat(needle[0]);
5559
let needle16b = u8x16::splat(needle[1]);
5660
let needle16c = u8x16::splat(needle[2]);
57-
let needle16d = u8x16::splat(needle[3]);
5861

5962
while i <= len - 16 {
6063
let mut bytes = [0; 16];
@@ -65,8 +68,7 @@ pub fn find4(haystack: &[u8], needle: [u8; 4]) -> Option<usize> {
6568
let eq1 = bytes.simd_eq(needle16a);
6669
let eq2 = bytes.simd_eq(needle16b);
6770
let eq3 = bytes.simd_eq(needle16c);
68-
let eq4 = bytes.simd_eq(needle16d);
69-
let or = (eq1 | eq2 | eq3 | eq4).to_int();
71+
let or = (eq1 | eq2 | eq3).to_int();
7072
let num = unsafe { std::mem::transmute::<i8x16, u128>(or) };
7173
if num != 0 {
7274
return Some(i + (num.trailing_zeros() >> 3) as usize);

src/tests.rs

Lines changed: 69 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -320,27 +320,25 @@ mod simd {
320320
}
321321

322322
#[test]
323-
fn string_search_4() {
324-
const NEEDLE: [u8; 4] = [b'a', b'b', b'c', b'd'];
325-
326-
assert_eq!(crate::simd::find4(b"e", NEEDLE), None);
327-
assert_eq!(crate::simd::find4(b"a", NEEDLE), Some(0));
328-
assert_eq!(crate::simd::find4(b"ea", NEEDLE), Some(1));
329-
assert_eq!(crate::simd::find4(b"ef", NEEDLE), None);
330-
assert_eq!(crate::simd::find4(b"ef a", NEEDLE), Some(3));
331-
assert_eq!(crate::simd::find4(b"ef g", NEEDLE), None);
332-
assert_eq!(crate::simd::find4(b"ef ghijk", NEEDLE), None);
333-
assert_eq!(crate::simd::find4(b"ef ghijkl", NEEDLE), None);
334-
assert_eq!(crate::simd::find4(b"ef ghijkla", NEEDLE), Some(9));
335-
assert_eq!(crate::simd::find4(b"ef ghiajklm", NEEDLE), Some(6));
336-
assert_eq!(crate::simd::find4(b"ef ghibjklm", NEEDLE), Some(6));
337-
assert_eq!(crate::simd::find4(b"ef ghicjklm", NEEDLE), Some(6));
338-
assert_eq!(crate::simd::find4(b"ef ghidjklm", NEEDLE), Some(6));
339-
assert_eq!(crate::simd::find4(b"ef ghijklmnopqrstua", NEEDLE), Some(18));
340-
assert_eq!(crate::simd::find4(b"ef ghijklmnopqrstub", NEEDLE), Some(18));
341-
assert_eq!(crate::simd::find4(b"ef ghijklmnopqrstuc", NEEDLE), Some(18));
342-
assert_eq!(crate::simd::find4(b"ef ghijklmnopqrstud", NEEDLE), Some(18));
343-
assert_eq!(crate::simd::find4(b"ef ghijklmnopqrstu", NEEDLE), None);
323+
fn string_search_3() {
324+
const NEEDLE: [u8; 3] = [b'a', b'b', b'c'];
325+
326+
assert_eq!(crate::simd::find3(b"e", NEEDLE), None);
327+
assert_eq!(crate::simd::find3(b"a", NEEDLE), Some(0));
328+
assert_eq!(crate::simd::find3(b"ea", NEEDLE), Some(1));
329+
assert_eq!(crate::simd::find3(b"ef", NEEDLE), None);
330+
assert_eq!(crate::simd::find3(b"ef a", NEEDLE), Some(3));
331+
assert_eq!(crate::simd::find3(b"ef g", NEEDLE), None);
332+
assert_eq!(crate::simd::find3(b"ef ghijk", NEEDLE), None);
333+
assert_eq!(crate::simd::find3(b"ef ghijkl", NEEDLE), None);
334+
assert_eq!(crate::simd::find3(b"ef ghijkla", NEEDLE), Some(9));
335+
assert_eq!(crate::simd::find3(b"ef ghiajklm", NEEDLE), Some(6));
336+
assert_eq!(crate::simd::find3(b"ef ghibjklm", NEEDLE), Some(6));
337+
assert_eq!(crate::simd::find3(b"ef ghicjklm", NEEDLE), Some(6));
338+
assert_eq!(crate::simd::find3(b"ef ghijklmnopqrstua", NEEDLE), Some(18));
339+
assert_eq!(crate::simd::find3(b"ef ghijklmnopqrstub", NEEDLE), Some(18));
340+
assert_eq!(crate::simd::find3(b"ef ghijklmnopqrstuc", NEEDLE), Some(18));
341+
assert_eq!(crate::simd::find3(b"ef ghijklmnopqrstu", NEEDLE), None);
344342
}
345343

346344
#[test]
@@ -510,6 +508,56 @@ fn unquoted() {
510508
);
511509
}
512510

511+
#[test]
512+
fn unquoted_href() {
513+
// https://github.com/y21/tl/issues/12
514+
let input = r#"
515+
<a id=u54423 href=https://www.google.com>Hello World</a>
516+
"#;
517+
518+
let dom = parse(input, ParserOptions::default()).unwrap();
519+
let parser = dom.parser();
520+
let element = dom.get_element_by_id("u54423");
521+
522+
assert_eq!(
523+
element.and_then(|x| x.get(parser).map(|x| x
524+
.as_tag()
525+
.unwrap()
526+
.attributes()
527+
.get("href")
528+
.flatten()
529+
.unwrap()
530+
.try_as_utf8_str()
531+
.unwrap()
532+
.to_string())),
533+
Some("https://www.google.com".into())
534+
);
535+
}
536+
537+
#[test]
538+
fn unquoted_self_closing() {
539+
// https://github.com/y21/tl/issues/12
540+
let input = r#"
541+
<a id=u54423 />
542+
"#;
543+
544+
let dom = parse(input, ParserOptions::default()).unwrap();
545+
let element = dom.get_element_by_id("u54423");
546+
547+
assert!(element.is_some());
548+
549+
// According to MDN, if there's no space between an unquoted attribute and the closing tag,
550+
// the slash is treated as part of the attribute value.
551+
let input = r#"
552+
<a id=u54423/>
553+
"#;
554+
555+
let dom = parse(input, ParserOptions::default()).unwrap();
556+
let element = dom.get_element_by_id("u54423/");
557+
558+
assert!(element.is_some());
559+
}
560+
513561
mod query_selector {
514562
use super::*;
515563
#[test]

0 commit comments

Comments
 (0)