Skip to content

Commit 3042990

Browse files
authored
Merge pull request #106 from jtojnar/encode
Fix character decoding regression when `title` precedes `meta[charset]`
2 parents 7413a38 + 8b89d70 commit 3042990

File tree

2 files changed

+27
-35
lines changed

2 files changed

+27
-35
lines changed

src/Readability.php

Lines changed: 7 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1422,7 +1422,7 @@ private function loadHtml(): void
14221422
unset($tidy);
14231423
}
14241424

1425-
$this->html = self::ensureMetaCharset((string) $this->html);
1425+
$this->html = self::entitizeNonAscii((string) $this->html);
14261426

14271427
if ('html5lib' === $this->parser || 'html5' === $this->parser) {
14281428
$this->dom = (new HTML5())->loadHTML($this->html);
@@ -1512,43 +1512,19 @@ private function isNodeVisible(\DOMElement $node): bool
15121512
}
15131513

15141514
/**
1515-
* Tries to insert `meta[charset]` tag into the proper place in the passed HTML document.
1515+
* Converts non-ASCII UTF-8 characters to numeric HTML entities.
15161516
*
15171517
* `DOMDocument::loadHTML` will parse HTML documents as ISO-8859-1 if there is no `meta[charset]` tag.
15181518
* This means that UTF-8-encoded HTML fragments such as those coming from JSON-LD `articleBody` field would be parsed with incorrect encoding.
1519-
* Unfortunately, we cannot just put the tag at the start of the HTML fragment, since that would cause parser to auto-insert a `html` element, losing the attributes of the original `html` tag.
15201519
*
15211520
* @param string $html UTF-8 encoded document
15221521
*/
1523-
private static function ensureMetaCharset(string $html): string
1522+
private static function entitizeNonAscii(string $html): string
15241523
{
1525-
$charsetTag = '<meta charset="utf-8">';
1524+
$convmap = [
1525+
0x80, 0x1FFFFF, 0, 0x10FFFF,
1526+
];
15261527

1527-
// Only look at first 1024 bytes since, according to HTML5 specification,
1528-
// that’s where <meta> elements declaring a character encoding must be located.
1529-
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta#charset
1530-
$start = substr($html, 0, 1000);
1531-
1532-
if (1 === preg_match('/<meta[^>]+charset/i', $start)) {
1533-
// <meta> tag is already present, no need for modification.
1534-
return $html;
1535-
}
1536-
1537-
if (1 === preg_match('/<head[^>]*>/i', $start)) {
1538-
// <head> tag was located, <meta> tags go there.
1539-
$html = preg_replace('/<head[^>]*>/i', '$0' . $charsetTag, $html, 1);
1540-
1541-
return $html;
1542-
}
1543-
1544-
if (1 === preg_match('/<html[^>]*>/i', $start)) {
1545-
// <html> tag was located, let’s put it inside and have parser create <head>.
1546-
$html = preg_replace('/<html[^>]*>/i', '$0' . $charsetTag, $html, 1);
1547-
1548-
return $html;
1549-
}
1550-
1551-
// Fallback – just plop the <meta> at the start of the fragment.
1552-
return $charsetTag . $html;
1528+
return mb_encode_numericentity($html, $convmap, 'utf8', true);
15531529
}
15541530
}

tests/ReadabilityTest.php

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -529,28 +529,42 @@ public function testVisibleNode(string $content, bool $shouldBeVisible): void
529529
}
530530
}
531531

532+
// https://github.com/wallabag/wallabag/issues/8158
533+
public function testCharsetAfterTitle(): void
534+
{
535+
$readability = $this->getReadability('<!DOCTYPE html><html lang="et"><head><title>Tõde ja õigus I</title> <meta charset="utf-8"></head><body><p>See oli läinud aastasaja kolmanda veerandi lõpul. Päike lähenes silmapiirile, seistes sedavõrd madalas, et enam ei ulatunud valgustama ei mäkke ronivat hobust, kes puutelgedega vankrit vedas, ei vankril istuvat noort naist ega ka ligi kolmekümnelist meest, kes kõndis vankri kõrval.</p></body></html>', 'https://et.wikisource.org/wiki/T%C3%B5de_ja_%C3%B5igus_I/I');
536+
$readability->convertLinksToFootnotes = true;
537+
$res = $readability->init();
538+
539+
$this->assertTrue($res);
540+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
541+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getTitle());
542+
$this->assertSame('Tõde ja õigus I', $readability->getTitle()->getInnerHtml());
543+
$this->assertStringContainsString('Päike lähenes', $readability->getContent()->getInnerHtml());
544+
}
545+
532546
/**
533547
* @return array<string, array{0: string, 1: string, 2?: bool}>
534548
*/
535549
public function dataForHtmlLang(): array
536550
{
537551
return [
538552
'meta' => [
539-
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
553+
'<html lang="fr"><head><meta charset="utf-8"></head><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
540554
'fr',
541555
],
542556
'head' => [
543-
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
557+
'<html lang="fr"><head><title>Foo</title></head><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
544558
'fr',
545559
],
546560
'headless' => [
547-
'<html lang="fr"><body><article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article></body></html>',
561+
'<html lang="fr"><body><article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article></body></html>',
548562
'fr',
549563
// tidy would add <head> tag.
550564
false,
551565
],
552566
'fragment' => [
553-
'<article>' . str_repeat('<p>This is the awesome content :)</p>', 7) . '</article>',
567+
'<article>' . str_repeat('<p>Tous les êtres humains naissent libres et égaux en dignité et en droits. Ils sont doués de raison et de conscience et doivent agir les uns envers les autres dans un esprit de fraternité.</p>', 7) . '</article>',
554568
'',
555569
// tidy would add <html>.
556570
false,
@@ -569,6 +583,8 @@ public function testHtmlLang(string $html, string $lang, bool $useTidy = true):
569583
$this->assertTrue($res);
570584
$this->assertInstanceOf(\DOMDocument::class, $readability->dom);
571585
$this->assertSame($lang, $readability->dom->documentElement->getAttribute('lang'));
586+
$this->assertInstanceOf('Readability\JSLikeHTMLElement', $readability->getContent());
587+
$this->assertStringContainsString('êtres', $readability->getContent()->getInnerHtml());
572588
}
573589

574590
private function getReadability(string $html, ?string $url = null, string $parser = 'libxml', bool $useTidy = true): Readability

0 commit comments

Comments
 (0)