From 25d764b7c44987a5c3c336b100883d15ca68f75c Mon Sep 17 00:00:00 2001 From: Mattias Geniar Date: Fri, 26 Feb 2021 12:26:21 +0100 Subject: [PATCH 1/2] Bugfix: prevent infinite loops when a CrawlProfile prevents crawling of a particular URL, it should remove that URL from the queue --- src/Crawler.php | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Crawler.php b/src/Crawler.php index cdd714d..6ad67b6 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -493,6 +493,8 @@ protected function getCrawlRequests(): Generator $this->crawlProfile->shouldCrawl($crawlUrl->url) === false || $this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl) ) { + $this->crawlQueue->markAsProcessed($crawlUrl); + continue; } From 3527cf93f30fa3a3af9ca57367c386d9a81c8917 Mon Sep 17 00:00:00 2001 From: Mattias Geniar Date: Thu, 23 Oct 2025 22:07:06 +0200 Subject: [PATCH 2/2] When fetching robots.txt, use the same User-Agent as defined by the user --- src/Crawler.php | 10 ++++++- tests/CrawlerRobotsUserAgentTest.php | 40 ++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 tests/CrawlerRobotsUserAgentTest.php diff --git a/src/Crawler.php b/src/Crawler.php index b68c33a..c4dd6e3 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -566,7 +566,15 @@ protected function startCrawlingQueue(): void protected function createRobotsTxt(UriInterface $uri): RobotsTxt { - return RobotsTxt::create($uri->withPath('/robots.txt')); + try { + $robotsUrl = (string) $uri->withPath('/robots.txt'); + $response = $this->client->get($robotsUrl); + $content = (string) $response->getBody(); + + return new RobotsTxt($content); + } catch (\Exception $exception) { + return new RobotsTxt(''); + } } protected function getCrawlRequests(): Generator diff --git a/tests/CrawlerRobotsUserAgentTest.php b/tests/CrawlerRobotsUserAgentTest.php new file mode 100644 index 0000000..36e4161 --- /dev/null +++ b/tests/CrawlerRobotsUserAgentTest.php @@ -0,0 +1,40 @@ +mockHandler = new \GuzzleHttp\Handler\MockHandler([ + new Response(200, [], "User-agent: *\nDisallow: /admin"), + new Response(200, [], 'Home'), + ]); + + $this->crawledUrls = []; + $this->history = Middleware::history($this->crawledUrls); + + $this->handlerStack = HandlerStack::create($this->mockHandler); + $this->handlerStack->push($this->history); +}); + +it('should send the correct user agent header when fetching robots.txt', function () { + $client = new Client(['handler' => $this->handlerStack]); + $crawler = new Crawler($client); + $crawler->respectRobots()->startCrawling('http://example.com'); + + expect($this->crawledUrls)->toHaveCount(2); + expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt'); + expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['GuzzleHttp/7']); +}); + +it('should send the custom user agent header when fetching robots.txt', function () { + $client = new Client(['handler' => $this->handlerStack]); + $crawler = new Crawler($client); + $crawler->respectRobots()->setUserAgent('CustomBot/2.0')->startCrawling('http://example.com'); + + expect($this->crawledUrls)->toHaveCount(2); + expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt'); + expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['CustomBot/2.0']); +});