diff --git a/src/Crawler.php b/src/Crawler.php index b68c33a..c4dd6e3 100644 --- a/src/Crawler.php +++ b/src/Crawler.php @@ -566,7 +566,15 @@ protected function startCrawlingQueue(): void protected function createRobotsTxt(UriInterface $uri): RobotsTxt { - return RobotsTxt::create($uri->withPath('/robots.txt')); + try { + $robotsUrl = (string) $uri->withPath('/robots.txt'); + $response = $this->client->get($robotsUrl); + $content = (string) $response->getBody(); + + return new RobotsTxt($content); + } catch (\Exception $exception) { + return new RobotsTxt(''); + } } protected function getCrawlRequests(): Generator diff --git a/tests/CrawlerRobotsUserAgentTest.php b/tests/CrawlerRobotsUserAgentTest.php new file mode 100644 index 0000000..36e4161 --- /dev/null +++ b/tests/CrawlerRobotsUserAgentTest.php @@ -0,0 +1,40 @@ +mockHandler = new \GuzzleHttp\Handler\MockHandler([ + new Response(200, [], "User-agent: *\nDisallow: /admin"), + new Response(200, [], 'Home'), + ]); + + $this->crawledUrls = []; + $this->history = Middleware::history($this->crawledUrls); + + $this->handlerStack = HandlerStack::create($this->mockHandler); + $this->handlerStack->push($this->history); +}); + +it('should send the correct user agent header when fetching robots.txt', function () { + $client = new Client(['handler' => $this->handlerStack]); + $crawler = new Crawler($client); + $crawler->respectRobots()->startCrawling('http://example.com'); + + expect($this->crawledUrls)->toHaveCount(2); + expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt'); + expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['GuzzleHttp/7']); +}); + +it('should send the custom user agent header when fetching robots.txt', function () { + $client = new Client(['handler' => $this->handlerStack]); + $crawler = new Crawler($client); + $crawler->respectRobots()->setUserAgent('CustomBot/2.0')->startCrawling('http://example.com'); + + expect($this->crawledUrls)->toHaveCount(2); + expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt'); + expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['CustomBot/2.0']); +});