From 25d764b7c44987a5c3c336b100883d15ca68f75c Mon Sep 17 00:00:00 2001
From: Mattias Geniar <m@ttias.be>
Date: Fri, 26 Feb 2021 12:26:21 +0100
Subject: [PATCH 1/2] Bugfix: prevent infinite loops when a CrawlProfile
 prevents crawling of a particular URL, it should remove that URL from the
 queue

---
 src/Crawler.php | 2 ++
 1 file changed, 2 insertions(+)
diff --git a/src/Crawler.php b/src/Crawler.php
index cdd714d..6ad67b6 100644
--- a/src/Crawler.php
+++ b/src/Crawler.php
@@ -493,6 +493,8 @@ protected function getCrawlRequests(): Generator
                 $this->crawlProfile->shouldCrawl($crawlUrl->url) === false ||
                 $this->crawlQueue->hasAlreadyBeenProcessed($crawlUrl)
             ) {
+                $this->crawlQueue->markAsProcessed($crawlUrl);
+
                 continue;
             }
 

From 3527cf93f30fa3a3af9ca57367c386d9a81c8917 Mon Sep 17 00:00:00 2001
From: Mattias Geniar <m@ttias.be>
Date: Thu, 23 Oct 2025 22:07:06 +0200
Subject: [PATCH 2/2] When fetching robots.txt, use the same User-Agent as
 defined by the user

---
 src/Crawler.php                      | 10 ++++++-
 tests/CrawlerRobotsUserAgentTest.php | 40 ++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 tests/CrawlerRobotsUserAgentTest.php

diff --git a/src/Crawler.php b/src/Crawler.php
index b68c33a..c4dd6e3 100644
--- a/src/Crawler.php
+++ b/src/Crawler.php
@@ -566,7 +566,15 @@ protected function startCrawlingQueue(): void
 
     protected function createRobotsTxt(UriInterface $uri): RobotsTxt
     {
-        return RobotsTxt::create($uri->withPath('/robots.txt'));
+        try {
+            $robotsUrl = (string) $uri->withPath('/robots.txt');
+            $response = $this->client->get($robotsUrl);
+            $content = (string) $response->getBody();
+
+            return new RobotsTxt($content);
+        } catch (\Exception $exception) {
+            return new RobotsTxt('');
+        }
     }
 
     protected function getCrawlRequests(): Generator
diff --git a/tests/CrawlerRobotsUserAgentTest.php b/tests/CrawlerRobotsUserAgentTest.php
new file mode 100644
index 0000000..36e4161
--- /dev/null
+++ b/tests/CrawlerRobotsUserAgentTest.php
@@ -0,0 +1,40 @@
+<?php
+
+use GuzzleHttp\Client;
+use GuzzleHttp\HandlerStack;
+use GuzzleHttp\Middleware;
+use GuzzleHttp\Psr7\Response;
+use Spatie\Crawler\Crawler;
+
+beforeEach(function () {
+    $this->mockHandler = new \GuzzleHttp\Handler\MockHandler([
+        new Response(200, [], "User-agent: *\nDisallow: /admin"),
+        new Response(200, [], '<html><body>Home</body></html>'),
+    ]);
+
+    $this->crawledUrls = [];
+    $this->history = Middleware::history($this->crawledUrls);
+
+    $this->handlerStack = HandlerStack::create($this->mockHandler);
+    $this->handlerStack->push($this->history);
+});
+
+it('should send the correct user agent header when fetching robots.txt', function () {
+    $client = new Client(['handler' => $this->handlerStack]);
+    $crawler = new Crawler($client);
+    $crawler->respectRobots()->startCrawling('http://example.com');
+
+    expect($this->crawledUrls)->toHaveCount(2);
+    expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt');
+    expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['GuzzleHttp/7']);
+});
+
+it('should send the custom user agent header when fetching robots.txt', function () {
+    $client = new Client(['handler' => $this->handlerStack]);
+    $crawler = new Crawler($client);
+    $crawler->respectRobots()->setUserAgent('CustomBot/2.0')->startCrawling('http://example.com');
+
+    expect($this->crawledUrls)->toHaveCount(2);
+    expect((string) $this->crawledUrls[0]['request']->getUri())->toBe('http://example.com/robots.txt');
+    expect($this->crawledUrls[0]['request']->getHeader('User-Agent'))->toBe(['CustomBot/2.0']);
+});