adobe
diff --git a/‎packages/spacecat-shared-utils/src/bot-blocker-detect/bot-blocker-detect.js‎
Lines changed: 129 additions & 1 deletion b/‎packages/spacecat-shared-utils/src/bot-blocker-detect/bot-blocker-detect.js‎
Lines changed: 129 additions & 1 deletion
diff --git a/‎packages/spacecat-shared-utils/src/index.js‎
Lines changed: 6 additions & 1 deletion b/‎packages/spacecat-shared-utils/src/index.js‎
Lines changed: 6 additions & 1 deletion
@@ -30,7 +30,70 @@ const CONFIDENCE_MEDIUM = 0.95;
 const CONFIDENCE_ABSOLUTE = 1.0;
 const DEFAULT_TIMEOUT = 5000;
 
-function analyzeResponse(response) {
+/**
+ * SpaceCat bot identification constants
+ */
+export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0';
+
+/**
+ * SpaceCat bot IPs by environment
+ */
+export const SPACECAT_BOT_IPS = {
+  production: [
+    '3.218.16.42',
+    '52.55.82.37',
+    '54.172.145.38',
+  ],
+  development: [
+    '44.218.57.115',
+    '54.87.205.187',
+  ],
+};
+
+/**
+ * HTML patterns for detecting challenge pages
+ */
+const CHALLENGE_PATTERNS = {
+  cloudflare: [
+    /Checking your browser/i,
+    /Just a moment\.\.\./i,
+    /Verifying you are human/i,
+    /Please wait.*CloudFlare/i,
+    /cf-turnstile/i,
+    /challenge-platform/i,
+    /cf-chl-widget/i, // Cloudflare challenge widget
+    /ray\s*id.*cloudflare/i, // Cloudflare Ray ID in error pages
+    /__cf_chl_tk/i, // Cloudflare challenge token
+    /cloudflare.*security/i,
+    /attention required.*cloudflare/i,
+  ],
+  imperva: [
+    /_Incapsula_Resource/i,
+    /Incapsula incident ID/i,
+    /incap_ses/i, // Imperva session cookie
+    /visid_incap/i, // Imperva visitor ID
+  ],
+  akamai: [
+    /Access Denied.*Akamai/i,
+    /Reference.*Akamai/i,
+  ],
+  general: [
+    /captcha/i,
+    /human verification/i,
+    /recaptcha/i,
+    /hcaptcha/i,
+    /datadome/i,
+    /dd-request-id/i,
+  ],
+};
+
+/**
+ * Analyzes response for bot protection indicators
+ * @param {Object} response - Response object with status and headers
+ * @param {string} [html] - Optional HTML content for deeper analysis
+ * @returns {Object} Detection result
+ */
+function analyzeResponse(response, html = null) {
   const { status, headers } = response;
 
   // Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
@@ -45,6 +108,12 @@ function analyzeResponse(response) {
     || headers.get('x-amz-cf-pop')
     || headers.get('via')?.includes('CloudFront');
 
+  // Check HTML content for challenge page patterns (if HTML provided)
+  const htmlHasChallenge = (patterns) => {
+    if (!html) return false;
+    return patterns.some((pattern) => pattern.test(html));
+  };
+
   // Active blocking (403 status with known blocker)
   if (status === 403 && hasCloudflare()) {
     return {
@@ -88,6 +157,16 @@ function analyzeResponse(response) {
 
   // Success with known infrastructure present (infrastructure detected but allowing requests)
   if (status === 200 && hasCloudflare()) {
+    // Check if HTML contains challenge page (even though status is 200)
+    if (htmlHasChallenge(CHALLENGE_PATTERNS.cloudflare)) {
+      return {
+        crawlable: false,
+        type: 'cloudflare',
+        confidence: CONFIDENCE_HIGH,
+        reason: 'Challenge page detected despite 200 status',
+      };
+    }
+
     return {
       crawlable: true,
       type: 'cloudflare-allowed',
@@ -96,6 +175,14 @@ function analyzeResponse(response) {
   }
 
   if (status === 200 && hasImperva()) {
+    if (htmlHasChallenge(CHALLENGE_PATTERNS.imperva)) {
+      return {
+        crawlable: false,
+        type: 'imperva',
+        confidence: CONFIDENCE_HIGH,
+        reason: 'Challenge page detected despite 200 status',
+      };
+    }
     return {
       crawlable: true,
       type: 'imperva-allowed',
@@ -104,6 +191,14 @@ function analyzeResponse(response) {
   }
 
   if (status === 200 && hasAkamai()) {
+    if (htmlHasChallenge(CHALLENGE_PATTERNS.akamai)) {
+      return {
+        crawlable: false,
+        type: 'akamai',
+        confidence: CONFIDENCE_HIGH,
+        reason: 'Challenge page detected despite 200 status',
+      };
+    }
     return {
       crawlable: true,
       type: 'akamai-allowed',
@@ -129,6 +224,15 @@ function analyzeResponse(response) {
 
   // Success with no known infrastructure
   if (status === 200) {
+    // Still check for generic challenge patterns
+    if (htmlHasChallenge(CHALLENGE_PATTERNS.general)) {
+      return {
+        crawlable: false,
+        type: 'unknown',
+        confidence: 0.7,
+        reason: 'Generic challenge patterns detected',
+      };
+    }
     return {
       crawlable: true,
       type: 'none',
@@ -207,3 +311,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
     return analyzeError(error);
   }
 }
+
+/**
+ * Analyzes already-fetched response data for bot protection.
+ * Used by content scraper to analyze Puppeteer results without making another request.
+ *
+ * @param {Object} data - Response data to analyze
+ * @param {number} data.status - HTTP status code
+ * @param {Object} data.headers - Response headers (plain object or Headers object)
+ * @param {string} [data.html] - Optional HTML content for challenge page detection
+ * @returns {Object} Detection result (same format as detectBotBlocker)
+ */
+export function analyzeBotProtection({ status, headers, html }) {
+  // Convert headers to Headers object if plain object
+  const headersObj = headers instanceof Headers
+    ? headers
+    : new Headers(Object.entries(headers || {}));
+
+  const mockResponse = {
+    status,
+    headers: headersObj,
+  };
+
+  return analyzeResponse(mockResponse, html);
+}
@@ -110,7 +110,12 @@ export * as llmoConfig from './llmo-config.js';
 export * as schemas from './schemas.js';
 
 export { detectLocale } from './locale-detect/locale-detect.js';
-export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
+export {
+  detectBotBlocker,
+  analyzeBotProtection,
+  SPACECAT_BOT_USER_AGENT,
+  SPACECAT_BOT_IPS,
+} from './bot-blocker-detect/bot-blocker-detect.js';
 export { prettifyLogForwardingConfig } from './cdn-helpers.js';
 
 export {