Skip to content

Commit 8d580a6

Browse files
Additional checks and methods on bot protection
1 parent 5229dd9 commit 8d580a6

File tree

4 files changed

+395
-3
lines changed

4 files changed

+395
-3
lines changed

packages/spacecat-shared-utils/src/bot-blocker-detect/bot-blocker-detect.js

Lines changed: 129 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,70 @@ const CONFIDENCE_MEDIUM = 0.95;
3030
const CONFIDENCE_ABSOLUTE = 1.0;
3131
const DEFAULT_TIMEOUT = 5000;
3232

33-
function analyzeResponse(response) {
33+
/**
34+
* SpaceCat bot identification constants
35+
*/
36+
export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0';
37+
38+
/**
39+
* SpaceCat bot IPs by environment
40+
*/
41+
export const SPACECAT_BOT_IPS = {
42+
production: [
43+
'3.218.16.42',
44+
'52.55.82.37',
45+
'54.172.145.38',
46+
],
47+
development: [
48+
'44.218.57.115',
49+
'54.87.205.187',
50+
],
51+
};
52+
53+
/**
54+
* HTML patterns for detecting challenge pages
55+
*/
56+
const CHALLENGE_PATTERNS = {
57+
cloudflare: [
58+
/Checking your browser/i,
59+
/Just a moment\.\.\./i,
60+
/Verifying you are human/i,
61+
/Please wait.*CloudFlare/i,
62+
/cf-turnstile/i,
63+
/challenge-platform/i,
64+
/cf-chl-widget/i, // Cloudflare challenge widget
65+
/ray\s*id.*cloudflare/i, // Cloudflare Ray ID in error pages
66+
/__cf_chl_tk/i, // Cloudflare challenge token
67+
/cloudflare.*security/i,
68+
/attention required.*cloudflare/i,
69+
],
70+
imperva: [
71+
/_Incapsula_Resource/i,
72+
/Incapsula incident ID/i,
73+
/incap_ses/i, // Imperva session cookie
74+
/visid_incap/i, // Imperva visitor ID
75+
],
76+
akamai: [
77+
/Access Denied.*Akamai/i,
78+
/Reference.*Akamai/i,
79+
],
80+
general: [
81+
/captcha/i,
82+
/human verification/i,
83+
/recaptcha/i,
84+
/hcaptcha/i,
85+
/datadome/i,
86+
/dd-request-id/i,
87+
],
88+
};
89+
90+
/**
91+
* Analyzes response for bot protection indicators
92+
* @param {Object} response - Response object with status and headers
93+
* @param {string} [html] - Optional HTML content for deeper analysis
94+
* @returns {Object} Detection result
95+
*/
96+
function analyzeResponse(response, html = null) {
3497
const { status, headers } = response;
3598

3699
// Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
@@ -45,6 +108,12 @@ function analyzeResponse(response) {
45108
|| headers.get('x-amz-cf-pop')
46109
|| headers.get('via')?.includes('CloudFront');
47110

111+
// Check HTML content for challenge page patterns (if HTML provided)
112+
const htmlHasChallenge = (patterns) => {
113+
if (!html) return false;
114+
return patterns.some((pattern) => pattern.test(html));
115+
};
116+
48117
// Active blocking (403 status with known blocker)
49118
if (status === 403 && hasCloudflare()) {
50119
return {
@@ -88,6 +157,16 @@ function analyzeResponse(response) {
88157

89158
// Success with known infrastructure present (infrastructure detected but allowing requests)
90159
if (status === 200 && hasCloudflare()) {
160+
// Check if HTML contains challenge page (even though status is 200)
161+
if (htmlHasChallenge(CHALLENGE_PATTERNS.cloudflare)) {
162+
return {
163+
crawlable: false,
164+
type: 'cloudflare',
165+
confidence: CONFIDENCE_HIGH,
166+
reason: 'Challenge page detected despite 200 status',
167+
};
168+
}
169+
91170
return {
92171
crawlable: true,
93172
type: 'cloudflare-allowed',
@@ -96,6 +175,14 @@ function analyzeResponse(response) {
96175
}
97176

98177
if (status === 200 && hasImperva()) {
178+
if (htmlHasChallenge(CHALLENGE_PATTERNS.imperva)) {
179+
return {
180+
crawlable: false,
181+
type: 'imperva',
182+
confidence: CONFIDENCE_HIGH,
183+
reason: 'Challenge page detected despite 200 status',
184+
};
185+
}
99186
return {
100187
crawlable: true,
101188
type: 'imperva-allowed',
@@ -104,6 +191,14 @@ function analyzeResponse(response) {
104191
}
105192

106193
if (status === 200 && hasAkamai()) {
194+
if (htmlHasChallenge(CHALLENGE_PATTERNS.akamai)) {
195+
return {
196+
crawlable: false,
197+
type: 'akamai',
198+
confidence: CONFIDENCE_HIGH,
199+
reason: 'Challenge page detected despite 200 status',
200+
};
201+
}
107202
return {
108203
crawlable: true,
109204
type: 'akamai-allowed',
@@ -129,6 +224,15 @@ function analyzeResponse(response) {
129224

130225
// Success with no known infrastructure
131226
if (status === 200) {
227+
// Still check for generic challenge patterns
228+
if (htmlHasChallenge(CHALLENGE_PATTERNS.general)) {
229+
return {
230+
crawlable: false,
231+
type: 'unknown',
232+
confidence: 0.7,
233+
reason: 'Generic challenge patterns detected',
234+
};
235+
}
132236
return {
133237
crawlable: true,
134238
type: 'none',
@@ -207,3 +311,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
207311
return analyzeError(error);
208312
}
209313
}
314+
315+
/**
316+
* Analyzes already-fetched response data for bot protection.
317+
* Used by content scraper to analyze Puppeteer results without making another request.
318+
*
319+
* @param {Object} data - Response data to analyze
320+
* @param {number} data.status - HTTP status code
321+
* @param {Object} data.headers - Response headers (plain object or Headers object)
322+
* @param {string} [data.html] - Optional HTML content for challenge page detection
323+
* @returns {Object} Detection result (same format as detectBotBlocker)
324+
*/
325+
export function analyzeBotProtection({ status, headers, html }) {
326+
// Convert headers to Headers object if plain object
327+
const headersObj = headers instanceof Headers
328+
? headers
329+
: new Headers(Object.entries(headers || {}));
330+
331+
const mockResponse = {
332+
status,
333+
headers: headersObj,
334+
};
335+
336+
return analyzeResponse(mockResponse, html);
337+
}

packages/spacecat-shared-utils/src/index.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,12 @@ export * as llmoConfig from './llmo-config.js';
110110
export * as schemas from './schemas.js';
111111

112112
export { detectLocale } from './locale-detect/locale-detect.js';
113-
export { detectBotBlocker } from './bot-blocker-detect/bot-blocker-detect.js';
113+
export {
114+
detectBotBlocker,
115+
analyzeBotProtection,
116+
SPACECAT_BOT_USER_AGENT,
117+
SPACECAT_BOT_IPS,
118+
} from './bot-blocker-detect/bot-blocker-detect.js';
114119
export { prettifyLogForwardingConfig } from './cdn-helpers.js';
115120

116121
export {

0 commit comments

Comments
 (0)