@@ -30,7 +30,70 @@ const CONFIDENCE_MEDIUM = 0.95;
3030const CONFIDENCE_ABSOLUTE = 1.0 ;
3131const DEFAULT_TIMEOUT = 5000 ;
3232
33- function analyzeResponse ( response ) {
33+ /**
34+ * SpaceCat bot identification constants
35+ */
36+ export const SPACECAT_BOT_USER_AGENT = 'Spacecat/1.0' ;
37+
38+ /**
39+ * SpaceCat bot IPs by environment
40+ */
41+ export const SPACECAT_BOT_IPS = {
42+ production : [
43+ '3.218.16.42' ,
44+ '52.55.82.37' ,
45+ '54.172.145.38' ,
46+ ] ,
47+ development : [
48+ '44.218.57.115' ,
49+ '54.87.205.187' ,
50+ ] ,
51+ } ;
52+
53+ /**
54+ * HTML patterns for detecting challenge pages
55+ */
56+ const CHALLENGE_PATTERNS = {
57+ cloudflare : [
58+ / C h e c k i n g y o u r b r o w s e r / i,
59+ / J u s t a m o m e n t \. \. \. / i,
60+ / V e r i f y i n g y o u a r e h u m a n / i,
61+ / P l e a s e w a i t .* C l o u d F l a r e / i,
62+ / c f - t u r n s t i l e / i,
63+ / c h a l l e n g e - p l a t f o r m / i,
64+ / c f - c h l - w i d g e t / i, // Cloudflare challenge widget
65+ / r a y \s * i d .* c l o u d f l a r e / i, // Cloudflare Ray ID in error pages
66+ / _ _ c f _ c h l _ t k / i, // Cloudflare challenge token
67+ / c l o u d f l a r e .* s e c u r i t y / i,
68+ / a t t e n t i o n r e q u i r e d .* c l o u d f l a r e / i,
69+ ] ,
70+ imperva : [
71+ / _ I n c a p s u l a _ R e s o u r c e / i,
72+ / I n c a p s u l a i n c i d e n t I D / i,
73+ / i n c a p _ s e s / i, // Imperva session cookie
74+ / v i s i d _ i n c a p / i, // Imperva visitor ID
75+ ] ,
76+ akamai : [
77+ / A c c e s s D e n i e d .* A k a m a i / i,
78+ / R e f e r e n c e .* A k a m a i / i,
79+ ] ,
80+ general : [
81+ / c a p t c h a / i,
82+ / h u m a n v e r i f i c a t i o n / i,
83+ / r e c a p t c h a / i,
84+ / h c a p t c h a / i,
85+ / d a t a d o m e / i,
86+ / d d - r e q u e s t - i d / i,
87+ ] ,
88+ } ;
89+
90+ /**
91+ * Analyzes response for bot protection indicators
92+ * @param {Object } response - Response object with status and headers
93+ * @param {string } [html] - Optional HTML content for deeper analysis
94+ * @returns {Object } Detection result
95+ */
96+ function analyzeResponse ( response , html = null ) {
3497 const { status, headers } = response ;
3598
3699 // Check for CDN/blocker infrastructure presence (lazy evaluation for performance)
@@ -45,6 +108,12 @@ function analyzeResponse(response) {
45108 || headers . get ( 'x-amz-cf-pop' )
46109 || headers . get ( 'via' ) ?. includes ( 'CloudFront' ) ;
47110
111+ // Check HTML content for challenge page patterns (if HTML provided)
112+ const htmlHasChallenge = ( patterns ) => {
113+ if ( ! html ) return false ;
114+ return patterns . some ( ( pattern ) => pattern . test ( html ) ) ;
115+ } ;
116+
48117 // Active blocking (403 status with known blocker)
49118 if ( status === 403 && hasCloudflare ( ) ) {
50119 return {
@@ -88,6 +157,16 @@ function analyzeResponse(response) {
88157
89158 // Success with known infrastructure present (infrastructure detected but allowing requests)
90159 if ( status === 200 && hasCloudflare ( ) ) {
160+ // Check if HTML contains challenge page (even though status is 200)
161+ if ( htmlHasChallenge ( CHALLENGE_PATTERNS . cloudflare ) ) {
162+ return {
163+ crawlable : false ,
164+ type : 'cloudflare' ,
165+ confidence : CONFIDENCE_HIGH ,
166+ reason : 'Challenge page detected despite 200 status' ,
167+ } ;
168+ }
169+
91170 return {
92171 crawlable : true ,
93172 type : 'cloudflare-allowed' ,
@@ -96,6 +175,14 @@ function analyzeResponse(response) {
96175 }
97176
98177 if ( status === 200 && hasImperva ( ) ) {
178+ if ( htmlHasChallenge ( CHALLENGE_PATTERNS . imperva ) ) {
179+ return {
180+ crawlable : false ,
181+ type : 'imperva' ,
182+ confidence : CONFIDENCE_HIGH ,
183+ reason : 'Challenge page detected despite 200 status' ,
184+ } ;
185+ }
99186 return {
100187 crawlable : true ,
101188 type : 'imperva-allowed' ,
@@ -104,6 +191,14 @@ function analyzeResponse(response) {
104191 }
105192
106193 if ( status === 200 && hasAkamai ( ) ) {
194+ if ( htmlHasChallenge ( CHALLENGE_PATTERNS . akamai ) ) {
195+ return {
196+ crawlable : false ,
197+ type : 'akamai' ,
198+ confidence : CONFIDENCE_HIGH ,
199+ reason : 'Challenge page detected despite 200 status' ,
200+ } ;
201+ }
107202 return {
108203 crawlable : true ,
109204 type : 'akamai-allowed' ,
@@ -129,6 +224,15 @@ function analyzeResponse(response) {
129224
130225 // Success with no known infrastructure
131226 if ( status === 200 ) {
227+ // Still check for generic challenge patterns
228+ if ( htmlHasChallenge ( CHALLENGE_PATTERNS . general ) ) {
229+ return {
230+ crawlable : false ,
231+ type : 'unknown' ,
232+ confidence : 0.7 ,
233+ reason : 'Generic challenge patterns detected' ,
234+ } ;
235+ }
132236 return {
133237 crawlable : true ,
134238 type : 'none' ,
@@ -207,3 +311,27 @@ export async function detectBotBlocker({ baseUrl, timeout = DEFAULT_TIMEOUT }) {
207311 return analyzeError ( error ) ;
208312 }
209313}
314+
315+ /**
316+ * Analyzes already-fetched response data for bot protection.
317+ * Used by content scraper to analyze Puppeteer results without making another request.
318+ *
319+ * @param {Object } data - Response data to analyze
320+ * @param {number } data.status - HTTP status code
321+ * @param {Object } data.headers - Response headers (plain object or Headers object)
322+ * @param {string } [data.html] - Optional HTML content for challenge page detection
323+ * @returns {Object } Detection result (same format as detectBotBlocker)
324+ */
325+ export function analyzeBotProtection ( { status, headers, html } ) {
326+ // Convert headers to Headers object if plain object
327+ const headersObj = headers instanceof Headers
328+ ? headers
329+ : new Headers ( Object . entries ( headers || { } ) ) ;
330+
331+ const mockResponse = {
332+ status,
333+ headers : headersObj ,
334+ } ;
335+
336+ return analyzeResponse ( mockResponse , html ) ;
337+ }
0 commit comments