`, `__REACT_DEVTOOLS_`}, "vue": {`

`, `__VUE__`}, "next": {`

`, `__NEXT_DATA__`}, } for framework, patterns := range frameworks { for _, pattern := range patterns { if strings.Contains(html, pattern) { // Found framework marker, but is content actually empty? stripped := stripScriptsAndStyles(html) if len(stripped) < 200 { return true, framework // SPA with no content } } } } return false, "" } ``` **Why check stripped length?** - Framework markers alone don't mean SPA - Many sites use React but also server-render content - Only flag as SPA if content is actually empty (<200 chars) **When SPA detected**: - `has_spa = true` heuristic - Suggests `playwright` fetcher (needs JS execution) - Probe body NOT reused (useless without JS) --- ## Heuristic Extraction ### From URL ```go func ExtractFromURL(rawURL string) []Heuristic { var heuristics []Heuristic parsed, err := url.Parse(rawURL) if err != nil { return heuristics } // Domain (always extract) domain := strings.TrimPrefix(parsed.Host, "www.") heuristics = append(heuristics, Heuristic{Type: "domain", Value: domain}) // Suffix (if present) if ext := filepath.Ext(parsed.Path); ext != "" { heuristics = append(heuristics, Heuristic{Type: "suffix", Value: ext}) } // Path patterns path := parsed.Path if strings.Contains(path, "/cdn/") { heuristics = append(heuristics, Heuristic{Type: "contains_cdn", Value: "true"}) } if strings.Contains(path, "/static/") { heuristics = append(heuristics, Heuristic{Type: "contains_static", Value: "true"}) } if strings.Contains(path, "/assets/") { heuristics = append(heuristics, Heuristic{Type: "contains_assets", Value: "true"}) } if strings.Contains(path, "/api/") { heuristics = append(heuristics, Heuristic{Type: "contains_api", Value: "true"}) } return heuristics } ``` ### From Response ```go func ExtractPostFetchHeuristics(html string, headers http.Header, statusCode int) []Heuristic { var heuristics []Heuristic // Status code heuristics = append(heuristics, Heuristic{ Type: fmt.Sprintf("status_%d", statusCode), Value: "true", }) // Server header if server := headers.Get("Server"); server != "" { serverLower := strings.ToLower(server) if strings.Contains(serverLower, "cloudflare") { heuristics = append(heuristics, Heuristic{Type: "server_cloudflare", Value: "true"}) } if strings.Contains(serverLower, "nginx") { heuristics = append(heuristics, Heuristic{Type: "server_nginx", Value: "true"}) } } // Content analysis if DetectCaptcha(html) { heuristics = append(heuristics, Heuristic{Type: "has_captcha", Value: "true"}) } isSPA, _ := DetectSPA(html) if isSPA { heuristics = append(heuristics, Heuristic{Type: "has_spa", Value: "true"}) } stripped := stripScriptsAndStyles(html) if len(stripped) < 200 { heuristics = append(heuristics, Heuristic{Type: "empty_body", Value: "true"}) } return heuristics } ``` --- ## Adding New Heuristics **No database changes needed.** Just add extraction logic: ```go // Example: Video platform detection if strings.Contains(url, "youtube.com") || strings.Contains(url, "vimeo.com") { heuristics = append(heuristics, Heuristic{ Type: "video_platform", Value: "true", }) } // Example: Path depth (deep = often dynamic) pathDepth := strings.Count(parsedURL.Path, "/") if pathDepth > 5 { heuristics = append(heuristics, Heuristic{ Type: "deep_path", Value: "true", }) } // Example: Query parameter detection if len(parsedURL.RawQuery) > 50 { heuristics = append(heuristics, Heuristic{ Type: "complex_query", Value: "true", }) } ``` **Why this is powerful**: - Query immediately finds attempts with new heuristic type - No migration, no schema change - System starts learning new pattern immediately - Can A/B test heuristic effectiveness --- ## ValidationResult Struct ```go type ValidationResult struct { IsBlocked bool // Captcha or 403/429 detected BlockType string // "blocked_captcha", "blocked_403" HasSPA bool // SPA framework detected with empty content Framework string // "react", "vue", "next" IsEmpty bool // <200 chars after stripping scripts } func Validate(html string, headers http.Header, statusCode int) *ValidationResult { result := &ValidationResult{} // Block detection if DetectCaptcha(html) { result.IsBlocked = true result.BlockType = "blocked_captcha" } else if Detect403(statusCode) { result.IsBlocked = true result.BlockType = "blocked_403" } // SPA detection result.HasSPA, result.Framework = DetectSPA(html) // Empty content detection stripped := stripScriptsAndStyles(html) result.IsEmpty = len(stripped) < 200 return result } ``` --- ## Importance Score (Analytics) ### Natural Frequency Weighting (Default) By default, heuristics get importance through query frequency: - Domain appears in EVERY attempt for that domain → high influence - Suffix appears in fewer attempts → lower influence This happens automatically - no explicit scoring needed. ### Explicit Scoring (Advanced) For debugging and optimization, calculate information gain: ```sql -- How much does knowing "has_captcha=true" improve prediction? -- Compare P(success) vs P(success | has_captcha=true) -- Overall success rate (baseline) SELECT AVG(CASE WHEN success THEN 1.0 ELSE 0.0 END) as baseline FROM fetcher_attempts; -- Success rate given each heuristic SELECT ah.heuristic_type, ah.heuristic_value, AVG(CASE WHEN fa.success THEN 1.0 ELSE 0.0 END) as conditional_rate, COUNT(*) as sample_size, ABS(AVG(CASE WHEN fa.success THEN 1.0 ELSE 0.0 END) - :baseline) as info_gain FROM fetcher_attempts fa JOIN attempt_heuristics ah ON fa.id = ah.attempt GROUP BY ah.heuristic_type, ah.heuristic_value HAVING COUNT(*) > 10 ORDER BY info_gain DESC; ``` **Use cases**: - Identify most predictive heuristics (has_captcha likely high) - Debug misclassifications (which heuristics dominated?) - Prune low-value heuristics from queries **When to run**: Weekly analytics job (not on hot path)