← 返回首页
🧠

LLM辅助网页爬取

📂 llm ⏱ 6 min 1004 words

--- title: "LLM辅助网页爬取" description: "探讨大语言模型如何增强网页爬取能力,包括智能解析、内容提取、反爬策略处理和数据清洗" tags: ["网页爬取", "数据提取", "智能解析", "反爬策略", "LLM应用"] category: "llm" icon: "🧠"

LLM辅助网页爬取

网页爬取是获取网络数据的重要手段,但传统爬虫面临页面结构变化、反爬机制和内容理解等挑战。大语言模型(LLM)的引入为网页爬取带来了革命性改进,通过智能解析、语义理解和自适应策略,显著提升了爬取效率和数据质量。

智能页面解析

传统爬虫依赖固定的选择器或XPath路径,当页面结构变化时容易失效。LLM能够理解页面语义,自适应地提取所需内容。

import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List

class IntelligentWebScraper:
    """智能网页爬虫"""
    
    def __init__(self, llm_client):
        self.llm_client = llm_client
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        })
    
    def scrape_with_intelligence(self, url: str, 
                                extraction_prompt: str) -> Dict[str, Any]:
        """智能爬取页面"""
        # 获取页面内容
        response = self.session.get(url, timeout=10)
        html_content = response.text
        
        # 使用LLM分析页面结构
        page_analysis = self._analyze_page_structure(html_content, url)
        
        # 智能提取内容
        extracted_data = self._extract_content(html_content, extraction_prompt)
        
        return {
            "url": url,
            "page_analysis": page_analysis,
            "extracted_data": extracted_data,
            "status": response.status_code
        }
    
    def _analyze_page_structure(self, html: str, url: str) -> Dict[str, Any]:
        """分析页面结构"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # 提取基本结构信息
        title = soup.title.string if soup.title else "无标题"
        meta_desc = soup.find("meta", {"name": "description"})
        description = meta_desc["content"] if meta_desc else "无描述"
        
        # 提取主要内容区域
        main_content = soup.find("main") or soup.find("article") or soup.find("div", class_="content")
        
        prompt = f"""分析以下网页结构:

URL: {url}
标题: {title}
描述: {description}
主要内容区域标签: {main_content.name if main_content else '未找到'}

HTML结构预览:
{html[:2000]}

请分析:
1. 页面类型(文章、产品列表、搜索结果等)
2. 主要内容区域
3. 导航结构
4. 反爬措施检测
5. 推荐的提取策略
"""
        
        analysis = self.llm_client.generate(prompt, temperature=0.2)
        
        return {
            "title": title,
            "description": description,
            "has_main_content": main_content is not None,
            "analysis": analysis
        }
    
    def _extract_content(self, html: str, extraction_prompt: str) -> Any:
        """使用LLM智能提取内容"""
        soup = BeautifulSoup(html, 'html.parser')
        
        # 移除脚本和样式
        for script in soup(["script", "style", "nav", "footer"]):
            script.decompose()
        
        clean_text = soup.get_text(separator=' ', strip=True)
        
        prompt = f"""从以下网页内容中提取信息:

提取需求:{extraction_prompt}

网页内容(前3000字符):
{clean_text[:3000]}

请提供结构化的提取结果,使用JSON格式。
"""
        
        result = self.llm_client.generate(prompt, temperature=0.3)
        
        try:
            # 尝试解析JSON
            return json.loads(result)
        except json.JSONDecodeError:
            return {"raw_text": result}
    
    def scrape_multiple_pages(self, urls: List[str], 
                             extraction_prompt: str) -> List[Dict]:
        """批量爬取多个页面"""
        results = []
        for url in urls:
            try:
                result = self.scrape_with_intelligence(url, extraction_prompt)
                results.append(result)
            except Exception as e:
                print(f"爬取 {url} 失败: {e}")
                results.append({"url": url, "error": str(e)})
        
        return results

# 使用示例
scraper = IntelligentWebScraper(llm_client)

# 爬取电商产品页面
result = scraper.scrape_with_intelligence(
    "https://example.com/products",
    "提取所有产品的名称、价格、评分和描述"
)

print(f"页面标题: {result['page_analysis']['title']}")
print(f"提取数据: {result['extracted_data']}")

智能页面解析的关键优势在于能够适应页面结构变化,理解页面语义,并根据提取需求动态调整策略。

内容理解与提取

LLM不仅能提取结构化数据,还能理解内容的含义,进行更深层次的信息提取。

class ContentUnderstandingScraper:
    """内容理解爬虫"""
    
    def __init__(self, llm_client):
        self.llm_client = llm_client
    
    def extract_with_understanding(self, url: str, 
                                  understanding_prompt: str) -> Dict[str, Any]:
        """带内容理解的提取"""
        # 获取页面内容
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 提取主要内容
        content = self._extract_main_content(soup)
        
        # 使用LLM理解内容
        understanding = self._understand_content(content, understanding_prompt)
        
        return {
            "url": url,
            "content_preview": content[:500],
            "understanding": understanding
        }
    
    def _extract_main_content(self, soup: BeautifulSoup) -> str:
        """提取主要内容"""
        # 移除非内容元素
        for element in soup(["script", "style", "nav", "footer", "aside"]):
            element.decompose()
        
        # 尝试找到主要内容区域
        main = soup.find("main") or soup.find("article")
        if main:
            return main.get_text(separator=' ', strip=True)
        
        # 回退到body
        body = soup.find("body")
        return body.get_text(separator=' ', strip=True) if body else ""
    
    def _understand_content(self, content: str, prompt: str) -> Dict[str, Any]:
        """理解内容含义"""
        full_prompt = f"""分析以下网页内容:

{content[:2000]}

分析需求:{prompt}

请提供:
1. 内容摘要
2. 关键观点和论点
3. 事实和数据
4. 情感倾向
5. 相关主题和关键词
6. 可信度评估
"""
        
        analysis = self.llm_client.generate(full_prompt, temperature=0.3)
        
        return {"analysis": analysis}
    
    def compare_pages(self, urls: List[str], 
                     comparison_criteria: str) -> Dict[str, Any]:
        """比较多个页面的内容"""
        pages_content = []
        
        for url in urls:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            content = self._extract_main_content(soup)
            pages_content.append({
                "url": url,
                "content": content[:1000]
            })
        
        prompt = f"""比较以下网页的内容:

比较标准:{comparison_criteria}

页面内容:
{json.dumps(pages_content, indent=2, ensure_ascii=False)}

请提供:
1. 内容相似性分析
2. 关键差异
3. 各页面的优势
4. 综合评价
5. 推荐(如果适用)
"""
        
        comparison = self.llm_client.generate(prompt, temperature=0.4)
        
        return {"comparison": comparison}

# 使用示例
understanding_scraper = ContentUnderstandingScraper(llm_client)

# 理解新闻文章
result = understanding_scraper.extract_with_understanding(
    "https://example.com/news/article1",
    "提取文章的核心观点和论据"
)

内容理解爬虫的关键优势在于能够理解内容的深层含义,进行观点提取、情感分析和可信度评估,而不仅仅是简单的数据提取。

反爬策略处理

现代网站通常部署各种反爬措施,LLM能够帮助爬虫理解和应对这些策略。

class AntiDetectionScraper:
    """反检测爬虫"""
    
    def __init__(self, llm_client):
        self.llm_client = llm_client
        self.session = requests.Session()
    
    def analyze_anti_crawling(self, url: str) -> Dict[str, Any]:
        """分析网站的反爬措施"""
        try:
            response = self.session.get(url, timeout=10)
            headers = dict(response.headers)
            
            # 检测常见反爬措施
            detection_result = self._detect_measures(headers, response.text)
            
            # 生成绕过策略
            bypass_strategy = self._generate_bypass_strategy(detection_result)
            
            return {
                "url": url,
                "status_code": response.status_code,
                "headers": headers,
                "detection": detection_result,
                "bypass_strategy": bypass_strategy
            }
        except Exception as e:
            return {"url": url, "error": str(e)}
    
    def _detect_measures(self, headers: Dict, html: str) -> Dict[str, Any]:
        """检测反爬措施"""
        measures = {
            "rate_limiting": "X-RateLimit" in str(headers),
            "captcha": "captcha" in html.lower() or "recaptcha" in html.lower(),
            "cloudflare": "cf-" in str(headers).lower(),
            "bot_detection": "bot" in str(headers).lower(),
            "javascript_challenge": "challenge" in html.lower()
        }
        
        # 使用LLM分析其他可能的措施
        prompt = f"""分析以下HTTP响应中的反爬措施:

响应头:
{json.dumps(dict(headers), indent=2)}

HTML预览(前1000字符):
{html[:1000]}

请识别:
1. 已知的反爬技术
2. 潜在的检测机制
3. 建议的绕过方法
4. 风险评估
"""
        
        llm_analysis = self.llm_client.generate(prompt, temperature=0.2)
        
        return {**measures, "llm_analysis": llm_analysis}
    
    def _generate_bypass_strategy(self, detection: Dict[str, Any]) -> str:
        """生成绕过策略"""
        prompt = f"""基于以下反爬检测结果,生成绕过策略:

检测结果:
{json.dumps(detection, indent=2, ensure_ascii=False)}

请提供:
1. 推荐的请求头设置
2. 请求频率建议
3. 代理和IP轮换策略
4. JavaScript执行需求
5. Cookie和会话管理
6. 法律和道德考虑
"""
        
        return self.llm_client.generate(prompt, temperature=0.3)
    
    def adaptive_scrape(self, url: str, max_retries: int = 3) -> Dict[str, Any]:
        """自适应爬取"""
        for attempt in range(max_retries):
            try:
                # 分析反爬措施
                analysis = self.analyze_anti_crawling(url)
                
                # 根据分析调整请求
                if analysis.get("detection", {}).get("captcha"):
                    print(f"尝试 {attempt + 1}: 检测到验证码,调整策略")
                    # 实际应用中需要处理验证码
                
                # 尝试爬取
                response = self.session.get(url, timeout=10)
                
                if response.status_code == 200:
                    return {
                        "success": True,
                        "content": response.text[:2000],
                        "attempts": attempt + 1
                    }
                else:
                    print(f"尝试 {attempt + 1}: 状态码 {response.status_code}")
                    
            except Exception as e:
                print(f"尝试 {attempt + 1} 失败: {e}")
        
        return {"success": False, "error": "达到最大重试次数"}

# 使用示例
anti_detection = AntiDetectionScraper(llm_client)

# 分析网站反爬措施
analysis = anti_detection.analyze_anti_crawling("https://example.com")
print(f"反爬措施: {analysis['detection']}")
print(f"绕过策略: {analysis['bypass_strategy'][:200]}...")

反爬策略处理的关键优势在于能够自动识别和应对各种反爬措施,生成个性化的绕过策略,并在遇到问题时自适应调整。

数据清洗与验证

爬取的数据通常包含噪声和不一致性,LLM能够智能清洗和验证数据。

class SmartDataCleaner:
    """智能数据清洗器"""
    
    def __init__(self, llm_client):
        self.llm_client = llm_client
    
    def clean_scraped_data(self, data: List[Dict], 
                          cleaning_rules: Dict[str, Any] = None) -> List[Dict]:
        """清洗爬取的数据"""
        cleaned_data = []
        
        for item in data:
            # 基本清洗
            cleaned_item = self._basic_cleaning(item)
            
            # LLM增强清洗
            enhanced_item = self._llm_enhanced_cleaning(cleaned_item, cleaning_rules)
            
            cleaned_data.append(enhanced_item)
        
        return cleaned_data
    
    def _basic_cleaning(self, item: Dict) -> Dict:
        """基本数据清洗"""
        cleaned = {}
        
        for key, value in item.items():
            if isinstance(value, str):
                # 清理字符串
                value = value.strip()
                value = value.replace('\n', ' ')
                value = value.replace('\t', ' ')
                
                # 移除多余空格
                import re
                value = re.sub(r'\s+', ' ', value)
            
            cleaned[key] = value
        
        return cleaned
    
    def _llm_enhanced_cleaning(self, item: Dict, 
                              rules: Dict[str, Any] = None) -> Dict:
        """LLM增强清洗"""
        prompt = f"""清洗以下数据:

原始数据:
{json.dumps(item, indent=2, ensure_ascii=False)}

清洗规则:
{json.dumps(rules, indent=2, ensure_ascii=False) if rules else "标准清洗"}

请提供:
1. 清洗后的数据
2. 应用的清洗操作
3. 数据质量评估
4. 建议的进一步处理
"""
        
        cleaning_result = self.llm_client.generate(prompt, temperature=0.1)
        
        try:
            # 尝试解析清洗后的数据
            cleaned_data = json.loads(cleaning_result)
            return cleaned_data
        except json.JSONDecodeError:
            # 如果无法解析,返回原始数据加清洗建议
            return {**item, "cleaning_suggestions": cleaning_result}
    
    def validate_data(self, data: List[Dict], 
                     validation_schema: Dict[str, Any]) -> Dict[str, Any]:
        """验证数据质量"""
        validation_results = []
        
        for item in data:
            item_validation = self._validate_item(item, validation_schema)
            validation_results.append(item_validation)
        
        # 统计验证结果
        valid_count = sum(1 for v in validation_results if v["is_valid"])
        invalid_count = len(validation_results) - valid_count
        
        return {
            "total_items": len(data),
            "valid_items": valid_count,
            "invalid_items": invalid_count,
            "validation_rate": valid_count / len(data) if data else 0,
            "details": validation_results
        }
    
    def _validate_item(self, item: Dict, schema: Dict) -> Dict[str, Any]:
        """验证单个数据项"""
        prompt = f"""验证以下数据项:

数据:
{json.dumps(item, indent=2, ensure_ascii=False)}

验证规则:
{json.dumps(schema, indent=2, ensure_ascii=False)}

请验证:
1. 字段完整性
2. 数据类型
3. 值范围
4. 格式要求
5. 业务规则
"""
        
        validation_result = self.llm_client.generate(prompt, temperature=0.1)
        
        # 简化的验证结果
        return {
            "item": item,
            "validation_result": validation_result,
            "is_valid": "有效" in validation_result and "无效" not in validation_result
        }

# 使用示例
cleaner = SmartDataCleaner(llm_client)

# 清洗爬取的数据
raw_data = [
    {"title": "  产品A  ", "price": "¥199.00", "description": "优质产品\n性能卓越"},
    {"title": "产品B", "price": "价格面议", "description": "详情请咨询"}
]

cleaned_data = cleaner.clean_scraped_data(raw_data)
print(f"清洗完成,处理了 {len(cleaned_data)} 条数据")

# 验证数据
validation_schema = {
    "title": {"required": True, "max_length": 100},
    "price": {"required": True, "pattern": r"^\d+(\.\d{2})?$"}
}

validation_result = cleaner.validate_data(cleaned_data, validation_schema)
print(f"验证通过率: {validation_result['validation_rate']:.2%}")

数据清洗与验证是确保爬取数据质量的关键步骤。LLM能够理解数据含义,进行智能清洗,并根据业务规则验证数据的有效性。结合传统清洗方法和LLM增强清洗,可以显著提高数据质量,为后续分析和应用提供可靠的数据基础。