LLM辅助网页爬取
--- title: "LLM辅助网页爬取" description: "探讨大语言模型如何增强网页爬取能力,包括智能解析、内容提取、反爬策略处理和数据清洗" tags: ["网页爬取", "数据提取", "智能解析", "反爬策略", "LLM应用"] category: "llm" icon: "🧠"
LLM辅助网页爬取
网页爬取是获取网络数据的重要手段,但传统爬虫面临页面结构变化、反爬机制和内容理解等挑战。大语言模型(LLM)的引入为网页爬取带来了革命性改进,通过智能解析、语义理解和自适应策略,显著提升了爬取效率和数据质量。
智能页面解析
传统爬虫依赖固定的选择器或XPath路径,当页面结构变化时容易失效。LLM能够理解页面语义,自适应地提取所需内容。
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
class IntelligentWebScraper:
"""智能网页爬虫"""
def __init__(self, llm_client):
self.llm_client = llm_client
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
})
def scrape_with_intelligence(self, url: str,
extraction_prompt: str) -> Dict[str, Any]:
"""智能爬取页面"""
# 获取页面内容
response = self.session.get(url, timeout=10)
html_content = response.text
# 使用LLM分析页面结构
page_analysis = self._analyze_page_structure(html_content, url)
# 智能提取内容
extracted_data = self._extract_content(html_content, extraction_prompt)
return {
"url": url,
"page_analysis": page_analysis,
"extracted_data": extracted_data,
"status": response.status_code
}
def _analyze_page_structure(self, html: str, url: str) -> Dict[str, Any]:
"""分析页面结构"""
soup = BeautifulSoup(html, 'html.parser')
# 提取基本结构信息
title = soup.title.string if soup.title else "无标题"
meta_desc = soup.find("meta", {"name": "description"})
description = meta_desc["content"] if meta_desc else "无描述"
# 提取主要内容区域
main_content = soup.find("main") or soup.find("article") or soup.find("div", class_="content")
prompt = f"""分析以下网页结构:
URL: {url}
标题: {title}
描述: {description}
主要内容区域标签: {main_content.name if main_content else '未找到'}
HTML结构预览:
{html[:2000]}
请分析:
1. 页面类型(文章、产品列表、搜索结果等)
2. 主要内容区域
3. 导航结构
4. 反爬措施检测
5. 推荐的提取策略
"""
analysis = self.llm_client.generate(prompt, temperature=0.2)
return {
"title": title,
"description": description,
"has_main_content": main_content is not None,
"analysis": analysis
}
def _extract_content(self, html: str, extraction_prompt: str) -> Any:
"""使用LLM智能提取内容"""
soup = BeautifulSoup(html, 'html.parser')
# 移除脚本和样式
for script in soup(["script", "style", "nav", "footer"]):
script.decompose()
clean_text = soup.get_text(separator=' ', strip=True)
prompt = f"""从以下网页内容中提取信息:
提取需求:{extraction_prompt}
网页内容(前3000字符):
{clean_text[:3000]}
请提供结构化的提取结果,使用JSON格式。
"""
result = self.llm_client.generate(prompt, temperature=0.3)
try:
# 尝试解析JSON
return json.loads(result)
except json.JSONDecodeError:
return {"raw_text": result}
def scrape_multiple_pages(self, urls: List[str],
extraction_prompt: str) -> List[Dict]:
"""批量爬取多个页面"""
results = []
for url in urls:
try:
result = self.scrape_with_intelligence(url, extraction_prompt)
results.append(result)
except Exception as e:
print(f"爬取 {url} 失败: {e}")
results.append({"url": url, "error": str(e)})
return results
# 使用示例
scraper = IntelligentWebScraper(llm_client)
# 爬取电商产品页面
result = scraper.scrape_with_intelligence(
"https://example.com/products",
"提取所有产品的名称、价格、评分和描述"
)
print(f"页面标题: {result['page_analysis']['title']}")
print(f"提取数据: {result['extracted_data']}")
智能页面解析的关键优势在于能够适应页面结构变化,理解页面语义,并根据提取需求动态调整策略。
内容理解与提取
LLM不仅能提取结构化数据,还能理解内容的含义,进行更深层次的信息提取。
class ContentUnderstandingScraper:
"""内容理解爬虫"""
def __init__(self, llm_client):
self.llm_client = llm_client
def extract_with_understanding(self, url: str,
understanding_prompt: str) -> Dict[str, Any]:
"""带内容理解的提取"""
# 获取页面内容
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 提取主要内容
content = self._extract_main_content(soup)
# 使用LLM理解内容
understanding = self._understand_content(content, understanding_prompt)
return {
"url": url,
"content_preview": content[:500],
"understanding": understanding
}
def _extract_main_content(self, soup: BeautifulSoup) -> str:
"""提取主要内容"""
# 移除非内容元素
for element in soup(["script", "style", "nav", "footer", "aside"]):
element.decompose()
# 尝试找到主要内容区域
main = soup.find("main") or soup.find("article")
if main:
return main.get_text(separator=' ', strip=True)
# 回退到body
body = soup.find("body")
return body.get_text(separator=' ', strip=True) if body else ""
def _understand_content(self, content: str, prompt: str) -> Dict[str, Any]:
"""理解内容含义"""
full_prompt = f"""分析以下网页内容:
{content[:2000]}
分析需求:{prompt}
请提供:
1. 内容摘要
2. 关键观点和论点
3. 事实和数据
4. 情感倾向
5. 相关主题和关键词
6. 可信度评估
"""
analysis = self.llm_client.generate(full_prompt, temperature=0.3)
return {"analysis": analysis}
def compare_pages(self, urls: List[str],
comparison_criteria: str) -> Dict[str, Any]:
"""比较多个页面的内容"""
pages_content = []
for url in urls:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
content = self._extract_main_content(soup)
pages_content.append({
"url": url,
"content": content[:1000]
})
prompt = f"""比较以下网页的内容:
比较标准:{comparison_criteria}
页面内容:
{json.dumps(pages_content, indent=2, ensure_ascii=False)}
请提供:
1. 内容相似性分析
2. 关键差异
3. 各页面的优势
4. 综合评价
5. 推荐(如果适用)
"""
comparison = self.llm_client.generate(prompt, temperature=0.4)
return {"comparison": comparison}
# 使用示例
understanding_scraper = ContentUnderstandingScraper(llm_client)
# 理解新闻文章
result = understanding_scraper.extract_with_understanding(
"https://example.com/news/article1",
"提取文章的核心观点和论据"
)
内容理解爬虫的关键优势在于能够理解内容的深层含义,进行观点提取、情感分析和可信度评估,而不仅仅是简单的数据提取。
反爬策略处理
现代网站通常部署各种反爬措施,LLM能够帮助爬虫理解和应对这些策略。
class AntiDetectionScraper:
"""反检测爬虫"""
def __init__(self, llm_client):
self.llm_client = llm_client
self.session = requests.Session()
def analyze_anti_crawling(self, url: str) -> Dict[str, Any]:
"""分析网站的反爬措施"""
try:
response = self.session.get(url, timeout=10)
headers = dict(response.headers)
# 检测常见反爬措施
detection_result = self._detect_measures(headers, response.text)
# 生成绕过策略
bypass_strategy = self._generate_bypass_strategy(detection_result)
return {
"url": url,
"status_code": response.status_code,
"headers": headers,
"detection": detection_result,
"bypass_strategy": bypass_strategy
}
except Exception as e:
return {"url": url, "error": str(e)}
def _detect_measures(self, headers: Dict, html: str) -> Dict[str, Any]:
"""检测反爬措施"""
measures = {
"rate_limiting": "X-RateLimit" in str(headers),
"captcha": "captcha" in html.lower() or "recaptcha" in html.lower(),
"cloudflare": "cf-" in str(headers).lower(),
"bot_detection": "bot" in str(headers).lower(),
"javascript_challenge": "challenge" in html.lower()
}
# 使用LLM分析其他可能的措施
prompt = f"""分析以下HTTP响应中的反爬措施:
响应头:
{json.dumps(dict(headers), indent=2)}
HTML预览(前1000字符):
{html[:1000]}
请识别:
1. 已知的反爬技术
2. 潜在的检测机制
3. 建议的绕过方法
4. 风险评估
"""
llm_analysis = self.llm_client.generate(prompt, temperature=0.2)
return {**measures, "llm_analysis": llm_analysis}
def _generate_bypass_strategy(self, detection: Dict[str, Any]) -> str:
"""生成绕过策略"""
prompt = f"""基于以下反爬检测结果,生成绕过策略:
检测结果:
{json.dumps(detection, indent=2, ensure_ascii=False)}
请提供:
1. 推荐的请求头设置
2. 请求频率建议
3. 代理和IP轮换策略
4. JavaScript执行需求
5. Cookie和会话管理
6. 法律和道德考虑
"""
return self.llm_client.generate(prompt, temperature=0.3)
def adaptive_scrape(self, url: str, max_retries: int = 3) -> Dict[str, Any]:
"""自适应爬取"""
for attempt in range(max_retries):
try:
# 分析反爬措施
analysis = self.analyze_anti_crawling(url)
# 根据分析调整请求
if analysis.get("detection", {}).get("captcha"):
print(f"尝试 {attempt + 1}: 检测到验证码,调整策略")
# 实际应用中需要处理验证码
# 尝试爬取
response = self.session.get(url, timeout=10)
if response.status_code == 200:
return {
"success": True,
"content": response.text[:2000],
"attempts": attempt + 1
}
else:
print(f"尝试 {attempt + 1}: 状态码 {response.status_code}")
except Exception as e:
print(f"尝试 {attempt + 1} 失败: {e}")
return {"success": False, "error": "达到最大重试次数"}
# 使用示例
anti_detection = AntiDetectionScraper(llm_client)
# 分析网站反爬措施
analysis = anti_detection.analyze_anti_crawling("https://example.com")
print(f"反爬措施: {analysis['detection']}")
print(f"绕过策略: {analysis['bypass_strategy'][:200]}...")
反爬策略处理的关键优势在于能够自动识别和应对各种反爬措施,生成个性化的绕过策略,并在遇到问题时自适应调整。
数据清洗与验证
爬取的数据通常包含噪声和不一致性,LLM能够智能清洗和验证数据。
class SmartDataCleaner:
"""智能数据清洗器"""
def __init__(self, llm_client):
self.llm_client = llm_client
def clean_scraped_data(self, data: List[Dict],
cleaning_rules: Dict[str, Any] = None) -> List[Dict]:
"""清洗爬取的数据"""
cleaned_data = []
for item in data:
# 基本清洗
cleaned_item = self._basic_cleaning(item)
# LLM增强清洗
enhanced_item = self._llm_enhanced_cleaning(cleaned_item, cleaning_rules)
cleaned_data.append(enhanced_item)
return cleaned_data
def _basic_cleaning(self, item: Dict) -> Dict:
"""基本数据清洗"""
cleaned = {}
for key, value in item.items():
if isinstance(value, str):
# 清理字符串
value = value.strip()
value = value.replace('\n', ' ')
value = value.replace('\t', ' ')
# 移除多余空格
import re
value = re.sub(r'\s+', ' ', value)
cleaned[key] = value
return cleaned
def _llm_enhanced_cleaning(self, item: Dict,
rules: Dict[str, Any] = None) -> Dict:
"""LLM增强清洗"""
prompt = f"""清洗以下数据:
原始数据:
{json.dumps(item, indent=2, ensure_ascii=False)}
清洗规则:
{json.dumps(rules, indent=2, ensure_ascii=False) if rules else "标准清洗"}
请提供:
1. 清洗后的数据
2. 应用的清洗操作
3. 数据质量评估
4. 建议的进一步处理
"""
cleaning_result = self.llm_client.generate(prompt, temperature=0.1)
try:
# 尝试解析清洗后的数据
cleaned_data = json.loads(cleaning_result)
return cleaned_data
except json.JSONDecodeError:
# 如果无法解析,返回原始数据加清洗建议
return {**item, "cleaning_suggestions": cleaning_result}
def validate_data(self, data: List[Dict],
validation_schema: Dict[str, Any]) -> Dict[str, Any]:
"""验证数据质量"""
validation_results = []
for item in data:
item_validation = self._validate_item(item, validation_schema)
validation_results.append(item_validation)
# 统计验证结果
valid_count = sum(1 for v in validation_results if v["is_valid"])
invalid_count = len(validation_results) - valid_count
return {
"total_items": len(data),
"valid_items": valid_count,
"invalid_items": invalid_count,
"validation_rate": valid_count / len(data) if data else 0,
"details": validation_results
}
def _validate_item(self, item: Dict, schema: Dict) -> Dict[str, Any]:
"""验证单个数据项"""
prompt = f"""验证以下数据项:
数据:
{json.dumps(item, indent=2, ensure_ascii=False)}
验证规则:
{json.dumps(schema, indent=2, ensure_ascii=False)}
请验证:
1. 字段完整性
2. 数据类型
3. 值范围
4. 格式要求
5. 业务规则
"""
validation_result = self.llm_client.generate(prompt, temperature=0.1)
# 简化的验证结果
return {
"item": item,
"validation_result": validation_result,
"is_valid": "有效" in validation_result and "无效" not in validation_result
}
# 使用示例
cleaner = SmartDataCleaner(llm_client)
# 清洗爬取的数据
raw_data = [
{"title": " 产品A ", "price": "¥199.00", "description": "优质产品\n性能卓越"},
{"title": "产品B", "price": "价格面议", "description": "详情请咨询"}
]
cleaned_data = cleaner.clean_scraped_data(raw_data)
print(f"清洗完成,处理了 {len(cleaned_data)} 条数据")
# 验证数据
validation_schema = {
"title": {"required": True, "max_length": 100},
"price": {"required": True, "pattern": r"^\d+(\.\d{2})?$"}
}
validation_result = cleaner.validate_data(cleaned_data, validation_schema)
print(f"验证通过率: {validation_result['validation_rate']:.2%}")
数据清洗与验证是确保爬取数据质量的关键步骤。LLM能够理解数据含义,进行智能清洗,并根据业务规则验证数据的有效性。结合传统清洗方法和LLM增强清洗,可以显著提高数据质量,为后续分析和应用提供可靠的数据基础。