248 lines
8.7 KiB
Python
248 lines
8.7 KiB
Python
"""
|
||
小红书搜索爬取脚本
|
||
自动搜索关键词并提取笔记图片
|
||
"""
|
||
|
||
import asyncio
|
||
import argparse
|
||
import os
|
||
import sys
|
||
from pathlib import Path
|
||
from playwright.async_api import async_playwright, Page
|
||
|
||
|
||
def get_default_chrome_user_data_dir():
|
||
"""获取 Windows 系统默认浏览器用户数据目录"""
|
||
local_app_data = os.environ.get('LOCALAPPDATA')
|
||
if not local_app_data:
|
||
return None
|
||
|
||
chrome_path = Path(local_app_data) / "Google" / "Chrome" / "User Data"
|
||
if chrome_path.exists():
|
||
return str(chrome_path)
|
||
|
||
edge_path = Path(local_app_data) / "Microsoft" / "Edge" / "User Data"
|
||
if edge_path.exists():
|
||
return str(edge_path)
|
||
|
||
return None
|
||
|
||
|
||
def show_login_notification():
|
||
"""显示登录提醒"""
|
||
try:
|
||
import ctypes
|
||
ctypes.windll.user32.MessageBoxW(
|
||
0,
|
||
"检测到未登录状态\n\n请在浏览器中完成小红书登录,然后关闭浏览器窗口继续",
|
||
"需要登录小红书",
|
||
0x40 | 0x0
|
||
)
|
||
except Exception:
|
||
print("\n" + "="*50)
|
||
print("⚠️ 请在浏览器中登录小红书...")
|
||
print("="*50 + "\n")
|
||
|
||
|
||
async def check_login(page: Page, timeout=5000):
|
||
"""检查小红书登录状态 - 检查多个可能的登录指示器"""
|
||
try:
|
||
# 检查多种登录指示器(根据用户提供的HTML更新)
|
||
selectors = [
|
||
".user.side-bar-component", # 用户头像导航项
|
||
"a[href*='/user/profile']", # 个人主页链接
|
||
"[src*='sns-avatar']", # 小红书头像图片
|
||
"[class*='user']", # 包含 user 的类
|
||
".avatar", # 头像
|
||
"[class*='avatar']", # 包含 avatar 的类
|
||
"img[src*='avatar']", # 头像图片
|
||
".user-nickname", # 用户名
|
||
"text=创作中心", # 创作中心按钮
|
||
"text=业务合作", # 业务合作按钮
|
||
".publish-btn", # 发布按钮
|
||
".global-nav" # 全局导航
|
||
]
|
||
|
||
for selector in selectors:
|
||
try:
|
||
await page.wait_for_selector(selector, timeout=2000)
|
||
print(f"✅ 检测到登录元素: {selector}")
|
||
return True
|
||
except:
|
||
continue
|
||
|
||
# 如果都没找到,尝试通过页面内容判断
|
||
html = await page.content()
|
||
login_indicators = ['退出登录', '个人主页', '创作中心', '业务合作', 'sns-avatar', '/user/profile/']
|
||
for indicator in login_indicators:
|
||
if indicator in html:
|
||
print(f"✅ 通过页面内容检测到已登录: {indicator}")
|
||
return True
|
||
|
||
return False
|
||
except:
|
||
return False
|
||
|
||
|
||
async def wait_for_login(page: Page):
|
||
"""等待用户登录"""
|
||
show_login_notification()
|
||
print("⏳ 等待登录中...(最多 5 分钟)")
|
||
print("💡 提示: 请确保在浏览器中完成登录,然后点击确定按钮继续")
|
||
|
||
# 等待弹窗关闭后继续检测
|
||
import time
|
||
start_time = time.time()
|
||
while time.time() - start_time < 300:
|
||
if await check_login(page, timeout=3000):
|
||
print("✅ 登录成功!")
|
||
return True
|
||
await asyncio.sleep(2)
|
||
|
||
print("❌ 登录超时")
|
||
return False
|
||
|
||
|
||
async def crawl_xiaohongshu(keyword: str, count: int = 20, save_dir: str = None):
|
||
"""爬取小红书搜索结果"""
|
||
|
||
user_data_dir = get_default_chrome_user_data_dir()
|
||
|
||
# 构建搜索 URL
|
||
search_url = f"https://www.xiaohongshu.com/search_result?keyword={keyword}&type=51"
|
||
|
||
async with async_playwright() as p:
|
||
if user_data_dir:
|
||
print(f"📁 使用系统浏览器")
|
||
browser = await p.chromium.launch_persistent_context(
|
||
user_data_dir=user_data_dir,
|
||
headless=False,
|
||
args=["--disable-blink-features=AutomationControlled"]
|
||
)
|
||
else:
|
||
print("⚠️ 未找到系统浏览器,使用临时配置")
|
||
browser = await p.chromium.launch(headless=False)
|
||
|
||
try:
|
||
page = await browser.new_page()
|
||
|
||
print(f"🔍 搜索: {keyword}")
|
||
await page.goto(search_url, wait_until="networkidle", timeout=60000)
|
||
|
||
# 等待加载
|
||
await asyncio.sleep(3)
|
||
|
||
# 检查登录
|
||
is_logged_in = await check_login(page)
|
||
if not is_logged_in:
|
||
print("🔐 需要登录小红书")
|
||
success = await wait_for_login(page)
|
||
if not success:
|
||
raise Exception("未完成登录")
|
||
await page.goto(search_url, wait_until="networkidle", timeout=60000)
|
||
await asyncio.sleep(3)
|
||
|
||
# 滚动加载更多
|
||
print("📜 加载更多内容...")
|
||
for _ in range(5):
|
||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||
await asyncio.sleep(2)
|
||
|
||
# 提取图片
|
||
print("🖼️ 提取图片中...")
|
||
|
||
# 小红书图片选择器 - 多种可能的选择器
|
||
img_elements = await page.query_selector_all(
|
||
'.note-item img, .cover-img img, .img img, [class*="cover"] img, .item img'
|
||
)
|
||
|
||
results = []
|
||
for i, img in enumerate(img_elements[:count]):
|
||
try:
|
||
src = await img.get_attribute('src')
|
||
if src and ('http' in src or '//' in src):
|
||
# 处理相对路径
|
||
if src.startswith('//'):
|
||
src = 'https:' + src
|
||
results.append(src)
|
||
print(f" [{i+1}] {src[:80]}...")
|
||
except Exception as e:
|
||
continue
|
||
|
||
# 提取标题
|
||
title_elements = await page.query_selector_all(
|
||
'.note-item .title, .title-content, [class*="title"]'
|
||
)
|
||
titles = []
|
||
for title in title_elements[:count]:
|
||
try:
|
||
text = await title.inner_text()
|
||
if text:
|
||
titles.append(text.strip())
|
||
except:
|
||
pass
|
||
|
||
# 输出结果
|
||
print(f"\n{'='*50}")
|
||
print(f"📊 爬取结果: 共 {len(results)} 张图片")
|
||
print(f"{'='*50}")
|
||
|
||
# 保存到文件
|
||
if save_dir:
|
||
save_path = Path(save_dir)
|
||
save_path.mkdir(parents=True, exist_ok=True)
|
||
|
||
# 保存图片链接
|
||
links_file = save_path / f"{keyword}_links.txt"
|
||
with open(links_file, "w", encoding="utf-8") as f:
|
||
f.write(f"# 小红书搜索: {keyword}\n")
|
||
f.write(f"# 图片数量: {len(results)}\n\n")
|
||
for i, src in enumerate(results):
|
||
f.write(f"{i+1}. {src}\n")
|
||
|
||
# 保存标题
|
||
if titles:
|
||
titles_file = save_path / f"{keyword}_titles.txt"
|
||
with open(titles_file, "w", encoding="utf-8") as f:
|
||
f.write(f"# 小红书搜索: {keyword}\n\n")
|
||
for i, t in enumerate(titles):
|
||
f.write(f"{i+1}. {t}\n")
|
||
|
||
print(f"💾 已保存链接到: {links_file}")
|
||
print(f"💾 已保存标题到: {titles_file}")
|
||
|
||
return results, titles
|
||
|
||
finally:
|
||
await browser.close()
|
||
|
||
|
||
def main():
|
||
import json
|
||
|
||
parser = argparse.ArgumentParser(description="小红书搜索爬取")
|
||
parser.add_argument("keyword", help="搜索关键词")
|
||
parser.add_argument("--count", type=int, default=20, help="爬取数量")
|
||
parser.add_argument("--save", help="保存目录")
|
||
|
||
args = parser.parse_args()
|
||
|
||
results, titles = asyncio.run(crawl_xiaohongshu(
|
||
keyword=args.keyword,
|
||
count=args.count,
|
||
save_dir=args.save
|
||
))
|
||
|
||
# 输出结构化 JSON 供调用方解析
|
||
output = {
|
||
"keyword": args.keyword,
|
||
"images": results,
|
||
"titles": titles,
|
||
"count": len(results)
|
||
}
|
||
print(json.dumps(output, ensure_ascii=False))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|