248 lines
8.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
小红书搜索爬取脚本
自动搜索关键词并提取笔记图片
"""
import asyncio
import argparse
import os
import sys
from pathlib import Path
from playwright.async_api import async_playwright, Page
def get_default_chrome_user_data_dir():
"""获取 Windows 系统默认浏览器用户数据目录"""
local_app_data = os.environ.get('LOCALAPPDATA')
if not local_app_data:
return None
chrome_path = Path(local_app_data) / "Google" / "Chrome" / "User Data"
if chrome_path.exists():
return str(chrome_path)
edge_path = Path(local_app_data) / "Microsoft" / "Edge" / "User Data"
if edge_path.exists():
return str(edge_path)
return None
def show_login_notification():
"""显示登录提醒"""
try:
import ctypes
ctypes.windll.user32.MessageBoxW(
0,
"检测到未登录状态\n\n请在浏览器中完成小红书登录,然后关闭浏览器窗口继续",
"需要登录小红书",
0x40 | 0x0
)
except Exception:
print("\n" + "="*50)
print("⚠️ 请在浏览器中登录小红书...")
print("="*50 + "\n")
async def check_login(page: Page, timeout=5000):
"""检查小红书登录状态 - 检查多个可能的登录指示器"""
try:
# 检查多种登录指示器根据用户提供的HTML更新
selectors = [
".user.side-bar-component", # 用户头像导航项
"a[href*='/user/profile']", # 个人主页链接
"[src*='sns-avatar']", # 小红书头像图片
"[class*='user']", # 包含 user 的类
".avatar", # 头像
"[class*='avatar']", # 包含 avatar 的类
"img[src*='avatar']", # 头像图片
".user-nickname", # 用户名
"text=创作中心", # 创作中心按钮
"text=业务合作", # 业务合作按钮
".publish-btn", # 发布按钮
".global-nav" # 全局导航
]
for selector in selectors:
try:
await page.wait_for_selector(selector, timeout=2000)
print(f"✅ 检测到登录元素: {selector}")
return True
except:
continue
# 如果都没找到,尝试通过页面内容判断
html = await page.content()
login_indicators = ['退出登录', '个人主页', '创作中心', '业务合作', 'sns-avatar', '/user/profile/']
for indicator in login_indicators:
if indicator in html:
print(f"✅ 通过页面内容检测到已登录: {indicator}")
return True
return False
except:
return False
async def wait_for_login(page: Page):
"""等待用户登录"""
show_login_notification()
print("⏳ 等待登录中...(最多 5 分钟)")
print("💡 提示: 请确保在浏览器中完成登录,然后点击确定按钮继续")
# 等待弹窗关闭后继续检测
import time
start_time = time.time()
while time.time() - start_time < 300:
if await check_login(page, timeout=3000):
print("✅ 登录成功!")
return True
await asyncio.sleep(2)
print("❌ 登录超时")
return False
async def crawl_xiaohongshu(keyword: str, count: int = 20, save_dir: str = None):
"""爬取小红书搜索结果"""
user_data_dir = get_default_chrome_user_data_dir()
# 构建搜索 URL
search_url = f"https://www.xiaohongshu.com/search_result?keyword={keyword}&type=51"
async with async_playwright() as p:
if user_data_dir:
print(f"📁 使用系统浏览器")
browser = await p.chromium.launch_persistent_context(
user_data_dir=user_data_dir,
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
else:
print("⚠️ 未找到系统浏览器,使用临时配置")
browser = await p.chromium.launch(headless=False)
try:
page = await browser.new_page()
print(f"🔍 搜索: {keyword}")
await page.goto(search_url, wait_until="networkidle", timeout=60000)
# 等待加载
await asyncio.sleep(3)
# 检查登录
is_logged_in = await check_login(page)
if not is_logged_in:
print("🔐 需要登录小红书")
success = await wait_for_login(page)
if not success:
raise Exception("未完成登录")
await page.goto(search_url, wait_until="networkidle", timeout=60000)
await asyncio.sleep(3)
# 滚动加载更多
print("📜 加载更多内容...")
for _ in range(5):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(2)
# 提取图片
print("🖼️ 提取图片中...")
# 小红书图片选择器 - 多种可能的选择器
img_elements = await page.query_selector_all(
'.note-item img, .cover-img img, .img img, [class*="cover"] img, .item img'
)
results = []
for i, img in enumerate(img_elements[:count]):
try:
src = await img.get_attribute('src')
if src and ('http' in src or '//' in src):
# 处理相对路径
if src.startswith('//'):
src = 'https:' + src
results.append(src)
print(f" [{i+1}] {src[:80]}...")
except Exception as e:
continue
# 提取标题
title_elements = await page.query_selector_all(
'.note-item .title, .title-content, [class*="title"]'
)
titles = []
for title in title_elements[:count]:
try:
text = await title.inner_text()
if text:
titles.append(text.strip())
except:
pass
# 输出结果
print(f"\n{'='*50}")
print(f"📊 爬取结果: 共 {len(results)} 张图片")
print(f"{'='*50}")
# 保存到文件
if save_dir:
save_path = Path(save_dir)
save_path.mkdir(parents=True, exist_ok=True)
# 保存图片链接
links_file = save_path / f"{keyword}_links.txt"
with open(links_file, "w", encoding="utf-8") as f:
f.write(f"# 小红书搜索: {keyword}\n")
f.write(f"# 图片数量: {len(results)}\n\n")
for i, src in enumerate(results):
f.write(f"{i+1}. {src}\n")
# 保存标题
if titles:
titles_file = save_path / f"{keyword}_titles.txt"
with open(titles_file, "w", encoding="utf-8") as f:
f.write(f"# 小红书搜索: {keyword}\n\n")
for i, t in enumerate(titles):
f.write(f"{i+1}. {t}\n")
print(f"💾 已保存链接到: {links_file}")
print(f"💾 已保存标题到: {titles_file}")
return results, titles
finally:
await browser.close()
def main():
import json
parser = argparse.ArgumentParser(description="小红书搜索爬取")
parser.add_argument("keyword", help="搜索关键词")
parser.add_argument("--count", type=int, default=20, help="爬取数量")
parser.add_argument("--save", help="保存目录")
args = parser.parse_args()
results, titles = asyncio.run(crawl_xiaohongshu(
keyword=args.keyword,
count=args.count,
save_dir=args.save
))
# 输出结构化 JSON 供调用方解析
output = {
"keyword": args.keyword,
"images": results,
"titles": titles,
"count": len(results)
}
print(json.dumps(output, ensure_ascii=False))
if __name__ == "__main__":
main()