From e0f3dc223c24cc125d059a2a9bb2c979f9acc38f Mon Sep 17 00:00:00 2001 From: hjjjj <1311711287@qq.com> Date: Thu, 26 Feb 2026 18:15:34 +0800 Subject: [PATCH] Update skill: browser-session-crawler by Anonymous --- browser-session-crawler/SKILL.md | 144 ++++++++++ browser-session-crawler/scripts/crawl.py | 252 ++++++++++++++++++ .../scripts/example_zhihu.py | 85 ++++++ .../scripts/xiaohongshu.py | 247 +++++++++++++++++ 4 files changed, 728 insertions(+) create mode 100644 browser-session-crawler/SKILL.md create mode 100644 browser-session-crawler/scripts/crawl.py create mode 100644 browser-session-crawler/scripts/example_zhihu.py create mode 100644 browser-session-crawler/scripts/xiaohongshu.py diff --git a/browser-session-crawler/SKILL.md b/browser-session-crawler/SKILL.md new file mode 100644 index 0000000..004f680 --- /dev/null +++ b/browser-session-crawler/SKILL.md @@ -0,0 +1,144 @@ +--- +name: browser-session-crawler +description: Crawl websites using your logged-in Chrome/Edge browser session. Automatically reuses existing login state; if not logged in, shows a popup to remind you to login, then continues automatically. Ideal for sites requiring authentication (social media, communities, admin panels, etc.). +compatibility: Requires Python 3.8+. Dependencies: playwright (pip install playwright && playwright install chromium) +--- + +# Browser Session Crawler + +Crawl websites using your system's logged-in Chrome/Edge browser session. + +## Core Features + +- **🔐 Automatic Session Reuse** - Uses Chrome/Edge user data directory, no need to login again +- **⏳ Login Reminder** - Detects unauthenticated state, shows popup reminder, continues after login +- **🌐 Real Browser Environment** - Non-headless mode, fewer anti-bot detections +- **📱 Pre-built Crawlers** - Ready-to-use scripts for Xiaohongshu (Redbook), Zhihu, and more + +## Installation + +```bash +pip install playwright +playwright install chromium +``` + +## Quick Start + +### Xiaohongshu Crawler (Recommended) + +```bash +# Search for beach beauty photos +python scripts/xiaohongshu.py "beach beauty" --count 20 + +# Search for any keyword +python scripts/xiaohongshu.py "your keyword" +``` + +**Parameters:** + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `keyword` | ✅ | Search keyword | +| `--count` | No | Number of items to crawl (default: 20) | +| `--save` | No | Directory to save images | + +**Examples:** + +```bash +# Crawl 50 beach beauty photos, save to imgs folder +python scripts/xiaohongshu.py "beach beauty" --count 50 --save imgs +``` + +### Generic Crawler + +```bash +python scripts/crawl.py "target_URL" --logged-indicator "login_indicator" --selector "css_selector" +``` + +| Parameter | Required | Description | +|-----------|----------|-------------| +| `target_url` | ✅ | Target page URL | +| `--logged-indicator` | ✅ | CSS selector that appears only after login | +| `--selector` | No | CSS selector for elements to extract | +| `--wait` | No | Seconds to wait after page load (default: 3) | +| `--scroll` | No | Scroll page to trigger lazy loading | +| `--max-length` | No | Maximum character count for output | +| `--save` | No | Save output to file | + +## Pre-built Scripts + +| Script | Function | Example | +|--------|----------|---------| +| `xiaohongshu.py` | Xiaohongshu search crawler | `python scripts/xiaohongshu.py "food"` | +| `crawl.py` | Generic webpage crawler | `python scripts/crawl.py "url" --logged-indicator "..."` | +| `example_zhihu.py` | Zhihu crawler example | - | + +## Common Site Configurations + +### Xiaohongshu (Redbook) + +```bash +# Search page crawling (auto extracts images) +python scripts/xiaohongshu.py "beach beauty" + +# Generic method +python scripts/crawl.py "https://www.xiaohongshu.com/search_result?keyword=beauty" --logged-indicator ".user-avatar" --selector ".note-item" +``` + +### Zhihu + +```bash +python scripts/crawl.py "https://www.zhihu.com/topic/19550517/hot" --logged-indicator ".AppHeader-profile" --selector ".List-item" --scroll +``` + +### Weibo + +```bash +python scripts/crawl.py "https://weibo.com/hot/search" --logged-indicator ".user-name" --selector ".list_pub" --scroll +``` + +## Login Detection + +Uses `--logged-indicator` selector to detect login state: +- Element found → Logged in, proceed with crawling +- Timeout (not found) → Show login reminder → Continue after login + +**Common Login Indicators:** + +| Site | Selector | +|------|----------| +| Xiaohongshu | `.user-avatar`, `.profile-avatar`, `.user-name` | +| Zhihu | `.AppHeader-profile`, `.UserAvatar` | +| LinkedIn | `.global-nav__me-wrapper` | +| Weibo | `.user-name`, `.m-text-cut` | + +## Workflow + +``` +1. Detect system browser user data directory + ↓ +2. Launch Chromium (reuse logged-in session) + ↓ +3. Navigate to target page + ↓ +4. Check login status + ↓ + ┌─────────────┐ + │ Logged in? │ + └─────────────┘ + ↓ ↓ + Yes No + ↓ ↓ + Crawl Show login reminder + ↓ ↓ + Save results +``` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Browser launch failed | Check if Chrome/Edge is currently using user data directory | +| Login detection failed | Adjust `--logged-indicator` to correct selector | +| Empty content | Increase `--wait 5` or add `--scroll` | +| Page stuck | Try `--headless` mode (may not support login) | diff --git a/browser-session-crawler/scripts/crawl.py b/browser-session-crawler/scripts/crawl.py new file mode 100644 index 0000000..26bf58d --- /dev/null +++ b/browser-session-crawler/scripts/crawl.py @@ -0,0 +1,252 @@ +""" +Browser Session Crawler - 通用爬取模块 +使用系统浏览器登录状态进行爬取 +""" + +import asyncio +import argparse +import os +import sys +import json +from pathlib import Path +from typing import Optional, Callable, Any +from playwright.async_api import async_playwright, Page, BrowserContext + + +def get_default_chrome_user_data_dir() -> Optional[str]: + """获取 Windows 系统默认浏览器用户数据目录""" + local_app_data = os.environ.get('LOCALAPPDATA') + if not local_app_data: + return None + + # 尝试 Chrome + chrome_path = Path(local_app_data) / "Google" / "Chrome" / "User Data" + if chrome_path.exists(): + return str(chrome_path) + + # 尝试 Edge + edge_path = Path(local_app_data) / "Microsoft" / "Edge" / "User Data" + if edge_path.exists(): + return str(edge_path) + + return None + + +def show_login_notification(): + """显示 Windows 登录提醒弹窗""" + try: + import ctypes + ctypes.windll.user32.MessageBoxW( + 0, + "检测到未登录状态\n\n请在浏览器中完成登录,然后关闭此窗口继续", + "需要登录", + 0x40 | 0x0 + ) + except Exception: + print("\n" + "="*50) + print("⚠️ 请在浏览器中完成登录...") + print("="*50 + "\n") + + +async def check_login_status(page: Page, indicator: str, timeout: int = 3000) -> bool: + """检查登录状态""" + try: + await page.wait_for_selector(indicator, timeout=timeout) + return True + except: + return False + + +async def wait_for_login(page: Page, indicator: str, login_url: Optional[str] = None): + """等待用户登录""" + if login_url: + await page.goto(login_url) + + show_login_notification() + + print(f"等待用户登录中...(最多 5 分钟)") + try: + await page.wait_for_selector(indicator, timeout=300000) + print("✅ 登录成功!") + return True + except: + print("❌ 登录超时") + return False + + +async def crawl_with_session( + target_url: str, + logged_in_indicator: str, + crawl_function: Callable[[Page], Any], + login_url: Optional[str] = None, + use_system_profile: bool = True, + headless: bool = False +) -> Any: + """使用系统浏览器会话进行爬取""" + + user_data_dir = None + + if use_system_profile: + user_data_dir = get_default_chrome_user_data_dir() + if user_data_dir: + print(f"📁 使用系统浏览器: {user_data_dir}") + else: + print("⚠️ 未找到系统浏览器,使用临时配置") + + async with async_playwright() as p: + launch_options = { + "headless": headless, + "args": [ + "--disable-blink-features=AutomationControlled", + "--disable-web-security", + ] + } + + if user_data_dir: + browser = await p.chromium.launch_persistent_context( + user_data_dir=user_data_dir, + **launch_options + ) + else: + browser = await p.chromium.launch(**launch_options) + + try: + page = await browser.new_page() + + print(f"🌐 正在访问: {target_url}") + await page.goto(target_url, wait_until="networkidle") + + # 检查登录状态 + is_logged_in = await check_login_status(page, logged_in_indicator) + + if not is_logged_in: + print("🔐 需要登录") + success = await wait_for_login(page, logged_in_indicator, login_url) + if not success: + raise Exception("用户未完成登录") + + # 重新访问目标页面 + await page.goto(target_url, wait_until="networkidle") + + # 执行爬取 + print("🔍 爬取中...") + result = await crawl_function(page) + return result + + finally: + await browser.close() + + +async def simple_crawl( + target_url: str, + logged_in_indicator: str, + selector: Optional[str] = None, + login_url: Optional[str] = None, + wait: int = 3, + scroll: bool = False, + max_length: int = 0, + save_path: Optional[str] = None +) -> str: + """简单爬取:提取页面内容""" + + user_data_dir = get_default_chrome_user_data_dir() + + async with async_playwright() as p: + if user_data_dir: + browser = await p.chromium.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, + args=["--disable-blink-features=AutomationControlled"] + ) + else: + browser = await p.chromium.launch(headless=False) + + try: + page = await browser.new_page() + + print(f"🌐 访问: {target_url}") + await page.goto(target_url, wait_until="networkidle") + + # 等待 + if wait > 0: + await asyncio.sleep(wait) + + # 滚动 + if scroll: + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await asyncio.sleep(2) + + # 检查登录 + is_logged_in = await check_login_status(page, logged_in_indicator) + + if not is_logged_in: + print("🔐 需要登录") + success = await wait_for_login(page, logged_in_indicator, login_url) + if not success: + raise Exception("未完成登录") + await page.goto(target_url, wait_until="networkidle") + + # 提取内容 + if selector: + print(f"📦 提取选择器: {selector}") + elements = await page.query_selector_all(selector) + results = [] + for el in elements: + text = await el.inner_text() + results.append(text.strip()) + output = "\n\n---\n\n".join(results) + else: + print("📄 提取全文...") + output = await page.evaluate("document.body.innerText") + + # 截断 + if max_length > 0 and len(output) > max_length: + output = output[:max_length] + "\n\n... (内容已截断)" + + # 保存 + if save_path: + with open(save_path, "w", encoding="utf-8") as f: + f.write(output) + print(f"💾 已保存到: {save_path}") + + return output + + finally: + await browser.close() + + +def main(): + parser = argparse.ArgumentParser(description="使用系统浏览器会话爬取网页") + parser.add_argument("target_url", help="目标URL") + parser.add_argument("--logged-indicator", required=True, help="登录后出现的元素选择器") + parser.add_argument("--login-url", help="登录页面URL") + parser.add_argument("--selector", help="CSS选择器(只提取匹配元素)") + parser.add_argument("--wait", type=int, default=3, help="加载后等待秒数") + parser.add_argument("--scroll", action="store_true", help="滚动页面触发懒加载") + parser.add_argument("--max-length", type=int, default=0, help="最大字符数") + parser.add_argument("--save", help="保存到文件") + parser.add_argument("--headless", action="store_true", help="无头模式(可能无法登录)") + + args = parser.parse_args() + + result = asyncio.run(simple_crawl( + target_url=args.target_url, + logged_in_indicator=args.logged_indicator, + selector=args.selector, + login_url=args.login_url, + wait=args.wait, + scroll=args.scroll, + max_length=args.max_length, + save_path=args.save + )) + + # 输出结构化 JSON 供调用方解析 + output = { + "url": args.target_url, + "content": result, + } + print(json.dumps(output, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/browser-session-crawler/scripts/example_zhihu.py b/browser-session-crawler/scripts/example_zhihu.py new file mode 100644 index 0000000..9175432 --- /dev/null +++ b/browser-session-crawler/scripts/example_zhihu.py @@ -0,0 +1,85 @@ +""" +知乎爬取示例 +演示如何使用 browser-session-crawler 爬取需要登录的内容 +""" + +import asyncio +import json +from playwright.async_api import Page +import sys +import os + +# 添加脚本目录到路径 +sys.path.append(os.path.dirname(os.path.abspath(__file__))) +from crawl import crawl_with_session, get_default_chrome_user_data_dir + + +async def crawl_zhihu_feed(page: Page): + """爬取知乎首页动态""" + + # 等待内容加载 + await page.wait_for_selector(".ContentItem", timeout=15000) + + # 获取内容项 + items = await page.query_selector_all(".ContentItem") + + results = [] + for item in items[:20]: # 取前20条 + try: + # 标题 + title_elem = await item.query_selector(".ContentItem-title") + title = await title_elem.inner_text() if title_elem else "" + + # 摘要 + content_elem = await item.query_selector(".RichContent-inner") + content = await content_elem.inner_text() if content_elem else "" + + # 作者 + author_elem = await item.query_selector(".AuthorInfo-name") + author = await author_elem.inner_text() if author_elem else "匿名" + + if title: + results.append({ + "title": title.strip(), + "content": content.strip()[:200] + "..." if len(content) > 200 else content.strip(), + "author": author.strip() + }) + except Exception as e: + continue + + return results + + +async def main(): + print("="*50) + print("🧪 知乎爬取示例") + print("="*50) + + # 检查浏览器 + user_data_dir = get_default_chrome_user_data_dir() + if user_data_dir: + print(f"✅ 找到浏览器数据: {user_data_dir}") + else: + print("⚠️ 未找到浏览器数据") + + # 执行爬取 + result = await crawl_with_session( + target_url="https://www.zhihu.com/", + logged_in_indicator=".AppHeader-profile", # 知乎登录后出现的元素 + crawl_function=crawl_zhihu_feed, + # login_url="https://www.zhihu.com/signin", # 可选:登录页 + ) + + # 输出结构化 JSON 供调用方解析 + output = { + "source": "zhihu", + "items": result, + "count": len(result) + } + print(json.dumps(output, ensure_ascii=False)) + + return result + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/browser-session-crawler/scripts/xiaohongshu.py b/browser-session-crawler/scripts/xiaohongshu.py new file mode 100644 index 0000000..1d32e8c --- /dev/null +++ b/browser-session-crawler/scripts/xiaohongshu.py @@ -0,0 +1,247 @@ +""" +小红书搜索爬取脚本 +自动搜索关键词并提取笔记图片 +""" + +import asyncio +import argparse +import os +import sys +from pathlib import Path +from playwright.async_api import async_playwright, Page + + +def get_default_chrome_user_data_dir(): + """获取 Windows 系统默认浏览器用户数据目录""" + local_app_data = os.environ.get('LOCALAPPDATA') + if not local_app_data: + return None + + chrome_path = Path(local_app_data) / "Google" / "Chrome" / "User Data" + if chrome_path.exists(): + return str(chrome_path) + + edge_path = Path(local_app_data) / "Microsoft" / "Edge" / "User Data" + if edge_path.exists(): + return str(edge_path) + + return None + + +def show_login_notification(): + """显示登录提醒""" + try: + import ctypes + ctypes.windll.user32.MessageBoxW( + 0, + "检测到未登录状态\n\n请在浏览器中完成小红书登录,然后关闭浏览器窗口继续", + "需要登录小红书", + 0x40 | 0x0 + ) + except Exception: + print("\n" + "="*50) + print("⚠️ 请在浏览器中登录小红书...") + print("="*50 + "\n") + + +async def check_login(page: Page, timeout=5000): + """检查小红书登录状态 - 检查多个可能的登录指示器""" + try: + # 检查多种登录指示器(根据用户提供的HTML更新) + selectors = [ + ".user.side-bar-component", # 用户头像导航项 + "a[href*='/user/profile']", # 个人主页链接 + "[src*='sns-avatar']", # 小红书头像图片 + "[class*='user']", # 包含 user 的类 + ".avatar", # 头像 + "[class*='avatar']", # 包含 avatar 的类 + "img[src*='avatar']", # 头像图片 + ".user-nickname", # 用户名 + "text=创作中心", # 创作中心按钮 + "text=业务合作", # 业务合作按钮 + ".publish-btn", # 发布按钮 + ".global-nav" # 全局导航 + ] + + for selector in selectors: + try: + await page.wait_for_selector(selector, timeout=2000) + print(f"✅ 检测到登录元素: {selector}") + return True + except: + continue + + # 如果都没找到,尝试通过页面内容判断 + html = await page.content() + login_indicators = ['退出登录', '个人主页', '创作中心', '业务合作', 'sns-avatar', '/user/profile/'] + for indicator in login_indicators: + if indicator in html: + print(f"✅ 通过页面内容检测到已登录: {indicator}") + return True + + return False + except: + return False + + +async def wait_for_login(page: Page): + """等待用户登录""" + show_login_notification() + print("⏳ 等待登录中...(最多 5 分钟)") + print("💡 提示: 请确保在浏览器中完成登录,然后点击确定按钮继续") + + # 等待弹窗关闭后继续检测 + import time + start_time = time.time() + while time.time() - start_time < 300: + if await check_login(page, timeout=3000): + print("✅ 登录成功!") + return True + await asyncio.sleep(2) + + print("❌ 登录超时") + return False + + +async def crawl_xiaohongshu(keyword: str, count: int = 20, save_dir: str = None): + """爬取小红书搜索结果""" + + user_data_dir = get_default_chrome_user_data_dir() + + # 构建搜索 URL + search_url = f"https://www.xiaohongshu.com/search_result?keyword={keyword}&type=51" + + async with async_playwright() as p: + if user_data_dir: + print(f"📁 使用系统浏览器") + browser = await p.chromium.launch_persistent_context( + user_data_dir=user_data_dir, + headless=False, + args=["--disable-blink-features=AutomationControlled"] + ) + else: + print("⚠️ 未找到系统浏览器,使用临时配置") + browser = await p.chromium.launch(headless=False) + + try: + page = await browser.new_page() + + print(f"🔍 搜索: {keyword}") + await page.goto(search_url, wait_until="networkidle", timeout=60000) + + # 等待加载 + await asyncio.sleep(3) + + # 检查登录 + is_logged_in = await check_login(page) + if not is_logged_in: + print("🔐 需要登录小红书") + success = await wait_for_login(page) + if not success: + raise Exception("未完成登录") + await page.goto(search_url, wait_until="networkidle", timeout=60000) + await asyncio.sleep(3) + + # 滚动加载更多 + print("📜 加载更多内容...") + for _ in range(5): + await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await asyncio.sleep(2) + + # 提取图片 + print("🖼️ 提取图片中...") + + # 小红书图片选择器 - 多种可能的选择器 + img_elements = await page.query_selector_all( + '.note-item img, .cover-img img, .img img, [class*="cover"] img, .item img' + ) + + results = [] + for i, img in enumerate(img_elements[:count]): + try: + src = await img.get_attribute('src') + if src and ('http' in src or '//' in src): + # 处理相对路径 + if src.startswith('//'): + src = 'https:' + src + results.append(src) + print(f" [{i+1}] {src[:80]}...") + except Exception as e: + continue + + # 提取标题 + title_elements = await page.query_selector_all( + '.note-item .title, .title-content, [class*="title"]' + ) + titles = [] + for title in title_elements[:count]: + try: + text = await title.inner_text() + if text: + titles.append(text.strip()) + except: + pass + + # 输出结果 + print(f"\n{'='*50}") + print(f"📊 爬取结果: 共 {len(results)} 张图片") + print(f"{'='*50}") + + # 保存到文件 + if save_dir: + save_path = Path(save_dir) + save_path.mkdir(parents=True, exist_ok=True) + + # 保存图片链接 + links_file = save_path / f"{keyword}_links.txt" + with open(links_file, "w", encoding="utf-8") as f: + f.write(f"# 小红书搜索: {keyword}\n") + f.write(f"# 图片数量: {len(results)}\n\n") + for i, src in enumerate(results): + f.write(f"{i+1}. {src}\n") + + # 保存标题 + if titles: + titles_file = save_path / f"{keyword}_titles.txt" + with open(titles_file, "w", encoding="utf-8") as f: + f.write(f"# 小红书搜索: {keyword}\n\n") + for i, t in enumerate(titles): + f.write(f"{i+1}. {t}\n") + + print(f"💾 已保存链接到: {links_file}") + print(f"💾 已保存标题到: {titles_file}") + + return results, titles + + finally: + await browser.close() + + +def main(): + import json + + parser = argparse.ArgumentParser(description="小红书搜索爬取") + parser.add_argument("keyword", help="搜索关键词") + parser.add_argument("--count", type=int, default=20, help="爬取数量") + parser.add_argument("--save", help="保存目录") + + args = parser.parse_args() + + results, titles = asyncio.run(crawl_xiaohongshu( + keyword=args.keyword, + count=args.count, + save_dir=args.save + )) + + # 输出结构化 JSON 供调用方解析 + output = { + "keyword": args.keyword, + "images": results, + "titles": titles, + "count": len(results) + } + print(json.dumps(output, ensure_ascii=False)) + + +if __name__ == "__main__": + main()