""" 知乎爬取示例 演示如何使用 browser-session-crawler 爬取需要登录的内容 """ import asyncio import json from playwright.async_api import Page import sys import os # 添加脚本目录到路径 sys.path.append(os.path.dirname(os.path.abspath(__file__))) from crawl import crawl_with_session, get_default_chrome_user_data_dir async def crawl_zhihu_feed(page: Page): """爬取知乎首页动态""" # 等待内容加载 await page.wait_for_selector(".ContentItem", timeout=15000) # 获取内容项 items = await page.query_selector_all(".ContentItem") results = [] for item in items[:20]: # 取前20条 try: # 标题 title_elem = await item.query_selector(".ContentItem-title") title = await title_elem.inner_text() if title_elem else "" # 摘要 content_elem = await item.query_selector(".RichContent-inner") content = await content_elem.inner_text() if content_elem else "" # 作者 author_elem = await item.query_selector(".AuthorInfo-name") author = await author_elem.inner_text() if author_elem else "匿名" if title: results.append({ "title": title.strip(), "content": content.strip()[:200] + "..." if len(content) > 200 else content.strip(), "author": author.strip() }) except Exception as e: continue return results async def main(): print("="*50) print("🧪 知乎爬取示例") print("="*50) # 检查浏览器 user_data_dir = get_default_chrome_user_data_dir() if user_data_dir: print(f"✅ 找到浏览器数据: {user_data_dir}") else: print("⚠️ 未找到浏览器数据") # 执行爬取 result = await crawl_with_session( target_url="https://www.zhihu.com/", logged_in_indicator=".AppHeader-profile", # 知乎登录后出现的元素 crawl_function=crawl_zhihu_feed, # login_url="https://www.zhihu.com/signin", # 可选:登录页 ) # 输出结构化 JSON 供调用方解析 output = { "source": "zhihu", "items": result, "count": len(result) } print(json.dumps(output, ensure_ascii=False)) return result if __name__ == "__main__": asyncio.run(main())