From b26852b95f05c5d50cf690160b315bbbd4cbb987 Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Wed, 29 Oct 2025 18:49:19 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E5=A2=9E=E5=BC=BA=E4=BA=86?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E8=AF=84=E8=AE=BA=E5=86=85=E5=AE=B9=E6=9C=BA?= =?UTF-8?q?=E5=88=B6=EF=BC=8C=20=E7=8E=B0=E5=9C=A8=E6=8A=93=E5=8F=96?= =?UTF-8?q?=E8=AF=84=E8=AE=BA=E5=86=85=E5=AE=B9=E7=9A=84=E6=96=B9=E6=B3=95?= =?UTF-8?q?=E6=98=AF=E7=BD=91=E7=BB=9C=E6=97=A5=E5=BF=97=E6=8B=A6=E6=88=AA?= =?UTF-8?q?=E6=8A=96=E9=9F=B3API=E7=9A=84=E6=96=B9=E6=B3=95=EF=BC=8C=20?= =?UTF-8?q?=E8=BF=99=E4=B8=AA=E8=8E=B7=E5=8F=96=E7=9A=84=E8=AF=84=E8=AE=BA?= =?UTF-8?q?=E5=86=85=E5=AE=B9=E6=98=AF=E9=9A=8F=E6=9C=BA=E7=9A=84=EF=BC=8C?= =?UTF-8?q?=E4=BD=86=E6=98=AF=E9=87=8C=E9=9D=A2=E6=B7=BB=E5=8A=A0=E4=BA=86?= =?UTF-8?q?=E5=8E=BB=E9=87=8D=E7=9A=84=E6=9C=BA=E5=88=B6=EF=BC=8C=20?= =?UTF-8?q?=E9=92=88=E5=AF=B9=E4=BA=8E=E8=BF=87=E5=B0=91=E7=9A=84=E8=AF=84?= =?UTF-8?q?=E8=AE=BA=E5=92=8C=E6=97=A0=E8=AF=84=E8=AE=BA=E5=81=9A=E4=BA=86?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=8C=20=E6=B7=BB=E5=8A=A0=E7=9B=91?= =?UTF-8?q?=E6=8E=A7=E8=B6=85=E6=97=B6=E9=99=90=E5=88=B6=EF=BC=8C=E7=88=AC?= =?UTF-8?q?=E5=8F=96=E8=AF=84=E8=AE=BA=E5=86=85=E5=AE=B9=E4=B8=80=E6=9D=A1?= =?UTF-8?q?=E8=A7=86=E9=A2=91=E7=9B=91=E6=8E=A7=E8=B6=85=E6=97=B6=E6=97=B6?= =?UTF-8?q?=E9=97=B4=E4=B8=BA5=E5=B0=8F=E6=97=B6=EF=BC=88=E5=9B=A0?= =?UTF-8?q?=E4=B8=BA=E6=9C=89=E7=9A=84=E8=A7=86=E9=A2=91=E8=AF=84=E8=AE=BA?= =?UTF-8?q?=E6=95=B0=E9=87=8F=E5=A4=AA=E5=A4=A7=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../handlers/Rankings/rank_data_scraper.py | 390 +++++++++++++++++- 1 file changed, 384 insertions(+), 6 deletions(-) diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 289ce19..4444954 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -79,7 +79,6 @@ class DouyinPlayVVScraper: self.db = None self.collection = None self.image_cache = {} # 图片ID到TOS链接的缓存映射 {image_id: tos_url} - self.all_collected_comments = [] # 存储所有收集到的评论数据 # 实时存储相关属性 self.batch_id = str(uuid.uuid4()) # 每次运行的唯一标识 @@ -1770,6 +1769,12 @@ class DouyinPlayVVScraper: self.update_single_video_details(document_id, i, video_id, video_details, mix_name) else: logging.warning(f'[增量更新] 第 {i} 集视频详细数据获取失败: {mix_name}') + + # 添加随机延迟避免请求过快,模拟人类行为 + if i < len(episode_video_ids): # 不是最后一个视频时才延迟 + random_delay = self.anti_detection.get_human_like_delay() + logging.info(f'🕐 [增量更新] 视频间隔等待时间: {random_delay:.1f}秒') + time.sleep(random_delay) except Exception as e: logging.error(f'[增量更新] 处理第 {i} 集视频时出错: {mix_name} - {e}') @@ -2037,8 +2042,8 @@ class DouyinPlayVVScraper: # 提交滑动任务 scroll_future = executor.submit(self._async_scroll_task_with_state, max_scroll_attempts, scroll_delay, shared_state) - # 同时提交监控任务 - 监控任务会检测滑动任务状态 - monitor_future = executor.submit(self._async_monitor_task_with_state, video_id, collected_comment_ids, shared_state, 3600, + # 同时提交监控任务 - 监控任务会检测滑动任务状态(5小时超时) + monitor_future = executor.submit(self._async_monitor_task_with_state, video_id, collected_comment_ids, shared_state, 18000, document_id, episode_number, mix_name) # 等待两个任务完成 @@ -2049,10 +2054,90 @@ class DouyinPlayVVScraper: logging.info(f'评论滑动加载完成,共收集到 {len(all_comments)} 条评论') + # 针对评论较少的情况,执行补丁机制确保不遗漏评论 + # 当滑动次数较少(可能只滑动了2-3次就到底)但评论数量也较少时, + # 很可能存在页面上可见但未被网络日志捕获的评论 + + # 智能判断是否需要执行补丁机制 + should_apply_patch = False + + # 条件1: 评论数量较少(少于20条) + if len(all_comments) < 20: + should_apply_patch = True + logging.debug(f'评论数量较少({len(all_comments)}条),启用补丁机制') + + # 条件2: 检查页面上是否有更多可见的评论元素 + try: + visible_comment_count = self.driver.execute_script(""" + var selectors = [ + '[data-e2e="comment-item"]', + '[class*="comment-item"]', + '[class*="comment-content"]' + ]; + var totalCount = 0; + selectors.forEach(function(selector) { + var elements = document.querySelectorAll(selector); + elements.forEach(function(element) { + if (element.offsetParent !== null && element.textContent.trim().length > 2) { + totalCount++; + } + }); + }); + return totalCount; + """) + + if visible_comment_count > len(all_comments): + should_apply_patch = True + logging.debug(f'页面可见评论({visible_comment_count}条) > 已获取评论({len(all_comments)}条),启用补丁机制') + + except Exception as e: + logging.debug(f'检查页面可见评论数量失败: {e}') + # 如果检查失败,对于少量评论的情况仍然执行补丁 + if len(all_comments) < 10: + should_apply_patch = True + + patch_comments = [] + if should_apply_patch: + logging.info('执行评论补丁机制...') + patch_comments = self._extract_comments_patch(video_id) + else: + logging.debug('无需执行补丁机制') + + if patch_comments: + # 去重合并补丁评论 + existing_texts = {comment.get('text', '') for comment in all_comments} + new_patch_comments = [] + + for patch_comment in patch_comments: + if patch_comment.get('text', '') not in existing_texts: + new_patch_comments.append(patch_comment) + existing_texts.add(patch_comment.get('text', '')) + + if new_patch_comments: + all_comments.extend(new_patch_comments) + logging.info(f'补丁机制额外获取到 {len(new_patch_comments)} 条评论,总计 {len(all_comments)} 条评论') + + # 如果有新的评论且启用了实时保存,更新数据库 + if document_id and episode_number and new_patch_comments: + try: + self.update_video_comments_realtime(document_id, episode_number, new_patch_comments, mix_name) + logging.info(f'实时保存补丁评论到数据库: {len(new_patch_comments)} 条') + except Exception as e: + logging.warning(f'实时保存补丁评论失败: {e}') + else: + logging.debug('补丁机制未发现新的评论') + else: + logging.debug('补丁机制未获取到任何评论') + # 保存评论到文件 if all_comments: self.save_comments_to_file(all_comments, video_id) + # 添加随机停留时间,防止网页被爬取崩溃 + rest_time = random.uniform(10, 20) # 10-20秒随机停留 + logging.info(f'评论抓取完成,停留 {rest_time:.1f} 秒以保护网页稳定性...') + time.sleep(rest_time) + return all_comments except Exception as e: @@ -3099,8 +3184,10 @@ class DouyinPlayVVScraper: video_details['episode_number'] = i video_details_list.append(video_details) - # 添加延迟避免请求过快 - time.sleep(2) + # 添加随机延迟避免请求过快,模拟人类行为 + random_delay = self.anti_detection.get_human_like_delay() + logging.info(f'🕐 视频间隔等待时间: {random_delay:.1f}秒') + time.sleep(random_delay) # exit(0) except Exception as e: @@ -3132,6 +3219,297 @@ class DouyinPlayVVScraper: self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()} return self.cookies + def _extract_comments_patch(self, video_id: str) -> list: + """ + 评论抓取补丁机制 - 强制从页面元素中提取所有可见评论 + 针对评论较少但获取不全的情况,确保不遗漏任何页面上可见的评论 + """ + comments = [] + try: + logging.info(f'开始执行评论抓取补丁机制,视频ID: {video_id}') + + # 首先检查是否存在"抢首评"按钮,如果存在说明视频确实没有评论 + if self._check_first_comment_button(): + logging.info('检测到"抢首评"按钮,确认视频没有评论,跳过补丁机制') + return comments + + # 等待页面稳定 + time.sleep(1) + + # 滚动到评论区域确保评论完全加载 + self._scroll_to_comment_section() + time.sleep(1) + + # 针对少量评论优化的选择器列表 + comment_selectors = [ + # 抖音常用的评论选择器(优先级最高) + '[data-e2e="comment-item"]', + '[data-e2e="comment-list"] > div', + '[class*="comment-item"]', + '[class*="comment-content"]', + + # 针对少量评论的特殊选择器 + '[class*="comment"] [class*="content"]', + '[class*="Comment"] [class*="content"]', + '[class*="comment-text"]', + '[class*="user-comment"]', + + # 更通用的评论选择器 + 'div[class*="comment"]:not([class*="input"]):not([class*="button"]):not([class*="header"])', + 'li[class*="comment"]', + '[role="listitem"][class*="comment"]', + + # 备用选择器(用于兜底) + '.comment-list > div', + '.comment-container > div', + '.comment-wrapper > div', + + # 针对少量评论的深度选择器 + 'div[class*="comment"] p', + 'div[class*="comment"] span:not([class*="button"]):not([class*="icon"])', + '[data-e2e*="comment"] div[class*="text"]' + ] + + collected_texts = set() # 用于去重 + + for selector in comment_selectors: + try: + logging.debug(f'尝试选择器: {selector}') + elements = self.driver.find_elements("css selector", selector) + + if elements: + logging.debug(f'选择器 {selector} 找到 {len(elements)} 个元素') + + for element in elements: + try: + if not element.is_displayed(): + continue + + # 获取评论文本 + comment_text = element.text.strip() + + # 过滤无效文本 + if not comment_text or len(comment_text) < 2: + continue + + # 过滤系统文本和按钮文本 + skip_texts = [ + '回复', '点赞', '举报', '删除', '编辑', '查看更多', + '展开', '收起', '暂时没有更多评论', '加载中', + '发布', '取消', '确定', '登录', '注册', '关注' + ] + + # 检查是否为纯按钮文本 + if comment_text in skip_texts: + continue + + # 清理包含按钮文本的评论 + original_text = comment_text + for skip_text in skip_texts: + comment_text = comment_text.replace(skip_text, '').strip() + + # 如果清理后文本太短,跳过 + if len(comment_text) < 2: + continue + + # 去重检查 + if comment_text in collected_texts: + continue + + collected_texts.add(comment_text) + + # 创建评论对象 + comment_info = { + 'text': comment_text, + 'user_name': '', + 'digg_count': 0, + 'create_time': 0, + 'source': f'patch_{selector}' # 标记来源 + } + + comments.append(comment_info) + logging.debug(f'补丁提取评论: {comment_text[:30]}...') + + except Exception as e: + logging.debug(f'处理评论元素失败: {e}') + continue + + except Exception as e: + logging.debug(f'选择器 {selector} 执行失败: {e}') + continue + + # 使用JavaScript进一步搜索评论文本(针对少量评论优化) + try: + js_comments = self.driver.execute_script(""" + var comments = []; + var processedTexts = new Set(); + + // 针对少量评论的优化选择器 + var selectors = [ + 'div[class*="comment"]', + 'li[class*="comment"]', + '[data-e2e*="comment"]', + '[class*="comment-item"]', + '[class*="comment-content"]' + ]; + + selectors.forEach(function(selector) { + try { + var elements = document.querySelectorAll(selector); + elements.forEach(function(element) { + if (element.offsetParent === null) return; // 跳过不可见元素 + + var text = element.textContent.trim(); + if (text.length < 2) return; + + // 过滤按钮文本 + var skipTexts = ['回复', '点赞', '举报', '删除', '编辑', '查看更多', '展开', '收起', '暂时没有更多评论', '关注']; + var isButtonText = skipTexts.some(function(skipText) { + return text === skipText; + }); + + if (isButtonText || processedTexts.has(text)) return; + + // 清理文本 + var cleanText = text; + skipTexts.forEach(function(skipText) { + cleanText = cleanText.replace(new RegExp(skipText, 'g'), '').trim(); + }); + + if (cleanText.length >= 2 && !processedTexts.has(cleanText)) { + processedTexts.add(cleanText); + comments.push({ + text: cleanText, + user_name: '', + digg_count: 0, + create_time: 0, + source: 'patch_js' + }); + } + }); + } catch (e) { + console.log('JS选择器执行失败:', selector, e); + } + }); + + return comments; + """) + + if js_comments: + # 去重合并 + existing_texts = {comment['text'] for comment in comments} + for js_comment in js_comments: + if js_comment['text'] not in existing_texts: + comments.append(js_comment) + existing_texts.add(js_comment['text']) + + logging.debug(f'JavaScript补丁额外提取 {len(js_comments)} 条评论') + + except Exception as e: + logging.debug(f'JavaScript评论提取失败: {e}') + + if comments: + logging.info(f'评论抓取补丁机制成功提取 {len(comments)} 条评论') + else: + logging.warning('评论抓取补丁机制未找到任何评论') + + except Exception as e: + logging.error(f'评论抓取补丁机制执行失败: {e}') + + return comments + + def _check_first_comment_button(self) -> bool: + """ + 检测是否存在"抢首评"按钮,如果存在说明视频确实没有评论 + Returns: + bool: True表示检测到抢首评按钮(视频没有评论),False表示没有检测到 + """ + try: + # 常见的"抢首评"相关文本 + first_comment_indicators = [ + "抢首评", "首评" + ] + + logging.debug('检测"抢首评"按钮...') + + # 方法1: 通过文本内容检测 + for indicator in first_comment_indicators: + try: + # 使用XPath查找包含指定文本的元素 + xpath_selectors = [ + f"//*[contains(text(), '{indicator}')]", + f"//button[contains(text(), '{indicator}')]", + f"//div[contains(text(), '{indicator}')]", + f"//span[contains(text(), '{indicator}')]" + ] + + for xpath in xpath_selectors: + elements = self.driver.find_elements("xpath", xpath) + if elements: + for element in elements: + try: + if element.is_displayed() and element.text.strip(): + logging.debug(f'检测到抢首评相关文本: "{element.text.strip()}"') + return True + except Exception: + continue + except Exception: + continue + + # 方法2: 通过CSS选择器检测评论输入框的占位符文本 + try: + comment_input_selectors = [ + 'input[placeholder*="抢首评"]', + 'textarea[placeholder*="抢首评"]', + '[data-e2e="comment-input"]', + '[class*="comment-input"]' + ] + + for selector in comment_input_selectors: + elements = self.driver.find_elements("css selector", selector) + if elements: + for element in elements: + try: + placeholder = element.get_attribute('placeholder') or '' + if any(indicator in placeholder for indicator in first_comment_indicators): + logging.debug(f'检测到抢首评输入框占位符: "{placeholder}"') + return True + except Exception: + continue + except Exception: + pass + + # 方法3: 检测评论区域是否显示空状态 + try: + empty_comment_selectors = [ + '[class*="empty"]', + '[class*="no-comment"]', + '[class*="comment-empty"]', + '[data-e2e="comment-empty"]' + ] + + for selector in empty_comment_selectors: + elements = self.driver.find_elements("css selector", selector) + if elements: + for element in elements: + try: + if element.is_displayed(): + text = element.text.strip() + if any(indicator in text for indicator in first_comment_indicators): + logging.debug(f'检测到空评论状态: "{text}"') + return True + except Exception: + continue + except Exception: + pass + + logging.debug('未检测到抢首评按钮或相关标识') + return False + + except Exception as e: + logging.debug(f'检测抢首评按钮时出错: {e}') + return False + def run(self): try: self.setup_driver() @@ -3194,4 +3572,4 @@ if __name__ == '__main__': # 普通模式下不设置任何环境变量,所有函数都正常运行 print('=== Selenium+CDP 抖音play_vv抓取器 ===') scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration) - scraper.run() \ No newline at end of file + scraper.run()