From acad6baadeaae0d5c5aa6836a32e7129f62c3d5e Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Thu, 30 Oct 2025 10:20:04 +0800 Subject: [PATCH] =?UTF-8?q?=E8=A1=A5=E4=B8=81=E6=9C=BA=E5=88=B6=E4=BC=98?= =?UTF-8?q?=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/config.py | 4 +- .../handlers/Rankings/rank_data_scraper.py | 372 +++++++++--------- 2 files changed, 179 insertions(+), 197 deletions(-) diff --git a/backend/config.py b/backend/config.py index ee501e4..1ba92a7 100644 --- a/backend/config.py +++ b/backend/config.py @@ -3,9 +3,9 @@ import importlib # 数据库配置 MONGO_URI = "mongodb://localhost:27017" -# MONGO_DB_NAME = "Rankings" +MONGO_DB_NAME = "Rankings" # MONGO_URI = "mongodb://mongouser:Jdei2243afN@172.16.0.6:27017,172.16.0.4:27017/test?replicaSet=cmgo-r6qkaern_0&authSource=admin" -MONGO_DB_NAME = "kemeng_media" +# MONGO_DB_NAME = "kemeng_media" # 应用配置 APP_ENV = os.getenv('APP_ENV', 'development') diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 4444954..6c7bf68 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -2017,6 +2017,11 @@ class DouyinPlayVVScraper: logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的评论滑动加载') return [] + # 首先检查视频是否真的没有评论(检测"抢首评"按钮) + if self._check_first_comment_button(): + logging.info(f'检测到视频 {video_id} 没有评论(存在"抢首评"按钮),跳过评论抓取') + return [] + all_comments = [] collected_comment_ids = set() @@ -2059,42 +2064,43 @@ class DouyinPlayVVScraper: # 很可能存在页面上可见但未被网络日志捕获的评论 # 智能判断是否需要执行补丁机制 + # 只在评论数量真正过少时才启用补丁机制 should_apply_patch = False - # 条件1: 评论数量较少(少于20条) - if len(all_comments) < 20: + # 只有当评论数量少于10条时才启用补丁机制 + if len(all_comments) < 10: should_apply_patch = True - logging.debug(f'评论数量较少({len(all_comments)}条),启用补丁机制') + logging.debug(f'评论数量过少({len(all_comments)}条),启用补丁机制') - # 条件2: 检查页面上是否有更多可见的评论元素 - try: - visible_comment_count = self.driver.execute_script(""" - var selectors = [ - '[data-e2e="comment-item"]', - '[class*="comment-item"]', - '[class*="comment-content"]' - ]; - var totalCount = 0; - selectors.forEach(function(selector) { - var elements = document.querySelectorAll(selector); - elements.forEach(function(element) { - if (element.offsetParent !== null && element.textContent.trim().length > 2) { - totalCount++; - } + # 对于评论数量在10-50条之间的情况,检查是否可能遗漏了评论 + elif len(all_comments) <= 50: + try: + visible_comment_count = self.driver.execute_script(""" + var selectors = [ + '[data-e2e="comment-item"]', + '[class*="comment-item"]', + '[class*="comment-content"]' + ]; + var totalCount = 0; + selectors.forEach(function(selector) { + var elements = document.querySelectorAll(selector); + elements.forEach(function(element) { + if (element.offsetParent !== null && element.textContent.trim().length > 2) { + totalCount++; + } + }); }); - }); - return totalCount; - """) - - if visible_comment_count > len(all_comments): - should_apply_patch = True - logging.debug(f'页面可见评论({visible_comment_count}条) > 已获取评论({len(all_comments)}条),启用补丁机制') + return totalCount; + """) - except Exception as e: - logging.debug(f'检查页面可见评论数量失败: {e}') - # 如果检查失败,对于少量评论的情况仍然执行补丁 - if len(all_comments) < 10: - should_apply_patch = True + # 只有当页面可见评论数量明显大于已获取数量时才启用补丁 + if visible_comment_count > len(all_comments) * 2: + should_apply_patch = True + logging.debug(f'页面可见评论({visible_comment_count}条) >> 已获取评论({len(all_comments)}条),启用补丁机制') + + except Exception as e: + logging.debug(f'检查页面可见评论数量失败: {e}') + # 检查失败时不启用补丁机制 patch_comments = [] if should_apply_patch: @@ -3221,12 +3227,12 @@ class DouyinPlayVVScraper: def _extract_comments_patch(self, video_id: str) -> list: """ - 评论抓取补丁机制 - 强制从页面元素中提取所有可见评论 - 针对评论较少但获取不全的情况,确保不遗漏任何页面上可见的评论 + 评论补丁机制 - 更仔细地重新从网络日志获取评论 + 不再抓取页面元素,而是重新触发评论加载并从API获取 """ comments = [] try: - logging.info(f'开始执行评论抓取补丁机制,视频ID: {video_id}') + logging.info(f'启动补丁机制,重新仔细获取视频 {video_id} 的评论...') # 首先检查是否存在"抢首评"按钮,如果存在说明视频确实没有评论 if self._check_first_comment_button(): @@ -3234,189 +3240,165 @@ class DouyinPlayVVScraper: return comments # 等待页面稳定 - time.sleep(1) + time.sleep(2) # 滚动到评论区域确保评论完全加载 self._scroll_to_comment_section() time.sleep(1) - # 针对少量评论优化的选择器列表 - comment_selectors = [ - # 抖音常用的评论选择器(优先级最高) - '[data-e2e="comment-item"]', - '[data-e2e="comment-list"] > div', - '[class*="comment-item"]', - '[class*="comment-content"]', - - # 针对少量评论的特殊选择器 - '[class*="comment"] [class*="content"]', - '[class*="Comment"] [class*="content"]', - '[class*="comment-text"]', - '[class*="user-comment"]', - - # 更通用的评论选择器 - 'div[class*="comment"]:not([class*="input"]):not([class*="button"]):not([class*="header"])', - 'li[class*="comment"]', - '[role="listitem"][class*="comment"]', - - # 备用选择器(用于兜底) - '.comment-list > div', - '.comment-container > div', - '.comment-wrapper > div', - - # 针对少量评论的深度选择器 - 'div[class*="comment"] p', - 'div[class*="comment"] span:not([class*="button"]):not([class*="icon"])', - '[data-e2e*="comment"] div[class*="text"]' - ] + # 点击评论区域,触发评论加载 + try: + self._click_comment_area() + time.sleep(1) + except: + pass - collected_texts = set() # 用于去重 + # 清理旧的网络日志 + self.driver.get_log('performance') - for selector in comment_selectors: + # 轻微滚动,触发更多评论加载 + for i in range(3): + self.driver.execute_script("window.scrollBy(0, 200);") + time.sleep(0.5) + self.driver.execute_script("window.scrollBy(0, -100);") + time.sleep(0.5) + + # 等待网络请求完成 + time.sleep(3) + + # 重新从网络日志中提取评论(更仔细的方式) + patch_comments = self._extract_comments_from_network_logs_detailed(video_id) + + if patch_comments: + logging.info(f'补丁机制成功重新获取 {len(patch_comments)} 条评论') + comments.extend(patch_comments) + else: + logging.info('补丁机制未找到额外评论') + + except Exception as e: + logging.error(f'评论补丁机制执行失败: {e}') + + return comments + + def _extract_comments_from_network_logs_detailed(self, video_id: str) -> list: + """ + 更详细地从网络日志中提取评论数据(补丁机制专用) + Args: + video_id: 视频ID + Returns: + list: 评论数据列表 + """ + comments = [] + try: + # 获取网络请求日志 + logs = self.driver.get_log('performance') + + for entry in logs: try: - logging.debug(f'尝试选择器: {selector}') - elements = self.driver.find_elements("css selector", selector) - - if elements: - logging.debug(f'选择器 {selector} 找到 {len(elements)} 个元素') + log = json.loads(entry['message'])['message'] + if ( + 'Network.responseReceived' in log['method'] + and 'response' in log['params'] + and log['params']['response'] + and log['params']['response'].get('url') + ): + url = log['params']['response']['url'] - for element in elements: + # 检查是否是评论相关的API(更宽泛的匹配) + comment_api_patterns = [ + '/aweme/v1/web/comment/list/', + '/comment/list/', + '/comment/detail/', + '/reply/list/' + ] + + is_comment_api = any(pattern in url for pattern in comment_api_patterns) + + if is_comment_api and video_id in url: try: - if not element.is_displayed(): - continue + # 获取响应体 + response_body = self.driver.execute_cdp_cmd( + 'Network.getResponseBody', + {'requestId': log['params']['requestId']} + ) + + if response_body and 'body' in response_body: + data = json.loads(response_body['body']) - # 获取评论文本 - comment_text = element.text.strip() - - # 过滤无效文本 - if not comment_text or len(comment_text) < 2: - continue + # 尝试多种可能的评论数据结构 + api_comments = [] - # 过滤系统文本和按钮文本 - skip_texts = [ - '回复', '点赞', '举报', '删除', '编辑', '查看更多', - '展开', '收起', '暂时没有更多评论', '加载中', - '发布', '取消', '确定', '登录', '注册', '关注' - ] - - # 检查是否为纯按钮文本 - if comment_text in skip_texts: - continue - - # 清理包含按钮文本的评论 - original_text = comment_text - for skip_text in skip_texts: - comment_text = comment_text.replace(skip_text, '').strip() - - # 如果清理后文本太短,跳过 - if len(comment_text) < 2: - continue - - # 去重检查 - if comment_text in collected_texts: - continue + # 标准结构 + if 'comments' in data: + api_comments = data['comments'] + # 备用结构 + elif 'comment_list' in data: + api_comments = data['comment_list'] + elif 'data' in data and isinstance(data['data'], list): + api_comments = data['data'] + elif 'data' in data and 'comments' in data['data']: + api_comments = data['data']['comments'] - collected_texts.add(comment_text) - - # 创建评论对象 - comment_info = { - 'text': comment_text, - 'user_name': '', - 'digg_count': 0, - 'create_time': 0, - 'source': f'patch_{selector}' # 标记来源 - } - - comments.append(comment_info) - logging.debug(f'补丁提取评论: {comment_text[:30]}...') - + for comment in api_comments: + if isinstance(comment, dict): + comment_text = comment.get('text', '') or comment.get('content', '') + if comment_text and len(comment_text.strip()) > 0: + comment_info = { + 'text': comment_text.strip(), + 'user_name': comment.get('user', {}).get('nickname', '') if comment.get('user') else '', + 'digg_count': int(comment.get('digg_count', 0) or comment.get('like_count', 0)), + 'create_time': comment.get('create_time', 0) or comment.get('timestamp', 0), + 'source': 'patch_api' + } + comments.append(comment_info) + + # 记录API URL信息,用于调试 + if api_comments: + logging.debug(f'补丁机制从API获取到 {len(api_comments)} 条评论: {url}') + except Exception as e: - logging.debug(f'处理评论元素失败: {e}') + logging.debug(f'补丁机制处理响应体失败: {e}') continue except Exception as e: - logging.debug(f'选择器 {selector} 执行失败: {e}') + logging.debug(f'补丁机制处理日志条目失败: {e}') + continue + + except Exception as e: + logging.error(f'补丁机制从网络日志提取评论失败: {e}') + + return comments + + def _click_comment_area(self): + """ + 点击评论区域,触发评论加载 + """ + try: + # 尝试多种方式点击评论区域 + comment_selectors = [ + '[data-e2e="comment-list"]', + '[class*="comment-list"]', + '[class*="comment-container"]', + '[class*="comment-area"]', + '[class*="comment-section"]' + ] + + for selector in comment_selectors: + try: + elements = self.driver.find_elements("css selector", selector) + if elements and elements[0].is_displayed(): + self.driver.execute_script("arguments[0].click();", elements[0]) + logging.debug(f'成功点击评论区域: {selector}') + return + except: continue - # 使用JavaScript进一步搜索评论文本(针对少量评论优化) - try: - js_comments = self.driver.execute_script(""" - var comments = []; - var processedTexts = new Set(); - - // 针对少量评论的优化选择器 - var selectors = [ - 'div[class*="comment"]', - 'li[class*="comment"]', - '[data-e2e*="comment"]', - '[class*="comment-item"]', - '[class*="comment-content"]' - ]; - - selectors.forEach(function(selector) { - try { - var elements = document.querySelectorAll(selector); - elements.forEach(function(element) { - if (element.offsetParent === null) return; // 跳过不可见元素 - - var text = element.textContent.trim(); - if (text.length < 2) return; - - // 过滤按钮文本 - var skipTexts = ['回复', '点赞', '举报', '删除', '编辑', '查看更多', '展开', '收起', '暂时没有更多评论', '关注']; - var isButtonText = skipTexts.some(function(skipText) { - return text === skipText; - }); - - if (isButtonText || processedTexts.has(text)) return; - - // 清理文本 - var cleanText = text; - skipTexts.forEach(function(skipText) { - cleanText = cleanText.replace(new RegExp(skipText, 'g'), '').trim(); - }); - - if (cleanText.length >= 2 && !processedTexts.has(cleanText)) { - processedTexts.add(cleanText); - comments.push({ - text: cleanText, - user_name: '', - digg_count: 0, - create_time: 0, - source: 'patch_js' - }); - } - }); - } catch (e) { - console.log('JS选择器执行失败:', selector, e); - } - }); - - return comments; - """) - - if js_comments: - # 去重合并 - existing_texts = {comment['text'] for comment in comments} - for js_comment in js_comments: - if js_comment['text'] not in existing_texts: - comments.append(js_comment) - existing_texts.add(js_comment['text']) - - logging.debug(f'JavaScript补丁额外提取 {len(js_comments)} 条评论') - - except Exception as e: - logging.debug(f'JavaScript评论提取失败: {e}') + # 如果没有找到特定的评论区域,尝试点击页面中部 + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);") + time.sleep(0.5) - if comments: - logging.info(f'评论抓取补丁机制成功提取 {len(comments)} 条评论') - else: - logging.warning('评论抓取补丁机制未找到任何评论') - except Exception as e: - logging.error(f'评论抓取补丁机制执行失败: {e}') - - return comments + logging.debug(f'点击评论区域失败: {e}') def _check_first_comment_button(self) -> bool: """