diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index f2c947c..7a2e3b3 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -69,6 +69,27 @@ class DouyinAutoScheduler: # 创建logger实例 self.logger = logging.getLogger(__name__) + def _sync_episode_details_with_lock(self, episode_details, comments_summary): + """ + 同步 episode_details 时处理评论锁定逻辑 + 如果有 comments_summary,则保留评论内容,只更新互动数据 + + Args: + episode_details: 管理数据库中的 episode_details + comments_summary: 评论总结字段 + + Returns: + 处理后的 episode_details + """ + # 如果没有 comments_summary 或没有 episode_details,直接返回原数据 + if not comments_summary or not episode_details: + return episode_details + + # 如果有 comments_summary,说明评论内容已锁定,直接返回管理数据库的数据 + # 因为管理数据库中已经保存了锁定的评论内容 + logging.info(f'🔒 检测到 comments_summary,episode_details 将保持锁定状态(包含评论内容)') + return episode_details + def _normalize_play_vv(self, play_vv): """标准化播放量数据类型,将字符串转换为数字""" if isinstance(play_vv, str): @@ -407,7 +428,11 @@ class DouyinAutoScheduler: "desc": management_data.get("desc", "") if management_data else "", "updated_to_episode": management_data.get("updated_to_episode", 0) if management_data else 0, "episode_video_ids": management_data.get("episode_video_ids", []) if management_data else [], - "episode_details": management_data.get("episode_details", []) if management_data else [], + # 🔒 episode_details 同步逻辑:如果有 comments_summary,保留评论内容但更新互动数据 + "episode_details": self._sync_episode_details_with_lock( + management_data.get("episode_details", []) if management_data else [], + management_data.get("comments_summary", "") if management_data else "" + ), "data_status": management_data.get("data_status", "") if management_data else "", "realtime_saved": management_data.get("realtime_saved", True) if management_data else True, "created_at": management_data.get("created_at") if management_data else None, diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index cda7dc7..65abe70 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -2373,6 +2373,20 @@ class DouyinPlayVVScraper: # 检查是否已存在该短剧的记录 existing_doc = target_collection.find_one({'mix_id': mix_id}) + # 🔒 保护现有的 episode_details 中的评论数据 + final_episode_details = target_doc.get('episode_details', []) + if existing_doc and existing_doc.get('episode_details'): + existing_episode_details = existing_doc.get('episode_details', []) + # 合并现有的评论数据到新的 episode_details + for i, new_episode in enumerate(final_episode_details): + if i < len(existing_episode_details): + existing_episode = existing_episode_details[i] + # 保留现有的评论数据(如果存在) + existing_comments = existing_episode.get('comments', []) + if existing_comments: + new_episode['comments'] = existing_comments + logging.info(f'[评论保护] 保留第 {i+1} 集的 {len(existing_comments)} 条现有评论: {mix_name}') + # 准备更新字段(不包含锁定字段,锁定字段将在后面单独处理) set_fields = { # 按照用户指定的字段顺序设置 @@ -2394,7 +2408,7 @@ class DouyinPlayVVScraper: 'desc': target_doc.get('desc', ''), 'updated_to_episode': target_doc.get('updated_to_episode', 0), 'episode_video_ids': target_doc.get('episode_video_ids', []), - 'episode_details': target_doc.get('episode_details', []), + 'episode_details': final_episode_details, # 使用合并后的 episode_details 'data_status': target_doc.get('data_status', ''), 'realtime_saved': target_doc.get('realtime_saved', True), 'created_at': target_doc.get('created_at', datetime.now()), @@ -2617,7 +2631,7 @@ class DouyinPlayVVScraper: logging.error(f'[增量更新] 更新视频ID列表失败: {mix_name} - 错误: {e}') return [] - def update_single_video_details(self, document_id, episode_number: int, video_id: str, video_details: dict, mix_name: str): + def update_single_video_details(self, document_id, episode_number: int, video_id: str, video_details: dict, mix_name: str, mix_id: str = ''): """更新单个视频的详细数据(第三阶段增量更新)""" target_collection = self.collection # 使用根据模式选择的集合 if not self.realtime_save_enabled or target_collection is None or not document_id: @@ -2631,6 +2645,36 @@ class DouyinPlayVVScraper: return False try: + # 🔒 检查是否有 comments_summary,如果有则保留现有评论 + # 注意:始终检查 Rankings_management 数据库,因为这是锁定字段的唯一来源 + existing_comments = [] + if self.management_collection is not None: + # 优先使用 mix_id 查询,因为 mix_id 是唯一且稳定的标识符 + # 如果没有 mix_id,则使用 document_id(_id) + query = {'mix_id': mix_id} if mix_id else {'_id': document_id} + logging.info(f'🔍 [评论锁定] 检查 Rankings_management 数据库: query={query}, episode_number={episode_number}') + doc = self.management_collection.find_one(query) + if doc: + logging.info(f'🔍 [评论锁定] 找到文档: mix_name={doc.get("mix_name")}, has_comments_summary={bool(doc.get("comments_summary"))}') + if doc.get('comments_summary'): + # 获取现有的评论数据 + episode_details = doc.get('episode_details', []) + logging.info(f'🔍 [评论锁定] episode_details 长度: {len(episode_details)}') + if episode_number - 1 < len(episode_details): + existing_episode = episode_details[episode_number - 1] + existing_comments = existing_episode.get('comments', []) + logging.info(f'🔍 [评论锁定] 第 {episode_number} 集现有评论数: {len(existing_comments)}') + if existing_comments: + logging.info(f'🔒 检测到 comments_summary,保留现有 {len(existing_comments)} 条评论') + else: + logging.warning(f'⚠️ [评论锁定] episode_number={episode_number} 超出 episode_details 范围(长度={len(episode_details)})') + else: + logging.info(f'🔍 [评论锁定] comments_summary 为空,将抓取新评论') + else: + logging.warning(f'⚠️ [评论锁定] 未找到 document_id={document_id} 的文档') + else: + logging.warning(f'⚠️ [评论锁定] management_collection 未初始化') + # 构建更新的视频详细信息 episode_info = { 'episode_number': episode_number, @@ -2641,7 +2685,8 @@ class DouyinPlayVVScraper: 'likes_formatted': self.format_interaction_count(video_details.get('likes', 0)), 'shares_formatted': self.format_interaction_count(video_details.get('shares', 0)), 'favorites_formatted': self.format_interaction_count(video_details.get('favorites', 0)), - 'comments': video_details.get('comments', []), + # 🔒 如果有现有评论则保留,否则使用新抓取的评论 + 'comments': existing_comments if existing_comments else video_details.get('comments', []), 'data_status': 'completed' } @@ -2919,13 +2964,13 @@ class DouyinPlayVVScraper: if video_details and video_details.get('success', False): # 立即更新到数据库 - self.update_single_video_details(document_id, i, video_id, video_details, mix_name) + self.update_single_video_details(document_id, i, video_id, video_details, mix_name, mix_id) else: logging.warning(f'[增量更新] 第 {i} 集视频详细数据获取失败: {mix_name}') # 添加随机延迟避免请求过快,模拟人类行为 if i < len(episode_video_ids): # 不是最后一个视频时才延迟 - random_delay = self.anti_detection.get_human_like_delay() + random_delay = random.uniform(2.0, 5.0) # 2-5秒随机延迟 logging.info(f'🕐 [增量更新] 视频间隔等待时间: {random_delay:.1f}秒') time.sleep(random_delay) @@ -3985,6 +4030,21 @@ class DouyinPlayVVScraper: # 添加互动数据保存标记,避免重复保存 interaction_data_saved = False + # 🔒 检查是否应该跳过评论抓取(根据 comments_summary 字段判断) + # 注意:始终检查 Rankings_management 数据库,因为这是锁定字段的唯一来源 + should_skip_comments = False + if document_id: + try: + # 使用 management_collection 而不是 self.collection + # 确保无论什么模式都检查管理数据库中的 comments_summary + if self.management_collection is not None: + doc = self.management_collection.find_one({'_id': document_id}) + if doc and doc.get('comments_summary'): + should_skip_comments = True + logging.info(f'🔒 检测到 comments_summary 字段有内容,将跳过评论抓取(但仍会更新点赞、分享、收藏数)') + except Exception as e: + logging.warning(f'检查 comments_summary 字段时出错: {e}') + # 检查是否应该跳过详细数据获取(仅在定时器模式下跳过) if os.environ.get('AUTO_CONTINUE') == '1': logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的详细数据获取(点赞、收藏、分享、评论)') @@ -3992,7 +4052,7 @@ class DouyinPlayVVScraper: video_details['error'] = '定时器模式:跳过详细数据获取' return video_details - logging.info(f'🔍 get_video_details 被调用: video_id={video_id}') + logging.info(f'🔍 get_video_details 被调用: video_id={video_id}, 跳过评论={should_skip_comments}') try: # 确保driver已初始化 @@ -4086,64 +4146,68 @@ class DouyinPlayVVScraper: except Exception as e: continue - # 启动滑动机制加载更多评论 - logging.info(f'开始为视频 {video_id} 启动滑动机制加载评论') - scrolled_comments = self._simulate_comment_scrolling(video_id, max_scroll_attempts=15, scroll_delay=2.0, - document_id=document_id, episode_number=episode_number, mix_name=mix_name, mix_id=mix_id, max_comments=100) - - # 如果滑动机制获取到评论,直接使用 - if scrolled_comments: - video_details['comments'] = scrolled_comments - logging.info(f'滑动机制成功获取 {len(video_details["comments"])} 条评论') - - # 获取滑动后的网络请求日志(用于评论数据) - logs = self.driver.get_log('performance') - - # 解析滑动后的网络日志获取评论数据(作为滑动机制的补充) - for entry in logs: - try: - log = json.loads(entry['message'])['message'] - if ( - 'Network.responseReceived' in log['method'] - and 'response' in log['params'] - and log['params']['response'] - and log['params']['response'].get('url') - ): - url = log['params']['response']['url'] - - # 只处理评论API(视频详情API已在初始阶段处理) - if '/aweme/v1/web/comment/list/' in url and video_id in url and not video_details['comments']: - try: - # 获取响应体 - response_body = self.driver.execute_cdp_cmd( - 'Network.getResponseBody', - {'requestId': log['params']['requestId']} - ) - - if response_body and 'body' in response_body: - data = json.loads(response_body['body']) - comments = data.get('comments', []) + # 🔒 根据 should_skip_comments 标志决定是否抓取评论 + if not should_skip_comments: + # 启动滑动机制加载更多评论 + logging.info(f'开始为视频 {video_id} 启动滑动机制加载评论') + scrolled_comments = self._simulate_comment_scrolling(video_id, max_scroll_attempts=15, scroll_delay=2.0, + document_id=document_id, episode_number=episode_number, mix_name=mix_name, mix_id=mix_id, max_comments=100) + + # 如果滑动机制获取到评论,直接使用 + if scrolled_comments: + video_details['comments'] = scrolled_comments + logging.info(f'滑动机制成功获取 {len(video_details["comments"])} 条评论') + + # 获取滑动后的网络请求日志(用于评论数据) + logs = self.driver.get_log('performance') + + # 解析滑动后的网络日志获取评论数据(作为滑动机制的补充) + for entry in logs: + try: + log = json.loads(entry['message'])['message'] + if ( + 'Network.responseReceived' in log['method'] + and 'response' in log['params'] + and log['params']['response'] + and log['params']['response'].get('url') + ): + url = log['params']['response']['url'] + + # 只处理评论API(视频详情API已在初始阶段处理) + if '/aweme/v1/web/comment/list/' in url and video_id in url and not video_details['comments']: + try: + # 获取响应体 + response_body = self.driver.execute_cdp_cmd( + 'Network.getResponseBody', + {'requestId': log['params']['requestId']} + ) - # 只有在滑动机制没有获取到评论时才使用这个方法 - if not video_details['comments']: - for comment in comments: - comment_info = { - 'text': comment.get('text', ''), - 'user_name': comment.get('user', {}).get('nickname', ''), - 'digg_count': int(comment.get('digg_count', 0)), - 'create_time': comment.get('create_time', 0) - } - video_details['comments'].append(comment_info) + if response_body and 'body' in response_body: + data = json.loads(response_body['body']) + comments = data.get('comments', []) - logging.info(f'备用方案获取到 {len(comments)} 条评论') - logging.info(f'评论API URL: {url}') + # 只有在滑动机制没有获取到评论时才使用这个方法 + if not video_details['comments']: + for comment in comments: + comment_info = { + 'text': comment.get('text', ''), + 'user_name': comment.get('user', {}).get('nickname', ''), + 'digg_count': int(comment.get('digg_count', 0)), + 'create_time': comment.get('create_time', 0) + } + video_details['comments'].append(comment_info) + + logging.info(f'备用方案获取到 {len(comments)} 条评论') + logging.info(f'评论API URL: {url}') + + except Exception as e: + logging.warning(f'解析评论API响应失败: {e}') + continue - except Exception as e: - logging.warning(f'解析评论API响应失败: {e}') - continue - - except Exception as e: - continue + except Exception as e: + continue + else: + logging.info(f'🔒 跳过视频 {video_id} 的评论抓取(comments_summary 已存在)') # 如果网络日志没有获取到数据,尝试页面解析 if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0: @@ -4410,7 +4474,7 @@ class DouyinPlayVVScraper: video_details_list.append(video_details) # 添加随机延迟避免请求过快,模拟人类行为 - random_delay = self.anti_detection.get_human_like_delay() + random_delay = random.uniform(2.0, 5.0) # 2-5秒随机延迟 logging.info(f'🕐 视频间隔等待时间: {random_delay:.1f}秒') time.sleep(random_delay) # exit(0) diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 8038f0e..62b3fdb 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -2937,3 +2937,208 @@ def article_health_check(): "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") } }) + +# ==================== 评论总结管理API ==================== + +@rank_bp.route('/comments/regenerate-summary', methods=['POST']) +def regenerate_comments_summary(): + """ + 重新生成评论总结 + 根据现有的评论内容重新调用大模型API生成总结 + 不清空评论内容,只更新 comments_summary 字段 + """ + try: + data = request.get_json() + mix_id = data.get('mix_id', '') + + if not mix_id: + return jsonify({ + "success": False, + "message": "缺少必要参数: mix_id" + }) + + # 从管理数据库获取数据 + management_doc = rankings_management_collection.find_one({'mix_id': mix_id}) + if not management_doc: + return jsonify({ + "success": False, + "message": f"未找到 mix_id 为 {mix_id} 的数据" + }) + + # 获取所有集的评论 + episode_details = management_doc.get('episode_details', []) + if not episode_details: + return jsonify({ + "success": False, + "message": "该短剧没有集数数据" + }) + + # 收集所有评论 + all_comments = [] + for episode in episode_details: + comments = episode.get('comments', []) + for comment in comments: + if isinstance(comment, dict): + text = comment.get('text', '').strip() + if text: + all_comments.append(text) + elif isinstance(comment, str): + text = comment.strip() + if text: + all_comments.append(text) + + if not all_comments: + return jsonify({ + "success": False, + "message": "该短剧没有评论内容,无法生成总结" + }) + + # 调用大模型API生成总结 + try: + from handlers.Rankings.rank_data_scraper import CommentsSummarizer + summarizer = CommentsSummarizer() + mix_name = management_doc.get('mix_name', '') + comments_summary = summarizer.summarize_comments(all_comments, mix_name) + + if not comments_summary: + return jsonify({ + "success": False, + "message": "评论总结生成失败,请稍后重试" + }) + + # 更新两个数据库的 comments_summary 字段 + update_data = { + '$set': { + 'comments_summary': comments_summary, + 'last_updated': datetime.now() + } + } + + # 更新管理数据库 + rankings_management_collection.update_one( + {'mix_id': mix_id}, + update_data + ) + + # 更新主数据库(Ranking_storage) + # 查找最新的包含该 mix_id 的文档 + latest_doc = collection.find_one( + {'data.mix_id': mix_id}, + sort=[('created_at', -1)] + ) + + if latest_doc: + # 更新 data 数组中对应的项 + collection.update_one( + {'_id': latest_doc['_id'], 'data.mix_id': mix_id}, + {'$set': {'data.$.comments_summary': comments_summary}} + ) + + logging.info(f'✅ 成功重新生成评论总结: mix_id={mix_id}, mix_name={mix_name}') + + return jsonify({ + "success": True, + "message": "评论总结重新生成成功", + "data": { + "comments_summary": comments_summary, + "comments_count": len(all_comments) + } + }) + + except Exception as e: + logging.error(f'调用大模型API失败: {e}') + return jsonify({ + "success": False, + "message": f"调用大模型API失败: {str(e)}" + }) + + except Exception as e: + logging.error(f'重新生成评论总结失败: {e}') + return jsonify({ + "success": False, + "message": f"重新生成评论总结失败: {str(e)}" + }) + +@rank_bp.route('/comments/clear-all', methods=['POST']) +def clear_all_comments(): + """ + 清空所有评论相关数据 + 同时清空 comments_summary 和所有 episode_details 中的 comments 字段 + 两个数据库都会同步清空 + """ + try: + data = request.get_json() + mix_id = data.get('mix_id', '') + + if not mix_id: + return jsonify({ + "success": False, + "message": "缺少必要参数: mix_id" + }) + + # 从管理数据库获取数据 + management_doc = rankings_management_collection.find_one({'mix_id': mix_id}) + if not management_doc: + return jsonify({ + "success": False, + "message": f"未找到 mix_id 为 {mix_id} 的数据" + }) + + # 清空 episode_details 中的所有评论 + episode_details = management_doc.get('episode_details', []) + for episode in episode_details: + episode['comments'] = [] + + # 更新数据 + update_data = { + '$set': { + 'comments_summary': '', + 'episode_details': episode_details, + 'last_updated': datetime.now() + } + } + + # 更新管理数据库 + result1 = rankings_management_collection.update_one( + {'mix_id': mix_id}, + update_data + ) + + # 更新主数据库(Ranking_storage) + # 查找最新的包含该 mix_id 的文档 + latest_doc = collection.find_one( + {'data.mix_id': mix_id}, + sort=[('created_at', -1)] + ) + + if latest_doc: + # 更新 data 数组中对应的项 + collection.update_one( + {'_id': latest_doc['_id'], 'data.mix_id': mix_id}, + { + '$set': { + 'data.$.comments_summary': '', + 'data.$.episode_details': episode_details + } + } + ) + + mix_name = management_doc.get('mix_name', '') + logging.info(f'✅ 成功清空所有评论数据: mix_id={mix_id}, mix_name={mix_name}') + + return jsonify({ + "success": True, + "message": "评论数据已全部清空", + "data": { + "mix_id": mix_id, + "mix_name": mix_name, + "cleared_episodes": len(episode_details) + } + }) + + except Exception as e: + logging.error(f'清空评论数据失败: {e}') + return jsonify({ + "success": False, + "message": f"清空评论数据失败: {str(e)}" + }) diff --git a/frontend/src/AdminPanel.vue b/frontend/src/AdminPanel.vue index c746abc..50de586 100644 --- a/frontend/src/AdminPanel.vue +++ b/frontend/src/AdminPanel.vue @@ -9,6 +9,7 @@ const router = useRouter() const rankingData = ref([]) const loading = ref(false) const showEditModal = ref(false) +const regenerating = ref(false) // 重新生成评论总结的加载状态 // 编辑表单数据 const editForm = reactive({ @@ -244,6 +245,73 @@ const clearCommentsSummary = async () => { } } +// 重新生成评论总结 +const regenerateCommentsSummary = async () => { + if (!confirm('确定要重新生成评论总结吗?这将调用AI根据现有评论内容重新生成总结。')) { + return + } + + if (regenerating.value) { + alert('正在生成中,请稍候...') + return + } + + try { + if (!editForm.mix_id) { + alert('缺少 mix_id,无法重新生成') + return + } + + // 设置加载状态 + regenerating.value = true + + const response = await axios.post(`${API_BASE_URL}/rank/comments/regenerate-summary`, { + mix_id: editForm.mix_id + }) + + if (response.data.success) { + editForm.comments_summary = response.data.data.comments_summary + alert(`评论总结重新生成成功!共分析了 ${response.data.data.comments_count} 条评论`) + } else { + alert(`重新生成失败: ${response.data.message}`) + } + } catch (error) { + console.error('重新生成评论总结失败:', error) + alert('重新生成评论总结失败,请检查网络连接') + } finally { + // 无论成功或失败,都要重置加载状态 + regenerating.value = false + } +} + +// 清空所有评论数据(包括评论总结和评论内容) +const clearAllComments = async () => { + if (!confirm('确定要清空所有评论数据吗?这将同时清空评论总结和所有评论内容,此操作不可恢复!')) { + return + } + + try { + if (!editForm.mix_id) { + alert('缺少 mix_id,无法清空') + return + } + + const response = await axios.post(`${API_BASE_URL}/rank/comments/clear-all`, { + mix_id: editForm.mix_id + }) + + if (response.data.success) { + editForm.comments_summary = '' + alert(`评论数据已全部清空!共清空了 ${response.data.data.cleared_episodes} 集的评论`) + } else { + alert(`清空失败: ${response.data.message}`) + } + } catch (error) { + console.error('清空评论数据失败:', error) + alert('清空评论数据失败,请检查网络连接') + } +} + // 删除项目 const deleteItem = async (item) => { if (!confirm(`确定要删除 "${item.title || item.mix_name}" 吗?`)) { @@ -531,7 +599,6 @@ onMounted(() => { -

评论总结

@@ -543,14 +610,25 @@ onMounted(() => { placeholder="评论总结内容(可手动编辑或由系统自动生成)" style="resize: vertical;" > - +
+ + +
@@ -666,11 +744,23 @@ export default { background: #357abd; } +.btn-primary:disabled { + background: #a0c4e8; + cursor: not-allowed; + opacity: 0.7; +} + .btn-secondary { background: #6c757d; color: white; } +.btn-delete:disabled { + background: #e89ca5; + cursor: not-allowed; + opacity: 0.7; +} + .btn-secondary:hover { background: #545b62; }