diff --git a/.gitignore b/.gitignore index d758556..51d3498 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,6 @@ yarn-error.log* # OS .DS_Store Thumbs.db + +# Figma 设计文件目录(无需纳入版本控制) .figma/ \ No newline at end of file diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index 9b9641b..f2c947c 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -66,6 +66,8 @@ def setup_logging(quiet_mode=False): class DouyinAutoScheduler: def __init__(self): self.is_running = False + # 创建logger实例 + self.logger = logging.getLogger(__name__) def _normalize_play_vv(self, play_vv): """标准化播放量数据类型,将字符串转换为数字""" @@ -78,29 +80,68 @@ class DouyinAutoScheduler: return 0 return play_vv - def _deduplicate_videos_by_mix_name(self, videos, include_rank=False): - """按短剧名称去重,保留播放量最高的记录""" - unique_data = {} - for video in videos: - mix_name = video.get("mix_name", "") - if mix_name: - # 标准化播放量数据类型 - play_vv = self._normalize_play_vv(video.get("play_vv", 0)) - - if mix_name not in unique_data or play_vv > unique_data[mix_name].get("play_vv", 0): - if include_rank: - # 用于昨天数据的格式 - unique_data[mix_name] = { - "play_vv": play_vv, - "video_id": str(video.get("_id", "")), - "rank": 0 # 稍后计算排名 - } - else: - # 用于今天数据的格式,直接更新原视频对象 - video["play_vv"] = play_vv - unique_data[mix_name] = video - - return unique_data + def check_browser_login_status(self): + """检查浏览器登录状态,如果没有登录则提示用户登录""" + try: + import os + script_dir = os.path.dirname(os.path.abspath(__file__)) + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + + + # 检查配置文件是否为空(可能未登录) + import glob + profile_files = glob.glob(os.path.join(profile_dir, "*")) + if len(profile_files) < 5: # 如果文件太少,可能未登录 + print("⚠️ 检测到定时器浏览器可能未登录") + print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面") + print(" 完成后按回车键继续...") + input() + else: + print("✅ 定时器浏览器已配置,继续执行...") + + except Exception as e: + logging.warning(f"检查浏览器登录状态时出错: {e}") + print("⚠️ 检查浏览器状态失败,请确保浏览器已正确配置") + print(" 完成后按回车键继续...") + input() + + def _cleanup_chrome_processes(self): + """清理可能占用配置文件的Chrome进程""" + try: + import psutil + import os + + # 获取当前配置文件路径 + script_dir = os.path.dirname(os.path.abspath(__file__)) + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + + # 查找使用该配置文件的Chrome进程 + killed_processes = [] + for proc in psutil.process_iter(['pid', 'name', 'cmdline']): + try: + if proc.info['name'] and 'chrome' in proc.info['name'].lower(): + cmdline = proc.info['cmdline'] + if cmdline and any(profile_dir in arg for arg in cmdline): + proc.terminate() + killed_processes.append(proc.info['pid']) + logging.info(f'终止占用配置文件的Chrome进程: PID {proc.info["pid"]}') + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + continue + + # 等待进程终止 + if killed_processes: + import time + time.sleep(2) + + return len(killed_processes) > 0 + + except ImportError: + # 如果没有psutil,跳过清理以避免影响其他脚本实例 + logging.warning('psutil 不可用,跳过进程清理(避免全局终止 Chrome)') + return False + except Exception as e: + logging.warning(f'清理Chrome进程时出错: {e}') + return False def run_douyin_scraper(self): """执行抖音播放量抓取任务""" @@ -114,14 +155,14 @@ class DouyinAutoScheduler: scraper = DouyinPlayVVScraper( start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation", auto_continue=True, - duration_s=60 + duration_s=60 # 增加到60秒,给更多时间收集数据 ) - - print("📁 开始执行抓取任务...") + + print("开始执行抓取任务...") logging.info("📁 开始执行抓取任务...") scraper.run() - print("✅ 抖音播放量抓取任务执行成功") + print("抖音播放量抓取任务执行成功") logging.info("✅ 抖音播放量抓取任务执行成功") # 数据抓取完成后,自动生成当日榜单 @@ -180,13 +221,25 @@ class DouyinAutoScheduler: today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1)) logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}") - # 按短剧名称去重,每个短剧只保留播放量最高的一条 + # 按短剧ID去重,每个短剧只保留播放量最高的一条 + # 🚫 过滤掉空的或无效的mix_id和播放量为0的记录 unique_videos = {} for video in today_videos_raw: - mix_name = video.get("mix_name", "") - if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)): - unique_videos[mix_name] = video - + mix_id = video.get("mix_id", "").strip() + mix_name = video.get("mix_name", "").strip() + play_vv = video.get("play_vv", 0) + + # 过滤掉空的或无效的mix_id + if not mix_id or mix_id == "" or mix_id.lower() == "null": + continue + + # 注意:播放量为0的数据也会被保留,可能是新发布的短剧 + if play_vv <= 0: + logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={play_vv},仍会保留") + + if mix_id not in unique_videos or play_vv > unique_videos[mix_id].get("play_vv", 0): + unique_videos[mix_id] = video + today_videos = list(unique_videos.values()) logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") @@ -213,16 +266,28 @@ class DouyinAutoScheduler: "batch_time": yesterday_batch_time }).sort("play_vv", -1)) - # 按短剧名称去重,每个短剧只保留播放量最高的一条 + # 按短剧ID去重,每个短剧只保留播放量最高的一条 + # 🚫 过滤掉空的或无效的mix_id unique_yesterday_videos = {} for video in yesterday_videos_raw: - mix_name = video.get("mix_name", "") - if mix_name and (mix_name not in unique_yesterday_videos or video.get("play_vv", 0) > unique_yesterday_videos[mix_name].get("play_vv", 0)): - unique_yesterday_videos[mix_name] = video - - # 将昨天的数据转换为字典,以短剧名称为键 - for mix_name, video in unique_yesterday_videos.items(): - yesterday_data[mix_name] = { + mix_id = video.get("mix_id", "").strip() + mix_name = video.get("mix_name", "").strip() + play_vv = video.get("play_vv", 0) + + # 过滤掉空的或无效的mix_id + if not mix_id or mix_id == "" or mix_id.lower() == "null": + continue + + # 注意:播放量为0的数据也会被保留,可能是新发布的短剧 + if play_vv <= 0: + logging.warning(f"⚠️ 昨天数据中发现播放量为0: mix_name={mix_name}, play_vv={play_vv},仍会保留") + + if mix_id not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_id].get("play_vv", 0): + unique_yesterday_videos[mix_id] = video + + # 将昨天的数据转换为字典,以短剧ID为键 + for mix_id, video in unique_yesterday_videos.items(): + yesterday_data[mix_id] = { "rank": 0, # 原始数据没有排名,设为0 "play_vv": video.get("play_vv", 0), "video_id": str(video.get("_id", "")) @@ -244,23 +309,23 @@ class DouyinAutoScheduler: play_vv_change_rate = 0 is_new = True - mix_name = video.get("mix_name", "") - if mix_name in yesterday_data: + mix_id = video.get("mix_id", "") + if mix_id in yesterday_data: is_new = False - yesterday_play_vv = yesterday_data[mix_name]["play_vv"] - + yesterday_play_vv = yesterday_data[mix_id]["play_vv"] + # 计算播放量变化 play_vv_change = current_play_vv - yesterday_play_vv if yesterday_play_vv > 0: play_vv_change_rate = round((play_vv_change / yesterday_play_vv) * 100, 2) - + # 创建包含增长数据的视频项 video_with_growth = { "video": video, "play_vv_change": play_vv_change, "play_vv_change_rate": play_vv_change_rate, "is_new": is_new, - "yesterday_data": yesterday_data.get(mix_name, {}) + "yesterday_data": yesterday_data.get(mix_id, {}) } videos_with_growth.append(video_with_growth) @@ -277,29 +342,92 @@ class DouyinAutoScheduler: "data": [] } + # 获取Rankings_management集合用于补充详细信息 + rankings_management_collection = db['Rankings_management'] + # 生成排序后的榜单数据 - for i, item in enumerate(videos_with_growth, 1): + rank = 1 # 使用独立的排名计数器 + for item in videos_with_growth: video = item["video"] video_id = str(video.get("_id", "")) current_play_vv = video.get("play_vv", 0) - mix_name = video.get("mix_name", "") + mix_name = video.get("mix_name", "").strip() + + # 🚫 跳过无效数据:确保mix_name不为空 + # 注意:播放量为0的数据也会被保留,可能是新发布的短剧 + if not mix_name or mix_name == "" or mix_name.lower() == "null": + self.logger.warning(f"跳过空的mix_name记录,video_id: {video_id}") + continue + + if current_play_vv <= 0: + self.logger.warning(f"⚠️ 榜单中发现播放量为0的记录: mix_name={mix_name}, play_vv={current_play_vv},仍会保留") # 计算排名变化(基于昨天的排名) rank_change = 0 if not item["is_new"] and item["yesterday_data"]: yesterday_rank = item["yesterday_data"].get("rank", 0) - rank_change = yesterday_rank - i + rank_change = yesterday_rank - rank # 使用当前排名计数器 + # 🔍 从Rankings_management获取详细信息(按mix_id查询,因为管理数据库每个短剧只有一条记录) + mix_id = video.get("mix_id", "").strip() + management_data = None + + if mix_id: + # 直接按mix_id查询,不需要按日期查询 + management_data = rankings_management_collection.find_one({"mix_id": mix_id}) + if management_data: + logging.info(f"📋 从 Rankings_management 获取数据: {mix_name} (mix_id: {mix_id})") + else: + logging.warning(f"⚠️ 未找到管理数据: {mix_name} (mix_id: {mix_id})") + else: + logging.warning(f"⚠️ mix_id 为空: {mix_name}") + ranking_item = { - "rank": i, + # 🎯 核心榜单字段 + "rank": rank, # 使用排名计数器 "title": mix_name, + "mix_name": mix_name, "play_vv": current_play_vv, - "author": video.get("author", ""), + "series_author": video.get("series_author", ""), "video_id": video_id, "video_url": video.get("video_url", ""), "cover_image_url": video.get("cover_image_url", ""), "playcount_str": video.get("playcount", ""), - # 时间轴对比数据 + + # 📋 从Rankings_management获取的详细字段 + "batch_id": management_data.get("batch_id", "") if management_data else "", + "batch_time": management_data.get("batch_time") if management_data else None, + "item_sequence": management_data.get("item_sequence", 0) if management_data else 0, + "mix_id": video.get("mix_id", ""), # 直接从原始数据获取mix_id + "playcount": management_data.get("playcount", "") if management_data else "", + "request_id": management_data.get("request_id", "") if management_data else "", + "cover_image_url_original": management_data.get("cover_image_url_original", "") if management_data else "", + "cover_upload_success": management_data.get("cover_upload_success", True) if management_data else True, + "cover_backup_urls": management_data.get("cover_backup_urls", []) if management_data else [], + "desc": management_data.get("desc", "") if management_data else "", + "updated_to_episode": management_data.get("updated_to_episode", 0) if management_data else 0, + "episode_video_ids": management_data.get("episode_video_ids", []) if management_data else [], + "episode_details": management_data.get("episode_details", []) if management_data else [], + "data_status": management_data.get("data_status", "") if management_data else "", + "realtime_saved": management_data.get("realtime_saved", True) if management_data else True, + "created_at": management_data.get("created_at") if management_data else None, + "last_updated": management_data.get("last_updated") if management_data else None, + # 🎬 评论总结字段:直接从管理数据库获取(按mix_id查询) + "comments_summary": management_data.get("comments_summary", "") if management_data else "", + + # 🔑 分类字段:直接从管理数据库获取(按mix_id查询,每个短剧只有一条记录) + "Manufacturing_Field": management_data.get("Manufacturing_Field", "") if management_data else "", + "Copyright_field": management_data.get("Copyright_field", "") if management_data else "", + "classification_type": management_data.get("classification_type", "") if management_data else "", + "release_date": management_data.get("release_date", "") if management_data else "", + "Novel_IDs": management_data.get("Novel_IDs", []) if management_data else [], + "Anime_IDs": management_data.get("Anime_IDs", []) if management_data else [], + "Drama_IDs": management_data.get("Drama_IDs", []) if management_data else [], + + # 🔒 锁定状态:直接从管理数据库获取 + "field_lock_status": management_data.get("field_lock_status", {}) if management_data else {}, + + # 📊 时间轴对比数据(重要:包含播放量差值) "timeline_data": { "is_new": item["is_new"], "rank_change": rank_change, @@ -311,6 +439,7 @@ class DouyinAutoScheduler: } comprehensive_ranking["data"].append(ranking_item) + rank += 1 # 递增排名计数器 # 为每次计算添加唯一的时间戳,确保数据唯一性 current_timestamp = datetime.now() @@ -330,6 +459,29 @@ class DouyinAutoScheduler: logging.info(f"📝 创建了新的今日榜单数据(第{existing_count + 1}次计算,包含最新差值)") logging.info(f"🔖 计算ID: {comprehensive_ranking['calculation_id']}") + # 📊 检查数据完整性:统计从Rankings_management成功获取详细信息的项目数量 + total_items = len(comprehensive_ranking["data"]) + items_with_management_data = 0 + items_with_manufacturing = 0 + items_with_copyright = 0 + + for item in comprehensive_ranking["data"]: + # 检查是否从Rankings_management获取到了数据 + if item.get("batch_id") or item.get("desc") or item.get("Manufacturing_Field") or item.get("Copyright_field"): + items_with_management_data += 1 + if item.get("Manufacturing_Field"): + items_with_manufacturing += 1 + if item.get("Copyright_field"): + items_with_copyright += 1 + + print(f"数据完整性统计:") + print(f" 总项目数: {total_items}") + print(f" 从Rankings_management获取到详细信息: {items_with_management_data}") + print(f" 包含Manufacturing_Field: {items_with_manufacturing}") + print(f" 包含Copyright_field: {items_with_copyright}") + + logging.info(f"📊 数据完整性: 总{total_items}项,获取详细信息{items_with_management_data}项,Manufacturing_Field: {items_with_manufacturing},Copyright_field: {items_with_copyright}") + # 统计信息 new_count = sum(1 for item in comprehensive_ranking["data"] if item["timeline_data"]["is_new"]) print(f"✅ 时间轴对比榜单生成成功") @@ -358,13 +510,165 @@ class DouyinAutoScheduler: import traceback logging.error(f"详细错误信息: {traceback.format_exc()}") + def check_and_sync_missing_fields(self): + """实时检查并同步当天缺失字段""" + try: + from database import db + + # 只检查当天的数据 + today = date.today() + today_str = today.strftime('%Y-%m-%d') + + # 首先检查 Rankings_management 是否有当天的数据 + rankings_management_collection = db['Rankings_management'] + management_count = rankings_management_collection.count_documents({}) + + if management_count == 0: + # Rankings_management 没有数据,说明还没有抓取,直接返回 + return + + rankings_collection = db['Ranking_storage'] + key_fields = ['Manufacturing_Field', 'Copyright_field', 'desc', 'series_author'] + + # 检查今天是否有缺失字段的数据 + missing_conditions = [] + for field in key_fields: + missing_conditions.extend([ + {field: {"$exists": False}}, + {field: None}, + {field: ""} + ]) + + today_missing_count = rankings_collection.count_documents({ + "date": today_str, + "$or": missing_conditions + }) + + # 如果今天没有缺失数据,静默返回 + if today_missing_count == 0: + return + + logging.info(f"🔍 检测到今天有 {today_missing_count} 条缺失字段,Rankings_management有 {management_count} 条数据,开始实时同步...") + + # 只处理当天的数据 + dates_to_check = [today_str] + + total_missing = 0 + total_synced = 0 + + for check_date in dates_to_check: + # 查询该日期缺失字段的数据 + rankings_collection = db['Ranking_storage'] + + # 检查多个关键字段(包括新增的分类字段) + key_fields = ['Manufacturing_Field', 'Copyright_field', 'desc', 'series_author', 'Novel_IDs', 'Anime_IDs', 'Drama_IDs'] + missing_conditions = [] + + for field in key_fields: + missing_conditions.extend([ + {field: {"$exists": False}}, + {field: None}, + {field: ""} + ]) + + missing_query = { + "date": check_date, + "$or": missing_conditions + } + + missing_count = rankings_collection.count_documents(missing_query) + + # 详细统计每个字段的缺失情况 + field_stats = {} + total_items = rankings_collection.count_documents({"date": check_date}) + + for field in key_fields: + missing_field_count = rankings_collection.count_documents({ + "date": check_date, + "$or": [ + {field: {"$exists": False}}, + {field: None}, + {field: ""} + ] + }) + field_stats[field] = { + "missing": missing_field_count, + "completion_rate": ((total_items - missing_field_count) / total_items * 100) if total_items > 0 else 0 + } + + if missing_count > 0: + logging.info(f"📅 今日({check_date}): 发现 {missing_count} 条记录缺失字段(总计 {total_items} 条)") + + # 输出详细的字段统计 + for field, stats in field_stats.items(): + if stats["missing"] > 0: + logging.info(f" - {field}: 缺失 {stats['missing']} 条 ({stats['completion_rate']:.1f}% 完整)") + + total_missing += missing_count + + # 尝试同步 + try: + from routers.rank_api_routes import sync_ranking_storage_fields + + # 使用改进的重试机制 + sync_result = sync_ranking_storage_fields( + target_date=check_date, + force_update=False, + max_retries=2, # 定期检查时重试2次 + retry_delay=15 # 15秒重试间隔 + ) + + if sync_result.get("success", False): + stats = sync_result.get("stats", {}) + synced = stats.get("updated_items", 0) + retry_count = stats.get("retry_count", 0) + pending_final = stats.get("pending_items_final", 0) + + total_synced += synced + if synced > 0: + logging.info(f"✅ 今日({check_date}): 成功同步 {synced} 条记录") + + if retry_count > 0: + logging.info(f"🔄 今日({check_date}): 使用了 {retry_count} 次重试") + + if pending_final > 0: + logging.warning(f"⚠️ 今日({check_date}): {pending_final} 条记录在 Rankings_management 中仍未找到") + else: + logging.warning(f"⚠️ 今日({check_date}): 同步失败 - {sync_result.get('message', '')}") + + except Exception as sync_error: + logging.error(f"💥 今日({check_date}): 同步过程出错 - {sync_error}") + else: + if total_items > 0: + logging.info(f"📅 {check_date}: 所有字段完整(总计 {total_items} 条记录)") + # 显示完整性统计 + for field, stats in field_stats.items(): + logging.info(f" - {field}: {stats['completion_rate']:.1f}% 完整") + else: + logging.info(f"📅 {check_date}: 无数据") + + if total_missing > 0: + logging.info(f"🔍 当天同步完成:发现 {total_missing} 条缺失记录,成功同步 {total_synced} 条") + print(f"🔍 当天字段同步:发现 {total_missing} 条缺失,同步 {total_synced} 条") + else: + # 当天没有缺失数据时,不输出日志(静默模式) + pass + + except Exception as e: + logging.error(f"💥 检查缺失字段时发生异常: {e}") + import traceback + logging.error(f"详细错误信息: {traceback.format_exc()}") def setup_schedule(self): """设置定时任务""" # 每小时的整点执行抖音播放量抓取 schedule.every().hour.at(":00").do(self.run_douyin_scraper) + + # 每1分钟检查一次缺失字段并尝试同步(实时同步) + schedule.every(1).minutes.do(self.check_and_sync_missing_fields) logging.info(f"⏰ 定时器已设置:每小时整点执行抖音播放量抓取") + logging.info(f"⏰ 定时器已设置:每1分钟检查缺失字段并同步(实时模式)") def show_next_run(self): """显示下次执行时间""" diff --git a/backend/app.py b/backend/app.py index 4189992..99e868f 100644 --- a/backend/app.py +++ b/backend/app.py @@ -1,9 +1,29 @@ -from flask import Flask, jsonify +from flask import Flask, jsonify, send_from_directory from flask_cors import CORS import logging import os app = Flask(__name__) + +# 配置静态文件目录为dist +# 说明:这里指向后端目录中的 dist(前端构建产物应复制或输出到此) +dist_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'dist')) +app.static_folder = dist_dir + +# 为 SPA 提供静态文件与回退到 index.html 的路由 +@app.route('/') +def serve_index(): + # 返回构建后的前端入口文件 + return send_from_directory(app.static_folder, 'index.html') + +@app.route('/') +def serve_static_or_fallback(path): + # 如果请求的文件存在则直接返回,否则回退到 index.html(用于前端路由) + file_path = os.path.join(app.static_folder, path) + if os.path.isfile(file_path): + return send_from_directory(app.static_folder, path) + return send_from_directory(app.static_folder, 'index.html') + CORS(app) # 允许跨域访问 # 配置日志 @@ -22,13 +42,11 @@ logging.basicConfig( # 导入并注册蓝图 from routers.rank_api_routes import rank_bp -from routers.article_routes import article_bp app.register_blueprint(rank_bp) -app.register_blueprint(article_bp) if __name__ == '__main__': print("启动主程序服务...") - print("服务地址: http://localhost:5001") + print("服务地址: http://localhost:8443") - app.run(host='0.0.0.0', port=5001, debug=True) \ No newline at end of file + app.run(host='0.0.0.0', port=8443, debug=True) diff --git a/backend/config.py b/backend/config.py index 1ba92a7..9db8d58 100644 --- a/backend/config.py +++ b/backend/config.py @@ -52,6 +52,18 @@ API_CONFIG = { 'OSS_HOST': TOS_CONFIG['self_domain'] } +# DeepSeek API 配置(用于评论总结功能) +DEEPSEEK_CONFIG = { + 'api_key': 'sk-7b47e34bdcb549e6b00115a99b9b5c4c', # DeepSeek API密钥 + 'api_base': 'https://api.deepseek.com/v1', # API基础URL + 'model': 'deepseek-chat', # 使用的模型 + 'max_retries': 3, # 最大重试次数 + 'retry_delays': [2, 5, 10], # 重试延迟(秒) + 'batch_size': 800, # 每批评论数量 + 'max_tokens': 15000, # 每批最大token数 + 'summary_max_length': 200 # 最终总结最大字数 +} + def apply_timer_environment(): """应用定时器环境变量配置""" for key, value in TIMER_ENV_CONFIG.items(): diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json index fe5b079..381ae09 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json @@ -19,9 +19,13 @@ { "video_id": "7471924777410645283", "episode_num": 0 + }, + { + "video_id": "7472791705268325641", + "episode_num": 0 } ], - "total_count": 5, - "last_update": "2025-10-22T09:55:36.943794", + "total_count": 6, + "last_update": "2025-11-06T17:43:54.929209", "mix_name": "《青蛇传》" } \ No newline at end of file diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json index 5291f84..dbb2d4e 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json @@ -143,9 +143,21 @@ { "video_id": "7558378239337467174", "episode_num": 0 + }, + { + "video_id": "7567050545257516331", + "episode_num": 0 + }, + { + "video_id": "7568152326477942022", + "episode_num": 0 + }, + { + "video_id": "7569217928420183332", + "episode_num": 0 } ], - "total_count": 36, - "last_update": "2025-10-22T09:55:32.073567", + "total_count": 39, + "last_update": "2025-11-06T11:06:44.598400", "mix_name": "末世系列" } \ No newline at end of file diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json index 2c3a6c6..e0127c7 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json @@ -47,9 +47,17 @@ { "video_id": "7548447317729234239", "episode_num": 0 + }, + { + "video_id": "7568747381357808923", + "episode_num": 0 + }, + { + "video_id": "7568800392985791784", + "episode_num": 0 } ], - "total_count": 12, - "last_update": "2025-10-22T09:55:50.726907", + "total_count": 14, + "last_update": "2025-11-06T17:48:06.014161", "mix_name": "青云修仙传" } \ No newline at end of file diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json index 2803b24..8abbf1e 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json @@ -107,9 +107,17 @@ { "video_id": "7560551213957500195", "episode_num": 0 + }, + { + "video_id": "7562056353343966464", + "episode_num": 0 + }, + { + "video_id": "7567981488823318927", + "episode_num": 0 } ], - "total_count": 27, - "last_update": "2025-10-22T09:56:16.947762", + "total_count": 29, + "last_update": "2025-11-06T17:15:32.747557", "mix_name": "绝境逆袭" } \ No newline at end of file diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 6c7bf68..cda7dc7 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -28,9 +28,11 @@ import base64 import uuid import sys import psutil +from typing import Dict, List, Optional, Set import random import threading import argparse +from typing import Dict, List, Optional, Set from concurrent.futures import ThreadPoolExecutor # 使用线程池实现异步滑动和监控 from selenium import webdriver @@ -52,6 +54,236 @@ from handlers.Rankings.tos_client import oss_client import config +# ==================== 评论总结器类 ==================== +class CommentsSummarizer: + """评论总结器 - 支持大量评论的分批处理和汇总""" + + def __init__(self): + self.api_key = config.DEEPSEEK_CONFIG['api_key'] + self.api_base = config.DEEPSEEK_CONFIG['api_base'] + self.model = config.DEEPSEEK_CONFIG['model'] + self.max_retries = config.DEEPSEEK_CONFIG['max_retries'] + self.retry_delays = config.DEEPSEEK_CONFIG['retry_delays'] + self.batch_size = config.DEEPSEEK_CONFIG['batch_size'] + self.max_tokens = config.DEEPSEEK_CONFIG['max_tokens'] + self.summary_max_length = config.DEEPSEEK_CONFIG['summary_max_length'] + self.logger = logging.getLogger(__name__) + + def _call_deepseek_api(self, messages: List[Dict], retry_count: int = 0) -> Optional[str]: + """调用 DeepSeek API""" + try: + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}' + } + + data = { + 'model': self.model, + 'messages': messages, + 'temperature': 0.7, + 'max_tokens': 2000 + } + + response = requests.post( + f'{self.api_base}/chat/completions', + headers=headers, + json=data, + timeout=60 + ) + + if response.status_code == 200: + result = response.json() + content = result['choices'][0]['message']['content'] + self.logger.info(f"✅ DeepSeek API 调用成功") + return content.strip() + else: + self.logger.error(f"❌ DeepSeek API 返回错误: {response.status_code} - {response.text}") + + if retry_count < self.max_retries: + delay = self.retry_delays[retry_count] + self.logger.info(f"⏳ {delay}秒后进行第 {retry_count + 1} 次重试...") + time.sleep(delay) + return self._call_deepseek_api(messages, retry_count + 1) + + return None + + except Exception as e: + self.logger.error(f"❌ DeepSeek API 调用异常: {e}") + + if retry_count < self.max_retries: + delay = self.retry_delays[retry_count] + self.logger.info(f"⏳ {delay}秒后进行第 {retry_count + 1} 次重试...") + time.sleep(delay) + return self._call_deepseek_api(messages, retry_count + 1) + + return None + + def _estimate_comment_length(self, comment: str) -> int: + """估算评论的字符长度""" + return len(comment) + + def _split_comments_into_batches(self, comments: List[str]) -> List[List[str]]: + """将评论智能分批,根据评论长度动态调整每批数量""" + if not comments: + return [] + + batches = [] + current_batch = [] + current_length = 0 + + avg_length = sum(self._estimate_comment_length(c) for c in comments[:100]) / min(100, len(comments)) + + if avg_length < 50: + batch_size = 1000 + elif avg_length < 200: + batch_size = 600 + else: + batch_size = 400 + + self.logger.info(f"📊 评论平均长度: {avg_length:.0f} 字,批次大小: {batch_size}") + + for comment in comments: + comment_length = self._estimate_comment_length(comment) + + if len(current_batch) >= batch_size or (current_length + comment_length > self.max_tokens * 3): + if current_batch: + batches.append(current_batch) + current_batch = [] + current_length = 0 + + current_batch.append(comment) + current_length += comment_length + + if current_batch: + batches.append(current_batch) + + return batches + + def _generate_analysis_prompt(self, content: str, max_length: int = 200) -> str: + """生成通用的分析提示词""" + return f"""你是一位资深的用户反馈分析师,擅长从海量评论中提炼真实观点,用客观自然的语言准确传达用户的声音和整体评价趋势。 + +请基于以下内容,写一份真实客观的观众反馈分析: + +{content} + +分析要求: +1. 识别高频话题和关键观点(如剧情、演技、制作、节奏等维度) +2. 准确判断整体情感倾向,如实反映好评或差评的比例和强度 +3. 用自然的语言描述观众的真实感受,避免模板化和官方措辞 +4. 明确指出观众最在意的亮点和槽点 +5. 负面评价要委婉表达,使用"有待提升"、"存在改进空间"、"部分观众认为"等温和措辞 +6. 字数控制在{max_length}字以内,语言简洁有力 + +输出格式要求(严格遵守): +必须使用【】符号标注每个部分,格式示例: + +【核心观点】用户普遍识别出AI制作属性,对技术应用表示惊叹,同时对作品质量提出了一些看法 + +【用户关注焦点】 + 优点:AI人物颜值高、特效精美、制作成本低 + 待提升:部分观众认为角色表情和动作的自然度有待改进,剧情逻辑存在优化空间 + +【情感分布】观众意见较为分散,约65%的观众提出了改进建议 + +【核心看法】技术创新获得认可,制作细节方面仍有提升空间 + +格式规则: +- 使用【】符号标注每个分析维度的标题(标题可以自由命名,不限于示例) +- 每个【】标题后直接跟内容,不要换行 +- 每个部分结束后换行,再开始下一个【】部分 +- 可以根据实际评论内容灵活组织分析维度 +- 不要添加其他前缀或后缀 +- 严格按照【标题】内容的格式输出""" + + def _summarize_batch(self, comments: List[str], batch_num: int, total_batches: int) -> Optional[str]: + """总结一批评论""" + self.logger.info(f"📝 正在总结第 {batch_num}/{total_batches} 批评论(共 {len(comments)} 条)...") + + comments_text = "\n".join([f"{i+1}. {comment}" for i, comment in enumerate(comments)]) + content = f"用户评论:\n{comments_text}" + + prompt = self._generate_analysis_prompt(content, max_length=200) + messages = [{"role": "user", "content": prompt}] + + return self._call_deepseek_api(messages) + + def _merge_summaries(self, batch_summaries: List[str]) -> Optional[str]: + """合并所有批次总结为最终总结""" + self.logger.info(f"🔄 正在合并 {len(batch_summaries)} 个批次总结...") + + if len(batch_summaries) == 1: + return batch_summaries[0] + + summaries_text = "\n\n".join([f"批次{i+1}总结:\n{summary}" for i, summary in enumerate(batch_summaries)]) + content = f"多个批次的评论总结:\n\n{summaries_text}" + + prompt = self._generate_analysis_prompt(content, max_length=self.summary_max_length) + messages = [{"role": "user", "content": prompt}] + + return self._call_deepseek_api(messages) + + def summarize_comments(self, comments: List[str], drama_name: str = "") -> Optional[str]: + """总结评论(主入口)""" + if not comments: + self.logger.warning("⚠️ 评论列表为空,无法总结") + return None + + self.logger.info(f"🚀 开始总结评论:{drama_name}(共 {len(comments)} 条评论)") + + # 过滤空评论,处理字符串和字典两种格式 + valid_comments = [] + for c in comments: + if isinstance(c, dict): + text = c.get('text', '').strip() + if text: + valid_comments.append(text) + elif isinstance(c, str): + text = c.strip() + if text: + valid_comments.append(text) + + if not valid_comments: + self.logger.warning("⚠️ 没有有效评论,无法总结") + return None + + self.logger.info(f"📊 有效评论数量: {len(valid_comments)}") + + # 分批处理 + batches = self._split_comments_into_batches(valid_comments) + self.logger.info(f"📦 评论已分为 {len(batches)} 批") + + # 逐批总结 + batch_summaries = [] + failed_batches = [] + + for i, batch in enumerate(batches, 1): + summary = self._summarize_batch(batch, i, len(batches)) + if summary: + batch_summaries.append(summary) + else: + self.logger.error(f"❌ 第 {i} 批总结失败") + failed_batches.append(i) + + if not batch_summaries: + self.logger.error(f"❌ 所有批次总结都失败了") + return None + + if failed_batches: + self.logger.warning(f"⚠️ 以下批次总结失败: {failed_batches}") + + # 合并批次总结 + final_summary = self._merge_summaries(batch_summaries) + + if final_summary: + self.logger.info(f"✅ 评论总结完成:{drama_name}") + self.logger.info(f"📝 总结长度: {len(final_summary)} 字") + return final_summary + else: + self.logger.error(f"❌ 最终总结合并失败:{drama_name}") + return None + + # 配置日志 # 确保logs目录存在 script_dir = os.path.dirname(os.path.abspath(__file__)) @@ -68,6 +300,633 @@ logging.basicConfig( ) +class UnifiedDataCollector: + """统一数据收集器 - 解决数据重复和抓取不全问题""" + + def __init__(self, driver, duration_s: int = 60): + self.driver = driver + self.duration_s = duration_s + + # 统一数据存储 - 按mix_id去重 + self.collected_items: Dict[str, dict] = {} + + # 数据源统计 + self.source_stats = { + 'network': 0, + 'ssr': 0, + 'page': 0, + 'filtered': 0 + } + + # 已知请求ID集合,用于去重 + self.known_request_ids: Set[str] = set() + + # 目标关键词(收藏/合集/视频) + self.url_keywords = ['aweme', 'mix', 'collection', 'favorite', 'note', 'api'] + + # 是否在网络收集过程中周期性触发滚动加载(默认关闭以避免浪费时间) + self.enable_network_scroll: bool = False + + logging.info('统一数据收集器初始化完成') + + def collect_all_data(self) -> List[dict]: + """统一的数据收集入口 - 整合所有数据源""" + logging.info('开始统一数据收集') + + # 重置统计 + self.source_stats = {'network': 0, 'ssr': 0, 'page': 0, 'filtered': 0} + + # 按优先级收集数据 + self._collect_from_network() + self._collect_from_ssr() + self._collect_from_page() + + # 输出统计信息 + self._log_collection_stats() + + return list(self.collected_items.values()) + + def _collect_from_network(self): + """从网络API监控收集数据""" + logging.info('开始网络API数据收集') + start_time = time.time() + last_scroll_time = start_time + + while time.time() - start_time < self.duration_s: + try: + logs = self.driver.get_log('performance') + except Exception as e: + logging.warning(f'获取性能日志失败: {e}') + time.sleep(1) + continue + + for entry in logs: + try: + message = json.loads(entry['message'])['message'] + method = message.get('method') + params = message.get('params', {}) + + # 响应到达,尝试获取响应体 + if method == 'Network.responseReceived': + req_id = params.get('requestId') + url = params.get('response', {}).get('url', '') + type_ = params.get('type') # XHR, Fetch, Document + + if req_id and req_id not in self.known_request_ids: + self.known_request_ids.add(req_id) + + # 仅处理XHR/Fetch + if type_ in ('XHR', 'Fetch') and any(k in url for k in self.url_keywords): + try: + body_obj = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': req_id}) + body_text = body_obj.get('body', '') + + # 可能是base64编码 + if body_obj.get('base64Encoded'): + try: + body_text = base64.b64decode(body_text).decode('utf-8', errors='ignore') + except Exception: + pass + + # 解析数据 + self._parse_and_add_item(body_text, url, req_id, 'network') + + except Exception: + # 某些响应不可获取或过大 + pass + except Exception: + continue + + # 在收集过程中定期触发数据加载(默认关闭) + if self.enable_network_scroll: + current_time = time.time() + if current_time - last_scroll_time > 15: # 降低频率:每15秒 + # 若检测到底部则不再滚动 + if not self._check_no_more_content(): + self._trigger_mini_scroll() + last_scroll_time = current_time + + time.sleep(0.8) + + logging.info(f'网络API数据收集完成,发现 {self.source_stats["network"]} 个有效项') + + def _trigger_mini_scroll(self): + """在数据收集过程中触发滚动加载数据 - 增强版滚动机制""" + try: + logging.info('开始触发滚动加载数据...') + + # 方式1:强力滚动策略 - 模拟真实用户行为 + try: + # 强力滚动:多次大幅度滚动确保触发懒加载 + for i in range(5): + # 计算滚动距离,递增以确保效果 + scroll_distance = 800 + (i * 300) + + # 执行强力滚动 + self.driver.execute_script(f""" + // 1. 强制滚动页面 + window.scrollBy(0, {scroll_distance}); + document.documentElement.scrollTop += {scroll_distance}; + document.body.scrollTop += {scroll_distance}; + + // 2. 滚动到页面底部(触发懒加载) + window.scrollTo(0, document.body.scrollHeight); + + // 3. 查找并滚动所有可能的容器 + const containers = document.querySelectorAll('[data-e2e="comment-list"], .comment-list, [class*="comment"], [class*="scroll"], [role="main"]'); + containers.forEach(container => {{ + if (container.scrollTop !== undefined) {{ + container.scrollTop = container.scrollHeight; + container.dispatchEvent(new Event('scroll', {{ bubbles: true }})); + }} + }}); + + // 4. 触发所有相关事件 + ['scroll', 'wheel', 'touchmove', 'resize'].forEach(eventType => {{ + window.dispatchEvent(new Event(eventType, {{ bubbles: true }})); + document.dispatchEvent(new Event(eventType, {{ bubbles: true }})); + }}); + + // 5. 模拟用户交互 + document.body.click(); + + console.log('执行强力滚动:', {scroll_distance}, 'px'); + """) + + logging.info(f'第{i+1}次强力滚动,距离: {scroll_distance}px') + time.sleep(2) # 等待数据加载 + + # 检查是否有新数据加载 + current_height = self.driver.execute_script("return document.body.scrollHeight;") + logging.info(f'当前页面高度: {current_height}px') + + # 检查是否到达底部 + if self._check_no_more_content(): + logging.info('检测到页面底部,停止滚动') + break + + return + except Exception as e: + logging.debug(f'强力滚动失败: {e}') + + # 方式2:尝试滚动到特定元素 + try: + # 查找可能的加载更多按钮或元素 + load_more_selectors = [ + "[data-e2e='load-more']", + "[class*='load-more']", + "[class*='loadmore']", + "[class*='more']", + "button", + "[role='button']" + ] + + for selector in load_more_selectors: + try: + elements = self.driver.find_elements(By.CSS_SELECTOR, selector) + for element in elements: + if element.is_displayed(): + # 滚动到元素 + self.driver.execute_script("arguments[0].scrollIntoView();", element) + logging.info(f'滚动到元素: {selector}') + time.sleep(2) + # 尝试点击 + try: + element.click() + logging.info(f'点击加载更多按钮: {selector}') + time.sleep(3) + except: + pass + return + except: + continue + except Exception as e: + logging.debug(f'滚动到元素失败: {e}') + + # 方式3:渐进式滚动 + try: + current_position = self.driver.execute_script("return window.pageYOffset;") + page_height = self.driver.execute_script("return document.body.scrollHeight;") + window_height = self.driver.execute_script("return window.innerHeight;") + + logging.info(f'当前位置: {current_position}px, 页面高度: {page_height}px, 窗口高度: {window_height}px') + + # 如果页面高度很小,说明没有数据,需要触发加载 + if page_height < 2000: + # 多次滚动触发数据加载 + for i in range(5): + self.driver.execute_script(f"window.scrollTo(0, {500 * (i+1)});") + logging.info(f'渐进滚动 {i+1}: {500 * (i+1)}px') + time.sleep(2) + else: + # 正常滚动 + scroll_distance = min(1000, page_height - current_position - window_height) + if scroll_distance > 100: + new_position = current_position + scroll_distance + self.driver.execute_script(f'window.scrollTo(0, {new_position});') + logging.info(f'滚动到位置: {new_position}px') + time.sleep(2) + + return + except Exception as e: + logging.debug(f'渐进式滚动失败: {e}') + + # 方式4:检查是否已显示"暂时没有更多了" + if self._check_no_more_content(): + logging.info('已到达页面底部:暂时没有更多了') + return + + logging.info('滚动完成,等待数据加载...') + + except Exception as e: + logging.error(f'滚动触发失败: {e}') + + def _check_no_more_content(self) -> bool: + """检查是否已到达页面底部,没有更多内容""" + try: + # 检查多种可能的底部标识文本 + bottom_indicators = [ + "暂时没有更多了", + "没有更多内容", + "已加载全部", + "加载完毕" + ] + + for indicator in bottom_indicators: + try: + result = self.driver.execute_script(f""" + var elements = document.querySelectorAll('*'); + for (var i = 0; i < elements.length; i++) {{ + var text = elements[i].textContent || elements[i].innerText; + if (text.includes('{indicator}')) {{ + return true; + }} + }} + return false; + """) + if result: + logging.debug(f'检测到页面底部标识: "{indicator}"') + return True + except Exception: + continue + + return False + except Exception as e: + logging.debug(f'检查页面底部失败: {e}') + return False + + def _trigger_scroll_during_collection(self): + """在数据收集过程中触发数据加载 - 简化版,仅使用滚动""" + logging.info('在数据收集过程中触发滚动加载') + + try: + # 获取初始数据量 + initial_count = len(self.collected_items) + logging.info(f'滚动前数据量: {initial_count} 个短剧') + + # 仅使用强力滚动策略,不进行不必要的刷新和按钮点击 + self._trigger_mini_scroll() + + # 检查是否有新数据加载 + final_count = len(self.collected_items) + total_new = final_count - initial_count + logging.info(f'滚动加载完成: 初始 {initial_count} → 最终 {final_count} 个短剧 (总共新增: {total_new} 个)') + + except Exception as e: + logging.warning(f'滚动加载过程中出错: {e}') + + + def _collect_from_ssr(self): + """从SSR数据收集数据""" + logging.info('开始SSR数据收集') + + # 尝试直接从window对象获取 + keys = ['_SSR_HYDRATED_DATA', 'RENDER_DATA'] + for key in keys: + try: + data = self.driver.execute_script(f'return window.{key}') + if data: + text = json.dumps(data, ensure_ascii=False) + self._parse_and_add_item(text, f'page_{key}', None, 'ssr') + logging.info(f'从 {key} 中解析完成') + except Exception: + continue + + logging.info(f'SSR数据收集完成,发现 {self.source_stats["ssr"]} 个有效项') + + def _collect_from_page(self): + """从页面解析收集数据(兜底方案)""" + logging.info('开始页面数据收集(兜底方案)') + + try: + page_source = self.driver.page_source + self._parse_and_add_item(page_source, 'page_source', None, 'page') + + # 同时尝试识别statis结构中的play_vv + for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source): + try: + vv = int(m) + # 从页面源码中无法获取完整的合集信息,跳过这些不完整的数据 + logging.debug(f'从页面源码statis中发现播放量: {vv},但缺少完整信息,跳过') + except Exception: + pass + + except Exception: + pass + + logging.info(f'页面数据收集完成,发现 {self.source_stats["page"]} 个有效项') + + def _parse_and_add_item(self, text: str, source_url: str, request_id: str, source_type: str): + """解析文本数据并添加到统一存储""" + try: + # 尝试解析JSON数据 + if text.strip().startswith('{') or text.strip().startswith('['): + try: + data = json.loads(text) + self._extract_from_json_data(data, source_url, request_id, source_type) + return + except json.JSONDecodeError: + pass + + # 如果不是JSON,使用正则表达式查找 + self._extract_from_text_regex(text, source_url, request_id, source_type) + + except Exception as e: + logging.debug(f'解析 {source_type} 数据时出错: {e}') + + def _extract_from_json_data(self, data, source_url: str, request_id: str, source_type: str): + """从JSON数据中递归提取合集信息""" + def extract_mix_info(obj, path=""): + if isinstance(obj, dict): + # 检查是否包含有效的合集信息 + if self._is_valid_collection_data(obj): + item_data = self._build_item_data(obj, source_url, request_id, source_type) + if item_data: + self._add_item_with_validation(item_data, source_type) + + # 递归搜索子对象 + for key, value in obj.items(): + if isinstance(value, (dict, list)): + extract_mix_info(value, f"{path}.{key}" if path else key) + + elif isinstance(obj, list): + for i, item in enumerate(obj): + if isinstance(item, (dict, list)): + extract_mix_info(item, f"{path}[{i}]" if path else f"[{i}]") + + extract_mix_info(data) + + def _extract_from_text_regex(self, text: str, source_url: str, request_id: str, source_type: str): + """使用正则表达式从文本中提取信息""" + # 查找包含完整合集信息的JSON片段 + mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' + + for match in re.finditer(mix_pattern, text): + try: + mix_id = match.group(1) + mix_name = match.group(2) + vv = int(match.group(3)) + + # 构建基础数据 + item_data = { + 'mix_id': mix_id, + 'mix_name': mix_name, + 'play_vv': vv, + 'url': source_url, + 'request_id': request_id, + 'source_type': source_type, + 'timestamp': datetime.now().isoformat() + } + + # 验证并添加 + if self._validate_item(item_data): + self._add_item_with_validation(item_data, source_type) + + except Exception: + continue + + def _is_valid_collection_data(self, obj: dict) -> bool: + """检查是否为有效的收藏合集数据""" + # 必须有mix_id和statis字段 + if 'mix_id' not in obj or 'statis' not in obj: + return False + + # statis必须是字典且包含play_vv + statis = obj.get('statis', {}) + if not isinstance(statis, dict) or 'play_vv' not in statis: + return False + + # play_vv必须是有效数字 + play_vv = statis.get('play_vv') + if not isinstance(play_vv, (int, str)): + return False + + try: + vv = int(play_vv) + # 收藏合集的短剧播放量不可能为0 + if vv <= 0: + return False + except (ValueError, TypeError): + return False + + return True + + def _build_item_data(self, obj: dict, source_url: str, request_id: str, source_type: str) -> Optional[dict]: + """构建标准化的数据项""" + try: + mix_id = obj.get('mix_id', '') + mix_name = obj.get('mix_name', '') + + # 获取播放量(与_is_valid_collection_data方法保持一致) + play_vv = 0 + + # 方式1:从statis字段获取 + if 'statis' in obj and isinstance(obj['statis'], dict): + statis = obj['statis'] + if 'play_vv' in statis: + play_vv = statis['play_vv'] + + # 方式2:直接从对象中获取play_vv + if play_vv == 0 and 'play_vv' in obj: + play_vv = obj['play_vv'] + + # 方式3:从其他可能的字段获取 + if play_vv == 0: + for field in ['play_count', 'view_count', 'vv']: + if field in obj: + play_vv = obj[field] + break + + # 转换为整数 + if isinstance(play_vv, str) and play_vv.isdigit(): + play_vv = int(play_vv) + + # 数据验证 + if not mix_id or play_vv <= 0: + return None + + # 如果mix_name为空,使用mix_id作为名称 + if not mix_name or mix_name.strip() == "": + mix_name = f"短剧_{mix_id}" + logging.warning(f"⚠️ mix_name为空,使用mix_id作为名称: {mix_name}") + + # 构建合集链接 + video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" + + # 构建标准数据项 + item_data = { + 'mix_id': mix_id, + 'mix_name': mix_name, + 'play_vv': play_vv, + 'formatted': self._format_count(play_vv), + 'url': source_url, + 'request_id': request_id, + 'video_url': video_url, + 'source_type': source_type, + 'timestamp': datetime.now().isoformat() + } + + # 提取额外字段 + self._extract_additional_fields(obj, item_data) + + return item_data + + except Exception as e: + logging.debug(f'构建数据项失败: {e}') + return None + + def _extract_additional_fields(self, obj: dict, item_data: dict): + """提取额外的字段信息""" + # 提取合集封面图片URL + cover_image_url = "" + cover_image_backup_urls = [] + + # 查找封面图片字段 + for field in ['cover', 'cover_url', 'image', 'pic']: + if field in obj: + field_data = obj[field] + if isinstance(field_data, dict) and 'url_list' in field_data and field_data['url_list']: + cover_image_url = field_data['url_list'][0] + cover_image_backup_urls = field_data['url_list'][1:] if len(field_data['url_list']) > 1 else [] + break + elif isinstance(field_data, str): + cover_image_url = field_data + break + + item_data['cover_image_url'] = cover_image_url + item_data['cover_backup_urls'] = cover_image_backup_urls + + # 提取合集作者/影视工作室 + series_author = "" + for author_field in ['author', 'creator', 'user']: + if author_field in obj: + author_data = obj[author_field] + if isinstance(author_data, dict): + series_author = (author_data.get('nickname') or + author_data.get('unique_id') or + author_data.get('short_id') or + author_data.get('name') or '') + break + elif isinstance(author_data, str): + series_author = author_data + break + + item_data['series_author'] = series_author + + # 提取合集描述 + desc = "" + if 'desc' in obj and obj['desc']: + desc_value = str(obj['desc']).strip() + if desc_value: + desc = desc_value + + item_data['desc'] = desc + + # 提取合集总集数 + updated_to_episode = 0 + if 'statis' in obj and isinstance(obj['statis'], dict): + statis = obj['statis'] + if 'updated_to_episode' in statis: + try: + episodes = int(statis['updated_to_episode']) + if episodes > 0: + updated_to_episode = episodes + except ValueError: + pass + + item_data['updated_to_episode'] = updated_to_episode + + def _validate_item(self, item_data: dict) -> bool: + """验证数据项的有效性""" + # 基本字段验证 + mix_id = item_data.get('mix_id', '') + mix_name = item_data.get('mix_name', '') + play_vv = item_data.get('play_vv', 0) + + # 必须有mix_id和mix_name + if not mix_id or not mix_name: + return False + + # 播放量必须大于0(收藏合集的短剧不可能为0) + if play_vv <= 0: + return False + + # 排除占位名称 + if mix_name.startswith('短剧_') or '未知' in mix_name: + return False + + return True + + def _add_item_with_validation(self, item_data: dict, source_type: str): + """验证并添加数据项,包含实时去重""" + if not self._validate_item(item_data): + self.source_stats['filtered'] += 1 + return + + mix_id = item_data.get('mix_id') + + # 实时去重:保留播放量最大的版本 + if mix_id in self.collected_items: + existing = self.collected_items[mix_id] + current_play_vv = item_data.get('play_vv', 0) + existing_play_vv = existing.get('play_vv', 0) + + if current_play_vv > existing_play_vv: + # 当前数据更好,替换 + self.collected_items[mix_id] = item_data + logging.info(f'🔄 更新重复短剧: {item_data.get("mix_name")} (播放量: {existing_play_vv:,} → {current_play_vv:,})') + else: + # 已有数据更好,跳过 + logging.info(f'⏭️ 跳过重复短剧: {item_data.get("mix_name")} (当前: {current_play_vv:,}, 已有: {existing_play_vv:,})') + + # 记录去重统计 + logging.debug(f'去重统计: mix_id={mix_id}, 已有播放量={existing_play_vv:,}, 新播放量={current_play_vv:,}, 是否更新={current_play_vv > existing_play_vv}') + else: + # 新数据,直接添加 + self.collected_items[mix_id] = item_data + self.source_stats[source_type] += 1 + logging.info(f'✅ 添加新短剧: {item_data.get("mix_name")} - {item_data.get("play_vv", 0):,} 播放量') + + def _format_count(self, n: int) -> str: + """格式化数字显示""" + if n >= 100_000_000: + return f"{n/100_000_000:.1f}亿" + if n >= 10_000: + return f"{n/10_000:.1f}万" + return str(n) + + def _log_collection_stats(self): + """输出收集统计信息""" + logging.info('=' * 60) + logging.info('统一数据收集统计:') + logging.info(f' - 网络API: {self.source_stats["network"]} 个') + logging.info(f' - SSR数据: {self.source_stats["ssr"]} 个') + logging.info(f' - 页面解析: {self.source_stats["page"]} 个') + logging.info(f' - 过滤无效: {self.source_stats["filtered"]} 个') + logging.info(f' - 最终结果: {len(self.collected_items)} 个唯一短剧') + logging.info('=' * 60) + + class DouyinPlayVVScraper: def __init__(self, start_url: str = None, auto_continue: bool = False, duration_s: int = 60): self.start_url = start_url or "https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation" @@ -100,6 +959,22 @@ class DouyinPlayVVScraper: self._cleanup_chrome_cache_smart() self._setup_mongodb() self._load_image_cache() + + # 初始化评论总结器 + try: + # 检查配置是否存在 + if not hasattr(config, 'DEEPSEEK_CONFIG'): + logging.warning('⚠️ config.py 中未找到 DEEPSEEK_CONFIG 配置,将跳过评论总结功能') + self.comments_summarizer = None + else: + self.comments_summarizer = CommentsSummarizer() + logging.info('✅ 评论总结器初始化成功') + logging.info(f'📝 DeepSeek API 配置: model={config.DEEPSEEK_CONFIG.get("model")}, base={config.DEEPSEEK_CONFIG.get("api_base")}') + except Exception as e: + logging.warning(f'⚠️ 评论总结器初始化失败: {e},将跳过评论总结功能') + import traceback + logging.warning(f'详细错误: {traceback.format_exc()}') + self.comments_summarizer = None def _setup_mongodb(self): """设置MongoDB连接""" @@ -109,25 +984,41 @@ class DouyinPlayVVScraper: # 根据运行模式选择集合 is_timer_mode = os.environ.get('TIMER_MODE') == '1' - mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_list' + mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_management' self.collection = self.db[mongo_collection] - logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}') + # 新增:设置Rankings_management集合(每天替换的数据库) + self.management_collection = self.db['Rankings_management'] + + logging.info(f'MongoDB连接成功,使用数据库: {self.db.name}') + logging.info(f'主集合: {mongo_collection}(只增不删)') + logging.info(f'管理集合: Rankings_management(每天替换)') logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}') except Exception as e: - logging.error(f'MongoDB连接失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': 'MongoDB连接设置' + } + logging.error(f'MongoDB连接失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') self.db = None self.collection = None + self.management_collection = None def _load_image_cache(self): """从数据库加载已存在的图片ID到TOS链接的映射""" - if self.collection is None: + target_collection = self.collection # 使用根据模式选择的集合 + if target_collection is None: return try: # 查询所有有封面图片的记录 - cursor = self.collection.find( + cursor = target_collection.find( { 'cover_image_url_original': {'$exists': True, '$ne': ''}, 'cover_image_url': {'$exists': True, '$ne': ''} @@ -150,15 +1041,30 @@ class DouyinPlayVVScraper: logging.info(f'从数据库加载图片缓存: {cache_count} 个图片映射') except Exception as e: - logging.error(f'加载图片缓存失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': '从数据库加载图片缓存' + } + logging.error(f'加载图片缓存失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') def _cleanup_old_profiles(self): """清理超过一天的旧临时Chrome配置文件""" try: script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_base_dir = os.path.join(script_dir, 'config', 'chrome_profile') - if not os.path.exists(profile_base_dir): - return + # 清理两个配置目录的旧文件 + profile_dirs = [ + os.path.join(script_dir, 'config', 'chrome_profile_scraper'), + os.path.join(script_dir, 'config', 'chrome_profile_timer') + ] + + for profile_base_dir in profile_dirs: + if not os.path.exists(profile_base_dir): + continue current_time = time.time() one_day_ago = current_time - 24 * 60 * 60 # 24小时前 @@ -177,15 +1083,27 @@ class DouyinPlayVVScraper: # 如果无法解析时间戳,跳过 continue except Exception as e: - logging.warning(f'清理旧配置文件时出错: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': '清理超过一天的旧临时Chrome配置文件' + } + logging.warning(f'清理旧配置文件时出错: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') def _cleanup_chrome_processes(self): """清理可能占用配置文件的Chrome进程""" try: - - # 获取当前配置文件路径 + # 获取当前配置文件路径(按模式隔离) script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent') + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if is_timer_mode: + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + else: + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent') # 查找使用该配置文件的Chrome进程 killed_processes = [] @@ -205,18 +1123,9 @@ class DouyinPlayVVScraper: time.sleep(2) return len(killed_processes) > 0 - except ImportError: - # 如果没有psutil,使用系统命令 - try: - result = subprocess.run(['taskkill', '/f', '/im', 'chrome.exe'], - capture_output=True, text=True, timeout=10) - if result.returncode == 0: - logging.info('使用taskkill清理Chrome进程') - time.sleep(2) - return True - except Exception as e: - logging.warning(f'清理Chrome进程失败: {e}') + # 如果没有psutil,跳过清理以避免影响其他脚本实例 + logging.warning('psutil 不可用,跳过进程清理(避免全局终止 Chrome)') return False except Exception as e: logging.warning(f'清理Chrome进程时出错: {e}') @@ -239,14 +1148,20 @@ class DouyinPlayVVScraper: def _cleanup_chrome_cache_smart(self, size_threshold_mb=50): """智能清理Chrome配置文件缓存 - + Args: size_threshold_mb (int): 触发清理的大小阈值(MB),默认50MB """ try: script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent') - + # 根据运行模式选择对应的配置目录 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + + if is_timer_mode: + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + else: + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent') + if not os.path.exists(profile_dir): logging.info('Chrome配置文件目录不存在,跳过缓存清理') return False @@ -319,10 +1234,10 @@ class DouyinPlayVVScraper: def setup_driver(self): logging.info('初始化Chrome WebDriver (启用CDP网络日志)') - + # 清理可能占用配置文件的Chrome进程 self._cleanup_chrome_processes() - + chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') @@ -334,9 +1249,20 @@ class DouyinPlayVVScraper: chrome_options.add_argument('--remote-debugging-port=0') chrome_options.add_argument('--start-maximized') chrome_options.add_argument('--lang=zh-CN') - # 使用固定的Chrome配置文件目录以保持登录状态 + + # 根据运行模式选择不同的Chrome配置文件目录 script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent') + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + + if is_timer_mode: + # 定时器模式使用独立的配置目录 + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + logging.info(f'[定时器模式] 使用独立Chrome配置文件: {profile_dir}') + else: + # 普通模式使用原有的配置目录 + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent') + logging.info(f'[普通模式] 使用独立Chrome配置文件: {profile_dir}') + os.makedirs(profile_dir, exist_ok=True) chrome_options.add_argument(f'--user-data-dir={profile_dir}') logging.info(f'使用持久化Chrome配置文件: {profile_dir}') @@ -401,7 +1327,16 @@ class DouyinPlayVVScraper: else: logging.info(f'候选路径不存在: {p}') except Exception as e: - logging.warning(f'尝试使用 {p} 启动失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'尝试使用ChromeDriver路径: {p}' + } + logging.warning(f'尝试使用 {p} 启动失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') if not driver_ready: # 最终回退:使用webdriver-manager(可能需要网络) @@ -411,7 +1346,17 @@ class DouyinPlayVVScraper: driver_ready = True logging.info('使用webdriver-manager成功启动ChromeDriver') except Exception as e: - raise RuntimeError('未能启动ChromeDriver。请手动下载匹配版本的chromedriver到项目根目录或PATH,或检查网络以允许webdriver-manager下载。错误: ' + str(e)) + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': '使用webdriver-manager启动ChromeDriver' + } + logging.error(f'webdriver-manager启动失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') + raise RuntimeError(f'未能启动ChromeDriver。请手动下载匹配版本的chromedriver到项目根目录或PATH,或检查网络以允许webdriver-manager下载。错误类型: {error_details["error_type"]}, 错误信息: {error_details["error_message"]}') # 反检测 try: @@ -434,12 +1379,12 @@ class DouyinPlayVVScraper: def ensure_login(self): """确保用户已登录并导航到收藏合集页面""" logging.info("检测登录状态和页面位置...") - + # 首先检查是否已经登录并在正确页面 if self._check_login_and_page(): logging.info("检测到已登录且在收藏合集页面,跳过手动确认") return - + # 如果未登录或不在正确页面,进行手动登录流程 logging.info("请在弹出的浏览器中手动完成登录。") @@ -452,7 +1397,34 @@ class DouyinPlayVVScraper: time.sleep(3) # 等待页面加载 logging.info("自动模式:假设登录成功,继续执行...") except Exception as e: - logging.warning(f"自动模式导航失败: {e},继续执行...") + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'自动模式导航到起始URL: {self.start_url}' + } + logging.warning(f"自动模式导航失败: {error_details['error_type']} - {error_details['error_message']},继续执行...") + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') + return + + # 定时器模式下的登录检查 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if is_timer_mode: + logging.info("定时器模式:检查浏览器登录状态...") + # 在定时器模式下,浏览器已经启动并导航到页面,现在检查登录状态 + if not self._check_login_and_page(): + logging.warning("定时器模式:检测到未登录状态,需要手动登录") + print("⚠️ 定时器浏览器未登录") + print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面") + print(" 完成后按回车键继续...") + input() + # 重新检查登录状态 + if not self._check_login_and_page(): + logging.warning("定时器模式:登录确认后仍然未登录,继续执行...") + else: + logging.info("定时器模式:浏览器已登录,继续执行...") return logging.info("进入手动登录确认循环...") @@ -578,17 +1550,127 @@ class DouyinPlayVVScraper: return True # 改为假设已登录,避免卡住 def trigger_loading(self): - logging.info('触发数据加载:滚动 + 刷新') - # 滚动触发懒加载 - for i in range(8): - self.driver.execute_script(f'window.scrollTo(0, {i * 900});') - time.sleep(1.2) - # 刷新触发新请求 - self.driver.refresh() - time.sleep(4) - for i in range(6): - self.driver.execute_script(f'window.scrollTo(0, {i * 1200});') - time.sleep(1.3) + logging.info('触发数据加载:强力滚动直到"暂时没有更多了"') + + # 等待页面完全加载 + logging.info('等待页面完全加载...') + time.sleep(10) + + # 强力滚动策略 - 模拟真实用户行为,直到看到"暂时没有更多了" + max_scroll_attempts = 50 # 最大滚动尝试次数 + scroll_count = 0 + no_more_content_found = False + + while scroll_count < max_scroll_attempts and not no_more_content_found: + try: + scroll_count += 1 + logging.info(f'第{scroll_count}次强力滚动...') + + # 强力滚动:多次大幅度滚动确保触发懒加载 + scroll_distance = 800 + (scroll_count * 200) + + # 执行强力滚动JavaScript + self.driver.execute_script(f""" + // 1. 强制滚动页面 + window.scrollBy(0, {scroll_distance}); + document.documentElement.scrollTop += {scroll_distance}; + document.body.scrollTop += {scroll_distance}; + + // 2. 滚动到页面底部(触发懒加载) + window.scrollTo(0, document.body.scrollHeight); + + // 3. 查找并滚动所有可能的容器 + const containers = document.querySelectorAll('[data-e2e="comment-list"], .comment-list, [class*="comment"], [class*="scroll"], [role="main"], [class*="collection"], [class*="favorite"]'); + containers.forEach(container => {{ + if (container.scrollTop !== undefined) {{ + container.scrollTop = container.scrollHeight; + container.dispatchEvent(new Event('scroll', {{ bubbles: true }})); + }} + }}); + + // 4. 触发所有相关事件 + ['scroll', 'wheel', 'touchmove', 'resize'].forEach(eventType => {{ + window.dispatchEvent(new Event(eventType, {{ bubbles: true }})); + document.dispatchEvent(new Event(eventType, {{ bubbles: true }})); + }}); + + // 5. 模拟用户交互 + document.body.click(); + + console.log('执行强力滚动:', {scroll_distance}, 'px'); + """) + + # 等待数据加载 + time.sleep(3) + + # 检查是否有新数据加载 + current_height = self.driver.execute_script("return document.body.scrollHeight;") + logging.info(f'当前页面高度: {current_height}px') + + # 检查是否到达底部 - 看到"暂时没有更多了" + no_more_content_found = self._check_no_more_content() + if no_more_content_found: + logging.info('✅ 检测到页面底部:"暂时没有更多了",停止滚动') + break + + # 检查页面高度是否不再增加(说明没有新内容加载) + if scroll_count > 5: + previous_height = current_height + time.sleep(2) + new_height = self.driver.execute_script("return document.body.scrollHeight;") + if new_height == previous_height: + logging.info('页面高度不再增加,可能已加载全部内容') + break + + except Exception as e: + logging.error(f'滚动过程中出错: {e}') + time.sleep(2) + + if no_more_content_found: + logging.info('🎉 成功滚动到页面底部,所有内容已加载完成') + else: + logging.info(f'达到最大滚动次数 {max_scroll_attempts},停止滚动') + + # 最终检查一次是否还有更多内容 + final_check = self._check_no_more_content() + if not final_check: + logging.info('⚠️ 最终检查:可能还有更多内容未加载') + + def _check_no_more_content(self) -> bool: + """检查是否已到达页面底部,没有更多内容""" + try: + # 检查多种可能的底部标识文本 + bottom_indicators = [ + "暂时没有更多了", + "没有更多内容", + "已加载全部", + "加载完毕", + "no more content", + "end of content" + ] + + for indicator in bottom_indicators: + try: + result = self.driver.execute_script(f""" + var elements = document.querySelectorAll('*'); + for (var i = 0; i < elements.length; i++) {{ + var text = elements[i].textContent || elements[i].innerText; + if (text.includes('{indicator}')) {{ + return true; + }} + }} + return false; + """) + if result: + logging.info(f'✅ 检测到页面底部标识: "{indicator}"') + return True + except Exception: + continue + + return False + except Exception as e: + logging.debug(f'检查页面底部失败: {e}') + return False def format_count(self, n: int) -> str: if n >= 100_000_000: @@ -651,452 +1733,131 @@ class DouyinPlayVVScraper: except Exception as e: logging.error(f'保存评论失败: {e}') - return None - - def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None): - """解析文本中的play_vv、mix_name和watched_item信息""" - try: - # 尝试解析JSON数据 - if text.strip().startswith('{') or text.strip().startswith('['): - try: - data = json.loads(text) - self._extract_from_json_data(data, source_url, request_id) - return - except json.JSONDecodeError: - pass - - # 如果不是JSON,使用正则表达式查找 - self._extract_from_text_regex(text, source_url, request_id) - - except Exception as e: - logging.warning(f'解析文本数据时出错: {e}') + return None - def _extract_from_json_data(self, data, source_url: str, request_id: str = None): - """从JSON数据中递归提取合集信息""" - def extract_mix_info(obj, path=""): - if isinstance(obj, dict): - # 检查是否包含合集信息 - if 'mix_id' in obj and 'statis' in obj: - mix_id = obj.get('mix_id', '') - mix_name = obj.get('mix_name', '') - statis = obj.get('statis', {}) - - # 调试:输出包含mix_id的完整对象结构(仅输出前3个) - if len(self.play_vv_items) < 3: - logging.info(f"=== 调试:合集对象结构 ===") - logging.info(f"完整对象键: {list(obj.keys())}") - # 查找可能的视频相关字段和新增字段 - for key, value in obj.items(): - if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower(): - logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}") - # 检查新增字段相关的键 - elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']): - logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}") - - # 特别检查ids字段 - if 'ids' in obj: - ids_value = obj['ids'] - logging.info(f"ids字段详细信息: {type(ids_value)} - {ids_value}") - if isinstance(ids_value, list) and len(ids_value) > 0: - logging.info(f"ids列表长度: {len(ids_value)}") - logging.info(f"第一个ID: {ids_value[0]}") - if len(ids_value) > 1: - logging.info(f"第二个ID: {ids_value[1]}") - - if isinstance(statis, dict) and 'play_vv' in statis: - play_vv = statis.get('play_vv') - if isinstance(play_vv, (int, str)) and str(play_vv).isdigit(): - vv = int(play_vv) - # 构建合集链接 - video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" - - # 提取合集封面图片URL - 直接存储完整的图片链接 - cover_image_url = "" - cover_image_backup_urls = [] # 备用链接列表 - # 查找封面图片字段,优先获取完整的URL链接 - if 'cover' in obj: - cover = obj['cover'] - if isinstance(cover, dict) and 'url_list' in cover and cover['url_list']: - # 主链接 - cover_image_url = cover['url_list'][0] - # 备用链接 - cover_image_backup_urls = cover['url_list'][1:] if len(cover['url_list']) > 1 else [] - elif isinstance(cover, str): - cover_image_url = cover - elif 'cover_url' in obj: - cover_url = obj['cover_url'] - if isinstance(cover_url, dict) and 'url_list' in cover_url and cover_url['url_list']: - cover_image_url = cover_url['url_list'][0] - cover_image_backup_urls = cover_url['url_list'][1:] if len(cover_url['url_list']) > 1 else [] - elif isinstance(cover_url, str): - cover_image_url = cover_url - elif 'image' in obj: - image = obj['image'] - if isinstance(image, dict) and 'url_list' in image and image['url_list']: - cover_image_url = image['url_list'][0] - cover_image_backup_urls = image['url_list'][1:] if len(image['url_list']) > 1 else [] - elif isinstance(image, str): - cover_image_url = image - elif 'pic' in obj: - pic = obj['pic'] - if isinstance(pic, dict) and 'url_list' in pic and pic['url_list']: - cover_image_url = pic['url_list'][0] - cover_image_backup_urls = pic['url_list'][1:] if len(pic['url_list']) > 1 else [] - elif isinstance(pic, str): - cover_image_url = pic - - # 提取新增的三个字段 - series_author = "" - desc = "" - updated_to_episode = 0 - - # 提取合集作者/影视工作室 - if 'author' in obj: - author = obj['author'] - if isinstance(author, dict): - # 尝试多个可能的作者字段 - series_author = (author.get('nickname') or - author.get('unique_id') or - author.get('short_id') or - author.get('name') or '') - elif isinstance(author, str): - series_author = author - elif 'creator' in obj: - creator = obj['creator'] - if isinstance(creator, dict): - series_author = (creator.get('nickname') or - creator.get('unique_id') or - creator.get('name') or '') - elif isinstance(creator, str): - series_author = creator - elif 'user' in obj: - user = obj['user'] - if isinstance(user, dict): - series_author = (user.get('nickname') or - user.get('unique_id') or - user.get('name') or '') - elif isinstance(user, str): - series_author = user - - # 提取合集描述 - 扩展更多可能的字段 - description_fields = ['desc', 'share_info'] # 保持字段列表 - - # 先检查desc字段 - if 'desc' in obj and obj['desc']: - desc_value = str(obj['desc']).strip() - if desc_value: - desc = desc_value - logging.info(f"从desc提取到描述") - - # 如果desc中没有找到有效描述,检查share_info - if not desc and 'share_info' in obj and isinstance(obj['share_info'], dict): - share_desc = obj['share_info'].get('share_desc', '').strip() - if share_desc: - desc = share_desc - logging.info(f"从share_info.share_desc提取到描述") - - # 如果share_info中没有找到有效描述,继续检查desc字段 - if not desc: - for field in description_fields: - if field in obj and obj[field]: - desc_value = str(obj[field]).strip() - if desc_value: - desc = desc_value - logging.info(f"从{field}提取到描述") - break - - # 如果还没有找到描述,尝试从嵌套对象中查找desc字段 - if not desc: - def search_nested_desc(data, depth=0): - if depth > 3: # 限制递归深度 - return None - - if isinstance(data, dict): - # 检查当前层级的desc字段 - if 'desc' in data and data['desc']: - desc_value = str(data['desc']).strip() - if 5 <= len(desc_value) <= 1000: - return desc_value - - # 递归检查嵌套对象 - for value in data.values(): - if isinstance(value, dict): - nested_result = search_nested_desc(value, depth + 1) - if nested_result: - return nested_result - return None - - desc = search_nested_desc(obj) - - - # 提取合集总集数 - 从statis字段中获取 - updated_to_episode = 0 # 初始化默认值 - if 'statis' in obj and isinstance(obj['statis'], dict): - statis = obj['statis'] - if 'updated_to_episode' in statis: - try: - episodes = int(statis['updated_to_episode']) - if episodes > 0: - updated_to_episode = episodes - logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") - except ValueError: - logging.warning("updated_to_episode字段值无法转换为整数") - else: - logging.info("未找到statis字段或statis不是字典类型") - try: - episodes = int(obj['updated_to_episode']) - if episodes > 0: - updated_to_episode = episodes - logging.info(f"从updated_to_episode提取到集数: {episodes}") - except ValueError: - pass # 忽略无法转换为整数的情况 - - # 构建合集数据 - item_data = { - 'play_vv': vv, - 'formatted': self.format_count(vv), - 'url': source_url, - 'request_id': request_id, - 'mix_name': mix_name, - 'video_url': video_url, # 合集链接 - 'mix_id': mix_id, # 合集ID - 'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL) - 'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表 - 'series_author': series_author, # 合集作者/影视工作室 - 'desc': desc, # 合集描述 - 'updated_to_episode': updated_to_episode, # 合集总集数 - 'timestamp': datetime.now().isoformat() - } - - # 添加到列表(保持原有逻辑) - self.play_vv_items.append(item_data) - - # 实时保存到数据库 - if self.realtime_save_enabled: - self.save_single_item_realtime(item_data) - - logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') - if series_author: - logging.info(f' 作者: {series_author}') - if desc: - logging.info(f' 描述: {desc[:100]}{"..." if len(desc) > 100 else ""}') - if updated_to_episode > 0: - logging.info(f' 总集数: {updated_to_episode}') - - # 递归搜索子对象 - for key, value in obj.items(): - if isinstance(value, (dict, list)): - extract_mix_info(value, f"{path}.{key}" if path else key) - - elif isinstance(obj, list): - for i, item in enumerate(obj): - if isinstance(item, (dict, list)): - extract_mix_info(item, f"{path}[{i}]" if path else f"[{i}]") - - extract_mix_info(data) - - def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None): - """使用正则表达式从文本中提取信息""" - # 查找包含完整合集信息的JSON片段,包括statis中的updated_to_episode - mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*"updated_to_episode"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' - - for match in re.finditer(mix_pattern, text): - try: - mix_id = match.group(1) - mix_name = match.group(2) - vv = int(match.group(3)) - episodes = int(match.group(4)) - - # 构建合集链接 - video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" - - if episodes > 0: - logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") - - # 构建合集数据 - item_data = { - 'play_vv': vv, - 'formatted': self.format_count(vv), - 'url': source_url, - 'request_id': request_id, - 'mix_name': mix_name, - 'video_url': video_url, # 合集链接 - 'mix_id': mix_id, # 合集ID - 'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数 - 'timestamp': datetime.now().isoformat() - } - - # 添加到列表(保持原有逻辑) - self.play_vv_items.append(item_data) - - # 实时保存到数据库 - if self.realtime_save_enabled: - self.save_single_item_realtime(item_data) - - logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') - except Exception: - continue - - # 兜底:查找单独的play_vv值 - for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text): - try: - vv = int(match) - # 检查是否已经存在相同的play_vv - if not any(item['play_vv'] == vv for item in self.play_vv_items): - # 构建合集数据 - item_data = { - 'play_vv': vv, - 'formatted': self.format_count(vv), - 'url': source_url, - 'request_id': request_id, - 'mix_name': '', # 未知合集名称 - 'video_url': '', # 未知链接 - 'mix_id': '', # 未知mix_id - 'updated_to_episode': None, # 未知集数 - 'timestamp': datetime.now().isoformat() - } - - # 添加到列表(保持原有逻辑) - self.play_vv_items.append(item_data) - - # 实时保存到数据库(对于未知合集,可能不需要实时保存,但为了一致性还是保存) - if self.realtime_save_enabled: - self.save_single_item_realtime(item_data) - except Exception: - continue - - def collect_network_bodies(self, duration_s: int = None): - if duration_s is None: - duration_s = self.duration_s - logging.info(f'开始收集网络响应体,持续 {duration_s}s') - start = time.time() - known_request_ids = set() - - # 目标关键词(收藏/合集/视频) - url_keywords = ['aweme', 'mix', 'collection', 'favorite', 'note', 'api'] - - last_progress = 0 - while time.time() - start < duration_s: - try: - logs = self.driver.get_log('performance') - except Exception as e: - logging.warning(f'获取性能日志失败: {e}') - time.sleep(1) - continue - - for entry in logs: - try: - message = json.loads(entry['message'])['message'] - except Exception: - continue - - method = message.get('method') - params = message.get('params', {}) - - # 记录请求URL - if method == 'Network.requestWillBeSent': - req_id = params.get('requestId') - url = params.get('request', {}).get('url', '') - if any(k in url for k in url_keywords): - self.captured_responses.append({'requestId': req_id, 'url': url, 'type': 'request'}) - - # 响应到达,尝试获取响应体 - if method == 'Network.responseReceived': - req_id = params.get('requestId') - url = params.get('response', {}).get('url', '') - type_ = params.get('type') # XHR, Fetch, Document - if req_id and req_id not in known_request_ids: - known_request_ids.add(req_id) - # 仅处理XHR/Fetch - if type_ in ('XHR', 'Fetch') and any(k in url for k in url_keywords): - try: - body_obj = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': req_id}) - body_text = body_obj.get('body', '') - # 可能是base64编码 - if body_obj.get('base64Encoded'): - try: - body_text = base64.b64decode(body_text).decode('utf-8', errors='ignore') - except Exception: - pass - - # 解析play_vv - self.parse_play_vv_from_text(body_text, url, req_id) - except Exception: - # 某些响应不可获取或过大 - pass - elapsed = int(time.time() - start) - if elapsed - last_progress >= 5: - last_progress = elapsed - logging.info(f'进度: {elapsed}/{duration_s}, 目标数量: {len(self.play_vv_items)}') - time.sleep(0.8) - - logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个目标') - - - def parse_ssr_data(self): - logging.info('尝试解析页面SSR数据') - # 尝试直接从window对象获取 - keys = ['_SSR_HYDRATED_DATA', 'RENDER_DATA'] - for key in keys: - try: - data = self.driver.execute_script(f'return window.{key}') - if data: - text = json.dumps(data, ensure_ascii=False) - self.parse_play_vv_from_text(text, f'page_{key}', None) - logging.info(f'从 {key} 中解析完成') - except Exception: - continue - - # 兜底:从page_source中正则查找 - try: - page_source = self.driver.page_source - self.parse_play_vv_from_text(page_source, 'page_source', None) - # 同时尝试识别statis结构中的play_vv - for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source): - try: - vv = int(m) - # 检查是否已经存在相同的play_vv - if not any(item['play_vv'] == vv for item in self.play_vv_items): - # 构建合集数据 - item_data = { - 'play_vv': vv, - 'formatted': self.format_count(vv), - 'url': 'page_source_statis', - 'request_id': None, - 'mix_name': '', # 从statis中无法获取合集名称 - 'video_url': '', # 从statis中无法获取链接 - 'timestamp': datetime.now().isoformat() - } - - # 添加到列表(保持原有逻辑) - self.play_vv_items.append(item_data) - - # 实时保存到数据库 - if self.realtime_save_enabled: - self.save_single_item_realtime(item_data) - except Exception: - pass - except Exception: - pass def dedupe(self): - # 去重按play_vv数值 - unique = [] - seen = set() + # 🔧 修复:按mix_id去重,保留播放量最大的那个 + # 原来的逻辑会导致播放量相同的不同短剧被误删 + unique_dict = {} # 使用字典存储,key是identifier,value是item + for item in self.play_vv_items: - vv = item['play_vv'] - if vv not in seen: - unique.append(item) - seen.add(vv) + mix_id = item.get('mix_id', '') + + # 如果没有mix_id,使用mix_name作为备用标识 + if not mix_id: + mix_name = item.get('mix_name', '') + identifier = f"name_{mix_name}" + else: + identifier = f"id_{mix_id}" + + # 如果是第一次遇到这个identifier,直接添加 + if identifier not in unique_dict: + unique_dict[identifier] = item + else: + # 如果已经存在,比较播放量,保留播放量大的 + existing_play_vv = unique_dict[identifier].get('play_vv', 0) + current_play_vv = item.get('play_vv', 0) + + if current_play_vv > existing_play_vv: + # 当前数据的播放量更大,替换 + logging.info(f'去重:发现重复短剧 {item.get("mix_name", "未知")},保留播放量更大的数据 ({existing_play_vv:,} → {current_play_vv:,})') + unique_dict[identifier] = item + else: + # 已有数据的播放量更大或相等,跳过当前数据 + logging.debug(f'去重:跳过重复的短剧 {item.get("mix_name", "未知")} (mix_id: {mix_id})') + + # 转换回列表 + unique = list(unique_dict.values()) + + removed_count = len(self.play_vv_items) - len(unique) + if removed_count > 0: + logging.info(f'去重完成:移除 {removed_count} 个重复项,保留 {len(unique)} 个唯一短剧') + else: + logging.info(f'去重完成:没有重复项,保留 {len(unique)} 个唯一短剧') + self.play_vv_items = unique def save_results(self): - if self.realtime_save_enabled and self.saved_items: - # 实时保存模式:只更新排名和统计信息 - self.update_ranks_for_batch() - logging.info(f'[实时保存] 所有数据已通过实时保存功能保存到数据库,共 {len(self.saved_items)} 个合集') + if self.realtime_save_enabled: + # 🔧 修复:在数据收集完成后,统一进行实时保存 + logging.info(f'[实时保存] 开始保存 {len(self.play_vv_items)} 个合集的数据') logging.info(f'[实时保存] 批次ID: {self.batch_id}') + + # 先保存所有合集的基础信息(不获取详细内容) + for item_data in self.play_vv_items: + try: + logging.info(f'[实时保存] 保存合集基础信息: {item_data.get("mix_name", "未知")}') + self.save_collection_basic_info(item_data) + except Exception as e: + logging.error(f'[实时保存] 保存合集基础信息失败: {item_data.get("mix_name", "未知")} - {e}') + + # 更新排名 + try: + self.update_ranks_for_batch() + except Exception as e: + logging.error(f'[实时保存] 更新排名失败: {e}') + + # 然后逐个获取详细内容(如果需要) + logging.info(f'[实时保存] 基础信息保存完成,开始获取详细内容') + for item_data in self.play_vv_items: + try: + mix_id = item_data.get('mix_id', '') + mix_name = item_data.get('mix_name', '') + current_episode_count = item_data.get('updated_to_episode', 0) + + if mix_id and current_episode_count > 0: + # 查找已保存的文档ID + target_collection = self.collection + if target_collection is not None: + existing_doc = target_collection.find_one({'mix_id': mix_id}, {'_id': 1}) + if existing_doc: + document_id = existing_doc['_id'] + logging.info(f'[实时保存] 开始获取详细内容: {mix_name}') + + # 获取视频ID列表 + episode_video_ids = self.update_collection_video_ids( + document_id, mix_id, mix_name, current_episode_count + ) + + # 获取视频详细数据 + if episode_video_ids: + self.update_video_details_incrementally( + document_id, episode_video_ids, mix_name, mix_id + ) + + # 🎬 生成评论总结(在所有数据收集完成后) + self.generate_comments_summary(document_id, mix_name) + except Exception as e: + logging.error(f'[实时保存] 获取详细内容失败: {item_data.get("mix_name", "未知")} - {e}') + + logging.info(f'[实时保存] 所有数据处理完成,共 {len(self.saved_items)} 个合集') + + # 🔄 同步字段到 Ranking_storage(包括评论总结) + try: + logging.info('[字段同步] 🔄 开始同步字段到 Ranking_storage') + + # 导入同步函数 + import sys + import os + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'routers')) + from rank_api_routes import sync_ranking_storage_fields + + # 获取今天的日期 + today_str = datetime.now().strftime('%Y-%m-%d') + + # 执行同步(强制更新以确保评论总结被同步) + sync_result = sync_ranking_storage_fields(target_date=today_str, force_update=True) + + if sync_result.get("success", False): + logging.info(f'[字段同步] ✅ 同步成功: {sync_result.get("message", "")}') + else: + logging.info(f'[字段同步] ⚠️ 同步完成: {sync_result.get("message", "")}') + + except Exception as e: + logging.error(f'[字段同步] ❌ 同步失败: {e}') + # 同步失败不影响主流程 else: # 传统批量保存模式 self.save_to_mongodb() @@ -1104,12 +1865,18 @@ class DouyinPlayVVScraper: def update_ranks_for_batch(self): """为当前批次的数据更新排名""" - if self.collection is None or not self.saved_items: + target_collection = self.collection # 使用根据模式选择的集合 + if target_collection is None: + logging.warning('[实时保存] 数据库集合未初始化,跳过排名更新') + return + + if not self.saved_items: + logging.warning('[实时保存] 没有已保存的数据,跳过排名更新') return try: # 获取当前批次的所有数据,按播放量排序 - cursor = self.collection.find( + cursor = target_collection.find( {'batch_id': self.batch_id}, {'_id': 1, 'play_vv': 1, 'mix_name': 1} ).sort('play_vv', -1) @@ -1132,7 +1899,7 @@ class DouyinPlayVVScraper: ) if bulk_operations: - result = self.collection.bulk_write(bulk_operations) + result = target_collection.bulk_write(bulk_operations) logging.info(f'[实时保存] 成功更新 {result.modified_count} 个合集的排名') # 输出排名统计 @@ -1142,7 +1909,8 @@ class DouyinPlayVVScraper: except Exception as e: logging.error(f'[实时保存] 更新排名失败: {e}') - + + def extract_douyin_image_id(self, cover_url): """ 从抖音图片URL中提取唯一的图片ID @@ -1354,7 +2122,8 @@ class DouyinPlayVVScraper: logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据') video_details_list = self.get_collection_video_details( episode_video_ids=episode_video_ids, - mix_name=mix_name + mix_name=mix_name, + mix_id=mix_id ) # 构建每集的详细信息,使用获取到的真实数据 @@ -1423,10 +2192,35 @@ class DouyinPlayVVScraper: } for i in range(item.get('updated_to_episode', 0)) ] + # 生成评论总结 + comments_summary = '' + if self.comments_summarizer and episode_details: + try: + # 收集所有集的评论 + all_comments = [] + for episode in episode_details: + comments = episode.get('comments', []) + if comments: + all_comments.extend(comments) + + if all_comments: + logging.info(f'🎬 开始为短剧 {mix_name} 生成评论总结(共 {len(all_comments)} 条评论)') + comments_summary = self.comments_summarizer.summarize_comments(all_comments, mix_name) + if comments_summary: + logging.info(f'✅ 短剧 {mix_name} 评论总结生成成功') + else: + logging.warning(f'⚠️ 短剧 {mix_name} 评论总结生成失败') + else: + logging.info(f'ℹ️ 短剧 {mix_name} 没有评论,跳过总结') + except Exception as e: + logging.error(f'❌ 生成评论总结时出错: {e}') + comments_summary = '' + # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段 doc = { 'batch_time': batch_time, 'mix_name': mix_name, + 'mix_id': item.get('mix_id', ''), # 合集ID 'video_url': item.get('video_url', ''), 'playcount': item.get('formatted', ''), 'play_vv': item.get('play_vv', 0), @@ -1441,7 +2235,10 @@ class DouyinPlayVVScraper: 'desc': item.get('desc', ''), # 合集描述 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数 'episode_video_ids': episode_video_ids, # 每一集的视频ID列表 - 'episode_details': episode_details # 每集的详细信息 + 'episode_details': episode_details, # 每集的详细信息 + 'Manufacturing_Field': item.get('Manufacturing_Field', ''), # 承制信息 + 'Copyright_field': item.get('Copyright_field', ''), # 版权信息 + 'comments_summary': comments_summary, # AI生成的评论总结 } documents.append(doc) @@ -1450,8 +2247,9 @@ class DouyinPlayVVScraper: for i, doc in enumerate(documents, 1): doc['rank'] = i - # 批量插入 - result = self.collection.insert_many(documents) + # 批量插入到目标集合(根据模式选择) + target_collection = self.collection # 使用根据模式选择的集合 + result = target_collection.insert_many(documents) logging.info(f'成功保存 {len(result.inserted_ids)} 条记录到MongoDB') # 输出统计信息 @@ -1459,7 +2257,7 @@ class DouyinPlayVVScraper: max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0 logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}') - logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, cover_upload_success, series_author, desc, updated_to_episode') + logging.info(f'保存的字段: batch_time, mix_name, mix_id, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, cover_upload_success, series_author, Manufacturing_Field, Copyright_field, desc, updated_to_episode') # 统计封面图片处理情况 cover_count = sum(1 for doc in documents if doc.get('cover_image_url')) @@ -1485,11 +2283,12 @@ class DouyinPlayVVScraper: return None try: - # 生成唯一标识用于去重 - item_key = f"{item_data.get('mix_id', '')}_{item_data.get('play_vv', 0)}" + # 生成唯一标识用于去重(只使用mix_id,不包含播放量) + mix_id = item_data.get('mix_id', '') + item_key = mix_id if item_key in self.saved_items: - logging.warning(f'[立即保存] 合集数据已存在,跳过保存: {item_data.get("mix_name", "")}') + logging.warning(f'[立即保存] 短剧已存在,跳过重复保存: {item_data.get("mix_name", "")} (mix_id: {mix_id})') return None # 增加序号 @@ -1546,6 +2345,10 @@ class DouyinPlayVVScraper: 'cover_upload_success': upload_success, 'cover_backup_urls': item_data.get('cover_backup_urls', []), 'series_author': item_data.get('series_author', ''), + 'Manufacturing_Field': item_data.get('Manufacturing_Field', ''), + 'Copyright_field': item_data.get('Copyright_field', ''), + 'classification_type': '', # 新增:类型/元素(锁定字段,初始为空) + 'release_date': '', # 新增:上线日期(锁定字段,初始为空) 'desc': item_data.get('desc', ''), 'updated_to_episode': current_episode_count, 'episode_video_ids': [], # 稍后更新 @@ -1556,14 +2359,208 @@ class DouyinPlayVVScraper: 'last_updated': datetime.now() } - # 插入文档 - result = self.collection.insert_one(doc) - document_id = result.inserted_id + # 根据运行模式选择数据库集合 + target_collection = self.collection # 使用根据模式选择的集合 + document_id = None + + # 保存到目标数据库(根据模式:定时器模式->Ranking_storage_list,普通模式->Rankings_management) + if target_collection is not None: + try: + # 为目标数据库准备文档数据 + target_doc = doc.copy() + target_doc['last_updated'] = datetime.now() + + # 检查是否已存在该短剧的记录 + existing_doc = target_collection.find_one({'mix_id': mix_id}) + + # 准备更新字段(不包含锁定字段,锁定字段将在后面单独处理) + set_fields = { + # 按照用户指定的字段顺序设置 + 'batch_id': target_doc.get('batch_id', ''), + 'batch_time': target_doc.get('batch_time', datetime.now()), + 'item_sequence': target_doc.get('item_sequence', 0), + 'mix_name': target_doc.get('mix_name', ''), + 'mix_id': mix_id, + 'video_url': target_doc.get('video_url', ''), + 'playcount': target_doc.get('playcount', ''), + 'play_vv': target_doc.get('play_vv', 0), + 'request_id': target_doc.get('request_id', ''), + 'rank': target_doc.get('rank', 0), + 'cover_image_url_original': target_doc.get('cover_image_url_original', ''), + 'cover_image_url': target_doc.get('cover_image_url', ''), + 'cover_upload_success': target_doc.get('cover_upload_success', True), + 'cover_backup_urls': target_doc.get('cover_backup_urls', []), + 'series_author': target_doc.get('series_author', ''), + 'desc': target_doc.get('desc', ''), + 'updated_to_episode': target_doc.get('updated_to_episode', 0), + 'episode_video_ids': target_doc.get('episode_video_ids', []), + 'episode_details': target_doc.get('episode_details', []), + 'data_status': target_doc.get('data_status', ''), + 'realtime_saved': target_doc.get('realtime_saved', True), + 'created_at': target_doc.get('created_at', datetime.now()), + 'last_updated': target_doc['last_updated'] + # 注意:分类字段 Novel_IDs, Anime_IDs, Drama_IDs 不在此处设置 + # 因为爬虫数据不包含这些用户手动设置的分类信息 + # 这些字段只在保护逻辑中处理,避免覆盖现有数据 + } + + # 锁定字段保护逻辑:检查field_lock_status来决定是否更新锁定字段 + # 规则:如果字段被用户锁定(field_lock_status中包含该字段),则跳过更新 + # 如果字段未被锁定,且现有记录中这些字段有值,则跳过更新(保持原值) + # 如果字段未被锁定,且现有记录中这些字段为空,且新数据有值,则更新 + # 如果是新记录,则使用新数据的值 + if existing_doc: + # 记录已存在,检查锁定字段保护 + existing_field_lock_status = existing_doc.get('field_lock_status', {}) + existing_manufacturing = existing_doc.get('Manufacturing_Field', '') + existing_copyright = existing_doc.get('Copyright_field', '') + existing_classification = existing_doc.get('classification_type', '') # 新增 + existing_release_date = existing_doc.get('release_date', '') # 新增 + existing_novel_ids = existing_doc.get('Novel_IDs', []) + existing_anime_ids = existing_doc.get('Anime_IDs', []) + existing_drama_ids = existing_doc.get('Drama_IDs', []) + + new_manufacturing = target_doc.get('Manufacturing_Field', '') + new_copyright = target_doc.get('Copyright_field', '') + # 注意:不从target_doc获取分类字段,因为爬虫数据不包含这些字段 + # 分类字段只能由用户手动设置,爬虫不应该更新它们 + new_novel_ids = [] # 爬虫数据不包含此字段 + new_anime_ids = [] # 爬虫数据不包含此字段 + new_drama_ids = [] # 爬虫数据不包含此字段 + + # Manufacturing_Field 保护逻辑 + if existing_field_lock_status.get('Manufacturing_Field_locked', False): + # 字段被用户锁定,跳过更新 + logging.info(f'[锁定字段] 跳过Manufacturing_Field更新: {mix_name} -> 字段已被用户锁定') + elif existing_manufacturing: + # 现有字段有值,跳过更新(不添加到set_fields中) + logging.info(f'[锁定字段] 跳过Manufacturing_Field更新: {mix_name} -> 保持现有值 "{existing_manufacturing}"') + elif new_manufacturing: + # 现有字段为空,且新数据有值,则更新 + set_fields['Manufacturing_Field'] = new_manufacturing + logging.info(f'[锁定字段] 更新Manufacturing_Field: {mix_name} -> "{new_manufacturing}"') + # 如果现有为空且新数据也为空,则不设置该字段(保持为空) + + # Copyright_field 保护逻辑 + if existing_field_lock_status.get('Copyright_field_locked', False): + # 字段被用户锁定,跳过更新 + logging.info(f'[锁定字段] 跳过Copyright_field更新: {mix_name} -> 字段已被用户锁定') + elif existing_copyright: + # 现有字段有值,跳过更新(不添加到set_fields中) + logging.info(f'[锁定字段] 跳过Copyright_field更新: {mix_name} -> 保持现有值 "{existing_copyright}"') + elif new_copyright: + # 现有字段为空,且新数据有值,则更新 + set_fields['Copyright_field'] = new_copyright + logging.info(f'[锁定字段] 更新Copyright_field: {mix_name} -> "{new_copyright}"') + # 如果现有为空且新数据也为空,则不设置该字段(保持为空) + + # classification_type 保护逻辑(新增) + existing_classification = existing_doc.get('classification_type') + new_classification = target_doc.get('classification_type', '') + if existing_field_lock_status.get('classification_type_locked', False): + logging.info(f'[锁定字段] 跳过classification_type更新: {mix_name} -> 字段已被用户锁定') + elif existing_classification: + logging.info(f'[锁定字段] 跳过classification_type更新: {mix_name} -> 保持现有值 "{existing_classification}"') + else: + set_fields['classification_type'] = new_classification or '' + if new_classification: + logging.info(f'[锁定字段] 更新classification_type: {mix_name} -> "{new_classification}"') + else: + logging.info(f'[锁定字段] 初始化classification_type: {mix_name} -> 空值') + + # release_date 保护逻辑(新增) + existing_release_date = existing_doc.get('release_date') + new_release_date = target_doc.get('release_date', '') + if existing_field_lock_status.get('release_date_locked', False): + logging.info(f'[锁定字段] 跳过release_date更新: {mix_name} -> 字段已被用户锁定') + elif existing_release_date: + logging.info(f'[锁定字段] 跳过release_date更新: {mix_name} -> 保持现有值 "{existing_release_date}"') + else: + set_fields['release_date'] = new_release_date or '' + if new_release_date: + logging.info(f'[锁定字段] 更新release_date: {mix_name} -> "{new_release_date}"') + else: + logging.info(f'[锁定字段] 初始化release_date: {mix_name} -> 空值') + + # Novel_IDs 保护逻辑 + if existing_field_lock_status.get('Novel_IDs_locked', False): + # 字段被用户锁定,跳过更新 + logging.info(f'[锁定字段] 跳过Novel_IDs更新: {mix_name} -> 字段已被用户锁定') + elif existing_novel_ids and len(existing_novel_ids) > 0: + # 现有字段有值,跳过更新(不添加到set_fields中) + logging.info(f'[锁定字段] 跳过Novel_IDs更新: {mix_name} -> 保持现有值 {existing_novel_ids}') + elif new_novel_ids and len(new_novel_ids) > 0: + # 现有字段为空,且新数据有值,则更新 + set_fields['Novel_IDs'] = new_novel_ids + logging.info(f'[锁定字段] 更新Novel_IDs: {mix_name} -> {new_novel_ids}') + # 如果现有为空且新数据也为空,则不设置该字段(保持为空) + + # Anime_IDs 保护逻辑 + if existing_field_lock_status.get('Anime_IDs_locked', False): + # 字段被用户锁定,跳过更新 + logging.info(f'[锁定字段] 跳过Anime_IDs更新: {mix_name} -> 字段已被用户锁定') + elif existing_anime_ids and len(existing_anime_ids) > 0: + # 现有字段有值,跳过更新(不添加到set_fields中) + logging.info(f'[锁定字段] 跳过Anime_IDs更新: {mix_name} -> 保持现有值 {existing_anime_ids}') + elif new_anime_ids and len(new_anime_ids) > 0: + # 现有字段为空,且新数据有值,则更新 + set_fields['Anime_IDs'] = new_anime_ids + logging.info(f'[锁定字段] 更新Anime_IDs: {mix_name} -> {new_anime_ids}') + # 如果现有为空且新数据也为空,则不设置该字段(保持为空) + + # Drama_IDs 保护逻辑 + if existing_field_lock_status.get('Drama_IDs_locked', False): + # 字段被用户锁定,跳过更新 + logging.info(f'[锁定字段] 跳过Drama_IDs更新: {mix_name} -> 字段已被用户锁定') + elif existing_drama_ids and len(existing_drama_ids) > 0: + # 现有字段有值,跳过更新(不添加到set_fields中) + logging.info(f'[锁定字段] 跳过Drama_IDs更新: {mix_name} -> 保持现有值 {existing_drama_ids}') + elif new_drama_ids and len(new_drama_ids) > 0: + # 现有字段为空,且新数据有值,则更新 + set_fields['Drama_IDs'] = new_drama_ids + logging.info(f'[锁定字段] 更新Drama_IDs: {mix_name} -> {new_drama_ids}') + # 如果现有为空且新数据也为空,则不设置该字段(保持为空) + + else: + # 新记录,只设置非分类字段 + set_fields['Manufacturing_Field'] = target_doc.get('Manufacturing_Field', '') + set_fields['Copyright_field'] = target_doc.get('Copyright_field', '') + set_fields['classification_type'] = target_doc.get('classification_type', '') # 新增 + set_fields['release_date'] = target_doc.get('release_date', '') # 新增 + # 注意:不设置分类字段 Novel_IDs, Anime_IDs, Drama_IDs + # 因为爬虫数据不包含这些用户手动设置的分类信息 + # 新记录的分类字段将保持为空,等待用户手动设置 + logging.info(f'[锁定字段] 新记录,设置初始非分类字段(包含新增的2个锁定字段): {mix_name}') + + # 使用upsert操作:如果存在则更新,不存在则插入 + upsert_result = target_collection.update_one( + {'mix_id': mix_id}, # 查询条件 + { + '$set': set_fields, + '$setOnInsert': { + # 只在插入时设置的字段(如果字段已在$set中,则不需要在这里重复) + } + }, + upsert=True # 如果不存在则插入 + ) + + if upsert_result.upserted_id: + # 新插入的文档 + document_id = upsert_result.upserted_id + logging.info(f'[数据保存] ✅ 新短剧添加: {mix_name} - 文档ID: {document_id}') + else: + # 更新的现有文档 + existing_doc = target_collection.find_one({'mix_id': mix_id}, {'_id': 1}) + document_id = existing_doc['_id'] if existing_doc else None + logging.info(f'[数据保存] 🔄 已有短剧更新: {mix_name} - 文档ID: {document_id}') + + except Exception as e: + logging.error(f'[数据保存] 目标数据库操作失败: {mix_name} - 错误: {e}') # 记录已保存的项目 self.saved_items.add(item_key) - logging.info(f'[立即保存] ✅ 成功保存合集基础信息: {mix_name} (播放量: {item_data.get("play_vv", 0):,}) - 文档ID: {document_id}') + logging.info(f'[数据保存] 🎯 合集基础信息保存完成: {mix_name} (播放量: {item_data.get("play_vv", 0):,})') return document_id @@ -1573,7 +2570,8 @@ class DouyinPlayVVScraper: def update_collection_video_ids(self, document_id, mix_id: str, mix_name: str, current_episode_count: int): """更新合集的视频ID列表(第二阶段更新)""" - if not self.realtime_save_enabled or self.collection is None or not document_id: + target_collection = self.collection # 使用根据模式选择的集合 + if not self.realtime_save_enabled or target_collection is None or not document_id: return False try: @@ -1587,23 +2585,29 @@ class DouyinPlayVVScraper: ) if episode_video_ids: - # 更新数据库中的视频ID列表 - update_result = self.collection.update_one( - {'_id': document_id}, - { - '$set': { - 'episode_video_ids': episode_video_ids, - 'data_status': 'video_ids_updated', - 'last_updated': datetime.now() - } + # 管理数据库更新逻辑 + update_data = { + '$set': { + 'episode_video_ids': episode_video_ids, + 'data_status': 'video_ids_updated', + 'last_updated': datetime.now() } - ) + } - if update_result.modified_count > 0: - logging.info(f'[增量更新] ✅ 成功更新视频ID列表: {mix_name} - 共 {len(episode_video_ids)} 个视频') - return episode_video_ids - else: - logging.warning(f'[增量更新] 更新视频ID列表失败: {mix_name}') + # 更新目标数据库 + try: + # 根据mix_id查找目标数据库中的文档 + update_result = target_collection.update_one( + {'mix_id': mix_id}, + update_data + ) + if update_result.modified_count > 0: + logging.info(f'[数据更新] ✅ 视频ID列表更新完成: {mix_name} - 共 {len(episode_video_ids)} 个视频') + return episode_video_ids + else: + logging.warning(f'[数据更新] 视频ID列表更新失败: {mix_name}') + except Exception as e: + logging.error(f'[数据更新] 视频ID更新失败: {mix_name} - 错误: {e}') else: logging.warning(f'[增量更新] 未获取到视频ID: {mix_name}') @@ -1615,7 +2619,15 @@ class DouyinPlayVVScraper: def update_single_video_details(self, document_id, episode_number: int, video_id: str, video_details: dict, mix_name: str): """更新单个视频的详细数据(第三阶段增量更新)""" - if not self.realtime_save_enabled or self.collection is None or not document_id: + target_collection = self.collection # 使用根据模式选择的集合 + if not self.realtime_save_enabled or target_collection is None or not document_id: + return False + + # 确保 episode_number 是整数类型 + try: + episode_number = int(episode_number) + except (ValueError, TypeError): + logging.error(f'update_single_video_details: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}') return False try: @@ -1633,31 +2645,50 @@ class DouyinPlayVVScraper: 'data_status': 'completed' } - # 更新数据库中对应集数的详细信息 - update_result = self.collection.update_one( - {'_id': document_id}, - { - '$set': { - f'episode_details.{episode_number - 1}': episode_info, - 'last_updated': datetime.now() - } + # 双数据库更新逻辑 + update_data = { + '$set': { + f'episode_details.{episode_number - 1}': episode_info, + 'last_updated': datetime.now() } - ) + } - if update_result.modified_count > 0: - logging.info(f'[增量更新] ✅ 成功更新第 {episode_number} 集详细数据: {mix_name} - 点赞: {video_details.get("likes", 0):,}, 评论: {len(video_details.get("comments", []))}') - return True + # 更新目标数据库 + if target_collection is not None: + try: + # 直接使用document_id查找目标数据库中的文档 + update_result = target_collection.update_one( + {'_id': document_id}, + update_data + ) + if update_result.modified_count > 0: + logging.info(f'[数据更新] ✅ 第 {episode_number} 集详细数据更新完成: {mix_name} - 点赞: {video_details.get("likes", 0):,}, 评论: {len(video_details.get("comments", []))}') + return True + else: + logging.warning(f'[数据更新] 第 {episode_number} 集详细数据更新失败: {mix_name}') + return False + except Exception as e: + logging.error(f'[数据更新] 第 {episode_number} 集详细数据更新失败: {mix_name} - 错误: {e}') + return False else: - logging.warning(f'[增量更新] 更新第 {episode_number} 集详细数据失败: {mix_name}') + logging.warning(f'[数据更新] 目标数据库第 {episode_number} 集详细数据更新失败: {mix_name}') return False except Exception as e: logging.error(f'[增量更新] 更新第 {episode_number} 集详细数据失败: {mix_name} - 错误: {e}') return False - def update_video_comments_realtime(self, document_id, episode_number: int, new_comments: list = None, mix_name: str = '', interaction_data: dict = None): + def update_video_comments_realtime(self, document_id, episode_number: int, new_comments: list = None, mix_name: str = '', mix_id: str = '', interaction_data: dict = None): """实时更新视频评论和互动数据(第四阶段实时更新)""" - if not self.realtime_save_enabled or self.collection is None or not document_id: + target_collection = self.collection # 使用根据模式选择的集合 + if not self.realtime_save_enabled or target_collection is None or not document_id: + return False + + # 确保 episode_number 是整数类型 + try: + episode_number = int(episode_number) + except (ValueError, TypeError): + logging.error(f'episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}') return False # 检查是否有数据需要更新 @@ -1695,38 +2726,117 @@ class DouyinPlayVVScraper: update_operations['$set'] = set_fields - # 执行更新 - update_result = self.collection.update_one( - {'_id': document_id}, - update_operations - ) - - if update_result.modified_count > 0: - # 构建日志信息 - log_parts = [] - if new_comments: - log_parts.append(f"追加 {len(new_comments)} 条评论") - if interaction_data: - interaction_summary = [] - if 'likes' in interaction_data: - interaction_summary.append(f"点赞={interaction_data.get('likes_formatted', interaction_data['likes'])}") - if 'shares' in interaction_data: - interaction_summary.append(f"分享={interaction_data.get('shares_formatted', interaction_data['shares'])}") - if 'favorites' in interaction_data: - interaction_summary.append(f"收藏={interaction_data.get('favorites_formatted', interaction_data['favorites'])}") - if interaction_summary: - log_parts.append(f"更新互动数据({', '.join(interaction_summary)})") - - logging.info(f'[实时更新] ✅ 成功{", ".join(log_parts)}: {mix_name} 第 {episode_number} 集') - return True + # 目标数据库更新逻辑 + if target_collection is not None: + try: + # 直接使用document_id查找目标数据库中的文档 + update_result = target_collection.update_one( + {'_id': document_id}, + update_operations + ) + if update_result.modified_count > 0: + # 构建日志信息 + log_parts = [] + if new_comments: + log_parts.append(f"追加 {len(new_comments)} 条评论") + if interaction_data: + interaction_summary = [] + if 'likes' in interaction_data: + interaction_summary.append(f"点赞={interaction_data.get('likes_formatted', interaction_data['likes'])}") + if 'shares' in interaction_data: + interaction_summary.append(f"分享={interaction_data.get('shares_formatted', interaction_data['shares'])}") + if 'favorites' in interaction_data: + interaction_summary.append(f"收藏={interaction_data.get('favorites_formatted', interaction_data['favorites'])}") + if interaction_summary: + log_parts.append(f"更新互动数据({', '.join(interaction_summary)})") + + logging.info(f'[目标数据库] ✅ 第 {episode_number} 集评论/互动数据更新完成: {mix_name} - {", ".join(log_parts)}') + return True + else: + logging.warning(f'[目标数据库] 第 {episode_number} 集评论/互动数据更新失败: {mix_name}') + return False + except Exception as e: + logging.error(f'[目标数据库] 第 {episode_number} 集评论/互动数据更新失败: {mix_name} - 错误: {e}') + return False else: - logging.warning(f'[实时更新] 更新失败: {mix_name} 第 {episode_number} 集') + logging.error(f'[目标数据库] 目标数据库未初始化') return False except Exception as e: - logging.error(f'[实时更新] 更新失败: {mix_name} 第 {episode_number} 集 - 错误: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'实时更新视频评论,合集: {mix_name}, 第 {episode_number} 集, 文档ID: {document_id}, 新评论数: {len(new_comments) if new_comments else 0}' + } + logging.error(f'[实时更新] 更新失败: {mix_name} 第 {episode_number} 集 - {error_details["error_type"]}: {error_details["error_message"]}') + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') return False + def generate_comments_summary(self, document_id, mix_name: str): + """生成评论总结并保存到数据库""" + logging.info(f'[评论总结] 🔍 检查评论总结条件: comments_summarizer={self.comments_summarizer is not None}, document_id={document_id}') + + if not self.comments_summarizer or not document_id: + if not self.comments_summarizer: + logging.warning(f'[评论总结] ⚠️ 评论总结器未初始化,跳过: {mix_name}') + if not document_id: + logging.warning(f'[评论总结] ⚠️ document_id 为空,跳过: {mix_name}') + return + + try: + # 从数据库获取最新的 episode_details + target_collection = self.collection + doc = target_collection.find_one({'_id': document_id}) + logging.info(f'[评论总结] 从数据库查询文档: 找到={doc is not None}') + + if not doc or not doc.get('episode_details'): + logging.warning(f'[评论总结] 未找到文档或episode_details为空: {mix_name}') + return + + # 🔍 检查是否已有评论总结 + existing_summary = doc.get('comments_summary', '') + if existing_summary: + logging.info(f'[评论总结] ⏭️ 短剧 {mix_name} 已有评论总结,跳过生成') + return + + logging.info(f'[评论总结] 🎬 开始为短剧 {mix_name} 生成评论总结') + + # 收集所有集的评论 + all_comments = [] + for episode in doc['episode_details']: + comments = episode.get('comments', []) + if comments: + all_comments.extend(comments) + + if not all_comments: + logging.info(f'[评论总结] ℹ️ 短剧 {mix_name} 没有评论,跳过总结') + return + + logging.info(f'[评论总结] 共收集到 {len(all_comments)} 条评论') + comments_summary = self.comments_summarizer.summarize_comments(all_comments, mix_name) + + if comments_summary: + # 更新评论总结到数据库 + target_collection.update_one( + {'_id': document_id}, + {'$set': { + 'comments_summary': comments_summary, + 'last_updated': datetime.now() + }} + ) + logging.info(f'[评论总结] ✅ 短剧 {mix_name} 评论总结生成并保存成功') + logging.info(f'[评论总结] 📝 总结内容(前100字): {comments_summary[:100]}...') + else: + logging.warning(f'[评论总结] ⚠️ 短剧 {mix_name} 评论总结生成失败') + + except Exception as e: + logging.error(f'[评论总结] ❌ 生成评论总结时出错: {mix_name} - {e}') + import traceback + logging.error(f'详细错误: {traceback.format_exc()}') + def save_single_item_realtime(self, item_data: dict): """分阶段实时保存合集数据(新版本)""" logging.info(f'[分阶段保存] 开始处理合集: {item_data.get("mix_name", "未知")}') @@ -1746,11 +2856,54 @@ class DouyinPlayVVScraper: # 第三阶段:逐个获取并更新视频详细数据 if episode_video_ids: - self.update_video_details_incrementally(document_id, episode_video_ids, mix_name) + self.update_video_details_incrementally(document_id, episode_video_ids, mix_name, mix_id) + # 🔄 第四阶段:触发字段同步到Ranking_storage(如果存在对应的榜单数据) + try: + if mix_name: # 只有当mix_name存在时才尝试同步 + logging.info(f'[字段同步] 检查是否需要同步字段到Ranking_storage: {mix_name}') + + # 导入同步函数(延迟导入避免循环依赖) + import sys + import os + sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'routers')) + from rank_api_routes import sync_ranking_storage_fields + + # 获取今天的日期 + today_str = datetime.now().strftime('%Y-%m-%d') + + # 检查Ranking_storage中是否存在该短剧的今日数据 + ranking_storage_collection = db['Ranking_storage'] + existing_ranking = ranking_storage_collection.find_one({ + "date": today_str, + "mix_name": mix_name + }) + + if existing_ranking: + # 存在对应的榜单数据,触发同步 + logging.info(f'[字段同步] 发现对应的榜单数据,开始同步: {mix_name}') + sync_result = sync_ranking_storage_fields(target_date=today_str, force_update=False) + + if sync_result.get("success", False): + logging.info(f'[字段同步] ✅ 同步成功: {sync_result.get("message", "")}') + else: + logging.info(f'[字段同步] ⚠️ 同步完成: {sync_result.get("message", "")}') + else: + logging.info(f'[字段同步] 未找到对应的榜单数据,跳过同步: {mix_name}') + + except Exception as sync_error: + logging.warning(f'[字段同步] 同步失败,但不影响数据保存: {mix_name} - {sync_error}') + # 同步失败不影响数据保存的成功状态 + + logging.info(f'[分阶段保存] ✅ 前四阶段完成,准备生成评论总结: {mix_name}') + + # 🎬 第五阶段:生成评论总结(在所有数据收集完成后) + self.generate_comments_summary(document_id, mix_name) + + logging.info(f'[分阶段保存] ✅ 所有阶段完成: {mix_name}') return True - def update_video_details_incrementally(self, document_id, episode_video_ids: list, mix_name: str): + def update_video_details_incrementally(self, document_id, episode_video_ids: list, mix_name: str, mix_id: str = ''): """增量更新视频详细数据""" logging.info(f'[增量更新] 开始逐个获取视频详细数据: {mix_name}') @@ -1762,7 +2915,7 @@ class DouyinPlayVVScraper: try: # 获取单个视频的详细数据 logging.info(f'[增量更新] 获取第 {i}/{len(episode_video_ids)} 集视频详细数据: {mix_name}') - video_details = self.get_video_details(video_id, mix_name, document_id, i) + video_details = self.get_video_details(video_id, mix_name, mix_id, document_id, i) if video_details and video_details.get('success', False): # 立即更新到数据库 @@ -1798,18 +2951,16 @@ class DouyinPlayVVScraper: # 等待页面加载完成 try: - - WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "video")) ) except Exception as e: logging.warning(f'等待视频元素超时: {e}') - + # 获取网络请求日志 logs = self.driver.get_log('performance') video_info = {} - + for entry in logs: try: log = json.loads(entry['message'])['message'] @@ -1841,7 +2992,7 @@ class DouyinPlayVVScraper: break except Exception as e: logging.warning(f'解析日志条目时出错: {e}') - + return video_info def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list: @@ -1918,26 +3069,34 @@ class DouyinPlayVVScraper: } all_videos = [] - + # 使用服务端提供的游标进行分页,而不是使用 len(all_videos) + cursor = 0 + seen_cursors = set() + while True: + # 将当前游标设置到请求参数(字符串以兼容部分接口) + params['cursor'] = str(cursor) + response = requests.get( 'https://www.douyin.com/aweme/v1/web/mix/aweme/', params=params, cookies=self.get_cookies_dict(), headers=headers ) - + if response.status_code != 200: logging.error(f"请求失败: {response.status_code}") logging.error(f"响应内容: {response.text}") break - + try: data = response.json() - aweme_list = data.get('aweme_list', []) + # 兼容可能的列表字段名 + aweme_list = data.get('aweme_list') or data.get('mix_aweme_list') or [] if not aweme_list: + logging.info("当前页无视频,结束分页") break - + for aweme in aweme_list: video_id = aweme.get('aweme_id') if video_id: @@ -1945,14 +3104,31 @@ class DouyinPlayVVScraper: 'video_id': video_id, 'episode_num': int(aweme.get('episode_num', 0)) }) - - has_more = data.get('has_more', False) - if not has_more: + + # 读取服务端分页标识 + has_more = data.get('has_more') or data.get('hasMore') or False + next_cursor = ( + data.get('cursor') or + data.get('next_cursor') or + data.get('max_cursor') or + data.get('min_cursor') + ) + + logging.info(f"分页: cursor={cursor}, next_cursor={next_cursor}, has_more={has_more}, 本页视频={len(aweme_list)}, 累计={len(all_videos)}") + + # 退出条件:没有更多或没有有效下一游标 + if not has_more or not next_cursor: break - - params['cursor'] = str(len(all_videos)) + + # 防止重复游标导致的死循环 + if next_cursor in seen_cursors: + logging.warning(f"检测到重复游标 {next_cursor},停止分页以避免死循环") + break + + seen_cursors.add(next_cursor) + cursor = next_cursor time.sleep(1) - + except json.JSONDecodeError as e: logging.error(f"JSON解析错误: {e}") logging.error(f"响应内容: {response.text}") @@ -2002,16 +3178,25 @@ class DouyinPlayVVScraper: return [] def _simulate_comment_scrolling(self, video_id: str, max_scroll_attempts: int = 10, scroll_delay: float = 2.0, - document_id=None, episode_number: int = 0, mix_name: str = '') -> list: + document_id=None, episode_number: int = 0, mix_name: str = '', mix_id: str = '', + max_comments: int = 100) -> list: """ 模拟用户异步滑动机制,向上滑动加载更多评论 Args: video_id: 视频ID max_scroll_attempts: 最大滑动尝试次数,默认10次 scroll_delay: 每次滑动后的延迟时间(秒),默认2秒 + max_comments: 每集最大评论数量限制,默认100条 Returns: list: 收集到的所有评论数据 """ + # 确保 episode_number 是整数类型 + try: + episode_number = int(episode_number) + except (ValueError, TypeError): + logging.error(f'_simulate_comment_scrolling: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}') + episode_number = 0 + # 检查是否应该跳过评论滑动(仅在定时器模式下跳过) if should_skip_function('scroll_comments'): logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的评论滑动加载') @@ -2049,7 +3234,7 @@ class DouyinPlayVVScraper: # 同时提交监控任务 - 监控任务会检测滑动任务状态(5小时超时) monitor_future = executor.submit(self._async_monitor_task_with_state, video_id, collected_comment_ids, shared_state, 18000, - document_id, episode_number, mix_name) + document_id, episode_number, mix_name, mix_id, max_comments) # 等待两个任务完成 scroll_result = scroll_future.result() @@ -2059,82 +3244,6 @@ class DouyinPlayVVScraper: logging.info(f'评论滑动加载完成,共收集到 {len(all_comments)} 条评论') - # 针对评论较少的情况,执行补丁机制确保不遗漏评论 - # 当滑动次数较少(可能只滑动了2-3次就到底)但评论数量也较少时, - # 很可能存在页面上可见但未被网络日志捕获的评论 - - # 智能判断是否需要执行补丁机制 - # 只在评论数量真正过少时才启用补丁机制 - should_apply_patch = False - - # 只有当评论数量少于10条时才启用补丁机制 - if len(all_comments) < 10: - should_apply_patch = True - logging.debug(f'评论数量过少({len(all_comments)}条),启用补丁机制') - - # 对于评论数量在10-50条之间的情况,检查是否可能遗漏了评论 - elif len(all_comments) <= 50: - try: - visible_comment_count = self.driver.execute_script(""" - var selectors = [ - '[data-e2e="comment-item"]', - '[class*="comment-item"]', - '[class*="comment-content"]' - ]; - var totalCount = 0; - selectors.forEach(function(selector) { - var elements = document.querySelectorAll(selector); - elements.forEach(function(element) { - if (element.offsetParent !== null && element.textContent.trim().length > 2) { - totalCount++; - } - }); - }); - return totalCount; - """) - - # 只有当页面可见评论数量明显大于已获取数量时才启用补丁 - if visible_comment_count > len(all_comments) * 2: - should_apply_patch = True - logging.debug(f'页面可见评论({visible_comment_count}条) >> 已获取评论({len(all_comments)}条),启用补丁机制') - - except Exception as e: - logging.debug(f'检查页面可见评论数量失败: {e}') - # 检查失败时不启用补丁机制 - - patch_comments = [] - if should_apply_patch: - logging.info('执行评论补丁机制...') - patch_comments = self._extract_comments_patch(video_id) - else: - logging.debug('无需执行补丁机制') - - if patch_comments: - # 去重合并补丁评论 - existing_texts = {comment.get('text', '') for comment in all_comments} - new_patch_comments = [] - - for patch_comment in patch_comments: - if patch_comment.get('text', '') not in existing_texts: - new_patch_comments.append(patch_comment) - existing_texts.add(patch_comment.get('text', '')) - - if new_patch_comments: - all_comments.extend(new_patch_comments) - logging.info(f'补丁机制额外获取到 {len(new_patch_comments)} 条评论,总计 {len(all_comments)} 条评论') - - # 如果有新的评论且启用了实时保存,更新数据库 - if document_id and episode_number and new_patch_comments: - try: - self.update_video_comments_realtime(document_id, episode_number, new_patch_comments, mix_name) - logging.info(f'实时保存补丁评论到数据库: {len(new_patch_comments)} 条') - except Exception as e: - logging.warning(f'实时保存补丁评论失败: {e}') - else: - logging.debug('补丁机制未发现新的评论') - else: - logging.debug('补丁机制未获取到任何评论') - # 保存评论到文件 if all_comments: self.save_comments_to_file(all_comments, video_id) @@ -2147,7 +3256,16 @@ class DouyinPlayVVScraper: return all_comments except Exception as e: - logging.error(f'评论滑动加载机制执行失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'评论滑动加载机制,视频ID: {video_id}, 最大滑动次数: {max_scroll_attempts}' + } + logging.error(f'评论滑动加载机制执行失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') return all_comments @@ -2164,6 +3282,12 @@ class DouyinPlayVVScraper: attempt += 1 logging.info(f'第 {attempt} 次向上滑动') + # 检查监控任务是否通知停止 + with shared_state['lock']: + if shared_state['scroll_completed']: + logging.info('收到监控任务停止信号,滑动任务结束') + break + # 记录滑动前的位置 current_position = self.driver.execute_script("return window.pageYOffset;") @@ -2410,14 +3534,31 @@ class DouyinPlayVVScraper: time.sleep(1) except Exception as e: - logging.warning(f'监控任务出错: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'异步监控评论任务,视频ID: {video_id}, 超时时间: {timeout}秒' + } + logging.warning(f'监控任务出错: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') time.sleep(2) return all_comments def _async_monitor_task_with_state(self, video_id: str, collected_comment_ids: set, shared_state: dict, timeout: float, - document_id=None, episode_number: int = 0, mix_name: str = '') -> list: + document_id=None, episode_number: int = 0, mix_name: str = '', mix_id: str = '', + max_comments: int = 100) -> list: """带状态的异步监控任务 - 监控评论并检测滑动任务状态""" + # 确保 episode_number 是整数类型 + try: + episode_number = int(episode_number) + except (ValueError, TypeError): + logging.error(f'_async_monitor_task_with_state: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}') + episode_number = 0 + all_comments = [] start_time = time.time() last_comment_count = 0 @@ -2445,7 +3586,16 @@ class DouyinPlayVVScraper: all_comments.append(comment) time.sleep(0.5) except Exception as e: - logging.warning(f'最终监控阶段出错: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'最终监控阶段,视频ID: {video_id}, 剩余监控时间: {5 - (time.time() - final_start):.1f}秒' + } + logging.warning(f'最终监控阶段出错: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') break # 从网络日志获取新评论 @@ -2462,7 +3612,7 @@ class DouyinPlayVVScraper: # 实时保存新评论到数据库 if new_comments_to_save and document_id and episode_number > 0: - self.update_video_comments_realtime(document_id, episode_number, new_comments_to_save, mix_name) + self.update_video_comments_realtime(document_id, episode_number, new_comments_to_save, mix_name, mix_id) # 检查是否有新评论 current_comment_count = len(all_comments) @@ -2476,15 +3626,32 @@ class DouyinPlayVVScraper: if no_new_comments_count % 30 == 0: logging.info(f'监控中...当前总计 {current_comment_count} 条评论,等待滑动任务完成') + # 检查是否达到评论数量限制 + if current_comment_count >= max_comments: + logging.info(f'已收集到 {current_comment_count} 条评论,达到限制数量 {max_comments},通知滑动任务停止') + with shared_state['lock']: + shared_state['scroll_completed'] = True + break + # 短暂等待后继续监控 time.sleep(1) except Exception as e: - logging.warning(f'监控任务出错: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'带状态的异步监控评论任务,视频ID: {video_id}, 超时时间: {timeout}秒, 文档ID: {document_id}, 集数: {episode_number}' + } + logging.warning(f'监控任务出错: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') time.sleep(2) logging.info(f'监控任务结束,共收集到 {len(all_comments)} 条评论') - return all_comments + # 确保只返回前max_comments条评论 + return all_comments[:max_comments] def _scroll_to_comment_section(self): """滚动到评论区域""" @@ -2585,7 +3752,16 @@ class DouyinPlayVVScraper: logging.debug(f'点击页面中部失败: {e}') except Exception as e: - logging.warning(f'点击评论区域失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'点击评论区域,尝试激活评论加载' + } + logging.warning(f'点击评论区域失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') def _check_comment_section_bottom(self) -> bool: """ @@ -2696,7 +3872,16 @@ class DouyinPlayVVScraper: return False except Exception as e: - logging.warning(f'检测评论区底部失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'检测评论区底部,目标文本: "暂时没有更多评论"' + } + logging.warning(f'检测评论区底部失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') return False def _extract_comments_from_network_logs(self, video_id: str) -> list: @@ -2757,17 +3942,33 @@ class DouyinPlayVVScraper: continue except Exception as e: - logging.warning(f'提取网络日志评论数据失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'提取网络日志评论数据,视频ID: {video_id}, 已处理评论数: {len(comments)}' + } + logging.warning(f'提取网络日志评论数据失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') return comments - def get_video_details(self, video_id: str, mix_name: str = '', document_id=None, episode_number: int = 0) -> dict: + def get_video_details(self, video_id: str, mix_name: str = '', mix_id: str = '', document_id=None, episode_number: int = 0) -> dict: """获取单个视频的详细互动数据 Args: video_id: 视频ID Returns: dict: 包含点赞数、收藏数、转发数、评论内容的字典 """ + # 确保 episode_number 是整数类型 + try: + episode_number = int(episode_number) + except (ValueError, TypeError): + logging.error(f'get_video_details: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}') + episode_number = 0 + video_details = { 'video_id': video_id, 'likes': 0, @@ -2873,7 +4074,7 @@ class DouyinPlayVVScraper: 'favorites': video_details['favorites'], 'favorites_formatted': video_details['favorites_formatted'] } - self.update_video_comments_realtime(document_id, episode_number, None, mix_name, interaction_data) + self.update_video_comments_realtime(document_id, episode_number, None, mix_name, mix_id, interaction_data) interaction_data_saved = True break @@ -2888,7 +4089,7 @@ class DouyinPlayVVScraper: # 启动滑动机制加载更多评论 logging.info(f'开始为视频 {video_id} 启动滑动机制加载评论') scrolled_comments = self._simulate_comment_scrolling(video_id, max_scroll_attempts=15, scroll_delay=2.0, - document_id=document_id, episode_number=episode_number, mix_name=mix_name) + document_id=document_id, episode_number=episode_number, mix_name=mix_name, mix_id=mix_id, max_comments=100) # 如果滑动机制获取到评论,直接使用 if scrolled_comments: @@ -2952,8 +4153,17 @@ class DouyinPlayVVScraper: return video_details except Exception as e: - error_msg = f'获取视频 {video_id} 详细数据失败: {e}' + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'获取视频详细数据,视频ID: {video_id}' + } + error_msg = f'获取视频 {video_id} 详细数据失败: {error_details["error_type"]} - {error_details["error_message"]}' logging.error(error_msg) + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') video_details['error'] = error_msg return video_details @@ -3029,7 +4239,7 @@ class DouyinPlayVVScraper: 'favorites': video_details['favorites'], 'favorites_formatted': video_details['favorites_formatted'] } - self.update_video_comments_realtime(document_id, episode_number, None, mix_name, interaction_data) + self.update_video_comments_realtime(document_id, episode_number, None, mix_name, mix_id, interaction_data) interaction_data_saved = True break @@ -3094,11 +4304,20 @@ class DouyinPlayVVScraper: 'favorites': video_details['favorites'], 'favorites_formatted': video_details['favorites_formatted'] } - self.update_video_comments_realtime(document_id, episode_number, None, mix_name, interaction_data) + self.update_video_comments_realtime(document_id, episode_number, None, mix_name, mix_id, interaction_data) interaction_data_saved = True except Exception as e: - logging.warning(f'CSS选择器解析失败: {e}') + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': f'CSS选择器解析视频互动数据,视频ID: {video_id}' + } + logging.warning(f'CSS选择器解析失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.warning(f'详细错误信息: {error_details["traceback"]}') + logging.warning(f'错误上下文: {error_details["context"]}') # 尝试获取评论(如果还没有获取到) if not video_details['comments']: @@ -3146,7 +4365,7 @@ class DouyinPlayVVScraper: return video_details - def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '') -> list: + def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', mix_id: str = '') -> list: """获取合集中所有视频的详细互动数据 Args: episode_video_ids: 视频ID列表 @@ -3186,7 +4405,7 @@ class DouyinPlayVVScraper: try: # 获取单个视频的详细数据 - video_details = self.get_video_details(video_id) + video_details = self.get_video_details(video_id, mix_name, '', 0, mix_id) video_details['episode_number'] = i video_details_list.append(video_details) @@ -3225,150 +4444,6 @@ class DouyinPlayVVScraper: self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()} return self.cookies - def _extract_comments_patch(self, video_id: str) -> list: - """ - 评论补丁机制 - 更仔细地重新从网络日志获取评论 - 不再抓取页面元素,而是重新触发评论加载并从API获取 - """ - comments = [] - try: - logging.info(f'启动补丁机制,重新仔细获取视频 {video_id} 的评论...') - - # 首先检查是否存在"抢首评"按钮,如果存在说明视频确实没有评论 - if self._check_first_comment_button(): - logging.info('检测到"抢首评"按钮,确认视频没有评论,跳过补丁机制') - return comments - - # 等待页面稳定 - time.sleep(2) - - # 滚动到评论区域确保评论完全加载 - self._scroll_to_comment_section() - time.sleep(1) - - # 点击评论区域,触发评论加载 - try: - self._click_comment_area() - time.sleep(1) - except: - pass - - # 清理旧的网络日志 - self.driver.get_log('performance') - - # 轻微滚动,触发更多评论加载 - for i in range(3): - self.driver.execute_script("window.scrollBy(0, 200);") - time.sleep(0.5) - self.driver.execute_script("window.scrollBy(0, -100);") - time.sleep(0.5) - - # 等待网络请求完成 - time.sleep(3) - - # 重新从网络日志中提取评论(更仔细的方式) - patch_comments = self._extract_comments_from_network_logs_detailed(video_id) - - if patch_comments: - logging.info(f'补丁机制成功重新获取 {len(patch_comments)} 条评论') - comments.extend(patch_comments) - else: - logging.info('补丁机制未找到额外评论') - - except Exception as e: - logging.error(f'评论补丁机制执行失败: {e}') - - return comments - - def _extract_comments_from_network_logs_detailed(self, video_id: str) -> list: - """ - 更详细地从网络日志中提取评论数据(补丁机制专用) - Args: - video_id: 视频ID - Returns: - list: 评论数据列表 - """ - comments = [] - try: - # 获取网络请求日志 - logs = self.driver.get_log('performance') - - for entry in logs: - try: - log = json.loads(entry['message'])['message'] - if ( - 'Network.responseReceived' in log['method'] - and 'response' in log['params'] - and log['params']['response'] - and log['params']['response'].get('url') - ): - url = log['params']['response']['url'] - - # 检查是否是评论相关的API(更宽泛的匹配) - comment_api_patterns = [ - '/aweme/v1/web/comment/list/', - '/comment/list/', - '/comment/detail/', - '/reply/list/' - ] - - is_comment_api = any(pattern in url for pattern in comment_api_patterns) - - if is_comment_api and video_id in url: - try: - # 获取响应体 - response_body = self.driver.execute_cdp_cmd( - 'Network.getResponseBody', - {'requestId': log['params']['requestId']} - ) - - if response_body and 'body' in response_body: - data = json.loads(response_body['body']) - - # 尝试多种可能的评论数据结构 - api_comments = [] - - # 标准结构 - if 'comments' in data: - api_comments = data['comments'] - # 备用结构 - elif 'comment_list' in data: - api_comments = data['comment_list'] - elif 'data' in data and isinstance(data['data'], list): - api_comments = data['data'] - elif 'data' in data and 'comments' in data['data']: - api_comments = data['data']['comments'] - - for comment in api_comments: - if isinstance(comment, dict): - comment_text = comment.get('text', '') or comment.get('content', '') - if comment_text and len(comment_text.strip()) > 0: - comment_info = { - 'text': comment_text.strip(), - 'user_name': comment.get('user', {}).get('nickname', '') if comment.get('user') else '', - 'digg_count': int(comment.get('digg_count', 0) or comment.get('like_count', 0)), - 'create_time': comment.get('create_time', 0) or comment.get('timestamp', 0), - 'source': 'patch_api' - } - comments.append(comment_info) - - # 记录API URL信息,用于调试 - if api_comments: - logging.debug(f'补丁机制从API获取到 {len(api_comments)} 条评论: {url}') - - except Exception as e: - logging.debug(f'补丁机制处理响应体失败: {e}') - continue - - except Exception as e: - logging.debug(f'补丁机制处理日志条目失败: {e}') - continue - - except Exception as e: - logging.error(f'补丁机制从网络日志提取评论失败: {e}') - - return comments - def _click_comment_area(self): """ 点击评论区域,触发评论加载 @@ -3492,17 +4567,107 @@ class DouyinPlayVVScraper: logging.debug(f'检测抢首评按钮时出错: {e}') return False + def cleanup_old_management_data(self, days_to_keep: int = 7): + """清理目标数据库Rankings_management中的旧数据,基于last_updated字段保留指定天数的数据""" + target_collection = self.collection # 使用根据模式选择的集合 + if target_collection is None: + logging.warning('[数据清理] 目标集合未初始化,跳过清理') + return False + + try: + # 计算需要保留的最早时间 + from datetime import timedelta + cutoff_datetime = datetime.now() - timedelta(days=days_to_keep) + + # 查询需要删除的数据数量(基于last_updated字段) + old_data_count = target_collection.count_documents({ + 'last_updated': {'$lt': cutoff_datetime} + }) + + if old_data_count == 0: + logging.info(f'[数据清理] 无需清理,没有超过{days_to_keep}天未更新的旧数据') + return True + + # 删除旧数据 + delete_result = target_collection.delete_many({ + 'last_updated': {'$lt': cutoff_datetime} + }) + + if delete_result.deleted_count > 0: + logging.info(f'[数据清理] ✅ 成功清理Rankings_management中{delete_result.deleted_count}条旧数据(保留最近{days_to_keep}天更新的数据)') + return True + else: + logging.warning(f'[数据清理] 清理操作未删除任何数据') + return False + + except Exception as e: + logging.error(f'[数据清理] 清理Rankings_management旧数据失败: {e}') + return False + def run(self): try: + # 在开始抓取前清理旧数据(保留最近7天) + self.cleanup_old_management_data(days_to_keep=7) + self.setup_driver() self.navigate() self.ensure_login() self.trigger_loading() - self.collect_network_bodies() - self.parse_ssr_data() - self.dedupe() + + logging.info('=' * 60) + logging.info('开始统一数据收集') + logging.info('=' * 60) + + # 使用统一数据收集器 + collector = UnifiedDataCollector(self.driver, self.duration_s) + collected_data = collector.collect_all_data() + + # 将收集到的数据转换为原有格式 + self.play_vv_items = [] + for item in collected_data: + self.play_vv_items.append({ + 'play_vv': item.get('play_vv', 0), + 'formatted': item.get('formatted', ''), + 'url': item.get('url', ''), + 'request_id': item.get('request_id', ''), + 'mix_name': item.get('mix_name', ''), + 'video_url': item.get('video_url', ''), + 'mix_id': item.get('mix_id', ''), + 'cover_image_url': item.get('cover_image_url', ''), + 'cover_backup_urls': item.get('cover_backup_urls', []), + 'series_author': item.get('series_author', ''), + 'desc': item.get('desc', ''), + 'updated_to_episode': item.get('updated_to_episode', 0), + 'timestamp': item.get('timestamp', '') + }) + + logging.info(f'✅ 统一数据收集完成:{len(self.play_vv_items)} 个合集') + + # 统一数据收集器已实时去重,无需额外去重步骤 + logging.info('=' * 60) + logging.info('数据去重已完成(统一收集器实时处理)') + logging.info('=' * 60) + + logging.info('=' * 60) + logging.info('开始保存数据') + logging.info('=' * 60) self.save_results() - logging.info('完成,play_vv数量: %d', len(self.play_vv_items)) + + logging.info('=' * 60) + logging.info(f'✅ 全部完成!共处理 {len(self.play_vv_items)} 个合集') + logging.info('=' * 60) + except Exception as e: + import traceback + error_details = { + 'error_type': type(e).__name__, + 'error_message': str(e), + 'traceback': traceback.format_exc(), + 'context': '执行抖音播放量抓取任务主流程' + } + logging.error(f'抓取任务执行失败: {error_details["error_type"]} - {error_details["error_message"]}') + logging.error(f'详细错误信息: {error_details["traceback"]}') + logging.error(f'错误上下文: {error_details["context"]}') + raise # 重新抛出异常,让上层调用者处理 finally: if self.driver: try: diff --git a/backend/routers/article_routes.py b/backend/routers/article_routes.py deleted file mode 100644 index 9d99260..0000000 --- a/backend/routers/article_routes.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -文章API服务器 -提供文章列表获取和文章详情获取的接口 -""" - -from flask import Blueprint, request, jsonify -from datetime import datetime, timedelta -import logging -from database import db -from bson import ObjectId - -# 创建蓝图 -article_bp = Blueprint('article', __name__, url_prefix='/api/article') - -# 获取数据库集合 -articles_collection = db['articles'] - -def format_time(time_obj): - """格式化时间""" - if not time_obj: - return "" - - if isinstance(time_obj, datetime): - return time_obj.strftime("%Y-%m-%d %H:%M:%S") - else: - return str(time_obj) - -def format_article_item(doc): - """格式化文章数据项""" - return { - "_id": str(doc.get("_id", "")), - "title": doc.get("title", ""), - "author_id": doc.get("author_id", ""), - "cover_image": doc.get("cover_image", ""), - "status": doc.get("status", ""), - "summary": doc.get("summary", ""), - "created_at": format_time(doc.get("created_at")), - "likes": doc.get("likes", []), - "likes_count": len(doc.get("likes", [])) - } - -def get_article_list(page=1, limit=20, sort_by="created_at", status=None): - """获取文章列表(分页)""" - try: - # 计算跳过的数量 - skip = (page - 1) * limit - - # 构建查询条件 - query_condition = {} - if status: - query_condition["status"] = status - - # 设置排序字段 - sort_field = sort_by if sort_by in ["created_at", "title"] else "created_at" - sort_order = -1 # 降序 - - # 查询数据 - cursor = articles_collection.find(query_condition).sort(sort_field, sort_order).skip(skip).limit(limit) - docs = list(cursor) - - # 获取总数 - total = articles_collection.count_documents(query_condition) - - # 格式化数据 - article_list = [] - for doc in docs: - item = format_article_item(doc) - article_list.append(item) - - return { - "success": True, - "data": article_list, - "pagination": { - "page": page, - "limit": limit, - "total": total, - "pages": (total + limit - 1) // limit, - "has_next": page * limit < total, - "has_prev": page > 1 - }, - "sort_by": sort_by, - "status_filter": status, - "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - - except Exception as e: - logging.error(f"获取文章列表失败: {e}") - return {"success": False, "message": f"获取数据失败: {str(e)}"} - -def search_articles(keyword, page=1, limit=10): - """搜索文章""" - try: - if not keyword: - return {"success": False, "message": "请提供搜索关键词"} - - # 计算跳过的数量 - skip = (page - 1) * limit - - # 构建搜索条件(模糊匹配标题和内容) - search_condition = { - "$or": [ - {"title": {"$regex": keyword, "$options": "i"}}, - {"content": {"$regex": keyword, "$options": "i"}}, - {"summary": {"$regex": keyword, "$options": "i"}} - ] - } - - # 查询数据 - cursor = articles_collection.find(search_condition).sort("created_at", -1).skip(skip).limit(limit) - docs = list(cursor) - - # 获取搜索结果总数 - total = articles_collection.count_documents(search_condition) - - # 格式化数据 - search_results = [] - for doc in docs: - item = format_article_item(doc) - search_results.append(item) - - return { - "success": True, - "data": search_results, - "keyword": keyword, - "pagination": { - "page": page, - "limit": limit, - "total": total, - "pages": (total + limit - 1) // limit, - "has_next": page * limit < total, - "has_prev": page > 1 - }, - "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - - except Exception as e: - logging.error(f"搜索文章失败: {e}") - return {"success": False, "message": f"搜索失败: {str(e)}"} - -def get_article_detail(article_id): - """获取文章详情""" - try: - # 尝试通过ObjectId查找 - try: - doc = articles_collection.find_one({"_id": ObjectId(article_id)}) - except: - # 如果ObjectId无效,尝试其他字段 - doc = articles_collection.find_one({ - "$or": [ - {"title": article_id}, - {"author_id": article_id} - ] - }) - - if not doc: - return {"success": False, "message": "未找到文章信息"} - - # 格式化详细信息 - detail = format_article_item(doc) - - return { - "success": True, - "data": detail, - "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - - except Exception as e: - logging.error(f"获取文章详情失败: {e}") - return {"success": False, "message": f"获取详情失败: {str(e)}"} - -def get_statistics(): - """获取统计信息""" - try: - # 基本统计 - total_articles = articles_collection.count_documents({}) - - if total_articles == 0: - return {"success": False, "message": "暂无数据"} - - # 按状态统计 - status_stats = [] - for status in ["draft", "published", "archived"]: - count = articles_collection.count_documents({"status": status}) - status_stats.append({"status": status, "count": count}) - - # 获取最新更新时间 - latest_doc = articles_collection.find().sort("created_at", -1).limit(1) - latest_time = "" - if latest_doc: - latest_list = list(latest_doc) - if latest_list: - latest_time = format_time(latest_list[0].get("created_at")) - - return { - "success": True, - "data": { - "total_articles": total_articles, - "status_stats": status_stats, - "latest_update": latest_time - }, - "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - - except Exception as e: - logging.error(f"获取统计信息失败: {e}") - return {"success": False, "message": f"获取统计失败: {str(e)}"} - -# 路由定义 -@article_bp.route('/list') -def get_articles(): - """获取文章列表""" - page = int(request.args.get('page', 1)) - limit = int(request.args.get('limit', 20)) - sort_by = request.args.get('sort', 'created_at') - status = request.args.get('status') - - result = get_article_list(page, limit, sort_by, status) - return jsonify(result) - -@article_bp.route('/search') -def search(): - """搜索文章""" - keyword = request.args.get('q', '') - page = int(request.args.get('page', 1)) - limit = int(request.args.get('limit', 10)) - result = search_articles(keyword, page, limit) - return jsonify(result) - -@article_bp.route('/detail') -def get_detail(): - """获取文章详情""" - article_id = request.args.get('id', '') - result = get_article_detail(article_id) - return jsonify(result) - -@article_bp.route('/stats') -def get_stats(): - """获取统计信息""" - result = get_statistics() - return jsonify(result) - -@article_bp.route('/health') -def health_check(): - """健康检查""" - try: - # 检查数据库连接 - total_records = articles_collection.count_documents({}) - - return jsonify({ - "success": True, - "message": "服务正常", - "data": { - "database": "连接正常", - "total_records": total_records, - "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - }) - except Exception as e: - return jsonify({ - "success": False, - "message": f"服务异常: {str(e)}", - "data": { - "database": "连接失败", - "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") - } - }) \ No newline at end of file diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index ae62d27..8038f0e 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -9,14 +9,18 @@ from flask import Blueprint, request, jsonify from datetime import datetime, timedelta import logging import re +import uuid +from werkzeug.utils import secure_filename from database import db +from handlers.Rankings.tos_client import oss_client # 创建蓝图 rank_bp = Blueprint('rank', __name__, url_prefix='/api/rank') # 获取数据库集合 -collection = db['Rankings_list'] -daily_rankings_collection = db['Ranking_storage'] # 榜单存储表 +collection = db['Ranking_storage'] # 主要数据源:榜单存储表(包含data数组) +rankings_management_collection = db['Rankings_management'] # 管理数据库(字段同步源) +claim_applications_collection = db['Claim_Applications'] # 认领申请集合 def format_playcount(playcount_str): """格式化播放量字符串为数字""" @@ -42,18 +46,6 @@ def format_playcount(playcount_str): except: return 0 -def format_cover_url(cover_data): - """格式化封面图片URL""" - if not cover_data: - return "" - - if isinstance(cover_data, str): - return cover_data - elif isinstance(cover_data, dict) and 'url_list' in cover_data: - return cover_data['url_list'][0] if cover_data['url_list'] else "" - else: - return "" - def format_time(time_obj): """格式化时间""" if not time_obj: @@ -64,6 +56,53 @@ def format_time(time_obj): else: return str(time_obj) +def parse_date_string(date_str): + """通用日期解析函数""" + try: + if isinstance(date_str, str): + return datetime.strptime(date_str, '%Y-%m-%d').date() + return date_str + except (ValueError, TypeError): + logging.warning(f"无法解析日期字符串: {date_str}") + return None + +def find_management_data(query, target_date=None): + """ + 通用的管理数据查询函数,优先使用mix_id进行查询 + + Args: + query: 查询条件字典,可以包含mix_id, mix_name等字段 + target_date: 目标日期(已不用于管理库过滤,保留参数兼容) + + Returns: + 查询到的文档或None + """ + try: + # 如果查询条件中有mix_id,优先使用mix_id查询 + if 'mix_id' in query and query['mix_id']: + mix_id_query = {"mix_id": query['mix_id']} + + result = rankings_management_collection.find_one(mix_id_query) + if result: + logging.info(f"通过mix_id找到管理数据: {query['mix_id']}") + return result + + # 如果通过mix_id没找到,或者没有mix_id,尝试其他查询条件 + fallback_query = {k: v for k, v in query.items() if k != 'mix_id'} + + if fallback_query: + result = rankings_management_collection.find_one(fallback_query) + if result: + logging.info(f"通过备用查询找到管理数据: {fallback_query}") + return result + + logging.warning(f"未找到匹配的管理数据: {query}") + return None + + except Exception as e: + logging.error(f"查询管理数据时出错: {e}") + return None + def sort_ranking_data(ranking_data, sort_by, sort_order='desc'): """ 对榜单数据进行动态排序 @@ -117,30 +156,76 @@ def sort_ranking_data(ranking_data, sort_by, sort_order='desc'): # 如果排序失败,返回原始数据 return ranking_data -def format_mix_item(doc): + + +def format_interaction_count(count): + """格式化互动数量为易读格式""" + try: + count = int(count) + if count >= 100000000: # 1亿+ + return f"{count / 100000000:.1f}亿" + elif count >= 10000: # 1万+ + return f"{count / 10000:.1f}万" + else: + return str(count) + except: + return "0" + +def format_mix_item(doc, target_date=None): """格式化合集数据项 - 完全按照数据库原始字段返回""" + mix_name = doc.get("mix_name", "") or doc.get("title", "") + + # 计算总点赞数 + episode_details = doc.get("episode_details", []) + total_likes = 0 + total_comments = 0 + + if episode_details: + for episode in episode_details: + total_likes += episode.get("likes", 0) + total_comments += len(episode.get("comments", [])) + + # 格式化总点赞数 + total_likes_formatted = format_interaction_count(total_likes) + total_comments_formatted = format_interaction_count(total_comments) + return { "_id": str(doc.get("_id", "")), "batch_time": format_time(doc.get("batch_time")), - "mix_name": doc.get("mix_name", ""), + "mix_name": mix_name, + "title": mix_name, "video_url": doc.get("video_url", ""), "playcount": doc.get("playcount", ""), "play_vv": doc.get("play_vv", 0), "request_id": doc.get("request_id", ""), "rank": doc.get("rank", 0), "cover_image_url": doc.get("cover_image_url", ""), - # 新增字段 + # 基础字段 "series_author": doc.get("series_author", ""), + "Manufacturing_Field": doc.get("Manufacturing_Field", ""), + "Copyright_field": doc.get("Copyright_field", ""), + "classification_type": doc.get("classification_type", ""), # 新增:类型/元素 + "release_date": doc.get("release_date", ""), # 新增:上线日期 "desc": doc.get("desc", ""), "updated_to_episode": doc.get("updated_to_episode", 0), "cover_backup_urls": doc.get("cover_backup_urls", []), "mix_id": doc.get("mix_id", ""), "episode_video_ids": doc.get("episode_video_ids", []), - "episode_details": doc.get("episode_details", []) + "episode_details": doc.get("episode_details", []), + # 点赞和评论总数 + "total_likes": total_likes, + "total_likes_formatted": total_likes_formatted, + "total_comments": total_comments, + "total_comments_formatted": total_comments_formatted, + # 播放量变化数据 + "timeline_data": doc.get("timeline_data", []), + # 评论总结 + "comments_summary": doc.get("comments_summary", ""), + } -def get_mix_list(page=1, limit=20, sort_by="playcount"): - """获取合集列表(分页)""" +def get_mix_list(page=1, limit=20, sort_by="playcount", classification_type=None): + """获取合集列表(分页)- 从Ranking_storage的data数组中获取数据,支持分类筛选""" try: # 计算跳过的数量 skip = (page - 1) * limit @@ -150,55 +235,85 @@ def get_mix_list(page=1, limit=20, sort_by="playcount"): # 按增长排序需要特殊处理 return get_growth_mixes(page, limit) else: - sort_field = "play_vv" if sort_by == "playcount" else "batch_time" - sort_order = -1 # 降序 - # 获取今天的日期 today = datetime.now().date() + today_str = today.strftime("%Y-%m-%d") - # 只查询今天的数据 - query_condition = { - "batch_time": { - "$gte": datetime(today.year, today.month, today.day), - "$lt": datetime(today.year, today.month, today.day) + timedelta(days=1) + # 从Ranking_storage中获取今天的数据 + ranking_doc = collection.find_one({ + "date": today_str, + "type": {"$in": ["comprehensive", "playcount"]} # 查找包含播放量数据的榜单 + }, sort=[("calculation_sequence", -1)]) # 获取最新的计算结果 + + if not ranking_doc or "data" not in ranking_doc: + # 如果没有找到今天的数据,返回空结果 + logging.warning(f"Ranking_storage中未找到 {today_str} 的数据") + return { + "success": True, + "message": f"暂无 {today_str} 的数据,请等待定时任务生成", + "data": [], + "pagination": { + "page": page, + "limit": limit, + "total": 0, + "pages": 0, + "has_next": False, + "has_prev": False + }, + "sort_by": sort_by, + "data_source": "ranking_storage", + "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") } - } - # 查询数据并按短剧名称分组,取每个短剧的最新记录 - pipeline = [ - {"$match": query_condition}, - {"$sort": {"batch_time": -1}}, # 按时间倒序 - {"$group": { - "_id": "$mix_name", # 按短剧名称分组 - "latest_doc": {"$first": "$$ROOT"} # 取每个分组的第一条记录(最新记录) - }}, - {"$replaceRoot": {"newRoot": "$latest_doc"}}, - {"$sort": {sort_field: sort_order}}, - {"$skip": skip}, - {"$limit": limit} - ] + # 获取data数组中的数据 + mix_data = ranking_doc.get("data", []) + + # 分类筛选逻辑 + if classification_type: + filtered_data = [] + classification_field_map = { + 'novel': 'Novel_IDs', + 'anime': 'Anime_IDs', + 'drama': 'Drama_IDs' + } + + if classification_type in classification_field_map: + field_name = classification_field_map[classification_type] + + for item in mix_data: + mix_id = item.get('mix_id') + if mix_id: + # 检查该mix_id是否在对应的分类字段中 + classification_ids = item.get(field_name, []) + if isinstance(classification_ids, list) and mix_id in classification_ids: + filtered_data.append(item) + + mix_data = filtered_data + logging.info(f"分类筛选 {classification_type}: 筛选出 {len(mix_data)} 条数据") + + # 按播放量排序(如果需要) + if sort_by == "playcount": + mix_data = sorted(mix_data, key=lambda x: x.get("play_vv", 0), reverse=True) - docs = list(collection.aggregate(pipeline)) + # 分页处理 + total = len(mix_data) + paginated_data = mix_data[skip:skip + limit] - # 获取总数 - total_pipeline = [ - {"$match": query_condition}, - {"$sort": {"batch_time": -1}}, - {"$group": {"_id": "$mix_name"}}, - {"$count": "total"} - ] - total_result = list(collection.aggregate(total_pipeline)) - total = total_result[0]["total"] if total_result else 0 - - # 格式化数据 - mix_list = [] - for doc in docs: - item = format_mix_item(doc) - mix_list.append(item) + # 为分页数据添加排名并格式化 + formatted_data = [] + for i, item in enumerate(paginated_data): + item["rank"] = skip + i + 1 + # 确保mix_name字段存在 + if "mix_name" not in item and "title" in item: + item["mix_name"] = item["title"] + + # 使用format_mix_item函数格式化数据,包括计算总点赞数 + formatted_item = format_mix_item(item) + formatted_data.append(formatted_item) return { "success": True, - "data": mix_list, + "data": formatted_data, "pagination": { "page": page, "limit": limit, @@ -208,48 +323,116 @@ def get_mix_list(page=1, limit=20, sort_by="playcount"): "has_prev": page > 1 }, "sort_by": sort_by, - "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "data_source": "ranking_storage", + "update_time": ranking_doc.get("created_at", datetime.now()).strftime("%Y-%m-%d %H:%M:%S") if isinstance(ranking_doc.get("created_at"), datetime) else str(ranking_doc.get("created_at", "")) } except Exception as e: logging.error(f"获取合集列表失败: {e}") return {"success": False, "message": f"获取数据失败: {str(e)}"} -def get_growth_mixes(page=1, limit=20, start_date=None, end_date=None): - """获取按播放量增长排序的合集列表 - 仅从Ranking_storage读取预计算数据""" +def get_yesterday_classification_data(mix_name, field_name): + """ + 获取昨天的分类数据 + + Args: + mix_name: 短剧名称 + field_name: 分类字段名 (Novel_IDs, Anime_IDs, Drama_IDs) + + Returns: + 昨天的分类数据列表或None + """ + try: + # 获取昨天的日期 + yesterday = datetime.now().date() - timedelta(days=1) + yesterday_str = yesterday.strftime("%Y-%m-%d") + + # 从Ranking_storage查询昨天的数据 + yesterday_doc = collection.find_one({ + "date": yesterday_str, + "data.mix_name": mix_name + }) + + if yesterday_doc: + # 在data数组中查找对应的项目 + for data_item in yesterday_doc.get("data", []): + if data_item.get("mix_name") == mix_name: + classification_ids = data_item.get(field_name, []) + if isinstance(classification_ids, list) and classification_ids: + logging.info(f"从昨天数据获取到分类信息: {mix_name} -> {field_name}: {classification_ids}") + return classification_ids + + return None + except Exception as e: + logging.error(f"获取昨天分类数据失败: {e}") + return None + +def get_growth_mixes(page=1, limit=20, start_date=None, end_date=None, classification_type=None): + """获取按播放量增长排序的合集列表 - 直接从Ranking_storage读取对应日期的数据""" try: # 计算跳过的数量 skip = (page - 1) * limit - # 如果没有提供日期,默认使用今天和昨天 - if not start_date or not end_date: - end_date = datetime.now().date() - start_date = end_date - timedelta(days=1) - else: - # 转换字符串日期为datetime对象 - if isinstance(start_date, str): - start_date = datetime.strptime(start_date, "%Y-%m-%d").date() + # 简化日期处理:直接使用前端传来的日期 + if start_date and end_date: + # 如果前端提供了日期,直接使用(优先使用end_date作为查询日期) if isinstance(end_date, str): - end_date = datetime.strptime(end_date, "%Y-%m-%d").date() + target_date = end_date + else: + target_date = end_date.strftime("%Y-%m-%d") + elif end_date: + # 如果只提供了end_date,使用end_date + if isinstance(end_date, str): + target_date = end_date + else: + target_date = end_date.strftime("%Y-%m-%d") + elif start_date: + # 如果只提供了start_date,使用start_date + if isinstance(start_date, str): + target_date = start_date + else: + target_date = start_date.strftime("%Y-%m-%d") + else: + # 如果没有提供日期,默认使用今天 + target_date = datetime.now().date().strftime("%Y-%m-%d") - end_date_str = end_date.strftime("%Y-%m-%d") - start_date_str = start_date.strftime("%Y-%m-%d") + logging.info(f"📅 查询日期: {target_date}") + + # 检查并自动同步Ranking_storage字段信息 + # 检查是否需要同步字段信息 + sample_item = collection.find_one({ + "date": target_date, + "mix_name": {"$exists": True} + }) + + if sample_item: + # 检查是否缺少关键字段 + missing_manufacturing = sample_item.get('Manufacturing_Field') is None + missing_copyright = sample_item.get('Copyright_field') is None + + if missing_manufacturing or missing_copyright: + logging.info(f"检测到 {target_date} 的Ranking_storage数据缺少字段信息,开始自动同步...") + sync_result = sync_ranking_storage_fields(target_date, force_update=False) + if sync_result["success"]: + logging.info(f"自动同步完成: {sync_result['stats']}") + else: + logging.warning(f"自动同步失败: {sync_result['message']}") # 从Ranking_storage读取预计算的增长榜数据 - growth_ranking = daily_rankings_collection.find_one({ - "date": end_date_str, + growth_ranking = collection.find_one({ + "date": target_date, "type": "comprehensive" # 使用comprehensive类型,包含增长数据 }, sort=[("calculation_sequence", -1)]) # 获取最新的计算结果 if not growth_ranking or "data" not in growth_ranking: # 如果没有找到comprehensive类型,尝试查找growth类型 - growth_ranking = daily_rankings_collection.find_one({ - "date": end_date_str, + growth_ranking = collection.find_one({ + "date": target_date, "type": "growth" }, sort=[("calculation_sequence", -1)]) if growth_ranking and "data" in growth_ranking: - logging.info(f"📈 从Ranking_storage读取 {end_date_str} 的增长榜数据") + logging.info(f"📈 从Ranking_storage读取 {target_date} 的增长榜数据") # 获取预先计算好的增长榜数据 growth_data = growth_ranking["data"] @@ -261,17 +444,134 @@ def get_growth_mixes(page=1, limit=20, start_date=None, end_date=None): key=lambda x: x.get("timeline_data", {}).get("play_vv_change", 0), reverse=True) + # 根据分类类型筛选数据 + if classification_type: + classification_field_map = { + "novel": "Novel_IDs", + "anime": "Anime_IDs", + "drama": "Drama_IDs" + } + + if classification_type in classification_field_map: + field_name = classification_field_map[classification_type] + filtered_data = [] + + for item in growth_data: + mix_name = item.get("mix_name", "") + mix_id = item.get("mix_id", "") + + # 检查当前数据是否有分类信息 + current_classification_ids = item.get(field_name, []) + + # 如果当前数据有分类信息,直接使用 + if isinstance(current_classification_ids, list) and current_classification_ids: + if mix_id and mix_id in current_classification_ids: + filtered_data.append(item) + elif not mix_id and mix_name: + # 如果没有mix_id但有mix_name,检查分类字段是否包含该短剧 + filtered_data.append(item) + else: + # 如果当前数据没有分类信息,尝试从昨天数据获取 + if mix_name: + yesterday_classification_ids = get_yesterday_classification_data(mix_name, field_name) + if yesterday_classification_ids: + # 使用昨天的分类数据 + if mix_id and mix_id in yesterday_classification_ids: + filtered_data.append(item) + elif not mix_id: + # 如果没有mix_id,直接使用昨天的分类数据 + filtered_data.append(item) + logging.info(f"使用昨天分类数据: {mix_name} -> {field_name}") + + growth_data = filtered_data + # 分页处理 total = len(growth_data) paginated_data = growth_data[skip:skip + limit] - # 为分页数据添加排名 + # 为分页数据添加排名和补充完整字段信息 for i, item in enumerate(paginated_data): item["rank"] = skip + i + 1 + # 修复:使用mix_name字段,不要用空的title覆盖它 + mix_name = item.get("mix_name", "") + + if mix_name: + + # 优化:直接从Ranking_storage中获取已同步的字段信息 + # 查找对应日期的Ranking_storage记录 + ranking_storage_item = collection.find_one({ + "date": target_date, + "mix_name": mix_name + }) + + if ranking_storage_item: + # 直接使用Ranking_storage中已同步的字段 + item.update({ + "Manufacturing_Field": ranking_storage_item.get("Manufacturing_Field", ""), + "Copyright_field": ranking_storage_item.get("Copyright_field", ""), + "series_author": ranking_storage_item.get("series_author", item.get("series_author", "")), + "video_id": ranking_storage_item.get("video_id", item.get("video_id", "")), + "video_url": ranking_storage_item.get("video_url", item.get("video_url", "")), + # 保持当前item中的封面和播放量数据(来自榜单计算) + "cover_image_url": item.get("cover_image_url", ranking_storage_item.get("cover_image_url", "")), + "play_vv": item.get("play_vv", ranking_storage_item.get("play_vv", 0)), + "playcount_str": item.get("playcount_str", ranking_storage_item.get("playcount_str", "0")) + }) + logging.info(f"从Ranking_storage获取到同步字段: {mix_name}") + else: + # 如果Ranking_storage中没有对应记录,回退到原有逻辑 + logging.warning(f"Ranking_storage中未找到 {mix_name} 的记录,回退到原有查询逻辑") + + # 根据查询日期判断数据源 + today = datetime.now().date() + # 将target_date字符串转换为日期对象进行比较 + try: + target_date_obj = datetime.strptime(target_date, "%Y-%m-%d").date() + is_historical_date = target_date_obj < today + except: + is_historical_date = False + + management_doc = None + + # 统一从Rankings_management获取数据 + management_doc = rankings_management_collection.find_one({"mix_name": mix_name}) + + if management_doc: + item.update({ + "Manufacturing_Field": management_doc.get("Manufacturing_Field", ""), + "Copyright_field": management_doc.get("Copyright_field", ""), + "series_author": management_doc.get("series_author", item.get("series_author", "")), + "video_id": management_doc.get("video_id", item.get("video_id", "")), + "video_url": management_doc.get("video_url", item.get("video_url", "")), + "cover_image_url": item.get("cover_image_url", management_doc.get("cover_image_url", "")), + "play_vv": item.get("play_vv", management_doc.get("play_vv", 0)), + "playcount_str": item.get("playcount_str", management_doc.get("playcount_str", "0")) + }) + else: + # 设置默认值 + item.update({ + "Manufacturing_Field": "", + "Copyright_field": "", + "series_author": item.get("series_author", ""), + "video_id": item.get("video_id", ""), + "video_url": item.get("video_url", ""), + "cover_image_url": item.get("cover_image_url", ""), + "play_vv": item.get("play_vv", 0), + "playcount_str": item.get("playcount_str", "0") + }) + else: + item["Manufacturing_Field"] = "" + item["Copyright_field"] = "" + + # 使用format_mix_item函数格式化所有数据,包括计算总点赞数 + formatted_data = [] + for item in paginated_data: + formatted_item = format_mix_item(item) + formatted_data.append(formatted_item) return { "success": True, - "data": paginated_data, + "data": formatted_data, "pagination": { "page": page, "limit": limit, @@ -282,18 +582,18 @@ def get_growth_mixes(page=1, limit=20, start_date=None, end_date=None): }, "sort_by": "growth", "date_range": { - "start_date": start_date_str, - "end_date": end_date_str + "start_date": target_date, + "end_date": target_date }, "data_source": "ranking_storage", # 标识数据来源 "update_time": growth_ranking.get("created_at", datetime.now()).strftime("%Y-%m-%d %H:%M:%S") if isinstance(growth_ranking.get("created_at"), datetime) else str(growth_ranking.get("created_at", "")) } else: # 如果Ranking_storage中没有数据,返回空结果 - logging.warning(f"Ranking_storage中未找到 {end_date_str} 的增长榜数据") + logging.warning(f"Ranking_storage中未找到 {target_date} 的增长榜数据") return { "success": True, - "message": f"暂无 {end_date_str} 的增长榜数据,请等待定时任务生成", + "message": f"暂无 {target_date} 的增长榜数据,请等待定时任务生成", "data": [], "pagination": { "page": page, @@ -305,8 +605,8 @@ def get_growth_mixes(page=1, limit=20, start_date=None, end_date=None): }, "sort_by": "growth", "date_range": { - "start_date": start_date_str, - "end_date": end_date_str + "start_date": target_date, + "end_date": target_date }, "data_source": "ranking_storage", "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") @@ -337,23 +637,19 @@ def get_top_mixes(limit=10): # 按播放量排序获取热门合集 cursor = collection.find().sort("play_vv", -1).limit(limit) docs = list(cursor) - if not docs: return {"success": False, "message": "暂无数据"} - # 格式化数据 top_list = [] for doc in docs: item = format_mix_item(doc) top_list.append(item) - return { "success": True, "data": top_list, "total": len(top_list), "update_time": format_time(docs[0].get("batch_time")) if docs else "" } - except Exception as e: logging.error(f"获取热门合集失败: {e}") return {"success": False, "message": f"获取数据失败: {str(e)}"} @@ -508,19 +804,32 @@ def get_statistics(): return {"success": False, "message": f"获取统计失败: {str(e)}"} # 路由定义 +@rank_bp.route('/growth_mixes') +def get_growth_mixes_route(): + """获取增长榜合集列表""" + page = int(request.args.get('page', 1)) + limit = int(request.args.get('limit', 20)) + start_date = request.args.get('start_date') + end_date = request.args.get('end_date') + classification_type = request.args.get('classification_type') + + result = get_growth_mixes(page, limit, start_date, end_date, classification_type) + return jsonify(result) + @rank_bp.route('/videos') def get_videos(): - """获取合集列表 - 兼容app.py调用""" + """获取合集列表 - 兼容app.py调用,支持分类筛选""" page = int(request.args.get('page', 1)) limit = int(request.args.get('limit', 20)) sort_by = request.args.get('sort', 'playcount') + classification_type = request.args.get('classification_type') # 新增分类筛选参数 if sort_by == 'growth': start_date = request.args.get('start_date') end_date = request.args.get('end_date') - result = get_growth_mixes(page, limit, start_date, end_date) + result = get_growth_mixes(page, limit, start_date, end_date, classification_type) else: - result = get_mix_list(page, limit, sort_by) + result = get_mix_list(page, limit, sort_by, classification_type) return jsonify(result) @@ -606,14 +915,14 @@ def get_rankings(): # 如果没有指定日期,默认获取最新日期的榜单 if not date: - latest_ranking = daily_rankings_collection.find_one( + latest_ranking = collection.find_one( {}, sort=[('date', -1)] ) if latest_ranking: query['date'] = latest_ranking['date'] # 查询榜单 - rankings = list(daily_rankings_collection.find(query).sort('generated_at', -1)) + rankings = list(collection.find(query).sort('generated_at', -1)) if not rankings: return jsonify({ @@ -680,7 +989,7 @@ def get_ranking_dates(): """获取可用的榜单日期列表""" try: # 获取所有不重复的日期 - dates = daily_rankings_collection.distinct('date') + dates = collection.distinct('date') dates.sort(reverse=True) # 按日期倒序排列 return jsonify({ @@ -702,7 +1011,7 @@ def get_ranking_types(): """获取支持的榜单类型""" try: # 获取所有不重复的榜单类型 - types = daily_rankings_collection.distinct('ranking_type') + types = collection.distinct('ranking_type') # 添加类型说明 type_descriptions = { @@ -737,7 +1046,7 @@ def get_latest_rankings(): """获取最新的所有类型榜单""" try: # 获取最新日期 - latest_ranking = daily_rankings_collection.find_one( + latest_ranking = collection.find_one( {}, sort=[('date', -1)] ) @@ -754,7 +1063,7 @@ def get_latest_rankings(): latest_date = latest_ranking['date'] # 获取该日期的所有榜单 - rankings = list(daily_rankings_collection.find({ + rankings = list(collection.find({ 'date': latest_date }).sort('ranking_type', 1)) @@ -792,17 +1101,17 @@ def get_rankings_stats(): """获取榜单统计信息""" try: # 统计总榜单数 - total_rankings = daily_rankings_collection.count_documents({}) + total_rankings = collection.count_documents({}) # 统计日期数量 - total_dates = len(daily_rankings_collection.distinct('date')) + total_dates = len(collection.distinct('date')) # 统计榜单类型数量 - total_types = len(daily_rankings_collection.distinct('ranking_type')) + total_types = len(collection.distinct('ranking_type')) # 获取最新和最早日期 - latest_ranking = daily_rankings_collection.find_one({}, sort=[('date', -1)]) - earliest_ranking = daily_rankings_collection.find_one({}, sort=[('date', 1)]) + latest_ranking = collection.find_one({}, sort=[('date', -1)]) + earliest_ranking = collection.find_one({}, sort=[('date', 1)]) latest_date = latest_ranking['date'] if latest_ranking else None earliest_date = earliest_ranking['date'] if earliest_ranking else None @@ -822,4 +1131,1809 @@ def get_rankings_stats(): except Exception as e: logging.error(f"获取榜单统计失败: {e}") - return jsonify({"success": False, "message": f"获取榜单统计失败: {str(e)}"}) \ No newline at end of file + return jsonify({"success": False, "message": f"获取榜单统计失败: {str(e)}"}) + + +@rank_bp.route('/update_drama_info', methods=['POST']) +def update_drama_info(): + """更新短剧信息(支持双向同步)""" + try: + data = request.get_json() + + # 验证必需参数 + if not data or 'mix_name' not in data: + return jsonify({"success": False, "message": "缺少必需参数 mix_name"}) + + mix_name = data['mix_name'] + target_date = data.get('target_date') # 可选参数,用于判断是否为今日数据 + + # 准备更新字段 + update_fields = {} + field_lock_updates = {} + + # 检查并添加需要更新的字段 + if 'title' in data: + update_fields['title'] = data['title'] + if 'series_author' in data: + update_fields['series_author'] = data['series_author'] + if 'Manufacturing_Field' in data: + update_fields['Manufacturing_Field'] = data['Manufacturing_Field'] + # 标记制作方字段已被用户锁定 + field_lock_updates['field_lock_status.Manufacturing_Field_locked'] = True + if 'Copyright_field' in data: + update_fields['Copyright_field'] = data['Copyright_field'] + # 标记版权方字段已被用户锁定 + field_lock_updates['field_lock_status.Copyright_field_locked'] = True + if 'classification_type' in data: + update_fields['classification_type'] = data['classification_type'] + # 标记类型/元素字段已被用户锁定 + field_lock_updates['field_lock_status.classification_type_locked'] = True + if 'release_date' in data: + update_fields['release_date'] = data['release_date'] + # 标记上线日期字段已被用户锁定 + field_lock_updates['field_lock_status.release_date_locked'] = True + if 'desc' in data: + update_fields['desc'] = data['desc'] + if 'play_vv' in data: + update_fields['play_vv'] = data['play_vv'] + + if 'cover_image_url' in data: + update_fields['cover_image_url'] = data['cover_image_url'] + if 'cover_backup_urls' in data: + update_fields['cover_backup_urls'] = data['cover_backup_urls'] + if 'timeline_data' in data: + update_fields['timeline_data'] = data['timeline_data'] + if 'comments_summary' in data: + update_fields['comments_summary'] = data['comments_summary'] + + # 检查分类字段的锁定状态 + if 'Novel_IDs' in data: + update_fields['Novel_IDs'] = data['Novel_IDs'] + field_lock_updates['field_lock_status.Novel_IDs_locked'] = True + if 'Anime_IDs' in data: + update_fields['Anime_IDs'] = data['Anime_IDs'] + field_lock_updates['field_lock_status.Anime_IDs_locked'] = True + if 'Drama_IDs' in data: + update_fields['Drama_IDs'] = data['Drama_IDs'] + field_lock_updates['field_lock_status.Drama_IDs_locked'] = True + + if not update_fields: + return jsonify({"success": False, "message": "没有提供需要更新的字段"}) + + # 获取今天的日期 + today = datetime.now().date().strftime('%Y-%m-%d') + is_today_data = target_date == today if target_date else True + + updated_count = 0 + + # 首先检查短剧是否存在 + existing_drama = rankings_management_collection.find_one({"mix_name": mix_name}) + if not existing_drama: + return jsonify({ + "success": False, + "message": f"未找到短剧: {mix_name}" + }) + + # 1. 更新Rankings_management数据库 + mgmt_update_data = update_fields.copy() + mgmt_update_data.update(field_lock_updates) # 添加锁定状态更新 + + result_mgmt = rankings_management_collection.update_many( + {"mix_name": mix_name}, + {"$set": mgmt_update_data} + ) + + # 2. 更新Ranking_storage数据库中的data数组 + storage_update_data = {f"data.$.{field}": value for field, value in update_fields.items()} + # 为Ranking_storage也添加锁定状态更新 + for field, value in field_lock_updates.items(): + storage_update_data[f"data.$.{field}"] = value + + result_storage = collection.update_many( + {"data.mix_name": mix_name}, + {"$set": storage_update_data} + ) + + updated_count = result_mgmt.modified_count + result_storage.modified_count + matched_count = result_mgmt.matched_count + result_storage.matched_count + + # 记录锁定状态更新 + locked_fields = [] + if field_lock_updates: + for field_key in field_lock_updates.keys(): + field_name = field_key.replace('field_lock_status.', '').replace('_locked', '') + locked_fields.append(field_name) + + logging.info(f"数据更新: Rankings_management(匹配:{result_mgmt.matched_count}, 修改:{result_mgmt.modified_count}), Ranking_storage(匹配:{result_storage.matched_count}, 修改:{result_storage.modified_count})") + if locked_fields: + logging.info(f"字段锁定状态更新: {', '.join(locked_fields)} 已被标记为用户锁定") + + # 只要找到了数据就算成功,不管是否有修改 + if matched_count > 0: + message = f"成功处理短剧 {mix_name} 的信息" + if updated_count > 0: + message += f",已更新 {updated_count} 条记录" + else: + message += ",数据无变化" + + return jsonify({ + "success": True, + "message": message, + "data": { + "mix_name": mix_name, + "updated_fields": list(update_fields.keys()), + "updated_count": updated_count, + "matched_count": matched_count, + "is_today_data": is_today_data + } + }) + else: + return jsonify({ + "success": False, + "message": f"未找到短剧 {mix_name} 的相关数据" + }) + + except Exception as e: + logging.error(f"更新短剧信息失败: {e}") + return jsonify({"success": False, "message": f"更新短剧信息失败: {str(e)}"}) + + +@rank_bp.route('/update_content_classification', methods=['POST']) +def update_content_classification(): + """更新内容分类(支持将短剧ID添加到对应分类字段中)""" + try: + data = request.get_json() + + # 验证必需参数(支持 mix_id 或 mix_name 任一) + if not data or ('mix_id' not in data and 'mix_name' not in data) or 'classification_type' not in data: + return jsonify({"success": False, "message": "缺少必需参数:需要 mix_id 或 mix_name,以及 classification_type"}) + + mix_id_param = data.get('mix_id') + mix_name = data.get('mix_name') + classification_type = data['classification_type'] # 'novel', 'anime', 'drama' + action = data.get('action', 'add') # 'add' 或 'remove' + exclusive = data.get('exclusive', True) # 默认启用互斥模式,确保每个短剧只能属于一个分类 + + # 验证分类类型 + valid_types = ['novel', 'anime', 'drama'] + if classification_type not in valid_types: + return jsonify({"success": False, "message": f"无效的分类类型,支持的类型: {valid_types}"}) + + # 映射分类类型到字段名 + field_mapping = { + 'novel': 'Novel_IDs', + 'anime': 'Anime_IDs', + 'drama': 'Drama_IDs' + } + field_name = field_mapping[classification_type] + + # 首先从Rankings_management获取短剧的mix_id,使用今天的日期 + today = datetime.now().date() + start_of_day = datetime.combine(today, datetime.min.time()) + end_of_day = datetime.combine(today, datetime.max.time()) + + mgmt_doc = rankings_management_collection.find_one({ + "mix_name": mix_name + }) + if not mgmt_doc: + return jsonify({"success": False, "message": f"未找到短剧:{mix_name or mix_id_param}"}) + + mix_id = mgmt_doc.get('mix_id') + if not mix_id: + return jsonify({"success": False, "message": f"短剧 {mix_name or '[未知名称]'} 缺少 mix_id"}) + + updated_count = 0 + + # 根据操作类型更新数据 + if action == 'add': + # 如果启用互斥模式,先移除其他分类 + if exclusive: + # 获取其他分类字段名 + other_fields = [f for f in field_mapping.values() if f != field_name] + + # 记录移除操作的结果 + removed_from_other_categories = [] + + # 1. 从Rankings_management中移除其他分类 + for other_field in other_fields: + result = rankings_management_collection.update_many( + {"mix_id": mix_id, other_field: mix_id}, + {"$pull": {other_field: mix_id}} + ) + if result.modified_count > 0: + # 找到对应的分类名称 + for cat_type, field in field_mapping.items(): + if field == other_field: + removed_from_other_categories.append(cat_type) + break + + # 2. 从Ranking_storage中移除其他分类 + for other_field in other_fields: + collection.update_many( + {"data.mix_name": mix_name}, + {"$pull": {f"data.$.{other_field}": mix_id}} + ) + + if removed_from_other_categories: + logging.info(f"互斥模式:已将短剧 {mix_name} 从 {', '.join(removed_from_other_categories)} 分类中移除") + else: + logging.info(f"互斥模式:短剧 {mix_name} 未在其他分类中,无需移除") + + # 添加到分类字段(使用$addToSet避免重复) + # 1. 更新Rankings_management数据库 + result_mgmt = rankings_management_collection.update_many( + {"mix_id": mix_id}, + {"$addToSet": {field_name: mix_id}} + ) + + # 2. 更新Ranking_storage数据库中的data数组 + result_storage = collection.update_many( + {"data.mix_name": mix_name}, + {"$addToSet": {f"data.$.{field_name}": mix_id}} + ) + + updated_count = result_mgmt.modified_count + result_storage.modified_count + message = f"成功将短剧 {mix_name} 添加到 {classification_type} 分类" + if exclusive and removed_from_other_categories: + message += f"(已自动从 {', '.join(removed_from_other_categories)} 分类中移除)" + + elif action == 'remove': + # 从分类字段中移除 + # 1. 更新Rankings_management数据库 + result_mgmt = rankings_management_collection.update_many( + {"mix_id": mix_id}, + {"$pull": {field_name: mix_id}} + ) + + # 2. 更新Ranking_storage数据库中的data数组 + result_storage = collection.update_many( + {"data.mix_name": mix_name}, + {"$pull": {f"data.$.{field_name}": mix_id}} + ) + + updated_count = result_mgmt.modified_count + result_storage.modified_count + message = f"成功将短剧 {mix_name} 从 {classification_type} 分类中移除" + + else: + return jsonify({"success": False, "message": "无效的操作类型,支持 'add' 或 'remove'"}) + + logging.info(f"分类更新: {message}, Rankings_management({result_mgmt.modified_count}), Ranking_storage({result_storage.modified_count})") + + # 获取更新后的分类状态(按 mix_id 直接查询,不做日期过滤) + updated_mgmt_doc = rankings_management_collection.find_one({"mix_id": mix_id}) + classification_status = { + 'novel': mix_id in updated_mgmt_doc.get('Novel_IDs', []) if updated_mgmt_doc else False, + 'anime': mix_id in updated_mgmt_doc.get('Anime_IDs', []) if updated_mgmt_doc else False, + 'drama': mix_id in updated_mgmt_doc.get('Drama_IDs', []) if updated_mgmt_doc else False + } + + return jsonify({ + "success": True, + "message": message, + "data": { + "mix_name": mix_name, + "mix_id": mix_id, + "classification_type": classification_type, + "field_name": field_name, + "action": action, + "updated_count": updated_count, + "classification_status": classification_status + } + }) + + except Exception as e: + logging.error(f"更新内容分类失败: {e}") + return jsonify({"success": False, "message": f"更新内容分类失败: {str(e)}"}) + + +@rank_bp.route('/get_content_classification', methods=['GET']) +def get_content_classification(): + """获取短剧的分类状态""" + try: + mix_id_param = request.args.get('mix_id') + mix_name = request.args.get('mix_name') + + if not mix_id_param and not mix_name: + return jsonify({"success": False, "message": "缺少必需参数:需要 mix_id 或 mix_name"}) + + # 优先使用 mix_id 获取管理库信息(不做日期过滤) + mgmt_doc = find_management_data({'mix_id': mix_id_param, 'mix_name': mix_name}) + if not mgmt_doc: + return jsonify({"success": False, "message": f"未找到短剧:{mix_name or mix_id_param}"}) + + mix_id = mgmt_doc.get('mix_id') + if not mix_id: + return jsonify({"success": False, "message": f"短剧 {mix_name or '[未知名称]'} 缺少 mix_id"}) + + # 检查短剧在各个分类中的状态 + novel_ids = mgmt_doc.get('Novel_IDs', []) + anime_ids = mgmt_doc.get('Anime_IDs', []) + drama_ids = mgmt_doc.get('Drama_IDs', []) + + classification_status = { + 'novel': mix_id in novel_ids, + 'anime': mix_id in anime_ids, + 'drama': mix_id in drama_ids + } + + return jsonify({ + "success": True, + "message": f"获取短剧 {mgmt_doc.get('mix_name', mix_name)} 分类状态成功", + "data": { + "mix_name": mgmt_doc.get('mix_name', mix_name), + "mix_id": mix_id, + "classification_status": classification_status, + "classification_details": { + "Novel_IDs": novel_ids, + "Anime_IDs": anime_ids, + "Drama_IDs": drama_ids + } + } + }) + + except Exception as e: + logging.error(f"获取内容分类状态失败: {e}") + return jsonify({"success": False, "message": f"获取内容分类状态失败: {str(e)}"}) + + +def validate_and_fix_classification_exclusivity(): + """ + 验证和修复数据库中的分类互斥性 + 确保每个短剧只属于一个分类(Novel_IDs、Anime_IDs、Drama_IDs) + + Returns: + dict: 修复结果统计 + """ + try: + # 获取所有Rankings_management数据 + all_docs = list(rankings_management_collection.find({})) + + fixed_count = 0 + conflict_count = 0 + + for doc in all_docs: + mix_name = doc.get('mix_name', '') + mix_id = doc.get('mix_id') + + if not mix_id: + continue + + # 检查分类字段 + novel_ids = doc.get('Novel_IDs', []) + anime_ids = doc.get('Anime_IDs', []) + drama_ids = doc.get('Drama_IDs', []) + + # 统计该mix_id在多少个分类中出现 + classifications = [] + if mix_id in novel_ids: + classifications.append('novel') + if mix_id in anime_ids: + classifications.append('anime') + if mix_id in drama_ids: + classifications.append('drama') + + # 如果出现在多个分类中,需要修复 + if len(classifications) > 1: + conflict_count += 1 + logging.warning(f"发现分类冲突: {mix_name} 同时属于 {classifications}") + + # 保留最后一个分类,移除其他分类 + # 优先级:drama > anime > novel + if 'drama' in classifications: + keep_classification = 'drama' + elif 'anime' in classifications: + keep_classification = 'anime' + else: + keep_classification = 'novel' + + # 更新数据库 + update_fields = {} + if keep_classification == 'novel': + update_fields['Novel_IDs'] = novel_ids + update_fields['Anime_IDs'] = [id for id in anime_ids if id != mix_id] + update_fields['Drama_IDs'] = [id for id in drama_ids if id != mix_id] + elif keep_classification == 'anime': + update_fields['Novel_IDs'] = [id for id in novel_ids if id != mix_id] + update_fields['Anime_IDs'] = anime_ids + update_fields['Drama_IDs'] = [id for id in drama_ids if id != mix_id] + elif keep_classification == 'drama': + update_fields['Novel_IDs'] = [id for id in novel_ids if id != mix_id] + update_fields['Anime_IDs'] = [id for id in anime_ids if id != mix_id] + update_fields['Drama_IDs'] = drama_ids + + # 更新Rankings_management - 优先使用mix_id + if mix_id: + rankings_management_collection.update_one( + {"mix_id": mix_id}, + {"$set": update_fields} + ) + else: + rankings_management_collection.update_one( + {"mix_name": mix_name}, + {"$set": update_fields} + ) + + # 更新Ranking_storage - 优先使用mix_id + if mix_id: + collection.update_many( + {"data.mix_id": mix_id}, + {"$set": { + f"data.$.Novel_IDs": update_fields['Novel_IDs'], + f"data.$.Anime_IDs": update_fields['Anime_IDs'], + f"data.$.Drama_IDs": update_fields['Drama_IDs'] + }} + ) + else: + collection.update_many( + {"data.mix_name": mix_name}, + {"$set": { + f"data.$.Novel_IDs": update_fields['Novel_IDs'], + f"data.$.Anime_IDs": update_fields['Anime_IDs'], + f"data.$.Drama_IDs": update_fields['Drama_IDs'] + }} + ) + + fixed_count += 1 + logging.info(f"修复分类冲突: {mix_name} 保留为 {keep_classification} 分类") + + return { + "success": True, + "message": f"分类互斥性验证完成", + "data": { + "total_checked": len(all_docs), + "conflicts_found": conflict_count, + "conflicts_fixed": fixed_count + } + } + + except Exception as e: + logging.error(f"验证分类互斥性失败: {e}") + return { + "success": False, + "message": f"验证分类互斥性失败: {str(e)}" + } + + +def sync_ranking_storage_fields(target_date=None, force_update=False, max_retries=3, retry_delay=60): + """ + 同步Ranking_storage中的字段信息 + 统一从Rankings_management中获取对应的字段值并保存到Ranking_storage + + Args: + target_date: 目标日期,格式为'YYYY-MM-DD',默认为今天 + force_update: 是否强制更新已有字段,默认False + max_retries: 最大重试次数,默认3次 + retry_delay: 重试间隔(秒),默认60秒 + + Returns: + dict: 同步结果统计 + """ + try: + # 设置目标日期 + if target_date is None: + target_date_obj = datetime.now().date() + target_date = target_date_obj.strftime('%Y-%m-%d') + else: + target_date_obj = datetime.strptime(target_date, '%Y-%m-%d').date() + + # 获取Ranking_storage中指定日期的数据 + ranking_storage_query = {"date": target_date} + ranking_storage_items = list(collection.find(ranking_storage_query)) + + if not ranking_storage_items: + return { + "success": False, + "message": f"未找到日期 {target_date} 的Ranking_storage数据" + } + + # 统计信息 + total_items = len(ranking_storage_items) + updated_items = 0 + skipped_items = 0 + error_items = 0 + retry_count = 0 # 重试次数计数器 + pending_items = [] # 需要重试的项目 + + # 🔄 修复后的同步逻辑:更新data数组中的每个项目 + for ranking_doc in ranking_storage_items: + try: + # 获取data数组 + data_array = ranking_doc.get('data', []) + if not data_array: + logging.warning(f"Ranking_storage文档没有data数组: {ranking_doc.get('_id')}") + skipped_items += 1 + continue + + # 标记是否有任何项目被更新 + doc_updated = False + updated_data_array = [] + + # 遍历data数组中的每个项目 + for data_item in data_array: + try: + mix_name = data_item.get('mix_name', '').strip() + + # 🚫 跳过无效数据:确保mix_name不为空 + if not mix_name or mix_name == "" or mix_name.lower() == "null": + logging.warning(f"跳过空的或无效的mix_name记录: {data_item.get('_id', 'unknown')}") + continue # 不添加到updated_data_array,直接跳过 + + # 🔧 优化逻辑:优先使用mix_id进行查询,提高准确性 + source_data = None + mix_id = data_item.get('mix_id') + + # 使用通用查询函数,优先mix_id查询 + query_conditions = {} + if mix_id: + query_conditions['mix_id'] = mix_id + if mix_name: + query_conditions['mix_name'] = mix_name + + # 使用find_management_data函数进行查询 + if query_conditions: + source_data = find_management_data(query_conditions, target_date) + + # 如果还是没找到,尝试通过title匹配 + if not source_data: + title = data_item.get('title') + if title and title.strip(): + title_query = {"mix_name": title.strip()} + source_data = find_management_data(title_query, target_date) + if source_data: + logging.info(f"通过title找到数据: {title} -> {source_data.get('mix_name', 'N/A')}") + + # 如果找到了源数据,更新mix_name(如果原来为空的话) + if source_data and not mix_name: + mix_name = source_data.get('mix_name', '').strip() + if mix_name: + data_item['mix_name'] = mix_name + logging.info(f"修复空的mix_name: {data_item.get('title', 'N/A')} -> {mix_name}") + else: + logging.warning(f"源数据中的mix_name也为空,跳过此记录") + continue # 跳过无效记录 + + # 如果还是没有找到源数据,检查是否有锁定字段需要保护 + if not source_data: + logging.warning(f"无法找到对应的源数据: mix_name={mix_name}, mix_id={data_item.get('mix_id')}, title={data_item.get('title')}") + + # 检查是否有锁定字段,如果有锁定字段,保持原数据不变(从 data_item 获取) + field_lock_status = data_item.get('field_lock_status', {}) + has_locked_fields = any([ + field_lock_status.get('Manufacturing_Field_locked', False), + field_lock_status.get('Copyright_field_locked', False), + field_lock_status.get('classification_type_locked', False), # 新增 + field_lock_status.get('release_date_locked', False), # 新增 + field_lock_status.get('Novel_IDs_locked', False), + field_lock_status.get('Anime_IDs_locked', False), + field_lock_status.get('Drama_IDs_locked', False) + ]) + + # 检查是否有用户设置的数据(锁定字段或分类数据) + has_user_data = has_locked_fields or any([ + data_item.get('Manufacturing_Field'), + data_item.get('Copyright_field'), + data_item.get('classification_type'), # 新增 + data_item.get('release_date'), # 新增 + data_item.get('Novel_IDs'), + data_item.get('Anime_IDs'), + data_item.get('Drama_IDs') + ]) + + if has_locked_fields: + logging.info(f"保持锁定字段不变: {mix_name} (无源数据但有锁定字段)") + updated_data_array.append(data_item) + elif has_user_data: + logging.info(f"保持用户设置的数据: {mix_name} (无源数据但有用户数据)") + updated_data_array.append(data_item) + else: + # 只有当mix_name有效时才保留记录 + if mix_name and mix_name.strip(): + updated_data_array.append(data_item) + continue + + # 检查是否需要更新 - 包含所有Rankings_management字段 + fields_to_check = { + # 基础字段 + 'batch_id': data_item.get('batch_id'), + 'batch_time': data_item.get('batch_time'), + 'item_sequence': data_item.get('item_sequence'), + 'mix_id': data_item.get('mix_id'), + 'playcount': data_item.get('playcount'), + 'request_id': data_item.get('request_id'), + # 封面相关字段 + 'cover_image_url_original': data_item.get('cover_image_url_original'), + 'cover_upload_success': data_item.get('cover_upload_success'), + 'cover_backup_urls': data_item.get('cover_backup_urls'), + # 内容字段 + 'desc': data_item.get('desc'), + 'series_author': data_item.get('series_author'), + 'updated_to_episode': data_item.get('updated_to_episode'), + 'episode_video_ids': data_item.get('episode_video_ids'), + 'episode_details': data_item.get('episode_details'), + # 状态字段 + 'data_status': data_item.get('data_status'), + 'realtime_saved': data_item.get('realtime_saved'), + 'created_at': data_item.get('created_at'), + 'last_updated': data_item.get('last_updated'), + 'Manufacturing_Field': data_item.get('Manufacturing_Field'), + 'Copyright_field': data_item.get('Copyright_field'), + 'classification_type': data_item.get('classification_type', ''), # 新增:类型/元素 + 'release_date': data_item.get('release_date', ''), # 新增:上线日期 + # 新增:内容分类字段 + 'Novel_IDs': data_item.get('Novel_IDs', []), + 'Anime_IDs': data_item.get('Anime_IDs', []), + 'Drama_IDs': data_item.get('Drama_IDs', []), + # 评论总结字段 + 'comments_summary': data_item.get('comments_summary', ''), + # 计算字段 + } + + # 🔒 检查字段锁定状态(从 data_item 获取,而不是 ranking_doc) + field_lock_status = data_item.get('field_lock_status', {}) + manufacturing_locked = field_lock_status.get('Manufacturing_Field_locked', False) + copyright_locked = field_lock_status.get('Copyright_field_locked', False) + novel_ids_locked = field_lock_status.get('Novel_IDs_locked', False) + anime_ids_locked = field_lock_status.get('Anime_IDs_locked', False) + drama_ids_locked = field_lock_status.get('Drama_IDs_locked', False) + + # 检查哪些字段需要更新(检查目标数据是否缺少字段) + needs_update = False + for field_name, source_field_value in fields_to_check.items(): + # 🔒 字段锁定保护:如果字段已锁定,跳过更新 + if field_name == 'Manufacturing_Field' and manufacturing_locked: + continue + elif field_name == 'Copyright_field' and copyright_locked: + continue + elif field_name == 'Novel_IDs' and novel_ids_locked: + continue + elif field_name == 'Anime_IDs' and anime_ids_locked: + continue + elif field_name == 'Drama_IDs' and drama_ids_locked: + continue + + # 🔑 关键修复:检查目标数据(data_item)中的字段值,而不是源数据 + current_value = data_item.get(field_name) + + # 对于数组字段,检查是否为空数组 + if field_name in ['cover_backup_urls', 'episode_video_ids', 'episode_details', 'Novel_IDs', 'Anime_IDs', 'Drama_IDs']: + if force_update or current_value is None or (isinstance(current_value, list) and len(current_value) == 0): + needs_update = True + break + # 对于其他字段,检查目标数据是否缺少或为空 + elif force_update or current_value is None or current_value == '': + needs_update = True + break + + if not needs_update: + updated_data_array.append(data_item) + continue + + # 从源数据获取字段值并更新data_item + item_updated = False + for field_name, source_field_value in fields_to_check.items(): + # 🔒 字段锁定保护:如果字段已锁定,跳过更新 + if field_name == 'Manufacturing_Field' and manufacturing_locked: + logging.info(f"[字段锁定] 保护Manufacturing_Field不被覆盖: {mix_name}") + continue + elif field_name == 'Copyright_field' and copyright_locked: + logging.info(f"[字段锁定] 保护Copyright_field不被覆盖: {mix_name}") + continue + elif field_name == 'Novel_IDs' and novel_ids_locked: + logging.info(f"[字段锁定] 保护Novel_IDs不被覆盖: {mix_name}") + continue + elif field_name == 'Anime_IDs' and anime_ids_locked: + logging.info(f"[字段锁定] 保护Anime_IDs不被覆盖: {mix_name}") + continue + elif field_name == 'Drama_IDs' and drama_ids_locked: + logging.info(f"[字段锁定] 保护Drama_IDs不被覆盖: {mix_name}") + continue + + # 🔑 关键修复:检查目标数据(data_item)中的字段值 + current_value = data_item.get(field_name) + + # 对于数组字段,检查是否为空数组 + should_update = False + if field_name in ['cover_backup_urls', 'episode_video_ids', 'episode_details', 'Novel_IDs', 'Anime_IDs', 'Drama_IDs']: + should_update = force_update or current_value is None or (isinstance(current_value, list) and len(current_value) == 0) + else: + should_update = force_update or current_value is None or current_value == '' + + if should_update: + if field_name == 'episode_details': + # 特殊处理episode_details字段,直接从源数据复制 + data_item[field_name] = source_data.get(field_name, []) + item_updated = True + elif field_name == 'cover_backup_urls': + # 特殊处理cover_backup_urls字段,确保是数组格式 + cover_backup_urls = source_data.get(field_name, []) + if not isinstance(cover_backup_urls, list): + cover_backup_urls = [] + data_item[field_name] = cover_backup_urls + item_updated = True + elif field_name == 'episode_video_ids': + # 特殊处理episode_video_ids字段,确保是数组格式 + episode_video_ids = source_data.get(field_name, []) + if not isinstance(episode_video_ids, list): + episode_video_ids = [] + data_item[field_name] = episode_video_ids + item_updated = True + elif field_name in ['Novel_IDs', 'Anime_IDs', 'Drama_IDs']: + # 特殊处理分类字段,确保是数组格式和互斥性 + classification_ids = source_data.get(field_name, []) + if not isinstance(classification_ids, list): + classification_ids = [] + + # 🔑 关键修复:只有当源数据有值时才更新,否则保留用户设置 + if classification_ids: + # 源数据有值,更新分类字段 + # 确保分类互斥性:如果当前字段有值,清空其他分类字段(但要检查锁定状态) + if field_name == 'Novel_IDs': + # 只有在其他字段未锁定时才清空 + if not anime_ids_locked: + data_item['Anime_IDs'] = [] + if not drama_ids_locked: + data_item['Drama_IDs'] = [] + elif field_name == 'Anime_IDs': + if not novel_ids_locked: + data_item['Novel_IDs'] = [] + if not drama_ids_locked: + data_item['Drama_IDs'] = [] + elif field_name == 'Drama_IDs': + if not novel_ids_locked: + data_item['Novel_IDs'] = [] + if not anime_ids_locked: + data_item['Anime_IDs'] = [] + + data_item[field_name] = classification_ids + item_updated = True + else: + # 源数据为空,检查当前是否有用户设置的值 + current_classification = data_item.get(field_name, []) + if current_classification and isinstance(current_classification, list) and len(current_classification) > 0: + # 用户已设置分类,保留不变 + logging.info(f"[分类保护] 保留用户设置的 {field_name}: {mix_name}") + else: + # 当前也没有值,设置为空数组 + data_item[field_name] = [] + item_updated = True + elif field_name == 'comments_summary': + # 🎬 特殊处理评论总结字段:只有源数据有值时才更新,保护已有的总结 + source_value = source_data.get(field_name, '') + if source_value: # 只有当源数据有评论总结时才更新 + data_item[field_name] = source_value + item_updated = True + logging.info(f"[评论总结] 更新评论总结: {mix_name}") + else: + # 源数据没有总结,保留当前值(不覆盖) + logging.debug(f"[评论总结] 保留现有评论总结: {mix_name}") + else: + # 对于其他字段,直接从源数据获取 + source_value = source_data.get(field_name, '') + data_item[field_name] = source_value + item_updated = True + + # 🔒 保护重要字段:确保不覆盖播放量差值等关键数据 + # timeline_data字段必须保留 + # 保护其他重要的计算字段 + protected_fields = ['rank', 'play_vv', 'video_id', 'video_url', 'cover_image_url', 'playcount_str', 'timeline_data'] + # 这些字段不会被覆盖,因为它们不在fields_to_check中 + + if item_updated: + doc_updated = True + logging.info(f"✅ 成功同步data项目字段: {mix_name}") + + updated_data_array.append(data_item) + + except Exception as e: + logging.error(f"同步data项目失败 {data_item.get('mix_name', 'N/A')}: {e}") + # 保持原数据不变 + updated_data_array.append(data_item) + continue + + # 如果有任何项目被更新,更新整个文档的data数组 + if doc_updated: + collection.update_one( + {"_id": ranking_doc["_id"]}, + {"$set": {"data": updated_data_array}} + ) + updated_items += 1 + logging.info(f"✅ 成功更新Ranking_storage文档的data数组: {ranking_doc.get('date', 'N/A')}") + else: + skipped_items += 1 + + except Exception as e: + logging.error(f"同步Ranking_storage文档失败 {ranking_doc.get('_id')}: {e}") + error_items += 1 + continue + + # 新的同步逻辑已经直接处理data数组,不需要重试机制 + + return { + "success": True, + "message": f"同步完成(重试 {retry_count} 次)", + "stats": { + "target_date": target_date, + "total_items": total_items, + "updated_items": updated_items, + "skipped_items": skipped_items, + "error_items": error_items, + "retry_count": retry_count, + "pending_items_final": len(pending_items), + "data_source": "Rankings_management" + } + } + + except Exception as e: + logging.error(f"同步Ranking_storage字段失败: {e}") + return { + "success": False, + "message": f"同步失败: {str(e)}" + } + + +@rank_bp.route('/sync_ranking_fields', methods=['POST']) +def sync_ranking_fields(): + """ + API端点:同步Ranking_storage字段 + """ + try: + data = request.get_json() or {} + target_date = data.get('target_date') + force_update = data.get('force_update', False) + + result = sync_ranking_storage_fields(target_date, force_update) + + if result["success"]: + return jsonify(result) + else: + return jsonify(result), 400 + + except Exception as e: + logging.error(f"同步API调用失败: {e}") + return jsonify({ + "success": False, + "message": f"API调用失败: {str(e)}" + }), 500 + + +@rank_bp.route('/validate_classification_exclusivity', methods=['POST']) +def validate_classification_exclusivity_api(): + """ + API端点:验证和修复分类互斥性 + 确保每个短剧只属于一个分类(Novel_IDs、Anime_IDs、Drama_IDs) + """ + try: + result = validate_and_fix_classification_exclusivity() + + if result["success"]: + return jsonify(result) + else: + return jsonify(result), 400 + + except Exception as e: + logging.error(f"验证分类互斥性API失败: {e}") + return jsonify({ + "success": False, + "message": f"验证分类互斥性失败: {str(e)}" + }), 500 + + +@rank_bp.route('/get_comments_summary', methods=['GET']) +def get_comments_summary(): + """获取短剧的评论总结(优先使用 mix_id)""" + try: + mix_id = request.args.get('mix_id') + mix_name = request.args.get('mix_name') + date_str = request.args.get('date') + + if not mix_id and not mix_name: + return jsonify({"success": False, "message": "缺少必需参数 mix_id 或 mix_name"}) + + if not date_str: + from datetime import date + date_str = date.today().strftime('%Y-%m-%d') + + # 从 Ranking_storage 获取榜单数据 + ranking_doc = collection.find_one({ + "date": date_str, + "type": "comprehensive" + }, sort=[("created_at", -1)]) + + if not ranking_doc: + return jsonify({ + "success": False, + "message": f"未找到 {date_str} 的榜单数据" + }) + + # 在 data 数组中查找短剧(优先使用 mix_id) + data_items = ranking_doc.get("data", []) + drama_item = None + + for item in data_items: + # 优先使用 mix_id 匹配 + if mix_id and item.get("mix_id") == mix_id: + drama_item = item + break + # 备用:使用 mix_name 匹配 + elif mix_name and item.get("mix_name") == mix_name: + drama_item = item + # 继续查找,看是否有 mix_id 匹配的 + + if not drama_item: + return jsonify({ + "success": False, + "message": f"未找到短剧: {mix_name or mix_id}" + }) + + comments_summary = drama_item.get("comments_summary", "") + + if not comments_summary: + return jsonify({ + "success": False, + "message": "该短剧暂无评论总结" + }) + + return jsonify({ + "success": True, + "data": { + "mix_id": drama_item.get("mix_id"), + "mix_name": drama_item.get("mix_name"), + "date": date_str, + "comments_summary": comments_summary + } + }) + + except Exception as e: + logging.error(f"获取评论总结失败: {e}") + return jsonify({ + "success": False, + "message": f"获取评论总结失败: {str(e)}" + }), 500 + + +@rank_bp.route('/clear_comments_summary', methods=['POST']) +def clear_comments_summary(): + """清空短剧的评论总结(优先使用 mix_id)""" + try: + data = request.get_json() + mix_id = data.get('mix_id') + mix_name = data.get('mix_name') + date_str = data.get('date') + + if not mix_id and not mix_name: + return jsonify({"success": False, "message": "缺少必需参数 mix_id 或 mix_name"}) + + if not date_str: + from datetime import date + date_str = date.today().strftime('%Y-%m-%d') + + # 从 Ranking_storage 获取榜单数据 + ranking_doc = collection.find_one({ + "date": date_str, + "type": "comprehensive" + }, sort=[("created_at", -1)]) + + if not ranking_doc: + return jsonify({ + "success": False, + "message": f"未找到 {date_str} 的榜单数据" + }) + + # 在 data 数组中查找短剧并获取 mix_id + data_items = ranking_doc.get("data", []) + target_mix_id = None + target_mix_name = None + + for item in data_items: + if mix_id and item.get("mix_id") == mix_id: + target_mix_id = item.get("mix_id") + target_mix_name = item.get("mix_name") + break + elif mix_name and item.get("mix_name") == mix_name: + target_mix_id = item.get("mix_id") + target_mix_name = item.get("mix_name") + + if not target_mix_id and not target_mix_name: + return jsonify({ + "success": False, + "message": f"未找到短剧: {mix_name or mix_id}" + }) + + # 清空评论总结字段(优先使用 mix_id) + if target_mix_id: + result = collection.update_many( + { + "date": date_str, + "type": "comprehensive", + "data.mix_id": target_mix_id + }, + { + "$set": { + "data.$[elem].comments_summary": "" + } + }, + array_filters=[{"elem.mix_id": target_mix_id}] + ) + else: + # 备用:使用 mix_name + result = collection.update_many( + { + "date": date_str, + "type": "comprehensive", + "data.mix_name": target_mix_name + }, + { + "$set": { + "data.$[elem].comments_summary": "" + } + }, + array_filters=[{"elem.mix_name": target_mix_name}] + ) + + # 同时清空 Rankings_management 中的评论总结 + management_result = None + if target_mix_id: + management_result = rankings_management_collection.update_one( + {"mix_id": target_mix_id}, + {"$set": {"comments_summary": ""}} + ) + elif target_mix_name: + management_result = rankings_management_collection.update_one( + {"mix_name": target_mix_name}, + {"$set": {"comments_summary": ""}} + ) + + if result.modified_count > 0 or (management_result and management_result.modified_count > 0): + return jsonify({ + "success": True, + "message": f"已清空短剧 {target_mix_name} 的评论总结(Ranking_storage: {result.modified_count}, Rankings_management: {management_result.modified_count if management_result else 0})", + "modified_count": result.modified_count + }) + else: + return jsonify({ + "success": False, + "message": "未找到需要清空的评论总结" + }) + + except Exception as e: + logging.error(f"清空评论总结失败: {e}") + return jsonify({ + "success": False, + "message": f"清空评论总结失败: {str(e)}" + }), 500 + + +@rank_bp.route('/drama/') +def get_drama_detail_by_id(drama_id): + """ + 根据短剧ID获取详细信息(用于详情页) + 支持通过 mix_id 或 _id 查询 + """ + try: + # 获取日期参数(可选) + date_str = request.args.get('date') + if not date_str: + date_str = datetime.now().date().strftime("%Y-%m-%d") + + # 首先尝试从 Ranking_storage 中查找 + ranking_doc = collection.find_one({ + "date": date_str, + "type": "comprehensive" + }, sort=[("calculation_sequence", -1)]) + + drama_data = None + + if ranking_doc and "data" in ranking_doc: + # 在 data 数组中查找匹配的短剧 + for item in ranking_doc.get("data", []): + if item.get("mix_id") == drama_id or str(item.get("_id")) == drama_id: + drama_data = item + break + + # 如果在 Ranking_storage 中没找到,尝试从 Rankings_management 查找 + if not drama_data: + from bson import ObjectId + try: + mgmt_doc = rankings_management_collection.find_one({"mix_id": drama_id}) + if not mgmt_doc: + mgmt_doc = rankings_management_collection.find_one({"_id": ObjectId(drama_id)}) + if mgmt_doc: + drama_data = mgmt_doc + except: + pass + + if not drama_data: + return jsonify({ + "success": False, + "message": f"未找到短剧: {drama_id}" + }) + + # 格式化数据(format_mix_item已经包含了所有新字段) + formatted_data = format_mix_item(drama_data, date_str) + + return jsonify({ + "success": True, + "data": formatted_data, + "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + }) + + except Exception as e: + logging.error(f"获取短剧详情失败: {e}") + return jsonify({ + "success": False, + "message": f"获取短剧详情失败: {str(e)}" + }) + +def upload_certification_file(file): + """ + 上传认领证明文件到TOS + + Args: + file: 上传的文件对象 + + Returns: + str: TOS永久链接URL + """ + try: + # 获取文件扩展名 + filename = secure_filename(file.filename) + file_extension = '' + if '.' in filename: + file_extension = '.' + filename.rsplit('.', 1)[1].lower() + + # 验证文件类型 + allowed_image_extensions = ['.jpg', '.jpeg', '.png', '.gif'] + allowed_doc_extensions = ['.pdf', '.doc', '.docx'] + + if file_extension not in allowed_image_extensions + allowed_doc_extensions: + raise ValueError(f"不支持的文件类型: {file_extension}") + + # 验证文件大小 + file.seek(0, 2) # 移动到文件末尾 + file_size = file.tell() # 获取文件大小 + file.seek(0) # 重置文件指针 + + max_size = 10 * 1024 * 1024 # 10MB for images + if file_extension in allowed_doc_extensions: + max_size = 20 * 1024 * 1024 # 20MB for documents + + if file_size > max_size: + raise ValueError(f"文件大小超过限制: {file_size / 1024 / 1024:.2f}MB") + + # 生成唯一文件名 + random_filename = f"{uuid.uuid4().hex}{file_extension}" + object_key = f"media/rank/Certification/{random_filename}" + + # 上传到TOS + tos_url = oss_client.upload_bytes( + data=file.read(), + object_key=object_key, + content_type=file.content_type or 'application/octet-stream', + return_url=True + ) + + logging.info(f"文件上传成功: {filename} -> {tos_url}") + return tos_url + + except Exception as e: + logging.error(f"文件上传失败: {str(e)}") + raise + + +@rank_bp.route('/claim', methods=['POST']) +def submit_claim(): + """ + 提交认领申请(新版本:上传文件到TOS并创建待审核申请) + """ + try: + # 获取表单数据 + drama_id = request.form.get('drama_id') + field_type = request.form.get('field_type') # 'copyright' 或 'manufacturing' + company_name = request.form.get('company_name') + description = request.form.get('description', '') + + # 验证必填字段 + if not all([drama_id, field_type, company_name]): + return jsonify({ + "success": False, + "message": "缺少必填字段" + }), 400 + + # 验证字段类型 + if field_type not in ['copyright', 'manufacturing']: + return jsonify({ + "success": False, + "message": "无效的字段类型" + }), 400 + + # 获取短剧信息 + drama_info = rankings_management_collection.find_one({"mix_id": drama_id}) + if not drama_info: + return jsonify({ + "success": False, + "message": "未找到对应的短剧" + }), 404 + + drama_name = drama_info.get('mix_name', '未知短剧') + + # 处理上传的文件并上传到TOS + uploaded_files = request.files.getlist('files') + tos_file_urls = [] + + if uploaded_files: + for file in uploaded_files: + if file and file.filename: + try: + tos_url = upload_certification_file(file) + tos_file_urls.append(tos_url) + except ValueError as ve: + return jsonify({ + "success": False, + "message": str(ve) + }), 400 + except Exception as e: + return jsonify({ + "success": False, + "message": f"文件上传失败: {str(e)}" + }), 500 + + if not tos_file_urls: + return jsonify({ + "success": False, + "message": "请至少上传一个证明文件" + }), 400 + + # 检查是否存在该短剧+该字段类型的待审核申请 + existing_application = claim_applications_collection.find_one({ + "drama_id": drama_id, + "field_type": field_type, + "status": "pending" + }) + + # 如果存在待审核申请,删除旧的(但保留TOS文件) + if existing_application: + claim_applications_collection.delete_one({"_id": existing_application["_id"]}) + logging.info(f"删除旧的待审核申请: {existing_application.get('application_id')}") + + # 创建新的申请记录 + application_id = str(uuid.uuid4()) + application_data = { + "application_id": application_id, + "drama_id": drama_id, + "drama_name": drama_name, + "field_type": field_type, + "company_name": company_name, + "description": description, + "tos_file_urls": tos_file_urls, + "status": "pending", + "submit_time": datetime.now(), + "review_time": None, + "reviewer": None, + "reject_reason": None + } + + claim_applications_collection.insert_one(application_data) + + logging.info(f"认领申请创建成功: application_id={application_id}, drama_id={drama_id}, field_type={field_type}") + + return jsonify({ + "success": True, + "message": "认领申请提交成功,等待管理员审核", + "data": { + "application_id": application_id, + "drama_id": drama_id, + "field_type": field_type, + "company_name": company_name, + "file_count": len(tos_file_urls) + } + }) + + except Exception as e: + logging.error(f"提交认领申请失败: {e}") + return jsonify({ + "success": False, + "message": f"提交认领申请失败: {str(e)}" + }), 500 + + +# 获取申请列表 +@rank_bp.route('/claim/applications', methods=['GET']) +def get_claim_applications(): + """ + 获取认领申请列表 + 支持筛选和分页 + """ + try: + # 获取查询参数 + status = request.args.get('status', 'all') # all/pending/approved/rejected + page = int(request.args.get('page', 1)) + limit = int(request.args.get('limit', 20)) + + # 构建查询条件 + query = {} + if status != 'all': + query['status'] = status + + # 查询总数 + total = claim_applications_collection.count_documents(query) + + # 查询数据(按提交时间倒序) + applications = list(claim_applications_collection.find(query) + .sort('submit_time', -1) + .skip((page - 1) * limit) + .limit(limit)) + + # 格式化数据 + formatted_applications = [] + for app in applications: + formatted_applications.append({ + "application_id": app.get('application_id'), + "drama_id": app.get('drama_id'), + "drama_name": app.get('drama_name'), + "field_type": app.get('field_type'), + "field_type_label": "版权方" if app.get('field_type') == 'copyright' else "承制方", + "company_name": app.get('company_name'), + "status": app.get('status'), + "status_label": { + "pending": "待审核", + "approved": "已通过", + "rejected": "已拒绝" + }.get(app.get('status'), "未知"), + "submit_time": app.get('submit_time').strftime("%Y-%m-%d %H:%M:%S") if app.get('submit_time') else "", + "file_count": len(app.get('tos_file_urls', [])) + }) + + return jsonify({ + "success": True, + "data": formatted_applications, + "pagination": { + "page": page, + "limit": limit, + "total": total, + "pages": (total + limit - 1) // limit + } + }) + + except Exception as e: + logging.error(f"获取申请列表失败: {e}") + return jsonify({ + "success": False, + "message": f"获取申请列表失败: {str(e)}" + }), 500 + + +# 获取申请详情 +@rank_bp.route('/claim/application/', methods=['GET']) +def get_claim_application_detail(application_id): + """ + 获取认领申请详情 + """ + try: + application = claim_applications_collection.find_one({"application_id": application_id}) + + if not application: + return jsonify({ + "success": False, + "message": "申请不存在" + }), 404 + + # 格式化数据 + formatted_data = { + "application_id": application.get('application_id'), + "drama_id": application.get('drama_id'), + "drama_name": application.get('drama_name'), + "field_type": application.get('field_type'), + "field_type_label": "版权方" if application.get('field_type') == 'copyright' else "承制方", + "company_name": application.get('company_name'), + "description": application.get('description', ''), + "tos_file_urls": application.get('tos_file_urls', []), + "status": application.get('status'), + "status_label": { + "pending": "待审核", + "approved": "已通过", + "rejected": "已拒绝" + }.get(application.get('status'), "未知"), + "submit_time": application.get('submit_time').strftime("%Y-%m-%d %H:%M:%S") if application.get('submit_time') else "", + "review_time": application.get('review_time').strftime("%Y-%m-%d %H:%M:%S") if application.get('review_time') else None, + "reviewer": application.get('reviewer'), + "reject_reason": application.get('reject_reason') + } + + return jsonify({ + "success": True, + "data": formatted_data + }) + + except Exception as e: + logging.error(f"获取申请详情失败: {e}") + return jsonify({ + "success": False, + "message": f"获取申请详情失败: {str(e)}" + }), 500 + + +# 审核申请 +@rank_bp.route('/claim/review', methods=['POST']) +def review_claim_application(): + """ + 审核认领申请 + """ + try: + data = request.get_json() + application_id = data.get('application_id') + action = data.get('action') # 'approve' 或 'reject' + reject_reason = data.get('reject_reason', '') + reviewer = data.get('reviewer', 'admin') # 审核人 + + # 验证参数 + if not application_id or not action: + return jsonify({ + "success": False, + "message": "缺少必填参数" + }), 400 + + if action not in ['approve', 'reject']: + return jsonify({ + "success": False, + "message": "无效的操作类型" + }), 400 + + if action == 'reject' and not reject_reason: + return jsonify({ + "success": False, + "message": "拒绝时必须填写理由" + }), 400 + + # 查找申请 + application = claim_applications_collection.find_one({"application_id": application_id}) + if not application: + return jsonify({ + "success": False, + "message": "申请不存在" + }), 404 + + if application.get('status') != 'pending': + return jsonify({ + "success": False, + "message": "该申请已经被审核过了" + }), 400 + + # 执行审核操作 + if action == 'approve': + # 通过:更新短剧字段并锁定 + drama_id = application.get('drama_id') + field_type = application.get('field_type') + company_name = application.get('company_name') + description = application.get('description', '') + tos_file_urls = application.get('tos_file_urls', []) + + field_name = 'Copyright_field' if field_type == 'copyright' else 'Manufacturing_field' + + # 更新 Rankings_management 数据库 + update_data = { + field_name: company_name, + f"{field_name}_claim_description": description, + f"{field_name}_claim_images": tos_file_urls, + f"{field_name}_claim_time": datetime.now(), + "last_updated": datetime.now() + } + + # 设置锁定状态 + lock_status_update = { + f"field_lock_status.{field_name}": True, + f"field_lock_status.{field_name}_claim_description": True, + f"field_lock_status.{field_name}_claim_images": True, + f"field_lock_status.{field_name}_claim_time": True + } + update_data.update(lock_status_update) + + rankings_management_collection.update_one( + {"mix_id": drama_id}, + {"$set": update_data} + ) + + # 同步更新 Ranking_storage 数据库 + ranking_storage_update = { + f"data.$[elem].{field_name}": company_name, + f"data.$[elem].{field_name}_claim_description": description, + f"data.$[elem].{field_name}_claim_images": tos_file_urls, + f"data.$[elem].{field_name}_claim_time": datetime.now(), + f"data.$[elem].field_lock_status.{field_name}": True, + f"data.$[elem].field_lock_status.{field_name}_claim_description": True, + f"data.$[elem].field_lock_status.{field_name}_claim_images": True, + f"data.$[elem].field_lock_status.{field_name}_claim_time": True + } + + collection.update_many( + {"data.mix_id": drama_id}, + {"$set": ranking_storage_update}, + array_filters=[{"elem.mix_id": drama_id}] + ) + + # 更新申请状态 + claim_applications_collection.update_one( + {"application_id": application_id}, + {"$set": { + "status": "approved", + "review_time": datetime.now(), + "reviewer": reviewer + }} + ) + + logging.info(f"认领申请审核通过: application_id={application_id}, drama_id={drama_id}") + + return jsonify({ + "success": True, + "message": "申请已通过,短剧信息已更新" + }) + + else: # reject + # 拒绝:只更新申请状态 + claim_applications_collection.update_one( + {"application_id": application_id}, + {"$set": { + "status": "rejected", + "review_time": datetime.now(), + "reviewer": reviewer, + "reject_reason": reject_reason + }} + ) + + logging.info(f"认领申请已拒绝: application_id={application_id}, reason={reject_reason}") + + return jsonify({ + "success": True, + "message": "申请已拒绝" + }) + + except Exception as e: + logging.error(f"审核申请失败: {e}") + return jsonify({ + "success": False, + "message": f"审核申请失败: {str(e)}" + }), 500 + + +# 获取待审核数量 +@rank_bp.route('/claim/pending-count', methods=['GET']) +def get_pending_claim_count(): + """ + 获取待审核的认领申请数量 + """ + try: + count = claim_applications_collection.count_documents({"status": "pending"}) + + return jsonify({ + "success": True, + "count": count + }) + + except Exception as e: + logging.error(f"获取待审核数量失败: {e}") + return jsonify({ + "success": False, + "message": f"获取待审核数量失败: {str(e)}" + }), 500 + + +# ==================== 文章相关API ==================== +# 获取数据库集合 +articles_collection = db['articles'] + +def format_article_item(doc): + """格式化文章数据项""" + return { + "_id": str(doc.get("_id", "")), + "title": doc.get("title", ""), + "author_id": doc.get("author_id", ""), + "cover_image": doc.get("cover_image", ""), + "status": doc.get("status", ""), + "summary": doc.get("summary", ""), + "created_at": format_time(doc.get("created_at")), + "likes": doc.get("likes", []), + "likes_count": len(doc.get("likes", [])) + } + +def get_article_list_data(page=1, limit=20, sort_by="created_at", status=None): + """获取文章列表(分页)""" + try: + skip = (page - 1) * limit + query_condition = {} + if status: + query_condition["status"] = status + + sort_field = sort_by if sort_by in ["created_at", "title"] else "created_at" + sort_order = -1 + + cursor = articles_collection.find(query_condition).sort(sort_field, sort_order).skip(skip).limit(limit) + docs = list(cursor) + total = articles_collection.count_documents(query_condition) + + article_list = [] + for doc in docs: + item = format_article_item(doc) + article_list.append(item) + + return { + "success": True, + "data": article_list, + "pagination": { + "page": page, + "limit": limit, + "total": total, + "pages": (total + limit - 1) // limit, + "has_next": page * limit < total, + "has_prev": page > 1 + }, + "sort_by": sort_by, + "status_filter": status, + "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + except Exception as e: + logging.error(f"获取文章列表失败: {e}") + return {"success": False, "message": f"获取数据失败: {str(e)}"} + +def search_articles_data(keyword, page=1, limit=10): + """搜索文章""" + try: + if not keyword: + return {"success": False, "message": "请提供搜索关键词"} + + skip = (page - 1) * limit + search_condition = { + "$or": [ + {"title": {"$regex": keyword, "$options": "i"}}, + {"content": {"$regex": keyword, "$options": "i"}}, + {"summary": {"$regex": keyword, "$options": "i"}} + ] + } + + cursor = articles_collection.find(search_condition).sort("created_at", -1).skip(skip).limit(limit) + docs = list(cursor) + total = articles_collection.count_documents(search_condition) + + search_results = [] + for doc in docs: + item = format_article_item(doc) + search_results.append(item) + + return { + "success": True, + "data": search_results, + "keyword": keyword, + "pagination": { + "page": page, + "limit": limit, + "total": total, + "pages": (total + limit - 1) // limit, + "has_next": page * limit < total, + "has_prev": page > 1 + }, + "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + except Exception as e: + logging.error(f"搜索文章失败: {e}") + return {"success": False, "message": f"搜索失败: {str(e)}"} + +def get_article_detail_data(article_id): + """获取文章详情""" + try: + from bson import ObjectId + try: + doc = articles_collection.find_one({"_id": ObjectId(article_id)}) + except: + doc = articles_collection.find_one({ + "$or": [ + {"title": article_id}, + {"author_id": article_id} + ] + }) + + if not doc: + return {"success": False, "message": "未找到文章信息"} + + detail = format_article_item(doc) + + return { + "success": True, + "data": detail, + "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + except Exception as e: + logging.error(f"获取文章详情失败: {e}") + return {"success": False, "message": f"获取详情失败: {str(e)}"} + +def get_article_statistics(): + """获取统计信息""" + try: + total_articles = articles_collection.count_documents({}) + + if total_articles == 0: + return {"success": False, "message": "暂无数据"} + + status_stats = [] + for status in ["draft", "published", "archived"]: + count = articles_collection.count_documents({"status": status}) + status_stats.append({"status": status, "count": count}) + + latest_doc = articles_collection.find().sort("created_at", -1).limit(1) + latest_time = "" + if latest_doc: + latest_list = list(latest_doc) + if latest_list: + latest_time = format_time(latest_list[0].get("created_at")) + + return { + "success": True, + "data": { + "total_articles": total_articles, + "status_stats": status_stats, + "latest_update": latest_time + }, + "update_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + except Exception as e: + logging.error(f"获取统计信息失败: {e}") + return {"success": False, "message": f"获取统计失败: {str(e)}"} + +# 文章路由定义 +@rank_bp.route('/article/list') +def get_articles_route(): + """获取文章列表""" + page = int(request.args.get('page', 1)) + limit = int(request.args.get('limit', 20)) + sort_by = request.args.get('sort', 'created_at') + status = request.args.get('status') + result = get_article_list_data(page, limit, sort_by, status) + return jsonify(result) + +@rank_bp.route('/article/search') +def search_articles_route(): + """搜索文章""" + keyword = request.args.get('q', '') + page = int(request.args.get('page', 1)) + limit = int(request.args.get('limit', 10)) + result = search_articles_data(keyword, page, limit) + return jsonify(result) + +@rank_bp.route('/article/detail') +def get_article_detail_route(): + """获取文章详情""" + article_id = request.args.get('id', '') + result = get_article_detail_data(article_id) + return jsonify(result) + +@rank_bp.route('/article/stats') +def get_article_stats_route(): + """获取统计信息""" + result = get_article_statistics() + return jsonify(result) + +@rank_bp.route('/article/health') +def article_health_check(): + """健康检查""" + try: + total_records = articles_collection.count_documents({}) + + return jsonify({ + "success": True, + "message": "服务正常", + "data": { + "database": "连接正常", + "total_records": total_records, + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + }) + except Exception as e: + return jsonify({ + "success": False, + "message": f"服务异常: {str(e)}", + "data": { + "database": "连接失败", + "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + }) diff --git a/frontend/package-lock.json b/frontend/package-lock.json index 4872c7c..abdca4b 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -11,7 +11,8 @@ "axios": "^1.12.2", "bootstrap": "^5.3.0-alpha1", "bootstrap-icons": "^1.13.1", - "vue": "^3.5.22" + "vue": "^3.5.22", + "vue-router": "^4.6.3" }, "devDependencies": { "@vitejs/plugin-vue": "^6.0.1", @@ -1333,6 +1334,11 @@ "@vue/shared": "3.5.22" } }, + "node_modules/@vue/devtools-api": { + "version": "6.6.4", + "resolved": "https://registry.npmmirror.com/@vue/devtools-api/-/devtools-api-6.6.4.tgz", + "integrity": "sha512-sGhTPMuXqZ1rVOk32RylztWkfXTRhuS7vgAKv0zjqk8gbsHkJ7xfFf+jbySxt7tWObEJwyKaHMikV/WGDiQm8g==" + }, "node_modules/@vue/devtools-core": { "version": "8.0.3", "resolved": "https://registry.npmmirror.com/@vue/devtools-core/-/devtools-core-8.0.3.tgz", @@ -2643,6 +2649,20 @@ } } }, + "node_modules/vue-router": { + "version": "4.6.3", + "resolved": "https://registry.npmmirror.com/vue-router/-/vue-router-4.6.3.tgz", + "integrity": "sha512-ARBedLm9YlbvQomnmq91Os7ck6efydTSpRP3nuOKCvgJOHNrhRoJDSKtee8kcL1Vf7nz6U+PMBL+hTvR3bTVQg==", + "dependencies": { + "@vue/devtools-api": "^6.6.4" + }, + "funding": { + "url": "https://github.com/sponsors/posva" + }, + "peerDependencies": { + "vue": "^3.5.0" + } + }, "node_modules/wsl-utils": { "version": "0.1.0", "resolved": "https://registry.npmmirror.com/wsl-utils/-/wsl-utils-0.1.0.tgz", @@ -3459,6 +3479,11 @@ "@vue/shared": "3.5.22" } }, + "@vue/devtools-api": { + "version": "6.6.4", + "resolved": "https://registry.npmmirror.com/@vue/devtools-api/-/devtools-api-6.6.4.tgz", + "integrity": "sha512-sGhTPMuXqZ1rVOk32RylztWkfXTRhuS7vgAKv0zjqk8gbsHkJ7xfFf+jbySxt7tWObEJwyKaHMikV/WGDiQm8g==" + }, "@vue/devtools-core": { "version": "8.0.3", "resolved": "https://registry.npmmirror.com/@vue/devtools-core/-/devtools-core-8.0.3.tgz", @@ -4276,6 +4301,14 @@ "@vue/shared": "3.5.22" } }, + "vue-router": { + "version": "4.6.3", + "resolved": "https://registry.npmmirror.com/vue-router/-/vue-router-4.6.3.tgz", + "integrity": "sha512-ARBedLm9YlbvQomnmq91Os7ck6efydTSpRP3nuOKCvgJOHNrhRoJDSKtee8kcL1Vf7nz6U+PMBL+hTvR3bTVQg==", + "requires": { + "@vue/devtools-api": "^6.6.4" + } + }, "wsl-utils": { "version": "0.1.0", "resolved": "https://registry.npmmirror.com/wsl-utils/-/wsl-utils-0.1.0.tgz", diff --git a/frontend/package.json b/frontend/package.json index b5c65b5..bbec006 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -15,7 +15,8 @@ "axios": "^1.12.2", "bootstrap": "^5.3.0-alpha1", "bootstrap-icons": "^1.13.1", - "vue": "^3.5.22" + "vue": "^3.5.22", + "vue-router": "^4.6.3" }, "devDependencies": { "@vitejs/plugin-vue": "^6.0.1", diff --git a/frontend/public/favicon.ico b/frontend/public/favicon.ico deleted file mode 100644 index df36fcf..0000000 Binary files a/frontend/public/favicon.ico and /dev/null differ diff --git a/frontend/public/placeholder-poster.svg b/frontend/public/placeholder-poster.svg deleted file mode 100644 index 0e44635..0000000 --- a/frontend/public/placeholder-poster.svg +++ /dev/null @@ -1,5 +0,0 @@ - - - 暂无 - 图片 - \ No newline at end of file diff --git a/frontend/src/AdminPanel.vue b/frontend/src/AdminPanel.vue new file mode 100644 index 0000000..c746abc --- /dev/null +++ b/frontend/src/AdminPanel.vue @@ -0,0 +1,1043 @@ + + + + + + + \ No newline at end of file diff --git a/frontend/src/App.vue b/frontend/src/App.vue index a1e973a..046ccf9 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -1,7 +1,11 @@ + + diff --git a/frontend/src/ClaimPage.vue b/frontend/src/ClaimPage.vue new file mode 100644 index 0000000..25114b5 --- /dev/null +++ b/frontend/src/ClaimPage.vue @@ -0,0 +1,759 @@ + + +