From d4d555cdb1775a079202bbe7e297ffb1778c3572 Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Thu, 6 Nov 2025 18:28:43 +0800 Subject: [PATCH] =?UTF-8?q?=E7=9B=AE=E5=89=8D=E5=90=8E=E7=AB=AF=E5=92=8C?= =?UTF-8?q?=E5=89=8D=E7=AB=AF=EF=BC=9A=201.=E6=B7=BB=E5=8A=A0=E5=90=8E?= =?UTF-8?q?=E5=8F=B0=E7=AE=A1=E7=90=86=E9=A1=B5=E9=9D=A2=EF=BC=8C=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E7=BD=91=E5=9D=80=E8=BF=9B=E5=85=A5=E5=90=8E=E5=8F=B0?= =?UTF-8?q?=E7=AE=A1=E7=90=86=E9=A1=B5=E9=9D=A2=EF=BC=9Ahttp://localhost:5?= =?UTF-8?q?174/admin=202.=E5=89=A7=E7=A7=8D=E5=88=86=E7=B1=BB=E5=AE=8C?= =?UTF-8?q?=E6=88=90=EF=BC=8C=E5=8F=AA=E8=A6=81=E7=94=A8=E6=88=B7=E5=86=8D?= =?UTF-8?q?=E5=90=8E=E5=8F=B0=E7=AE=A1=E7=90=86=E9=A1=B5=E9=9D=A2=E9=80=89?= =?UTF-8?q?=E6=8B=A9=E7=B1=BB=E5=9E=8B=E4=B9=8B=E5=90=8E=E4=BC=9A=E4=B8=80?= =?UTF-8?q?=E7=9B=B4=E6=98=BE=E7=A4=BA=203.=E5=9C=A8=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E8=84=9A=E6=9C=AC=E8=BF=90=E8=A1=8C=E7=9A=84=E6=97=B6=E5=80=99?= =?UTF-8?q?=E5=8F=AF=E4=BB=A5=E5=90=8C=E6=97=B6=E5=90=AF=E5=8A=A8=E4=B8=A4?= =?UTF-8?q?=E4=B8=AA=E6=B5=8F=E8=A7=88=E5=99=A8=E9=A1=B5=E9=9D=A2=E4=B8=8D?= =?UTF-8?q?=E5=8F=97=E5=BD=B1=E5=93=8D=EF=BC=88=E5=8E=9F=E5=9B=A0=E6=98=AF?= =?UTF-8?q?=EF=BC=9A=E5=BF=85=E9=A1=BB=E8=A6=81=E7=AE=A1=E7=90=86=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E6=9C=89=E4=BA=86=E6=95=B0=E6=8D=AE=E4=B9=8B?= =?UTF-8?q?=E5=90=8E=E5=89=8D=E7=AB=AF=E7=82=B9=E8=B5=9E=E6=89=8D=E5=8F=AF?= =?UTF-8?q?=E4=BB=A5=E6=98=BE=E7=A4=BA=EF=BC=8C=E4=BD=86=E6=98=AF=E4=B8=BB?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E8=BF=90=E8=A1=8C=E6=85=A2=EF=BC=8C=E4=B8=BA?= =?UTF-8?q?=E4=BA=86=E4=B8=8D=E6=B5=AA=E8=B4=B9=E6=97=B6=E9=97=B4=20?= =?UTF-8?q?=E6=AF=8F=E6=AC=A1=E5=AE=9A=E6=97=B6=E5=99=A8=E8=BF=90=E8=A1=8C?= =?UTF-8?q?=E7=9A=84=E6=97=B6=E5=80=99=E9=83=BD=E4=BC=9A=E9=80=9A=E8=BF=87?= =?UTF-8?q?=E8=A7=86=E9=A2=91ID=E6=9D=A5=E5=90=8C=E6=AD=A5=E7=9F=AD?= =?UTF-8?q?=E5=89=A7=E7=9A=84=E8=AF=A6=E7=BB=86=E4=BF=A1=E6=81=AF=EF=BC=89?= =?UTF-8?q?=20=E5=89=8D=E7=AB=AF=E5=8F=AF=E4=BB=A5=E7=A8=B3=E5=AE=9A?= =?UTF-8?q?=E7=9A=84=E6=98=BE=E7=A4=BA=E6=95=B0=E6=8D=AE=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/Timer_worker.py | 128 ++++++++++++----- .../video_ids_7472792051890554918.json | 8 +- .../video_ids_7529584800245254183.json | 12 +- .../video_ids_7561020459769153570.json | 12 +- .../handlers/Rankings/rank_data_scraper.py | 131 ++++++++++++++---- backend/routers/rank_api_routes.py | 4 - 6 files changed, 220 insertions(+), 75 deletions(-) diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index 70f11aa..ef2e862 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -80,39 +80,91 @@ class DouyinAutoScheduler: return 0 return play_vv - def _deduplicate_videos_by_mix_name(self, videos, include_rank=False): - """按短剧名称去重,保留播放量最高的记录""" - unique_data = {} - for video in videos: - mix_name = video.get("mix_name", "").strip() - - # 过滤掉空的或无效的mix_name - if not mix_name or mix_name == "" or mix_name.lower() == "null": - self.logger.warning(f"跳过空的或无效的mix_name记录: {video.get('_id', 'unknown')}") - continue - - # 标准化播放量数据类型 - play_vv = self._normalize_play_vv(video.get("play_vv", 0)) - - # 确保播放量大于0,过滤无效数据 - if play_vv <= 0: - self.logger.warning(f"跳过播放量为0或无效的记录: mix_name={mix_name}, play_vv={video.get('play_vv', 0)}") - continue - - if mix_name not in unique_data or play_vv > unique_data[mix_name].get("play_vv", 0): - if include_rank: - # 用于昨天数据的格式 - unique_data[mix_name] = { - "play_vv": play_vv, - "video_id": str(video.get("_id", "")), - "rank": 0 # 稍后计算排名 - } + def check_browser_login_status(self): + """检查浏览器登录状态,如果没有登录则提示用户登录""" + try: + import os + script_dir = os.path.dirname(os.path.abspath(__file__)) + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + + # # 检查配置文件目录是否存在 + # if not os.path.exists(profile_dir): + # print("⚠️ 检测到定时器浏览器配置目录不存在,需要首次登录") + # print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面") + # print(" 完成后按回车键继续...") + # input() + # return + + # 检查配置文件是否为空(可能未登录) + import glob + profile_files = glob.glob(os.path.join(profile_dir, "*")) + if len(profile_files) < 5: # 如果文件太少,可能未登录 + print("⚠️ 检测到定时器浏览器可能未登录") + print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面") + print(" 完成后按回车键继续...") + input() + else: + print("✅ 定时器浏览器已配置,继续执行...") + + except Exception as e: + logging.warning(f"检查浏览器登录状态时出错: {e}") + print("⚠️ 检查浏览器状态失败,请确保浏览器已正确配置") + print(" 完成后按回车键继续...") + input() + + def _cleanup_chrome_processes(self): + """清理可能占用配置文件的Chrome进程""" + try: + import psutil + import os + + # 获取当前配置文件路径 + script_dir = os.path.dirname(os.path.abspath(__file__)) + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + + # 查找使用该配置文件的Chrome进程 + killed_processes = [] + for proc in psutil.process_iter(['pid', 'name', 'cmdline']): + try: + if proc.info['name'] and 'chrome' in proc.info['name'].lower(): + cmdline = proc.info['cmdline'] + if cmdline and any(profile_dir in arg for arg in cmdline): + proc.terminate() + killed_processes.append(proc.info['pid']) + logging.info(f'终止占用配置文件的Chrome进程: PID {proc.info["pid"]}') + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + continue + + # 等待进程终止 + if killed_processes: + import time + time.sleep(2) + + return len(killed_processes) > 0 + + except ImportError: + # 如果没有psutil,使用系统命令 + try: + import subprocess + import os + + script_dir = os.path.dirname(os.path.abspath(__file__)) + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + + # 使用taskkill命令终止Chrome进程 + result = subprocess.run(['taskkill', '/F', '/IM', 'chrome.exe'], capture_output=True, text=True) + if result.returncode == 0: + logging.info('使用系统命令终止Chrome进程') + return True else: - # 用于今天数据的格式,直接更新原视频对象 - video["play_vv"] = play_vv - unique_data[mix_name] = video - - return unique_data + logging.warning('无法终止Chrome进程') + return False + except Exception as e: + logging.warning(f'系统命令清理Chrome进程失败: {e}') + return False + except Exception as e: + logging.warning(f'清理Chrome进程时出错: {e}') + return False def run_douyin_scraper(self): """执行抖音播放量抓取任务""" @@ -126,14 +178,14 @@ class DouyinAutoScheduler: scraper = DouyinPlayVVScraper( start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation", auto_continue=True, - duration_s=60 + duration_s=180 # 增加到180秒,给更多时间收集数据 ) - - print("📁 开始执行抓取任务...") + + print("开始执行抓取任务...") logging.info("📁 开始执行抓取任务...") scraper.run() - print("✅ 抖音播放量抓取任务执行成功") + print("抖音播放量抓取任务执行成功") logging.info("✅ 抖音播放量抓取任务执行成功") # 数据抓取完成后,自动生成当日榜单 @@ -381,7 +433,7 @@ class DouyinAutoScheduler: # 🎯 核心榜单字段 "rank": rank, # 使用排名计数器 "title": mix_name, - "mix_name": mix_name, # 确保包含mix_name字段用于同步 + "mix_name": mix_name, "play_vv": current_play_vv, "series_author": video.get("series_author", ""), "video_id": video_id, @@ -478,7 +530,7 @@ class DouyinAutoScheduler: if item.get("Copyright_field"): items_with_copyright += 1 - print(f"📊 数据完整性统计:") + print(f"数据完整性统计:") print(f" 总项目数: {total_items}") print(f" 从Rankings_management获取到详细信息: {items_with_management_data}") print(f" 包含Manufacturing_Field: {items_with_manufacturing}") diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json index fe5b079..381ae09 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json @@ -19,9 +19,13 @@ { "video_id": "7471924777410645283", "episode_num": 0 + }, + { + "video_id": "7472791705268325641", + "episode_num": 0 } ], - "total_count": 5, - "last_update": "2025-10-22T09:55:36.943794", + "total_count": 6, + "last_update": "2025-11-06T17:43:54.929209", "mix_name": "《青蛇传》" } \ No newline at end of file diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json index 2c3a6c6..e0127c7 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json @@ -47,9 +47,17 @@ { "video_id": "7548447317729234239", "episode_num": 0 + }, + { + "video_id": "7568747381357808923", + "episode_num": 0 + }, + { + "video_id": "7568800392985791784", + "episode_num": 0 } ], - "total_count": 12, - "last_update": "2025-10-22T09:55:50.726907", + "total_count": 14, + "last_update": "2025-11-06T17:48:06.014161", "mix_name": "青云修仙传" } \ No newline at end of file diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json index 2803b24..8abbf1e 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json @@ -107,9 +107,17 @@ { "video_id": "7560551213957500195", "episode_num": 0 + }, + { + "video_id": "7562056353343966464", + "episode_num": 0 + }, + { + "video_id": "7567981488823318927", + "episode_num": 0 } ], - "total_count": 27, - "last_update": "2025-10-22T09:56:16.947762", + "total_count": 29, + "last_update": "2025-11-06T17:15:32.747557", "mix_name": "绝境逆袭" } \ No newline at end of file diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index d626ff6..8ee16db 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -181,9 +181,15 @@ class DouyinPlayVVScraper: """清理超过一天的旧临时Chrome配置文件""" try: script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_base_dir = os.path.join(script_dir, 'config', 'chrome_profile') - if not os.path.exists(profile_base_dir): - return + # 清理两个配置目录的旧文件 + profile_dirs = [ + os.path.join(script_dir, 'config', 'chrome_profile_scraper'), + os.path.join(script_dir, 'config', 'chrome_profile_timer') + ] + + for profile_base_dir in profile_dirs: + if not os.path.exists(profile_base_dir): + continue current_time = time.time() one_day_ago = current_time - 24 * 60 * 60 # 24小时前 @@ -219,7 +225,7 @@ class DouyinPlayVVScraper: # 获取当前配置文件路径 script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent') + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent') # 查找使用该配置文件的Chrome进程 killed_processes = [] @@ -273,14 +279,20 @@ class DouyinPlayVVScraper: def _cleanup_chrome_cache_smart(self, size_threshold_mb=50): """智能清理Chrome配置文件缓存 - + Args: size_threshold_mb (int): 触发清理的大小阈值(MB),默认50MB """ try: script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent') - + # 根据运行模式选择对应的配置目录 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + + if is_timer_mode: + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + else: + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent') + if not os.path.exists(profile_dir): logging.info('Chrome配置文件目录不存在,跳过缓存清理') return False @@ -353,10 +365,10 @@ class DouyinPlayVVScraper: def setup_driver(self): logging.info('初始化Chrome WebDriver (启用CDP网络日志)') - + # 清理可能占用配置文件的Chrome进程 self._cleanup_chrome_processes() - + chrome_options = Options() chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-dev-shm-usage') @@ -368,9 +380,20 @@ class DouyinPlayVVScraper: chrome_options.add_argument('--remote-debugging-port=0') chrome_options.add_argument('--start-maximized') chrome_options.add_argument('--lang=zh-CN') - # 使用固定的Chrome配置文件目录以保持登录状态 + + # 根据运行模式选择不同的Chrome配置文件目录 script_dir = os.path.dirname(os.path.abspath(__file__)) - profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent') + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + + if is_timer_mode: + # 定时器模式使用独立的配置目录 + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') + logging.info(f'[定时器模式] 使用独立Chrome配置文件: {profile_dir}') + else: + # 普通模式使用原有的配置目录 + profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent') + logging.info(f'[普通模式] 使用独立Chrome配置文件: {profile_dir}') + os.makedirs(profile_dir, exist_ok=True) chrome_options.add_argument(f'--user-data-dir={profile_dir}') logging.info(f'使用持久化Chrome配置文件: {profile_dir}') @@ -487,12 +510,12 @@ class DouyinPlayVVScraper: def ensure_login(self): """确保用户已登录并导航到收藏合集页面""" logging.info("检测登录状态和页面位置...") - + # 首先检查是否已经登录并在正确页面 if self._check_login_and_page(): logging.info("检测到已登录且在收藏合集页面,跳过手动确认") return - + # 如果未登录或不在正确页面,进行手动登录流程 logging.info("请在弹出的浏览器中手动完成登录。") @@ -517,6 +540,24 @@ class DouyinPlayVVScraper: logging.warning(f'错误上下文: {error_details["context"]}') return + # 定时器模式下的登录检查 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if is_timer_mode: + logging.info("定时器模式:检查浏览器登录状态...") + # 在定时器模式下,浏览器已经启动并导航到页面,现在检查登录状态 + if not self._check_login_and_page(): + logging.warning("定时器模式:检测到未登录状态,需要手动登录") + print("⚠️ 定时器浏览器未登录") + print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面") + print(" 完成后按回车键继续...") + input() + # 重新检查登录状态 + if not self._check_login_and_page(): + logging.warning("定时器模式:登录确认后仍然未登录,继续执行...") + else: + logging.info("定时器模式:浏览器已登录,继续执行...") + return + logging.info("进入手动登录确认循环...") while True: # 要求用户输入特定文本确认 @@ -641,6 +682,16 @@ class DouyinPlayVVScraper: def trigger_loading(self): logging.info('触发数据加载:滚动 + 刷新') + + # 在auto_continue模式下增加页面加载等待时间 + if self.auto_continue: + logging.info('自动继续模式:增加页面加载等待时间') + time.sleep(8) # 等待页面完全加载 + else: + # 普通模式也需要增加页面加载等待时间 + logging.info('普通模式:增加页面加载等待时间') + time.sleep(8) # 等待页面完全加载 + # 滚动触发懒加载 for i in range(8): self.driver.execute_script(f'window.scrollTo(0, {i * 900});') @@ -1217,7 +1268,8 @@ class DouyinPlayVVScraper: except Exception as e: logging.error(f'[实时保存] 更新排名失败: {e}') - + + def extract_douyin_image_id(self, cover_url): """ 从抖音图片URL中提取唯一的图片ID @@ -2251,26 +2303,34 @@ class DouyinPlayVVScraper: } all_videos = [] - + # 使用服务端提供的游标进行分页,而不是使用 len(all_videos) + cursor = 0 + seen_cursors = set() + while True: + # 将当前游标设置到请求参数(字符串以兼容部分接口) + params['cursor'] = str(cursor) + response = requests.get( 'https://www.douyin.com/aweme/v1/web/mix/aweme/', params=params, cookies=self.get_cookies_dict(), headers=headers ) - + if response.status_code != 200: logging.error(f"请求失败: {response.status_code}") logging.error(f"响应内容: {response.text}") break - + try: data = response.json() - aweme_list = data.get('aweme_list', []) + # 兼容可能的列表字段名 + aweme_list = data.get('aweme_list') or data.get('mix_aweme_list') or [] if not aweme_list: + logging.info("当前页无视频,结束分页") break - + for aweme in aweme_list: video_id = aweme.get('aweme_id') if video_id: @@ -2278,14 +2338,31 @@ class DouyinPlayVVScraper: 'video_id': video_id, 'episode_num': int(aweme.get('episode_num', 0)) }) - - has_more = data.get('has_more', False) - if not has_more: + + # 读取服务端分页标识 + has_more = data.get('has_more') or data.get('hasMore') or False + next_cursor = ( + data.get('cursor') or + data.get('next_cursor') or + data.get('max_cursor') or + data.get('min_cursor') + ) + + logging.info(f"分页: cursor={cursor}, next_cursor={next_cursor}, has_more={has_more}, 本页视频={len(aweme_list)}, 累计={len(all_videos)}") + + # 退出条件:没有更多或没有有效下一游标 + if not has_more or not next_cursor: break - - params['cursor'] = str(len(all_videos)) + + # 防止重复游标导致的死循环 + if next_cursor in seen_cursors: + logging.warning(f"检测到重复游标 {next_cursor},停止分页以避免死循环") + break + + seen_cursors.add(next_cursor) + cursor = next_cursor time.sleep(1) - + except json.JSONDecodeError as e: logging.error(f"JSON解析错误: {e}") logging.error(f"响应内容: {response.text}") @@ -3725,7 +3802,7 @@ class DouyinPlayVVScraper: return False def cleanup_old_management_data(self, days_to_keep: int = 7): - """清理目标数据库中的旧数据,基于last_updated字段保留指定天数的数据""" + """清理目标数据库Rankings_management中的旧数据,基于last_updated字段保留指定天数的数据""" target_collection = self.collection # 使用根据模式选择的集合 if target_collection is None: logging.warning('[数据清理] 目标集合未初始化,跳过清理') @@ -3824,7 +3901,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Selenium+CDP 抖音play_vv抓取器') parser.add_argument('--url', default='https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation', help='收藏合集列表页面URL') parser.add_argument('--auto', action='store_true', help='自动继续,跳过回车等待') - parser.add_argument('--duration', type=int, default=60, help='网络响应收集时长(秒)') + parser.add_argument('--duration', type=int, default=180, help='网络响应收集时长(秒)') parser.add_argument('--driver', help='覆盖chromedriver路径') parser.add_argument('--timer', action='store_true', help='启用定时器模式,应用config.py中的定时器配置') args = parser.parse_args() diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 85251a2..06c8437 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -656,23 +656,19 @@ def get_top_mixes(limit=10): # 按播放量排序获取热门合集 cursor = collection.find().sort("play_vv", -1).limit(limit) docs = list(cursor) - if not docs: return {"success": False, "message": "暂无数据"} - # 格式化数据 top_list = [] for doc in docs: item = format_mix_item(doc) top_list.append(item) - return { "success": True, "data": top_list, "total": len(top_list), "update_time": format_time(docs[0].get("batch_time")) if docs else "" } - except Exception as e: logging.error(f"获取热门合集失败: {e}") return {"success": False, "message": f"获取数据失败: {str(e)}"}