From 8b1149da5607f88d8b57aef4417fb70a37fa1874 Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Tue, 21 Oct 2025 17:56:00 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86episode=5Fvideo=5Fid?= =?UTF-8?q?s=E5=AD=97=E6=AE=B5=EF=BC=8C=E5=9C=A8=E8=BF=90=E8=A1=8Crank=5Fd?= =?UTF-8?q?ata=5Fscraper.py=20=E6=88=96=E5=AE=9A=E6=97=B6=E5=99=A8?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E7=9A=84=E6=97=B6=E5=80=99=E4=B8=8D=E4=BB=85?= =?UTF-8?q?=E4=BF=9D=E8=AF=81=E4=BA=86=E5=8E=9F=E6=9D=A5=E7=9A=84=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E5=AD=98=E5=85=A5=E6=95=B0=E6=8D=AE=E5=BA=93?= =?UTF-8?q?=E8=BF=98=E4=BF=9D=E5=AD=98=E4=BA=86=20=E6=AF=8F=E4=B8=80?= =?UTF-8?q?=E9=83=A8=E7=9F=AD=E5=89=A7=E7=9A=84=E6=AF=8F=E4=B8=80=E9=9B=86?= =?UTF-8?q?=E7=9A=84=E8=A7=86=E9=A2=91ID?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 提示: 拉取代码在您运行脚本之后会主动创建一个episode_video_ids文件夹 里面存入的是您第一次运行脚本的每一集的视频ID(作为缓存) 判断的方法是:在运行脚本之后检查每一集的缓存数量是否与本剧的集数相同,相同则使用缓存的视频ID 不相同则重新获取 获取视频ID的时间不长 --- .../handlers/Rankings/rank_data_scraper.py | 243 +++++++++++++++++- backend/routers/rank_api_routes.py | 3 +- 2 files changed, 242 insertions(+), 4 deletions(-) diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index b3d3dd7..1869847 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -23,6 +23,7 @@ import logging import os import shutil from datetime import datetime +import requests from selenium import webdriver import os @@ -995,7 +996,20 @@ class DouyinPlayVVScraper: # 没有封面图片,使用空字符串 permanent_cover_url = '' - # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段 + # 获取合集中的所有视频ID + mix_id = item.get('mix_id', '') + episode_video_ids = [] + if mix_id: + logging.info(f'获取合集 {mix_name} 的所有视频ID') + current_episode_count = item.get('updated_to_episode', 0) + episode_video_ids = self.get_collection_videos( + mix_id=mix_id, + mix_name=mix_name, + current_episode_count=current_episode_count + ) + logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID') + + # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段 doc = { 'batch_time': batch_time, 'mix_name': mix_name, @@ -1007,10 +1021,11 @@ class DouyinPlayVVScraper: 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表 - # 新增的三个字段 + # 新增的字段 'series_author': item.get('series_author', ''), # 合集作者/影视工作室 'desc': item.get('desc', ''), # 合集描述 - 'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数 + 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数 + 'episode_video_ids': episode_video_ids # 每一集的视频ID列表 } documents.append(doc) @@ -1042,6 +1057,228 @@ class DouyinPlayVVScraper: except Exception as e: logging.error(f'保存到MongoDB时出错: {e}') + def get_video_info(self, video_id: str) -> dict: + """获取视频详细信息 + Args: + video_id: 视频ID + Returns: + dict: 包含视频详细信息的字典 + """ + video_url = f'https://www.douyin.com/video/{video_id}' + logging.info(f'获取视频信息: {video_url}') + + # 清除之前的网络日志 + self.driver.execute_cdp_cmd('Network.clearBrowserCache', {}) + self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {}) + self.driver.get(video_url) + time.sleep(3) + + # 等待页面加载完成 + try: + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.by import By + + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "video")) + ) + except Exception as e: + logging.warning(f'等待视频元素超时: {e}') + + # 获取网络请求日志 + logs = self.driver.get_log('performance') + video_info = {} + + for entry in logs: + try: + log = json.loads(entry['message'])['message'] + if ( + 'Network.responseReceived' in log['method'] + and 'response' in log['params'] + and 'url' in log['params']['response'] + and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url'] + ): + request_id = log['params']['requestId'] + response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id}) + if response and 'body' in response: + data = json.loads(response['body']) + if 'item_list' in data and len(data['item_list']) > 0: + item = data['item_list'][0] + video_info = { + 'video_id': item.get('aweme_id'), + 'create_time': item.get('create_time'), + 'desc': item.get('desc'), + 'duration': item.get('duration'), + 'mix_info': { + 'mix_id': item.get('mix_info', {}).get('mix_id'), + 'mix_name': item.get('mix_info', {}).get('mix_name'), + 'total': item.get('mix_info', {}).get('total') + } + } + break + except Exception as e: + logging.warning(f'解析日志条目时出错: {e}') + + return video_info + + def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list: + """获取合集中的所有视频ID列表,支持增量更新 + Args: + mix_id: 合集ID + mix_name: 合集名称,用于日志 + current_episode_count: 当前已知的集数 + Returns: + list: 按集数排序的视频ID列表 + """ + try: + # 检查缓存文件 + cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids') + # 确保缓存目录存在 + os.makedirs(cache_dir, exist_ok=True) + cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json') + cached_videos = [] + + try: + if os.path.exists(cache_file): + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + cached_videos = cache_data.get('episodes', []) + last_update = cache_data.get('last_update') + + # 如果缓存的集数等于当前集数,直接返回缓存的结果 + if len(cached_videos) == current_episode_count: + logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})") + return [video['video_id'] for video in cached_videos] + except Exception as e: + logging.warning(f"读取缓存文件失败: {e}") + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Referer': 'https://www.douyin.com/', + } + + params = { + 'device_platform': 'webapp', + 'aid': '6383', + 'channel': 'channel_pc_web', + 'pc_client_type': '1', + 'version_code': '170400', + 'version_name': '17.4.0', + 'cookie_enabled': 'true', + 'platform': 'PC', + 'downlink': '10', + 'mix_id': mix_id, + 'cursor': '0', + 'count': '30', + 'screen_width': '1920', + 'screen_height': '1080', + 'browser_language': 'zh-CN', + 'browser_platform': 'Win32', + 'browser_name': 'Chrome', + 'browser_version': '120.0.0.0', + 'browser_online': 'true', + 'engine_name': 'Blink', + 'engine_version': '120.0.0.0', + 'os_name': 'Windows', + 'os_version': '10', + 'cpu_core_num': '16', + 'device_memory': '8', + 'effective_type': '4g', + 'round_trip_time': '50', + } + + all_videos = [] + + while True: + response = requests.get( + 'https://www.douyin.com/aweme/v1/web/mix/aweme/', + params=params, + cookies=self.get_cookies_dict(), + headers=headers + ) + + if response.status_code != 200: + logging.error(f"请求失败: {response.status_code}") + logging.error(f"响应内容: {response.text}") + break + + try: + data = response.json() + aweme_list = data.get('aweme_list', []) + if not aweme_list: + break + + for aweme in aweme_list: + video_id = aweme.get('aweme_id') + if video_id: + all_videos.append({ + 'video_id': video_id, + 'episode_num': int(aweme.get('episode_num', 0)) + }) + + has_more = data.get('has_more', False) + if not has_more: + break + + params['cursor'] = str(len(all_videos)) + time.sleep(1) + + except json.JSONDecodeError as e: + logging.error(f"JSON解析错误: {e}") + logging.error(f"响应内容: {response.text}") + break + + if not all_videos: + if cached_videos: + logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})") + return [video['video_id'] for video in cached_videos] + return [] + + logging.info(f"获取到 {len(all_videos)} 个视频ID") + + # 按集数排序 + all_videos.sort(key=lambda x: x['episode_num']) + + # 整理视频ID和集数信息 + episode_info = [] + for video in all_videos: + episode_info.append({ + 'video_id': video['video_id'], + 'episode_num': video['episode_num'] + }) + + # 检查是否有新增视频 + if len(episode_info) > len(cached_videos): + logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)} 集") + + # 保存到缓存文件 + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump({ + 'episodes': episode_info, + 'total_count': len(episode_info), + 'last_update': datetime.now().isoformat(), + 'mix_name': mix_name + }, f, ensure_ascii=False, indent=2) + + # 返回视频ID列表 + return [video['video_id'] for video in all_videos] + + except Exception as e: + logging.error(f"获取合集视频时出错: {e}") + # 如果出错且有缓存,返回缓存的结果 + if cached_videos: + logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})") + return [video['video_id'] for video in cached_videos] + return [] + + def get_cookies_dict(self): + """获取当前页面的cookies""" + if not hasattr(self, 'cookies') or not self.cookies: + self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()} + return self.cookies + def run(self): try: self.setup_driver() diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 660cb2b..aec3aae 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -134,7 +134,8 @@ def format_mix_item(doc): "desc": doc.get("desc", ""), "updated_to_episode": doc.get("updated_to_episode", 0), "cover_backup_urls": doc.get("cover_backup_urls", []), - "mix_id": doc.get("mix_id", "") + "mix_id": doc.get("mix_id", ""), + "episode_video_ids": doc.get("episode_video_ids", []) } def get_mix_list(page=1, limit=20, sort_by="playcount"):