diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index 3aacde8..99a0c82 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -61,6 +61,8 @@ class DouyinAutoScheduler: # 设置环境变量,确保自动模式 os.environ['AUTO_CONTINUE'] = '1' + # 设置定时器模式环境变量,跳过评论抓取等函数 + os.environ['TIMER_MODE'] = '1' # 直接创建并运行 DouyinPlayVVScraper 实例 scraper = DouyinPlayVVScraper( @@ -68,10 +70,10 @@ class DouyinAutoScheduler: auto_continue=True, duration_s=60 ) - + logging.info("📁 开始执行抓取任务...") scraper.run() - + logging.info("✅ 抖音播放量抓取任务执行成功") # 数据抓取完成后,自动生成当日榜单 @@ -89,7 +91,7 @@ class DouyinAutoScheduler: from datetime import timedelta # 获取集合 - douyin_collection = db['Rankings_list'] # 使用真实抓取的数据 + douyin_collection = db['Ranking_storage_list'] # 使用定时器抓取的数据 rankings_collection = db['Ranking_storage'] today = date.today() @@ -107,10 +109,20 @@ class DouyinAutoScheduler: try: logging.info("🔄 正在生成时间轴对比榜单...") - # 获取今天的数据,按短剧名称去重,只保留播放量最高的 - today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1)) + # 获取最新批次的数据 + latest_batch = douyin_collection.find_one(sort=[("batch_time", -1)]) + if not latest_batch: + logging.warning("⚠️ 未找到任何数据") + return False + + latest_batch_time = latest_batch.get("batch_time") + logging.info(f"📊 找到最新批次时间: {latest_batch_time}") - # 按短剧名称去重,每个短剧只保留播放量最高的一条 + # 只获取最新批次的数据 + today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1)) + logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}") + + # 按短剧名称去重(虽然同一批次应该不会有重复,但为了代码健壮性保留此逻辑) unique_videos = {} for video in today_videos_raw: mix_name = video.get("mix_name", "") @@ -121,26 +133,36 @@ class DouyinAutoScheduler: logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") - # 获取昨天的榜单数据(如果存在),取最新的计算结果 - yesterday_ranking = rankings_collection.find_one({ - "date": yesterday_str, - "type": "comprehensive" - }, sort=[("calculation_sequence", -1)]) + # 获取昨天最后一批次的数据 + yesterday_batch = douyin_collection.find_one({ + "batch_time": {"$regex": f"^{yesterday_str}"} + }, sort=[("batch_time", -1)]) yesterday_data = {} - if yesterday_ranking and "data" in yesterday_ranking: - # 将昨天的数据转换为字典,以短剧名称为键 - for item in yesterday_ranking["data"]: - title = item.get("title", "") - if title: - yesterday_data[title] = { - "rank": item.get("rank", 0), - "play_vv": item.get("play_vv", 0), - "video_id": item.get("video_id", "") + if yesterday_batch: + # 获取昨天最后一批次的所有数据 + yesterday_videos = list(douyin_collection.find({ + "batch_time": yesterday_batch["batch_time"] + }).sort("play_vv", -1)) + + # 按短剧名称去重,保留播放量最高的记录 + for video in yesterday_videos: + mix_name = video.get("mix_name", "") + if mix_name and (mix_name not in yesterday_data or video.get("play_vv", 0) > yesterday_data[mix_name].get("play_vv", 0)): + yesterday_data[mix_name] = { + "play_vv": video.get("play_vv", 0), + "video_id": str(video.get("_id", "")), + "rank": 0 # 稍后计算排名 } - logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧") + + # 计算排名 + sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True) + for rank, (mix_name, data) in enumerate(sorted_videos, 1): + yesterday_data[mix_name]["rank"] = rank + + logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧") else: - logging.info("📊 未找到昨天的榜单数据,将作为首次生成") + logging.info("📊 未找到昨天的原始数据,将作为首次生成") if today_videos: # 先计算所有视频的播放量差值 diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 1869847..bd11206 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -83,11 +83,13 @@ class DouyinPlayVVScraper: # 使用 database.py 中的连接 self.db = db - # 设置集合 - mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list') + # 根据运行模式选择集合 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_list' self.collection = self.db[mongo_collection] logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}') + logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}') except Exception as e: logging.error(f'MongoDB连接失败: {e}') @@ -447,6 +449,28 @@ class DouyinPlayVVScraper: if n >= 10_000: return f"{n/10_000:.1f}万" return str(n) + + def format_interaction_count(self, n: int) -> str: + """格式化互动数据数量,返回带单位的字符串 + Args: + n: 数量 + Returns: + str: 格式化后的字符串,如 27898 -> 2.8W, 1234 -> 1234 + """ + if n >= 100_000_000: + result = n / 100_000_000 + if result == int(result): + return f"{int(result)}亿" + else: + return f"{result:.1f}亿" + elif n >= 10_000: + result = n / 10_000 + if result == int(result): + return f"{int(result)}W" + else: + return f"{result:.1f}W" + else: + return str(n) @@ -1008,6 +1032,80 @@ class DouyinPlayVVScraper: current_episode_count=current_episode_count ) logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID') + + # 获取每个视频的详细互动数据 + logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据') + video_details_list = self.get_collection_video_details( + episode_video_ids=episode_video_ids, + mix_name=mix_name, + max_comments_per_video=10 # 每个视频最多获取10条评论 + ) + + # 构建每集的详细信息,使用获取到的真实数据 + episode_details = [] + total_episodes = item.get('updated_to_episode', 0) + + for i in range(total_episodes): + episode_number = i + 1 + video_id = episode_video_ids[i] if i < len(episode_video_ids) else '' + + # 查找对应的视频详细数据 + video_detail = None + if i < len(video_details_list): + video_detail = video_details_list[i] + + if video_detail and video_detail.get('success', False): + # 使用获取到的真实数据 + likes = video_detail.get('likes', 0) + shares = video_detail.get('shares', 0) + favorites = video_detail.get('favorites', 0) + + episode_info = { + 'episode_number': episode_number, + 'video_id': video_id, + 'likes': likes, + 'shares': shares, + 'favorites': favorites, + 'likes_formatted': self.format_interaction_count(likes), + 'shares_formatted': self.format_interaction_count(shares), + 'favorites_formatted': self.format_interaction_count(favorites), + 'comments': video_detail.get('comments', []) + } + else: + # 使用默认值 + episode_info = { + 'episode_number': episode_number, + 'video_id': video_id, + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [] + } + + episode_details.append(episode_info) + + # 统计获取到的数据 + total_likes = sum(ep.get('likes', 0) for ep in episode_details) + total_comments = sum(len(ep.get('comments', [])) for ep in episode_details) + logging.info(f'合集 {mix_name} 详细数据统计: 总点赞数={total_likes:,}, 总评论数={total_comments}') + else: + # 如果没有获取到视频ID,使用默认的episode_details + episode_details = [ + { + 'episode_number': i + 1, + 'video_id': '', + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [] + } for i in range(item.get('updated_to_episode', 0)) + ] # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段 doc = { @@ -1025,7 +1123,8 @@ class DouyinPlayVVScraper: 'series_author': item.get('series_author', ''), # 合集作者/影视工作室 'desc': item.get('desc', ''), # 合集描述 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数 - 'episode_video_ids': episode_video_ids # 每一集的视频ID列表 + 'episode_video_ids': episode_video_ids, # 每一集的视频ID列表 + 'episode_details': episode_details # 每集的详细信息 } documents.append(doc) @@ -1095,6 +1194,7 @@ class DouyinPlayVVScraper: if ( 'Network.responseReceived' in log['method'] and 'response' in log['params'] + and log['params']['response'] and 'url' in log['params']['response'] and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url'] ): @@ -1130,6 +1230,11 @@ class DouyinPlayVVScraper: Returns: list: 按集数排序的视频ID列表 """ + # 定时器模式下跳过此函数 + if os.environ.get('TIMER_MODE') == '1': + logging.info(f'定时器模式:跳过 get_collection_videos 函数') + return [] + try: # 检查缓存文件 cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids') @@ -1273,6 +1378,374 @@ class DouyinPlayVVScraper: return [video['video_id'] for video in cached_videos] return [] + def get_video_details(self, video_id: str, max_comments: int = 20) -> dict: + """获取单个视频的详细互动数据 + Args: + video_id: 视频ID + max_comments: 最大评论数量,默认20条 + Returns: + dict: 包含点赞数、收藏数、转发数、评论内容的字典 + """ + video_details = { + 'video_id': video_id, + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [], + 'success': False, + 'error': None + } + + try: + # 确保driver已初始化 + if self.driver is None: + logging.info('Driver未初始化,正在设置...') + self.setup_driver() + if self.driver is None: + raise Exception("无法初始化WebDriver") + + video_url = f'https://www.douyin.com/video/{video_id}' + logging.info(f'获取视频详细数据: {video_url}') + + # 导航到视频页面 + self.driver.get(video_url) + time.sleep(3) + + # 等待页面加载完成 + try: + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.by import By + + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "video")) + ) + except Exception as e: + logging.warning(f'等待视频元素超时: {e}') + + # 获取网络请求日志 + logs = self.driver.get_log('performance') + + # 解析网络日志获取视频详细数据 + for entry in logs: + try: + log = json.loads(entry['message'])['message'] + if ( + 'Network.responseReceived' in log['method'] + and 'response' in log['params'] + and log['params']['response'] + and log['params']['response'].get('url') + ): + url = log['params']['response']['url'] + + # 检查是否是视频详情API + if '/aweme/v1/web/aweme/detail/' in url and video_id in url: + try: + # 获取响应体 + response_body = self.driver.execute_cdp_cmd( + 'Network.getResponseBody', + {'requestId': log['params']['requestId']} + ) + + if response_body and 'body' in response_body: + data = json.loads(response_body['body']) + aweme_detail = data.get('aweme_detail', {}) + + if aweme_detail: + # 获取统计数据 + statistics = aweme_detail.get('statistics', {}) + video_details['likes'] = int(statistics.get('digg_count', 0)) + video_details['shares'] = int(statistics.get('share_count', 0)) + video_details['favorites'] = int(statistics.get('collect_count', 0)) + + # 添加格式化字段 + video_details['likes_formatted'] = self.format_interaction_count(video_details['likes']) + video_details['shares_formatted'] = self.format_interaction_count(video_details['shares']) + video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites']) + + logging.info(f'视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}') + + except Exception as e: + logging.warning(f'解析视频详情API响应失败: {e}') + continue + + # 检查是否是评论API + elif '/aweme/v1/web/comment/list/' in url and video_id in url: + try: + # 获取响应体 + response_body = self.driver.execute_cdp_cmd( + 'Network.getResponseBody', + {'requestId': log['params']['requestId']} + ) + + if response_body and 'body' in response_body: + data = json.loads(response_body['body']) + comments = data.get('comments', []) + + for comment in comments[:max_comments]: + comment_info = { + 'text': comment.get('text', ''), + 'user_name': comment.get('user', {}).get('nickname', ''), + 'digg_count': int(comment.get('digg_count', 0)), + 'create_time': comment.get('create_time', 0) + } + video_details['comments'].append(comment_info) + + logging.info(f'视频 {video_id} 获取到 {len(video_details["comments"])} 条评论') + + except Exception as e: + logging.warning(f'解析评论API响应失败: {e}') + continue + + except Exception as e: + continue + + # 如果网络日志没有获取到数据,尝试页面解析 + if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0: + video_details = self._parse_video_details_from_page(video_id, video_details, max_comments) + + video_details['success'] = True + return video_details + + except Exception as e: + error_msg = f'获取视频 {video_id} 详细数据失败: {e}' + logging.error(error_msg) + video_details['error'] = error_msg + return video_details + + def _parse_video_details_from_page(self, video_id: str, video_details: dict, max_comments: int = 20) -> dict: + """从页面元素解析视频详细数据(备用方案) + Args: + video_id: 视频ID + video_details: 现有的视频详细数据字典 + max_comments: 最大评论数量 + Returns: + dict: 更新后的视频详细数据字典 + """ + try: + logging.info(f'尝试从页面元素解析视频 {video_id} 的详细数据') + + # 尝试解析页面中的SSR数据 + try: + # 查找包含视频数据的script标签 + scripts = self.driver.find_elements("tag name", "script") + for script in scripts: + script_content = script.get_attribute('innerHTML') + if script_content and ('window._SSR_HYDRATED_DATA' in script_content or 'RENDER_DATA' in script_content): + # 提取JSON数据 + if 'window._SSR_HYDRATED_DATA' in script_content: + match = re.search(r'window\._SSR_HYDRATED_DATA\s*=\s*({.*?});', script_content, re.DOTALL) + else: + match = re.search(r'window\.RENDER_DATA\s*=\s*({.*?});', script_content, re.DOTALL) + + if match: + data = json.loads(match.group(1)) + + # 查找视频详情数据 + def find_video_data(obj, target_id): + if isinstance(obj, dict): + for key, value in obj.items(): + if key == 'aweme_id' and str(value) == str(target_id): + return obj + elif isinstance(value, (dict, list)): + result = find_video_data(value, target_id) + if result: + return result + elif isinstance(obj, list): + for item in obj: + result = find_video_data(item, target_id) + if result: + return result + return None + + video_data = find_video_data(data, video_id) + if video_data: + statistics = video_data.get('statistics', {}) + video_details['likes'] = int(statistics.get('digg_count', 0)) + video_details['shares'] = int(statistics.get('share_count', 0)) + video_details['favorites'] = int(statistics.get('collect_count', 0)) + + # 添加格式化字段 + video_details['likes_formatted'] = self.format_interaction_count(video_details['likes']) + video_details['shares_formatted'] = self.format_interaction_count(video_details['shares']) + video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites']) + + logging.info(f'从SSR数据解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}') + break + + except Exception as e: + logging.warning(f'解析SSR数据失败: {e}') + + # 如果SSR数据解析失败,尝试CSS选择器 + if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0: + try: + # 尝试常见的点赞、分享、收藏按钮选择器 + selectors = { + 'likes': [ + '[data-e2e="video-like-count"]', + '[class*="like"] [class*="count"]', + '[class*="digg"] [class*="count"]' + ], + 'shares': [ + '[data-e2e="video-share-count"]', + '[class*="share"] [class*="count"]' + ], + 'favorites': [ + '[data-e2e="video-collect-count"]', + '[class*="collect"] [class*="count"]', + '[class*="favorite"] [class*="count"]' + ] + } + + for data_type, selector_list in selectors.items(): + for selector in selector_list: + try: + elements = self.driver.find_elements("css selector", selector) + if elements: + text = elements[0].text.strip() + if text and text.replace('.', '').replace('万', '').replace('亿', '').isdigit(): + # 转换数字格式 + if '亿' in text: + video_details[data_type] = int(float(text.replace('亿', '')) * 100000000) + elif '万' in text: + video_details[data_type] = int(float(text.replace('万', '')) * 10000) + else: + video_details[data_type] = int(text) + break + except Exception: + continue + + if video_details['likes'] > 0 or video_details['shares'] > 0 or video_details['favorites'] > 0: + # 添加格式化字段 + video_details['likes_formatted'] = self.format_interaction_count(video_details['likes']) + video_details['shares_formatted'] = self.format_interaction_count(video_details['shares']) + video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites']) + + logging.info(f'从页面元素解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}') + + except Exception as e: + logging.warning(f'CSS选择器解析失败: {e}') + + # 尝试获取评论(如果还没有获取到) + if not video_details['comments']: + try: + # 滚动到评论区域 + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + # 尝试常见的评论选择器 + comment_selectors = [ + '[data-e2e="comment-item"]', + '[class*="comment-item"]', + '[class*="comment"] [class*="content"]' + ] + + for selector in comment_selectors: + try: + comment_elements = self.driver.find_elements("css selector", selector)[:max_comments] + if comment_elements: + for element in comment_elements: + try: + comment_text = element.text.strip() + if comment_text: + comment_info = { + 'text': comment_text, + 'user_name': '', + 'digg_count': 0, + 'create_time': 0 + } + video_details['comments'].append(comment_info) + except Exception: + continue + + if video_details['comments']: + logging.info(f'从页面元素获取到视频 {video_id} 的 {len(video_details["comments"])} 条评论') + break + except Exception: + continue + + except Exception as e: + logging.warning(f'获取评论失败: {e}') + + except Exception as e: + logging.warning(f'页面解析视频详细数据失败: {e}') + + return video_details + + def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', max_comments_per_video: int = 10) -> list: + """获取合集中所有视频的详细互动数据 + Args: + episode_video_ids: 视频ID列表 + mix_name: 合集名称,用于日志 + max_comments_per_video: 每个视频最大评论数量,默认10条 + Returns: + list: 包含每个视频详细数据的列表 + """ + # 定时器模式下跳过此函数 + if os.environ.get('TIMER_MODE') == '1': + logging.info(f'定时器模式:跳过 get_collection_video_details 函数') + return [] + + if not episode_video_ids: + logging.info(f'合集 {mix_name} 没有视频ID,跳过详细数据获取') + return [] + + logging.info(f'开始获取合集 {mix_name} 中 {len(episode_video_ids)} 个视频的详细数据') + + video_details_list = [] + + for i, video_id in enumerate(episode_video_ids, 1): + if not video_id: + logging.warning(f'合集 {mix_name} 第 {i} 集视频ID为空,跳过') + video_details_list.append({ + 'episode_number': i, + 'video_id': '', + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'comments': [], + 'success': False, + 'error': '视频ID为空' + }) + continue + + logging.info(f'获取合集 {mix_name} 第 {i}/{len(episode_video_ids)} 集视频详细数据: {video_id}') + + try: + # 获取单个视频的详细数据 + video_details = self.get_video_details(video_id, max_comments_per_video) + video_details['episode_number'] = i + video_details_list.append(video_details) + + # 添加延迟避免请求过快 + time.sleep(2) + + except Exception as e: + error_msg = f'获取视频 {video_id} 详细数据时出错: {e}' + logging.error(error_msg) + video_details_list.append({ + 'episode_number': i, + 'video_id': video_id, + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'comments': [], + 'success': False, + 'error': error_msg + }) + + # 统计获取结果 + success_count = sum(1 for detail in video_details_list if detail.get('success', False)) + total_likes = sum(detail.get('likes', 0) for detail in video_details_list) + total_comments = sum(len(detail.get('comments', [])) for detail in video_details_list) + + logging.info(f'合集 {mix_name} 视频详细数据获取完成: {success_count}/{len(episode_video_ids)} 成功, 总点赞数={total_likes:,}, 总评论数={total_comments}') + + return video_details_list + def get_cookies_dict(self): """获取当前页面的cookies""" if not hasattr(self, 'cookies') or not self.cookies: diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index aec3aae..ae62d27 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -135,7 +135,8 @@ def format_mix_item(doc): "updated_to_episode": doc.get("updated_to_episode", 0), "cover_backup_urls": doc.get("cover_backup_urls", []), "mix_id": doc.get("mix_id", ""), - "episode_video_ids": doc.get("episode_video_ids", []) + "episode_video_ids": doc.get("episode_video_ids", []), + "episode_details": doc.get("episode_details", []) } def get_mix_list(page=1, limit=20, sort_by="playcount"):