diff --git a/backend/config.py b/backend/config.py index 38d7cc5..96aed7a 100644 --- a/backend/config.py +++ b/backend/config.py @@ -18,6 +18,19 @@ LOG_DIR = 'logs' # 定时器配置 SCHEDULER_TIME = "24:00" # 定时器执行时间,格式为 HH:MM (24小时制) +# 定时器环境变量配置 +TIMER_ENV_CONFIG = { + 'TIMER_MODE': '1', # 启用定时器模式,使数据保存到 Ranking_storage_list 集合 + 'AUTO_CONTINUE': '1' # 启用自动模式,跳过详细数据获取以提高性能 +} + +# 自动模式跳过函数配置 +AUTO_CONTINUE_SKIP_FUNCTIONS = [ + 'get_collection_video_details', # 跳过合集视频详细数据获取 + 'scroll_comments', # 跳过评论滚动 + # 可以在这里添加更多需要跳过的函数名 +] + # TOS/火山云对象存储配置 TOS_CONFIG = { 'access_key_id': os.getenv('TOS_ACCESS_KEY_ID', 'AKLTYjQyYmE1ZDAwZTY5NGZiOWI3ODZkZDhhOWE4MzVjODE'), @@ -39,4 +52,13 @@ API_CONFIG = { 'OSS_HOST': TOS_CONFIG['self_domain'] } +def apply_timer_environment(): + """应用定时器环境变量配置""" + for key, value in TIMER_ENV_CONFIG.items(): + os.environ[key] = value + +def get_skip_functions(): + """获取自动模式下需要跳过的函数列表""" + return AUTO_CONTINUE_SKIP_FUNCTIONS.copy() + print(f"Successfully loaded configuration for environment: {APP_ENV}") \ No newline at end of file diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 3134963..69e1e2f 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -713,7 +713,7 @@ class DouyinPlayVVScraper: except ValueError: pass # 忽略无法转换为整数的情况 - self.play_vv_items.append({ + item_data = { 'play_vv': vv, 'formatted': self.format_count(vv), 'url': source_url, @@ -727,7 +727,9 @@ class DouyinPlayVVScraper: 'desc': desc, # 合集描述 'updated_to_episode': updated_to_episode, # 合集总集数 'timestamp': datetime.now().isoformat() - }) + } + + self.play_vv_items.append(item_data) logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') if series_author: logging.info(f' 作者: {series_author}') @@ -735,6 +737,14 @@ class DouyinPlayVVScraper: logging.info(f' 描述: {desc[:100]}{"..." if len(desc) > 100 else ""}') if updated_to_episode > 0: logging.info(f' 总集数: {updated_to_episode}') + + # 只在非定时器模式下使用实时保存 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if not is_timer_mode: + logging.info(f'立即保存合集数据: {mix_name}') + self.save_single_item_to_mongodb(item_data) + else: + logging.info(f'定时器模式:暂存合集数据: {mix_name},将在最后批量保存') # 递归搜索子对象 for key, value in obj.items(): @@ -766,7 +776,7 @@ class DouyinPlayVVScraper: if episodes > 0: logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") - self.play_vv_items.append({ + item_data = { 'play_vv': vv, 'formatted': self.format_count(vv), 'url': source_url, @@ -776,8 +786,18 @@ class DouyinPlayVVScraper: 'mix_id': mix_id, # 合集ID 'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数 'timestamp': datetime.now().isoformat() - }) + } + + self.play_vv_items.append(item_data) logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') + + # 只在非定时器模式下使用实时保存 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if not is_timer_mode: + logging.info(f'立即保存正则提取的合集数据: {mix_name}') + self.save_single_item_to_mongodb(item_data) + else: + logging.info(f'定时器模式:暂存正则提取的合集数据: {mix_name},将在最后批量保存') except Exception: continue @@ -787,7 +807,7 @@ class DouyinPlayVVScraper: vv = int(match) # 检查是否已经存在相同的play_vv if not any(item['play_vv'] == vv for item in self.play_vv_items): - self.play_vv_items.append({ + item_data = { 'play_vv': vv, 'formatted': self.format_count(vv), 'url': source_url, @@ -797,7 +817,18 @@ class DouyinPlayVVScraper: 'mix_id': '', # 未知mix_id 'updated_to_episode': None, # 未知集数 'timestamp': datetime.now().isoformat() - }) + } + + self.play_vv_items.append(item_data) + logging.info(f'兜底提取到播放量: {vv:,}') + + # 只在非定时器模式下使用实时保存 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if not is_timer_mode: + logging.info(f'立即保存兜底提取的数据: {vv:,} 播放量') + self.save_single_item_to_mongodb(item_data) + else: + logging.info(f'定时器模式:暂存兜底提取的数据: {vv:,} 播放量,将在最后批量保存') except Exception: continue @@ -1065,7 +1096,11 @@ class DouyinPlayVVScraper: return cover_url # 上传失败时返回原链接 def save_to_mongodb(self): - """将数据保存到MongoDB""" + """ + 将数据批量保存到MongoDB + 注意:此方法现在作为备用保留,正常流程使用实时保存功能(save_single_item_to_mongodb) + 避免重复保存数据 + """ if self.collection is None: logging.warning('MongoDB未连接,跳过数据库保存') return @@ -1253,6 +1288,119 @@ class DouyinPlayVVScraper: except Exception as e: logging.error(f'保存到MongoDB时出错: {e}') + def save_single_item_to_mongodb(self, item: dict): + """将单条数据立即保存到MongoDB + Args: + item: 包含合集信息的字典 + """ + if self.collection is None: + logging.warning('MongoDB未连接,跳过单条数据保存') + return + + try: + batch_time = datetime.now() + + # 获取原始封面图片URL + original_cover_url = item.get('cover_image_url', '') + mix_name = item.get('mix_name', '') + mix_id = item.get('mix_id', '') + + # 处理封面图片 + permanent_cover_url = '' + upload_success = False + + if original_cover_url: + # 上传封面图片到TOS获取永久链接 + permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name) + + # 检查上传是否成功 + if permanent_cover_url != original_cover_url: + upload_success = True + logging.info(f'封面图片上传成功: {mix_name}') + else: + upload_success = False + logging.warning(f'封面图片上传失败,使用原始链接: {mix_name}') + else: + permanent_cover_url = '' + upload_success = True # 没有图片不算失败 + + # 获取合集中的所有视频ID(定时器模式时不获取详细互动数据) + episode_video_ids = [] + episode_details = [] + + if mix_id: + logging.info(f'获取合集 {mix_name} 的视频ID') + current_episode_count = item.get('updated_to_episode', 0) + episode_video_ids = self.get_collection_videos( + mix_id=mix_id, + mix_name=mix_name, + current_episode_count=current_episode_count + ) + + # 构建每集信息(定时器模式时不获取详细互动数据以提高速度) + total_episodes = item.get('updated_to_episode', 0) + for i in range(total_episodes): + episode_number = i + 1 + video_id = episode_video_ids[i] if i < len(episode_video_ids) else '' + + episode_info = { + 'episode_number': episode_number, + 'video_id': video_id, + 'likes': 0, # 定时器模式时不获取详细数据 + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [] + } + episode_details.append(episode_info) + + # 计算当前排名(基于当前批次的数据) + higher_count = self.collection.count_documents({ + 'play_vv': {'$gt': item.get('play_vv', 0)}, + 'batch_time': {'$gte': batch_time.replace(hour=0, minute=0, second=0, microsecond=0)} + }) + current_rank = higher_count + 1 + + # 构建文档 - 每次都插入新记录,保留历史数据 + doc = { + 'batch_time': batch_time, + 'mix_name': mix_name, + 'video_url': item.get('video_url', ''), + 'playcount': item.get('formatted', ''), + 'play_vv': item.get('play_vv', 0), + 'request_id': item.get('request_id', ''), + 'rank': current_rank, + 'cover_image_url_original': original_cover_url, + 'cover_image_url': permanent_cover_url, + 'cover_upload_success': upload_success, + 'cover_backup_urls': item.get('cover_backup_urls', []), + 'series_author': item.get('series_author', ''), + 'desc': item.get('desc', ''), + 'updated_to_episode': item.get('updated_to_episode', 0), + 'episode_video_ids': episode_video_ids, + 'episode_details': episode_details, + 'created_at': datetime.now() + } + + # 插入新记录 - 始终插入,不更新已存在的记录 + result = self.collection.insert_one(doc) + logging.info(f'边抓取边保存新记录: {mix_name} - {item.get("play_vv", 0):,} 播放量 (排名: {current_rank})') + + # 更新其他记录的排名 + self.collection.update_many( + { + 'play_vv': {'$lt': item.get('play_vv', 0)}, + 'batch_time': {'$gte': batch_time.replace(hour=0, minute=0, second=0, microsecond=0)}, + '_id': {'$ne': result.inserted_id} + }, + {'$inc': {'rank': 1}} + ) + + except Exception as e: + logging.error(f'实时保存单条数据到MongoDB时出错: {e}') + def get_video_info(self, video_id: str) -> dict: """获取视频详细信息 Args: @@ -2569,8 +2717,17 @@ class DouyinPlayVVScraper: self.collect_network_bodies() self.parse_ssr_data() self.dedupe() - self.save_results() - logging.info('完成,play_vv数量: %d', len(self.play_vv_items)) + + # 根据模式选择保存方式 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + if is_timer_mode: + # 定时器模式:使用批量保存,所有数据使用相同的batch_time + self.save_results() + logging.info('定时器模式:完成批量保存,play_vv数量: %d', len(self.play_vv_items)) + else: + # 普通模式:数据已通过实时保存功能保存 + logging.info('普通模式:完成,play_vv数量: %d', len(self.play_vv_items)) + logging.info('所有数据已通过实时保存功能保存到数据库') finally: if self.driver: try: