主代码可以实时更新

定时器由于要进行播放量插值计算，所以要有固定的时间戳，还是统一保存。
2025-10-27 18:55:54 +08:00 · 2025-10-27 18:55:54 +08:00 · 9295e77cf1
commit 9295e77cf1
parent e8baaa4ce9
2 changed files with 188 additions and 9 deletions
--- a/backend/config.py
+++ b/backend/config.py
@ -18,6 +18,19 @@ LOG_DIR = 'logs'
 # 定时器配置
 SCHEDULER_TIME = "24:00"  # 定时器执行时间，格式为 HH:MM (24小时制)

+# 定时器环境变量配置
+TIMER_ENV_CONFIG = {
+    'TIMER_MODE': '1',      # 启用定时器模式，使数据保存到 Ranking_storage_list 集合
+    'AUTO_CONTINUE': '1'    # 启用自动模式，跳过详细数据获取以提高性能
+}
+
+# 自动模式跳过函数配置
+AUTO_CONTINUE_SKIP_FUNCTIONS = [
+    'get_collection_video_details',  # 跳过合集视频详细数据获取
+    'scroll_comments',               # 跳过评论滚动
+    # 可以在这里添加更多需要跳过的函数名
+]
+
 # TOS/火山云对象存储配置
 TOS_CONFIG = {
    'access_key_id': os.getenv('TOS_ACCESS_KEY_ID', 'AKLTYjQyYmE1ZDAwZTY5NGZiOWI3ODZkZDhhOWE4MzVjODE'),
@ -39,4 +52,13 @@ API_CONFIG = {
    'OSS_HOST': TOS_CONFIG['self_domain']
 }

+def apply_timer_environment():
+    """应用定时器环境变量配置"""
+    for key, value in TIMER_ENV_CONFIG.items():
+        os.environ[key] = value
+
+def get_skip_functions():
+    """获取自动模式下需要跳过的函数列表"""
+    return AUTO_CONTINUE_SKIP_FUNCTIONS.copy()
+
 print(f"Successfully loaded configuration for environment: {APP_ENV}")
--- a/backend/handlers/Rankings/rank_data_scraper.py
+++ b/backend/handlers/Rankings/rank_data_scraper.py
@ -713,7 +713,7 @@ class DouyinPlayVVScraper:
                                except ValueError:
                                    pass  # 忽略无法转换为整数的情况

-                            self.play_vv_items.append({
+                            item_data = {
                                'play_vv': vv,
                                'formatted': self.format_count(vv),
                                'url': source_url,
@ -727,7 +727,9 @@ class DouyinPlayVVScraper:
                                'desc': desc,  # 合集描述
                                'updated_to_episode': updated_to_episode,  # 合集总集数
                                'timestamp': datetime.now().isoformat()
-                            })
+                            }
+                            
+                            self.play_vv_items.append(item_data)
                            logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
                            if series_author:
                                logging.info(f'  作者: {series_author}')
@ -736,6 +738,14 @@ class DouyinPlayVVScraper:
                            if updated_to_episode > 0:
                                logging.info(f'  总集数: {updated_to_episode}')
                            
+                            # 只在非定时器模式下使用实时保存
+                            is_timer_mode = os.environ.get('TIMER_MODE') == '1'
+                            if not is_timer_mode:
+                                logging.info(f'立即保存合集数据: {mix_name}')
+                                self.save_single_item_to_mongodb(item_data)
+                            else:
+                                logging.info(f'定时器模式：暂存合集数据: {mix_name}，将在最后批量保存')
+                
                # 递归搜索子对象
                for key, value in obj.items():
                    if isinstance(value, (dict, list)):
@ -766,7 +776,7 @@ class DouyinPlayVVScraper:
                if episodes > 0:
                    logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")

-                self.play_vv_items.append({
+                item_data = {
                    'play_vv': vv,
                    'formatted': self.format_count(vv),
                    'url': source_url,
@ -776,8 +786,18 @@ class DouyinPlayVVScraper:
                    'mix_id': mix_id,  # 合集ID
                    'updated_to_episode': episodes if episodes > 0 else None,  # 从statis.updated_to_episode提取的集数
                    'timestamp': datetime.now().isoformat()
-                })
+                }
+                
+                self.play_vv_items.append(item_data)
                logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
+                
+                # 只在非定时器模式下使用实时保存
+                is_timer_mode = os.environ.get('TIMER_MODE') == '1'
+                if not is_timer_mode:
+                    logging.info(f'立即保存正则提取的合集数据: {mix_name}')
+                    self.save_single_item_to_mongodb(item_data)
+                else:
+                    logging.info(f'定时器模式：暂存正则提取的合集数据: {mix_name}，将在最后批量保存')
            except Exception:
                continue
        
@ -787,7 +807,7 @@ class DouyinPlayVVScraper:
                vv = int(match)
                # 检查是否已经存在相同的play_vv
                if not any(item['play_vv'] == vv for item in self.play_vv_items):
-                    self.play_vv_items.append({
+                    item_data = {
                        'play_vv': vv,
                        'formatted': self.format_count(vv),
                        'url': source_url,
@ -797,7 +817,18 @@ class DouyinPlayVVScraper:
                        'mix_id': '',  # 未知mix_id
                        'updated_to_episode': None,  # 未知集数
                        'timestamp': datetime.now().isoformat()
-                    })
+                    }
+                    
+                    self.play_vv_items.append(item_data)
+                    logging.info(f'兜底提取到播放量: {vv:,}')
+                    
+                    # 只在非定时器模式下使用实时保存
+                    is_timer_mode = os.environ.get('TIMER_MODE') == '1'
+                    if not is_timer_mode:
+                        logging.info(f'立即保存兜底提取的数据: {vv:,} 播放量')
+                        self.save_single_item_to_mongodb(item_data)
+                    else:
+                        logging.info(f'定时器模式：暂存兜底提取的数据: {vv:,} 播放量，将在最后批量保存')
            except Exception:
                continue

@ -1065,7 +1096,11 @@ class DouyinPlayVVScraper:
        return cover_url  # 上传失败时返回原链接

    def save_to_mongodb(self):
-        """将数据保存到MongoDB"""
+        """
+        将数据批量保存到MongoDB
+        注意：此方法现在作为备用保留，正常流程使用实时保存功能(save_single_item_to_mongodb)
+        避免重复保存数据
+        """
        if self.collection is None:
            logging.warning('MongoDB未连接，跳过数据库保存')
            return
@ -1253,6 +1288,119 @@ class DouyinPlayVVScraper:
        except Exception as e:
            logging.error(f'保存到MongoDB时出错: {e}')

+    def save_single_item_to_mongodb(self, item: dict):
+        """将单条数据立即保存到MongoDB
+        Args:
+            item: 包含合集信息的字典
+        """
+        if self.collection is None:
+            logging.warning('MongoDB未连接，跳过单条数据保存')
+            return
+        
+        try:
+            batch_time = datetime.now()
+            
+            # 获取原始封面图片URL
+            original_cover_url = item.get('cover_image_url', '')
+            mix_name = item.get('mix_name', '')
+            mix_id = item.get('mix_id', '')
+            
+            # 处理封面图片
+            permanent_cover_url = ''
+            upload_success = False
+            
+            if original_cover_url:
+                # 上传封面图片到TOS获取永久链接
+                permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name)
+                
+                # 检查上传是否成功
+                if permanent_cover_url != original_cover_url:
+                    upload_success = True
+                    logging.info(f'封面图片上传成功: {mix_name}')
+                else:
+                    upload_success = False
+                    logging.warning(f'封面图片上传失败，使用原始链接: {mix_name}')
+            else:
+                permanent_cover_url = ''
+                upload_success = True  # 没有图片不算失败
+            
+            # 获取合集中的所有视频ID（定时器模式时不获取详细互动数据）
+            episode_video_ids = []
+            episode_details = []
+            
+            if mix_id:
+                logging.info(f'获取合集 {mix_name} 的视频ID')
+                current_episode_count = item.get('updated_to_episode', 0)
+                episode_video_ids = self.get_collection_videos(
+                    mix_id=mix_id,
+                    mix_name=mix_name,
+                    current_episode_count=current_episode_count
+                )
+                
+                # 构建每集信息（定时器模式时不获取详细互动数据以提高速度）
+                total_episodes = item.get('updated_to_episode', 0)
+                for i in range(total_episodes):
+                    episode_number = i + 1
+                    video_id = episode_video_ids[i] if i < len(episode_video_ids) else ''
+                    
+                    episode_info = {
+                        'episode_number': episode_number,
+                        'video_id': video_id,
+                        'likes': 0,  # 定时器模式时不获取详细数据
+                        'shares': 0,
+                        'favorites': 0,
+                        'likes_formatted': '0',
+                        'shares_formatted': '0',
+                        'favorites_formatted': '0',
+                        'comments': []
+                    }
+                    episode_details.append(episode_info)
+            
+            # 计算当前排名（基于当前批次的数据）
+            higher_count = self.collection.count_documents({
+                'play_vv': {'$gt': item.get('play_vv', 0)},
+                'batch_time': {'$gte': batch_time.replace(hour=0, minute=0, second=0, microsecond=0)}
+            })
+            current_rank = higher_count + 1
+            
+            # 构建文档 - 每次都插入新记录，保留历史数据
+            doc = {
+                'batch_time': batch_time,
+                'mix_name': mix_name,
+                'video_url': item.get('video_url', ''),
+                'playcount': item.get('formatted', ''),
+                'play_vv': item.get('play_vv', 0),
+                'request_id': item.get('request_id', ''),
+                'rank': current_rank,
+                'cover_image_url_original': original_cover_url,
+                'cover_image_url': permanent_cover_url,
+                'cover_upload_success': upload_success,
+                'cover_backup_urls': item.get('cover_backup_urls', []),
+                'series_author': item.get('series_author', ''),
+                'desc': item.get('desc', ''),
+                'updated_to_episode': item.get('updated_to_episode', 0),
+                'episode_video_ids': episode_video_ids,
+                'episode_details': episode_details,
+                'created_at': datetime.now()
+            }
+            
+            # 插入新记录 - 始终插入，不更新已存在的记录
+            result = self.collection.insert_one(doc)
+            logging.info(f'边抓取边保存新记录: {mix_name} - {item.get("play_vv", 0):,} 播放量 (排名: {current_rank})')
+            
+            # 更新其他记录的排名
+            self.collection.update_many(
+                {
+                    'play_vv': {'$lt': item.get('play_vv', 0)},
+                    'batch_time': {'$gte': batch_time.replace(hour=0, minute=0, second=0, microsecond=0)},
+                    '_id': {'$ne': result.inserted_id}
+                },
+                {'$inc': {'rank': 1}}
+            )
+                
+        except Exception as e:
+            logging.error(f'实时保存单条数据到MongoDB时出错: {e}')
+
    def get_video_info(self, video_id: str) -> dict:
        """获取视频详细信息
        Args:
@ -2569,8 +2717,17 @@ class DouyinPlayVVScraper:
            self.collect_network_bodies()
            self.parse_ssr_data()
            self.dedupe()
+            
+            # 根据模式选择保存方式
+            is_timer_mode = os.environ.get('TIMER_MODE') == '1'
+            if is_timer_mode:
+                # 定时器模式：使用批量保存，所有数据使用相同的batch_time
                self.save_results()
-            logging.info('完成，play_vv数量: %d', len(self.play_vv_items))
+                logging.info('定时器模式：完成批量保存，play_vv数量: %d', len(self.play_vv_items))
+            else:
+                # 普通模式：数据已通过实时保存功能保存
+                logging.info('普通模式：完成，play_vv数量: %d', len(self.play_vv_items))
+                logging.info('所有数据已通过实时保存功能保存到数据库')
        finally:
            if self.driver:
                try: