diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index ef2e862..e514454 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -87,13 +87,6 @@ class DouyinAutoScheduler: script_dir = os.path.dirname(os.path.abspath(__file__)) profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent') - # # 检查配置文件目录是否存在 - # if not os.path.exists(profile_dir): - # print("⚠️ 检测到定时器浏览器配置目录不存在,需要首次登录") - # print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面") - # print(" 完成后按回车键继续...") - # input() - # return # 检查配置文件是否为空(可能未登录) import glob @@ -178,7 +171,7 @@ class DouyinAutoScheduler: scraper = DouyinPlayVVScraper( start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation", auto_continue=True, - duration_s=180 # 增加到180秒,给更多时间收集数据 + duration_s=60 # 增加到60秒,给更多时间收集数据 ) print("开始执行抓取任务...") diff --git a/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json b/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json index 619bea2..dbb2d4e 100644 --- a/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json +++ b/backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json @@ -147,9 +147,17 @@ { "video_id": "7567050545257516331", "episode_num": 0 + }, + { + "video_id": "7568152326477942022", + "episode_num": 0 + }, + { + "video_id": "7569217928420183332", + "episode_num": 0 } ], - "total_count": 37, - "last_update": "2025-10-31T09:50:18.533027", + "total_count": 39, + "last_update": "2025-11-06T11:06:44.598400", "mix_name": "末世系列" } \ No newline at end of file diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 8ee16db..4c24f95 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -686,22 +686,32 @@ class DouyinPlayVVScraper: # 在auto_continue模式下增加页面加载等待时间 if self.auto_continue: logging.info('自动继续模式:增加页面加载等待时间') - time.sleep(8) # 等待页面完全加载 + time.sleep(10) # 增加到10秒,确保页面完全加载 else: # 普通模式也需要增加页面加载等待时间 logging.info('普通模式:增加页面加载等待时间') - time.sleep(8) # 等待页面完全加载 + time.sleep(10) # 增加到10秒,确保页面完全加载 - # 滚动触发懒加载 - for i in range(8): + # 第一轮滚动:触发懒加载 + logging.info('第一轮滚动:触发懒加载') + for i in range(10): # 增加滚动次数 self.driver.execute_script(f'window.scrollTo(0, {i * 900});') - time.sleep(1.2) + time.sleep(1.5) # 增加等待时间 + + # 等待数据加载 + logging.info('等待数据加载...') + time.sleep(5) + # 刷新触发新请求 + logging.info('刷新页面触发新请求') self.driver.refresh() - time.sleep(4) - for i in range(6): + time.sleep(6) # 增加刷新后的等待时间 + + # 第二轮滚动:确保所有数据加载 + logging.info('第二轮滚动:确保所有数据加载') + for i in range(8): self.driver.execute_script(f'window.scrollTo(0, {i * 1200});') - time.sleep(1.3) + time.sleep(1.5) def format_count(self, n: int) -> str: if n >= 100_000_000: @@ -794,41 +804,19 @@ class DouyinPlayVVScraper: mix_name = obj.get('mix_name', '') statis = obj.get('statis', {}) - # 调试:输出包含mix_id的完整对象结构(仅输出前3个) - if len(self.play_vv_items) < 3: - logging.info(f"=== 调试:合集对象结构 ===") - logging.info(f"完整对象键: {list(obj.keys())}") - # 查找可能的视频相关字段和新增字段 - for key, value in obj.items(): - if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower(): - logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}") - # 检查新增字段相关的键 - elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']): - logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}") - - # 特别检查ids字段 - if 'ids' in obj: - ids_value = obj['ids'] - logging.info(f"ids字段详细信息: {type(ids_value)} - {ids_value}") - if isinstance(ids_value, list) and len(ids_value) > 0: - logging.info(f"ids列表长度: {len(ids_value)}") - logging.info(f"第一个ID: {ids_value[0]}") - if len(ids_value) > 1: - logging.info(f"第二个ID: {ids_value[1]}") - if isinstance(statis, dict) and 'play_vv' in statis: play_vv = statis.get('play_vv') if isinstance(play_vv, (int, str)) and str(play_vv).isdigit(): vv = int(play_vv) - # 数据验证:确保播放量大于0且合集名称不为空 - if vv <= 0: - logging.warning(f"跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}") + # 数据验证:确保合集名称不为空 + if not mix_name or mix_name.strip() == "": + logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}, mix_id={mix_id}") return - if not mix_name or mix_name.strip() == "": - logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}") - return + # 🔧 修复:不跳过播放量为0的数据,而是标记并保留 + # 这些数据可能是因为页面加载不完整,但合集本身是存在的 + # 警告信息移到去重检查之后,只有真正添加时才警告 # 构建合集链接 video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" @@ -994,12 +982,40 @@ class DouyinPlayVVScraper: 'timestamp': datetime.now().isoformat() } - # 添加到列表(保持原有逻辑) - self.play_vv_items.append(item_data) + # 🔧 修复:添加前检查是否已存在(避免重复) + # 检查是否已经有相同mix_id的数据 + existing_item = None + for existing in self.play_vv_items: + if existing.get('mix_id') == mix_id: + existing_item = existing + break - # 实时保存到数据库 - if self.realtime_save_enabled: - self.save_single_item_realtime(item_data) + if existing_item: + # 如果已存在,比较播放量,保留更大的 + existing_vv = existing_item.get('play_vv', 0) + if vv > existing_vv: + # 当前数据更好,替换 + logging.info(f'🔄 更新重复短剧: {mix_name} (播放量: {existing_vv:,} → {vv:,})') + self.play_vv_items.remove(existing_item) + self.play_vv_items.append(item_data) + else: + # 已有数据更好,跳过 + logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})') + return # 跳过当前数据 + else: + # 不存在,直接添加 + self.play_vv_items.append(item_data) + + # 只有在真正添加时,才对播放量为0的数据发出警告 + if vv <= 0: + logging.warning(f"⚠️ 添加了播放量为0的数据: {mix_name} (ID: {mix_id})") + logging.warning(f" 这可能需要后续重新获取播放量") + + # 🔧 修复:不在数据收集阶段进行实时保存 + # 实时保存会触发获取详细内容,导致数据收集中断 + # 改为在数据收集完成后统一处理 + # if self.realtime_save_enabled: + # self.save_single_item_realtime(item_data) logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') if series_author: @@ -1064,10 +1080,6 @@ class DouyinPlayVVScraper: # 添加到列表(保持原有逻辑) self.play_vv_items.append(item_data) - # 实时保存到数据库 - if self.realtime_save_enabled: - self.save_single_item_realtime(item_data) - logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') except Exception: continue @@ -1165,6 +1177,13 @@ class DouyinPlayVVScraper: time.sleep(0.8) logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个目标') + logging.info(f'=' * 60) + logging.info(f'网络收集阶段统计:') + logging.info(f' - 总数量: {len(self.play_vv_items)} 个合集') + logging.info(f' - 播放量为0: {sum(1 for item in self.play_vv_items if item.get("play_vv", 0) == 0)} 个') + logging.info(f' - 播放量正常: {sum(1 for item in self.play_vv_items if item.get("play_vv", 0) > 0)} 个') + logging.info(f'=' * 60) + logging.info(f'开始解析SSR数据...') def parse_ssr_data(self): @@ -1206,22 +1225,98 @@ class DouyinPlayVVScraper: pass def dedupe(self): - # 去重按play_vv数值 - unique = [] - seen = set() + # 🔧 修复:按mix_id去重,保留播放量最大的那个 + # 原来的逻辑会导致播放量相同的不同短剧被误删 + unique_dict = {} # 使用字典存储,key是identifier,value是item + for item in self.play_vv_items: - vv = item['play_vv'] - if vv not in seen: - unique.append(item) - seen.add(vv) + mix_id = item.get('mix_id', '') + + # 如果没有mix_id,使用mix_name作为备用标识 + if not mix_id: + mix_name = item.get('mix_name', '') + identifier = f"name_{mix_name}" + else: + identifier = f"id_{mix_id}" + + # 如果是第一次遇到这个identifier,直接添加 + if identifier not in unique_dict: + unique_dict[identifier] = item + else: + # 如果已经存在,比较播放量,保留播放量大的 + existing_play_vv = unique_dict[identifier].get('play_vv', 0) + current_play_vv = item.get('play_vv', 0) + + if current_play_vv > existing_play_vv: + # 当前数据的播放量更大,替换 + logging.info(f'去重:发现重复短剧 {item.get("mix_name", "未知")},保留播放量更大的数据 ({existing_play_vv:,} → {current_play_vv:,})') + unique_dict[identifier] = item + else: + # 已有数据的播放量更大或相等,跳过当前数据 + logging.debug(f'去重:跳过重复的短剧 {item.get("mix_name", "未知")} (mix_id: {mix_id})') + + # 转换回列表 + unique = list(unique_dict.values()) + + removed_count = len(self.play_vv_items) - len(unique) + if removed_count > 0: + logging.info(f'去重完成:移除 {removed_count} 个重复项,保留 {len(unique)} 个唯一短剧') + else: + logging.info(f'去重完成:没有重复项,保留 {len(unique)} 个唯一短剧') + self.play_vv_items = unique def save_results(self): - if self.realtime_save_enabled and self.saved_items: - # 实时保存模式:只更新排名和统计信息 - self.update_ranks_for_batch() - logging.info(f'[实时保存] 所有数据已通过实时保存功能保存到数据库,共 {len(self.saved_items)} 个合集') + if self.realtime_save_enabled: + # 🔧 修复:在数据收集完成后,统一进行实时保存 + logging.info(f'[实时保存] 开始保存 {len(self.play_vv_items)} 个合集的数据') logging.info(f'[实时保存] 批次ID: {self.batch_id}') + + # 先保存所有合集的基础信息(不获取详细内容) + for item_data in self.play_vv_items: + try: + logging.info(f'[实时保存] 保存合集基础信息: {item_data.get("mix_name", "未知")}') + self.save_collection_basic_info(item_data) + except Exception as e: + logging.error(f'[实时保存] 保存合集基础信息失败: {item_data.get("mix_name", "未知")} - {e}') + + # 更新排名 + try: + self.update_ranks_for_batch() + except Exception as e: + logging.error(f'[实时保存] 更新排名失败: {e}') + + # 然后逐个获取详细内容(如果需要) + logging.info(f'[实时保存] 基础信息保存完成,开始获取详细内容') + for item_data in self.play_vv_items: + try: + mix_id = item_data.get('mix_id', '') + mix_name = item_data.get('mix_name', '') + current_episode_count = item_data.get('updated_to_episode', 0) + + if mix_id and current_episode_count > 0: + # 查找已保存的文档ID + target_collection = self.collection + if target_collection is not None: + existing_doc = target_collection.find_one({'mix_id': mix_id}, {'_id': 1}) + if existing_doc: + document_id = existing_doc['_id'] + logging.info(f'[实时保存] 开始获取详细内容: {mix_name}') + + # 获取视频ID列表 + episode_video_ids = self.update_collection_video_ids( + document_id, mix_id, mix_name, current_episode_count + ) + + # 获取视频详细数据 + if episode_video_ids: + self.update_video_details_incrementally( + document_id, episode_video_ids, mix_name, mix_id + ) + except Exception as e: + logging.error(f'[实时保存] 获取详细内容失败: {item_data.get("mix_name", "未知")} - {e}') + + logging.info(f'[实时保存] 所有数据处理完成,共 {len(self.saved_items)} 个合集') else: # 传统批量保存模式 self.save_to_mongodb() @@ -1230,7 +1325,12 @@ class DouyinPlayVVScraper: def update_ranks_for_batch(self): """为当前批次的数据更新排名""" target_collection = self.collection # 使用根据模式选择的集合 - if target_collection is None or not self.saved_items: + if target_collection is None: + logging.warning('[实时保存] 数据库集合未初始化,跳过排名更新') + return + + if not self.saved_items: + logging.warning('[实时保存] 没有已保存的数据,跳过排名更新') return try: @@ -3847,11 +3947,32 @@ class DouyinPlayVVScraper: self.navigate() self.ensure_login() self.trigger_loading() + + logging.info('=' * 60) + logging.info('开始数据收集阶段') + logging.info('=' * 60) self.collect_network_bodies() + logging.info(f'✅ 网络数据收集完成:{len(self.play_vv_items)} 个合集') + self.parse_ssr_data() + logging.info(f'✅ SSR数据解析完成:{len(self.play_vv_items)} 个合集') + + logging.info('=' * 60) + logging.info('开始数据去重') + logging.info('=' * 60) + before_dedupe = len(self.play_vv_items) self.dedupe() + after_dedupe = len(self.play_vv_items) + logging.info(f'✅ 去重完成:{before_dedupe} → {after_dedupe} (移除 {before_dedupe - after_dedupe} 个重复项)') + + logging.info('=' * 60) + logging.info('开始保存数据') + logging.info('=' * 60) self.save_results() - logging.info('完成,play_vv数量: %d', len(self.play_vv_items)) + + logging.info('=' * 60) + logging.info(f'✅ 全部完成!共处理 {len(self.play_vv_items)} 个合集') + logging.info('=' * 60) except Exception as e: import traceback error_details = { @@ -3901,7 +4022,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Selenium+CDP 抖音play_vv抓取器') parser.add_argument('--url', default='https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation', help='收藏合集列表页面URL') parser.add_argument('--auto', action='store_true', help='自动继续,跳过回车等待') - parser.add_argument('--duration', type=int, default=180, help='网络响应收集时长(秒)') + parser.add_argument('--duration', type=int, default=60, help='网络响应收集时长(秒)') parser.add_argument('--driver', help='覆盖chromedriver路径') parser.add_argument('--timer', action='store_true', help='启用定时器模式,应用config.py中的定时器配置') args = parser.parse_args()