This commit is contained in:
qiaoyirui0819 2025-10-25 19:28:01 +08:00
commit 43ec2c397e
2 changed files with 27 additions and 1160 deletions

View File

@ -103,12 +103,6 @@ class DouyinAutoScheduler:
# 设置环境变量,确保自动模式 # 设置环境变量,确保自动模式
os.environ['AUTO_CONTINUE'] = '1' os.environ['AUTO_CONTINUE'] = '1'
# 设置定时器模式环境变量,跳过评论抓取等函数
os.environ['TIMER_MODE'] = '1'
# 只在定时器模式下设置静默模式(非测试、非单次执行、非仅生成榜单)
if hasattr(self, '_is_timer_mode') and self._is_timer_mode:
os.environ['QUIET_MODE'] = '1'
# 直接创建并运行 DouyinPlayVVScraper 实例 # 直接创建并运行 DouyinPlayVVScraper 实例
scraper = DouyinPlayVVScraper( scraper = DouyinPlayVVScraper(
@ -117,10 +111,10 @@ class DouyinAutoScheduler:
duration_s=60 duration_s=60
) )
logging.warning("📁 开始执行抓取任务...") logging.info("📁 开始执行抓取任务...")
scraper.run() scraper.run()
logging.warning("✅ 抖音播放量抓取任务执行成功") logging.info("✅ 抖音播放量抓取任务执行成功")
# 数据抓取完成后,自动生成当日榜单 # 数据抓取完成后,自动生成当日榜单
self.generate_daily_rankings() self.generate_daily_rankings()
@ -168,43 +162,35 @@ class DouyinAutoScheduler:
today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1)) today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1))
logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}") logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}")
# 调试:检查原始数据 # 按短剧名称去重,每个短剧只保留播放量最高的一条
if today_videos_raw: unique_videos = {}
sample_video = today_videos_raw[0] for video in today_videos_raw:
logging.info(f"🔍 样本数据检查:") mix_name = video.get("mix_name", "")
logging.info(f" mix_name: {sample_video.get('mix_name')}") if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)):
logging.info(f" play_vv: {sample_video.get('play_vv')} (类型: {type(sample_video.get('play_vv'))})") unique_videos[mix_name] = video
logging.info(f" author: {sample_video.get('author')}")
# 按短剧名称去重并确保数据类型正确
unique_videos = self._deduplicate_videos_by_mix_name(today_videos_raw, include_rank=False)
today_videos = list(unique_videos.values()) today_videos = list(unique_videos.values())
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
# 获取昨天最后一批次的数据 # 获取昨天的榜单数据(如果存在),取最新的计算结果
yesterday_start = datetime(yesterday.year, yesterday.month, yesterday.day) yesterday_ranking = rankings_collection.find_one({
yesterday_end = yesterday_start + timedelta(days=1) "date": yesterday_str,
yesterday_batch = douyin_collection.find_one({ "type": "comprehensive"
"batch_time": {"$gte": yesterday_start, "$lt": yesterday_end} }, sort=[("calculation_sequence", -1)])
}, sort=[("batch_time", -1)])
yesterday_data = {} yesterday_data = {}
if yesterday_batch: if yesterday_ranking and "data" in yesterday_ranking:
# 获取昨天最后一批次的所有数据 # 将昨天的数据转换为字典,以短剧名称为键
yesterday_videos = list(douyin_collection.find({ for item in yesterday_ranking["data"]:
"batch_time": yesterday_batch["batch_time"] title = item.get("title", "")
}).sort("play_vv", -1)) if title:
yesterday_data[title] = {
# 按短剧名称去重,保留播放量最高的记录,并确保数据类型正确 "rank": item.get("rank", 0),
yesterday_data = self._deduplicate_videos_by_mix_name(yesterday_videos, include_rank=True) "play_vv": item.get("play_vv", 0),
"video_id": item.get("video_id", "")
# 计算排名 }
sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True) logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧")
for rank, (mix_name, data) in enumerate(sorted_videos, 1):
yesterday_data[mix_name]["rank"] = rank
logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧")
else: else:
logging.info("📊 未找到昨天的原始数据,将作为首次生成") logging.info("📊 未找到昨天的原始数据,将作为首次生成")

File diff suppressed because it is too large Load Diff