This commit is contained in:
qiaoyirui0819 2025-10-25 19:28:01 +08:00
commit 43ec2c397e
2 changed files with 27 additions and 1160 deletions

View File

@ -103,12 +103,6 @@ class DouyinAutoScheduler:
# 设置环境变量,确保自动模式
os.environ['AUTO_CONTINUE'] = '1'
# 设置定时器模式环境变量,跳过评论抓取等函数
os.environ['TIMER_MODE'] = '1'
# 只在定时器模式下设置静默模式(非测试、非单次执行、非仅生成榜单)
if hasattr(self, '_is_timer_mode') and self._is_timer_mode:
os.environ['QUIET_MODE'] = '1'
# 直接创建并运行 DouyinPlayVVScraper 实例
scraper = DouyinPlayVVScraper(
@ -116,11 +110,11 @@ class DouyinAutoScheduler:
auto_continue=True,
duration_s=60
)
logging.warning("📁 开始执行抓取任务...")
logging.info("📁 开始执行抓取任务...")
scraper.run()
logging.warning("✅ 抖音播放量抓取任务执行成功")
logging.info("✅ 抖音播放量抓取任务执行成功")
# 数据抓取完成后,自动生成当日榜单
self.generate_daily_rankings()
@ -168,43 +162,35 @@ class DouyinAutoScheduler:
today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1))
logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}")
# 调试:检查原始数据
if today_videos_raw:
sample_video = today_videos_raw[0]
logging.info(f"🔍 样本数据检查:")
logging.info(f" mix_name: {sample_video.get('mix_name')}")
logging.info(f" play_vv: {sample_video.get('play_vv')} (类型: {type(sample_video.get('play_vv'))})")
logging.info(f" author: {sample_video.get('author')}")
# 按短剧名称去重,每个短剧只保留播放量最高的一条
unique_videos = {}
for video in today_videos_raw:
mix_name = video.get("mix_name", "")
if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)):
unique_videos[mix_name] = video
# 按短剧名称去重并确保数据类型正确
unique_videos = self._deduplicate_videos_by_mix_name(today_videos_raw, include_rank=False)
today_videos = list(unique_videos.values())
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
# 获取昨天最后一批次的数据
yesterday_start = datetime(yesterday.year, yesterday.month, yesterday.day)
yesterday_end = yesterday_start + timedelta(days=1)
yesterday_batch = douyin_collection.find_one({
"batch_time": {"$gte": yesterday_start, "$lt": yesterday_end}
}, sort=[("batch_time", -1)])
# 获取昨天的榜单数据(如果存在),取最新的计算结果
yesterday_ranking = rankings_collection.find_one({
"date": yesterday_str,
"type": "comprehensive"
}, sort=[("calculation_sequence", -1)])
yesterday_data = {}
if yesterday_batch:
# 获取昨天最后一批次的所有数据
yesterday_videos = list(douyin_collection.find({
"batch_time": yesterday_batch["batch_time"]
}).sort("play_vv", -1))
# 按短剧名称去重,保留播放量最高的记录,并确保数据类型正确
yesterday_data = self._deduplicate_videos_by_mix_name(yesterday_videos, include_rank=True)
# 计算排名
sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True)
for rank, (mix_name, data) in enumerate(sorted_videos, 1):
yesterday_data[mix_name]["rank"] = rank
logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧")
if yesterday_ranking and "data" in yesterday_ranking:
# 将昨天的数据转换为字典,以短剧名称为键
for item in yesterday_ranking["data"]:
title = item.get("title", "")
if title:
yesterday_data[title] = {
"rank": item.get("rank", 0),
"play_vv": item.get("play_vv", 0),
"video_id": item.get("video_id", "")
}
logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧")
else:
logging.info("📊 未找到昨天的原始数据,将作为首次生成")

File diff suppressed because it is too large Load Diff