This commit is contained in:
qiaoyirui0819 2025-11-08 16:29:33 +08:00
parent ad54ff0398
commit 13b05ae252
2 changed files with 109 additions and 102 deletions

View File

@ -249,9 +249,9 @@ class DouyinAutoScheduler:
if not mix_id or mix_id == "" or mix_id.lower() == "null": if not mix_id or mix_id == "" or mix_id.lower() == "null":
continue continue
# 过滤掉播放量为0或无效的记录 # 注意播放量为0的数据也会被保留可能是新发布的短剧
if play_vv <= 0: if play_vv <= 0:
continue logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={play_vv},仍会保留")
if mix_id not in unique_videos or play_vv > unique_videos[mix_id].get("play_vv", 0): if mix_id not in unique_videos or play_vv > unique_videos[mix_id].get("play_vv", 0):
unique_videos[mix_id] = video unique_videos[mix_id] = video
@ -283,7 +283,7 @@ class DouyinAutoScheduler:
}).sort("play_vv", -1)) }).sort("play_vv", -1))
# 按短剧ID去重每个短剧只保留播放量最高的一条 # 按短剧ID去重每个短剧只保留播放量最高的一条
# 🚫 过滤掉空的或无效的mix_id和播放量为0的记录 # 🚫 过滤掉空的或无效的mix_id
unique_yesterday_videos = {} unique_yesterday_videos = {}
for video in yesterday_videos_raw: for video in yesterday_videos_raw:
mix_id = video.get("mix_id", "").strip() mix_id = video.get("mix_id", "").strip()
@ -294,9 +294,9 @@ class DouyinAutoScheduler:
if not mix_id or mix_id == "" or mix_id.lower() == "null": if not mix_id or mix_id == "" or mix_id.lower() == "null":
continue continue
# 过滤掉播放量为0或无效的记录 # 注意播放量为0的数据也会被保留可能是新发布的短剧
if play_vv <= 0: if play_vv <= 0:
continue logging.warning(f"⚠️ 昨天数据中发现播放量为0: mix_name={mix_name}, play_vv={play_vv},仍会保留")
if mix_id not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_id].get("play_vv", 0): if mix_id not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_id].get("play_vv", 0):
unique_yesterday_videos[mix_id] = video unique_yesterday_videos[mix_id] = video
@ -369,15 +369,14 @@ class DouyinAutoScheduler:
current_play_vv = video.get("play_vv", 0) current_play_vv = video.get("play_vv", 0)
mix_name = video.get("mix_name", "").strip() mix_name = video.get("mix_name", "").strip()
# 🚫 跳过无效数据确保mix_name不为空且播放量大于0 # 🚫 跳过无效数据确保mix_name不为空
# 注意:这些数据应该已经在去重阶段被过滤掉了,这里是双重保险 # 注意:播放量为0的数据也会被保留可能是新发布的短剧
if not mix_name or mix_name == "" or mix_name.lower() == "null": if not mix_name or mix_name == "" or mix_name.lower() == "null":
self.logger.warning(f"跳过空的mix_name记录video_id: {video_id}") self.logger.warning(f"跳过空的mix_name记录video_id: {video_id}")
continue continue
if current_play_vv <= 0: if current_play_vv <= 0:
self.logger.warning(f"跳过播放量无效的记录: mix_name={mix_name}, play_vv={current_play_vv}") self.logger.warning(f"⚠️ 榜单中发现播放量为0的记录: mix_name={mix_name}, play_vv={current_play_vv},仍会保留")
continue
# 计算排名变化(基于昨天的排名) # 计算排名变化(基于昨天的排名)
rank_change = 0 rank_change = 0

View File

@ -809,11 +809,15 @@ class DouyinPlayVVScraper:
if isinstance(play_vv, (int, str)) and str(play_vv).isdigit(): if isinstance(play_vv, (int, str)) and str(play_vv).isdigit():
vv = int(play_vv) vv = int(play_vv)
# 数据验证:确保合集名称不为空 # 数据验证确保有mix_id按短剧ID去重所以必须有mix_id
if not mix_id or mix_id.strip() == "":
logging.warning(f"跳过缺少mix_id的数据: play_vv={vv}, mix_name={mix_name}")
# 跳过当前项但继续递归解析其他数据不使用return
else:
# 如果mix_name为空使用mix_id作为名称
if not mix_name or mix_name.strip() == "": if not mix_name or mix_name.strip() == "":
logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}, mix_id={mix_id}") mix_name = f"短剧_{mix_id}"
return logging.warning(f"⚠️ mix_name为空使用mix_id作为名称: {mix_name}")
# 🔧 修复不跳过播放量为0的数据而是标记并保留 # 🔧 修复不跳过播放量为0的数据而是标记并保留
# 这些数据可能是因为页面加载不完整,但合集本身是存在的 # 这些数据可能是因为页面加载不完整,但合集本身是存在的
# 警告信息移到去重检查之后,只有真正添加时才警告 # 警告信息移到去重检查之后,只有真正添加时才警告
@ -999,9 +1003,9 @@ class DouyinPlayVVScraper:
self.play_vv_items.remove(existing_item) self.play_vv_items.remove(existing_item)
self.play_vv_items.append(item_data) self.play_vv_items.append(item_data)
else: else:
# 已有数据更好,跳过 # 已有数据更好,跳过当前数据但继续递归解析其他数据
logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})') logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})')
return # 跳过当前数据 # 注意不使用return避免中断递归解析
else: else:
# 不存在,直接添加 # 不存在,直接添加
self.play_vv_items.append(item_data) self.play_vv_items.append(item_data)
@ -1049,14 +1053,20 @@ class DouyinPlayVVScraper:
vv = int(match.group(3)) vv = int(match.group(3))
episodes = int(match.group(4)) episodes = int(match.group(4))
# 数据验证确保播放量大于0且合集名称不为空 # 数据验证确保有mix_id按短剧ID去重
# 注意播放量为0的数据也会被保存可能是新发布的短剧
if vv <= 0: if vv <= 0:
logging.warning(f"正则提取跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}") logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={vv},仍会保存")
# 检查mix_id如果没有则跳过
if not mix_id or mix_id.strip() == "":
logging.warning(f"正则提取跳过缺少mix_id的数据: play_vv={vv}, mix_name={mix_name}")
continue continue
# 如果mix_name为空使用mix_id作为名称
if not mix_name or mix_name.strip() == "": if not mix_name or mix_name.strip() == "":
logging.warning(f"正则提取跳过缺少合集名称的数据: play_vv={vv}") mix_name = f"短剧_{mix_id}"
continue logging.warning(f"⚠️ mix_name为空使用mix_id作为名称: {mix_name}")
# 构建合集链接 # 构建合集链接
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
@ -1088,10 +1098,9 @@ class DouyinPlayVVScraper:
for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text): for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text):
try: try:
vv = int(match) vv = int(match)
# 数据验证:跳过无效的播放量数据 # 数据验证:播放量为0的数据也会被保存
if vv <= 0: if vv <= 0:
logging.warning(f"跳过无效的播放量数据: play_vv={vv}") logging.warning(f"⚠️ 发现播放量为0的数据: play_vv={vv},仍会保存")
continue
# 检查是否已经存在相同的play_vv # 检查是否已经存在相同的play_vv
if not any(item['play_vv'] == vv for item in self.play_vv_items): if not any(item['play_vv'] == vv for item in self.play_vv_items):
@ -1208,10 +1217,9 @@ class DouyinPlayVVScraper:
for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source): for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source):
try: try:
vv = int(m) vv = int(m)
# 数据验证:跳过无效的播放量数据 # 数据验证:播放量为0的数据也会被保存
if vv <= 0: if vv <= 0:
logging.warning(f"跳过无效的播放量数据: play_vv={vv}") logging.warning(f"⚠️ 发现播放量为0的数据: play_vv={vv},仍会保存")
continue
# 检查是否已经存在相同的play_vv # 检查是否已经存在相同的play_vv
if not any(item['play_vv'] == vv for item in self.play_vv_items): if not any(item['play_vv'] == vv for item in self.play_vv_items):