修改
This commit is contained in:
parent
ad54ff0398
commit
13b05ae252
@ -249,9 +249,9 @@ class DouyinAutoScheduler:
|
|||||||
if not mix_id or mix_id == "" or mix_id.lower() == "null":
|
if not mix_id or mix_id == "" or mix_id.lower() == "null":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 过滤掉播放量为0或无效的记录
|
# 注意:播放量为0的数据也会被保留,可能是新发布的短剧
|
||||||
if play_vv <= 0:
|
if play_vv <= 0:
|
||||||
continue
|
logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={play_vv},仍会保留")
|
||||||
|
|
||||||
if mix_id not in unique_videos or play_vv > unique_videos[mix_id].get("play_vv", 0):
|
if mix_id not in unique_videos or play_vv > unique_videos[mix_id].get("play_vv", 0):
|
||||||
unique_videos[mix_id] = video
|
unique_videos[mix_id] = video
|
||||||
@ -283,7 +283,7 @@ class DouyinAutoScheduler:
|
|||||||
}).sort("play_vv", -1))
|
}).sort("play_vv", -1))
|
||||||
|
|
||||||
# 按短剧ID去重,每个短剧只保留播放量最高的一条
|
# 按短剧ID去重,每个短剧只保留播放量最高的一条
|
||||||
# 🚫 过滤掉空的或无效的mix_id和播放量为0的记录
|
# 🚫 过滤掉空的或无效的mix_id
|
||||||
unique_yesterday_videos = {}
|
unique_yesterday_videos = {}
|
||||||
for video in yesterday_videos_raw:
|
for video in yesterday_videos_raw:
|
||||||
mix_id = video.get("mix_id", "").strip()
|
mix_id = video.get("mix_id", "").strip()
|
||||||
@ -294,9 +294,9 @@ class DouyinAutoScheduler:
|
|||||||
if not mix_id or mix_id == "" or mix_id.lower() == "null":
|
if not mix_id or mix_id == "" or mix_id.lower() == "null":
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 过滤掉播放量为0或无效的记录
|
# 注意:播放量为0的数据也会被保留,可能是新发布的短剧
|
||||||
if play_vv <= 0:
|
if play_vv <= 0:
|
||||||
continue
|
logging.warning(f"⚠️ 昨天数据中发现播放量为0: mix_name={mix_name}, play_vv={play_vv},仍会保留")
|
||||||
|
|
||||||
if mix_id not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_id].get("play_vv", 0):
|
if mix_id not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_id].get("play_vv", 0):
|
||||||
unique_yesterday_videos[mix_id] = video
|
unique_yesterday_videos[mix_id] = video
|
||||||
@ -369,15 +369,14 @@ class DouyinAutoScheduler:
|
|||||||
current_play_vv = video.get("play_vv", 0)
|
current_play_vv = video.get("play_vv", 0)
|
||||||
mix_name = video.get("mix_name", "").strip()
|
mix_name = video.get("mix_name", "").strip()
|
||||||
|
|
||||||
# 🚫 跳过无效数据:确保mix_name不为空且播放量大于0
|
# 🚫 跳过无效数据:确保mix_name不为空
|
||||||
# 注意:这些数据应该已经在去重阶段被过滤掉了,这里是双重保险
|
# 注意:播放量为0的数据也会被保留,可能是新发布的短剧
|
||||||
if not mix_name or mix_name == "" or mix_name.lower() == "null":
|
if not mix_name or mix_name == "" or mix_name.lower() == "null":
|
||||||
self.logger.warning(f"跳过空的mix_name记录,video_id: {video_id}")
|
self.logger.warning(f"跳过空的mix_name记录,video_id: {video_id}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if current_play_vv <= 0:
|
if current_play_vv <= 0:
|
||||||
self.logger.warning(f"跳过播放量无效的记录: mix_name={mix_name}, play_vv={current_play_vv}")
|
self.logger.warning(f"⚠️ 榜单中发现播放量为0的记录: mix_name={mix_name}, play_vv={current_play_vv},仍会保留")
|
||||||
continue
|
|
||||||
|
|
||||||
# 计算排名变化(基于昨天的排名)
|
# 计算排名变化(基于昨天的排名)
|
||||||
rank_change = 0
|
rank_change = 0
|
||||||
|
|||||||
@ -809,91 +809,95 @@ class DouyinPlayVVScraper:
|
|||||||
if isinstance(play_vv, (int, str)) and str(play_vv).isdigit():
|
if isinstance(play_vv, (int, str)) and str(play_vv).isdigit():
|
||||||
vv = int(play_vv)
|
vv = int(play_vv)
|
||||||
|
|
||||||
# 数据验证:确保合集名称不为空
|
# 数据验证:确保有mix_id(按短剧ID去重,所以必须有mix_id)
|
||||||
if not mix_name or mix_name.strip() == "":
|
if not mix_id or mix_id.strip() == "":
|
||||||
logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}, mix_id={mix_id}")
|
logging.warning(f"跳过缺少mix_id的数据: play_vv={vv}, mix_name={mix_name}")
|
||||||
return
|
# 跳过当前项,但继续递归解析其他数据(不使用return)
|
||||||
|
else:
|
||||||
# 🔧 修复:不跳过播放量为0的数据,而是标记并保留
|
# 如果mix_name为空,使用mix_id作为名称
|
||||||
# 这些数据可能是因为页面加载不完整,但合集本身是存在的
|
if not mix_name or mix_name.strip() == "":
|
||||||
# 警告信息移到去重检查之后,只有真正添加时才警告
|
mix_name = f"短剧_{mix_id}"
|
||||||
|
logging.warning(f"⚠️ mix_name为空,使用mix_id作为名称: {mix_name}")
|
||||||
# 构建合集链接
|
# 🔧 修复:不跳过播放量为0的数据,而是标记并保留
|
||||||
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
# 这些数据可能是因为页面加载不完整,但合集本身是存在的
|
||||||
|
# 警告信息移到去重检查之后,只有真正添加时才警告
|
||||||
# 提取合集封面图片URL - 直接存储完整的图片链接
|
|
||||||
cover_image_url = ""
|
# 构建合集链接
|
||||||
cover_image_backup_urls = [] # 备用链接列表
|
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
||||||
|
|
||||||
|
# 提取合集封面图片URL - 直接存储完整的图片链接
|
||||||
|
cover_image_url = ""
|
||||||
|
cover_image_backup_urls = [] # 备用链接列表
|
||||||
|
|
||||||
# 查找封面图片字段,优先获取完整的URL链接
|
# 查找封面图片字段,优先获取完整的URL链接
|
||||||
if 'cover' in obj:
|
if 'cover' in obj:
|
||||||
cover = obj['cover']
|
cover = obj['cover']
|
||||||
if isinstance(cover, dict) and 'url_list' in cover and cover['url_list']:
|
if isinstance(cover, dict) and 'url_list' in cover and cover['url_list']:
|
||||||
# 主链接
|
# 主链接
|
||||||
cover_image_url = cover['url_list'][0]
|
cover_image_url = cover['url_list'][0]
|
||||||
# 备用链接
|
# 备用链接
|
||||||
cover_image_backup_urls = cover['url_list'][1:] if len(cover['url_list']) > 1 else []
|
cover_image_backup_urls = cover['url_list'][1:] if len(cover['url_list']) > 1 else []
|
||||||
elif isinstance(cover, str):
|
elif isinstance(cover, str):
|
||||||
cover_image_url = cover
|
cover_image_url = cover
|
||||||
elif 'cover_url' in obj:
|
elif 'cover_url' in obj:
|
||||||
cover_url = obj['cover_url']
|
cover_url = obj['cover_url']
|
||||||
if isinstance(cover_url, dict) and 'url_list' in cover_url and cover_url['url_list']:
|
if isinstance(cover_url, dict) and 'url_list' in cover_url and cover_url['url_list']:
|
||||||
cover_image_url = cover_url['url_list'][0]
|
cover_image_url = cover_url['url_list'][0]
|
||||||
cover_image_backup_urls = cover_url['url_list'][1:] if len(cover_url['url_list']) > 1 else []
|
cover_image_backup_urls = cover_url['url_list'][1:] if len(cover_url['url_list']) > 1 else []
|
||||||
elif isinstance(cover_url, str):
|
elif isinstance(cover_url, str):
|
||||||
cover_image_url = cover_url
|
cover_image_url = cover_url
|
||||||
elif 'image' in obj:
|
elif 'image' in obj:
|
||||||
image = obj['image']
|
image = obj['image']
|
||||||
if isinstance(image, dict) and 'url_list' in image and image['url_list']:
|
if isinstance(image, dict) and 'url_list' in image and image['url_list']:
|
||||||
cover_image_url = image['url_list'][0]
|
cover_image_url = image['url_list'][0]
|
||||||
cover_image_backup_urls = image['url_list'][1:] if len(image['url_list']) > 1 else []
|
cover_image_backup_urls = image['url_list'][1:] if len(image['url_list']) > 1 else []
|
||||||
elif isinstance(image, str):
|
elif isinstance(image, str):
|
||||||
cover_image_url = image
|
cover_image_url = image
|
||||||
elif 'pic' in obj:
|
elif 'pic' in obj:
|
||||||
pic = obj['pic']
|
pic = obj['pic']
|
||||||
if isinstance(pic, dict) and 'url_list' in pic and pic['url_list']:
|
if isinstance(pic, dict) and 'url_list' in pic and pic['url_list']:
|
||||||
cover_image_url = pic['url_list'][0]
|
cover_image_url = pic['url_list'][0]
|
||||||
cover_image_backup_urls = pic['url_list'][1:] if len(pic['url_list']) > 1 else []
|
cover_image_backup_urls = pic['url_list'][1:] if len(pic['url_list']) > 1 else []
|
||||||
elif isinstance(pic, str):
|
elif isinstance(pic, str):
|
||||||
cover_image_url = pic
|
cover_image_url = pic
|
||||||
|
|
||||||
# 提取新增的五个字段
|
# 提取新增的五个字段
|
||||||
series_author = ""
|
series_author = ""
|
||||||
desc = ""
|
desc = ""
|
||||||
updated_to_episode = 0
|
updated_to_episode = 0
|
||||||
manufacturing_field = "" # 承制信息
|
manufacturing_field = "" # 承制信息
|
||||||
copyright_field = "" # 版权信息
|
copyright_field = "" # 版权信息
|
||||||
|
|
||||||
# 提取合集作者/影视工作室
|
# 提取合集作者/影视工作室
|
||||||
if 'author' in obj:
|
if 'author' in obj:
|
||||||
author = obj['author']
|
author = obj['author']
|
||||||
if isinstance(author, dict):
|
if isinstance(author, dict):
|
||||||
# 尝试多个可能的作者字段
|
# 尝试多个可能的作者字段
|
||||||
series_author = (author.get('nickname') or
|
series_author = (author.get('nickname') or
|
||||||
author.get('unique_id') or
|
author.get('unique_id') or
|
||||||
author.get('short_id') or
|
author.get('short_id') or
|
||||||
author.get('name') or '')
|
author.get('name') or '')
|
||||||
elif isinstance(author, str):
|
elif isinstance(author, str):
|
||||||
series_author = author
|
series_author = author
|
||||||
elif 'creator' in obj:
|
elif 'creator' in obj:
|
||||||
creator = obj['creator']
|
creator = obj['creator']
|
||||||
if isinstance(creator, dict):
|
if isinstance(creator, dict):
|
||||||
series_author = (creator.get('nickname') or
|
series_author = (creator.get('nickname') or
|
||||||
creator.get('unique_id') or
|
creator.get('unique_id') or
|
||||||
creator.get('name') or '')
|
creator.get('name') or '')
|
||||||
elif isinstance(creator, str):
|
elif isinstance(creator, str):
|
||||||
series_author = creator
|
series_author = creator
|
||||||
elif 'user' in obj:
|
elif 'user' in obj:
|
||||||
user = obj['user']
|
user = obj['user']
|
||||||
if isinstance(user, dict):
|
if isinstance(user, dict):
|
||||||
series_author = (user.get('nickname') or
|
series_author = (user.get('nickname') or
|
||||||
user.get('unique_id') or
|
user.get('unique_id') or
|
||||||
user.get('name') or '')
|
user.get('name') or '')
|
||||||
elif isinstance(user, str):
|
elif isinstance(user, str):
|
||||||
series_author = user
|
series_author = user
|
||||||
|
|
||||||
# 提取合集描述 - 扩展更多可能的字段
|
# 提取合集描述 - 扩展更多可能的字段
|
||||||
description_fields = ['desc', 'share_info'] # 保持字段列表
|
description_fields = ['desc', 'share_info'] # 保持字段列表
|
||||||
|
|
||||||
# 先检查desc字段
|
# 先检查desc字段
|
||||||
if 'desc' in obj and obj['desc']:
|
if 'desc' in obj and obj['desc']:
|
||||||
@ -999,9 +1003,9 @@ class DouyinPlayVVScraper:
|
|||||||
self.play_vv_items.remove(existing_item)
|
self.play_vv_items.remove(existing_item)
|
||||||
self.play_vv_items.append(item_data)
|
self.play_vv_items.append(item_data)
|
||||||
else:
|
else:
|
||||||
# 已有数据更好,跳过
|
# 已有数据更好,跳过当前数据但继续递归解析其他数据
|
||||||
logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})')
|
logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})')
|
||||||
return # 跳过当前数据
|
# 注意:不使用return,避免中断递归解析
|
||||||
else:
|
else:
|
||||||
# 不存在,直接添加
|
# 不存在,直接添加
|
||||||
self.play_vv_items.append(item_data)
|
self.play_vv_items.append(item_data)
|
||||||
@ -1049,14 +1053,20 @@ class DouyinPlayVVScraper:
|
|||||||
vv = int(match.group(3))
|
vv = int(match.group(3))
|
||||||
episodes = int(match.group(4))
|
episodes = int(match.group(4))
|
||||||
|
|
||||||
# 数据验证:确保播放量大于0且合集名称不为空
|
# 数据验证:确保有mix_id(按短剧ID去重)
|
||||||
|
# 注意:播放量为0的数据也会被保存,可能是新发布的短剧
|
||||||
if vv <= 0:
|
if vv <= 0:
|
||||||
logging.warning(f"正则提取跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}")
|
logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={vv},仍会保存")
|
||||||
|
|
||||||
|
# 检查mix_id,如果没有则跳过
|
||||||
|
if not mix_id or mix_id.strip() == "":
|
||||||
|
logging.warning(f"正则提取跳过缺少mix_id的数据: play_vv={vv}, mix_name={mix_name}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# 如果mix_name为空,使用mix_id作为名称
|
||||||
if not mix_name or mix_name.strip() == "":
|
if not mix_name or mix_name.strip() == "":
|
||||||
logging.warning(f"正则提取跳过缺少合集名称的数据: play_vv={vv}")
|
mix_name = f"短剧_{mix_id}"
|
||||||
continue
|
logging.warning(f"⚠️ mix_name为空,使用mix_id作为名称: {mix_name}")
|
||||||
|
|
||||||
# 构建合集链接
|
# 构建合集链接
|
||||||
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
||||||
@ -1088,10 +1098,9 @@ class DouyinPlayVVScraper:
|
|||||||
for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text):
|
for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text):
|
||||||
try:
|
try:
|
||||||
vv = int(match)
|
vv = int(match)
|
||||||
# 数据验证:跳过无效的播放量数据
|
# 数据验证:播放量为0的数据也会被保存
|
||||||
if vv <= 0:
|
if vv <= 0:
|
||||||
logging.warning(f"跳过无效的播放量数据: play_vv={vv}")
|
logging.warning(f"⚠️ 发现播放量为0的数据: play_vv={vv},仍会保存")
|
||||||
continue
|
|
||||||
|
|
||||||
# 检查是否已经存在相同的play_vv
|
# 检查是否已经存在相同的play_vv
|
||||||
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
||||||
@ -1208,10 +1217,9 @@ class DouyinPlayVVScraper:
|
|||||||
for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source):
|
for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source):
|
||||||
try:
|
try:
|
||||||
vv = int(m)
|
vv = int(m)
|
||||||
# 数据验证:跳过无效的播放量数据
|
# 数据验证:播放量为0的数据也会被保存
|
||||||
if vv <= 0:
|
if vv <= 0:
|
||||||
logging.warning(f"跳过无效的播放量数据: play_vv={vv}")
|
logging.warning(f"⚠️ 发现播放量为0的数据: play_vv={vv},仍会保存")
|
||||||
continue
|
|
||||||
|
|
||||||
# 检查是否已经存在相同的play_vv
|
# 检查是否已经存在相同的play_vv
|
||||||
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user