From 13b05ae252a12aa05bd63c7000020426c04b19f9 Mon Sep 17 00:00:00 2001 From: qiaoyirui0819 <3160533978@qq.com> Date: Sat, 8 Nov 2025 16:29:33 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/Timer_worker.py | 17 +- .../handlers/Rankings/rank_data_scraper.py | 194 +++++++++--------- 2 files changed, 109 insertions(+), 102 deletions(-) diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index e514454..3fcebd3 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -249,9 +249,9 @@ class DouyinAutoScheduler: if not mix_id or mix_id == "" or mix_id.lower() == "null": continue - # 过滤掉播放量为0或无效的记录 + # 注意:播放量为0的数据也会被保留,可能是新发布的短剧 if play_vv <= 0: - continue + logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={play_vv},仍会保留") if mix_id not in unique_videos or play_vv > unique_videos[mix_id].get("play_vv", 0): unique_videos[mix_id] = video @@ -283,7 +283,7 @@ class DouyinAutoScheduler: }).sort("play_vv", -1)) # 按短剧ID去重,每个短剧只保留播放量最高的一条 - # 🚫 过滤掉空的或无效的mix_id和播放量为0的记录 + # 🚫 过滤掉空的或无效的mix_id unique_yesterday_videos = {} for video in yesterday_videos_raw: mix_id = video.get("mix_id", "").strip() @@ -294,9 +294,9 @@ class DouyinAutoScheduler: if not mix_id or mix_id == "" or mix_id.lower() == "null": continue - # 过滤掉播放量为0或无效的记录 + # 注意:播放量为0的数据也会被保留,可能是新发布的短剧 if play_vv <= 0: - continue + logging.warning(f"⚠️ 昨天数据中发现播放量为0: mix_name={mix_name}, play_vv={play_vv},仍会保留") if mix_id not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_id].get("play_vv", 0): unique_yesterday_videos[mix_id] = video @@ -369,15 +369,14 @@ class DouyinAutoScheduler: current_play_vv = video.get("play_vv", 0) mix_name = video.get("mix_name", "").strip() - # 🚫 跳过无效数据:确保mix_name不为空且播放量大于0 - # 注意:这些数据应该已经在去重阶段被过滤掉了,这里是双重保险 + # 🚫 跳过无效数据:确保mix_name不为空 + # 注意:播放量为0的数据也会被保留,可能是新发布的短剧 if not mix_name or mix_name == "" or mix_name.lower() == "null": self.logger.warning(f"跳过空的mix_name记录,video_id: {video_id}") continue if current_play_vv <= 0: - self.logger.warning(f"跳过播放量无效的记录: mix_name={mix_name}, play_vv={current_play_vv}") - continue + self.logger.warning(f"⚠️ 榜单中发现播放量为0的记录: mix_name={mix_name}, play_vv={current_play_vv},仍会保留") # 计算排名变化(基于昨天的排名) rank_change = 0 diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 4c24f95..a682bb5 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -809,91 +809,95 @@ class DouyinPlayVVScraper: if isinstance(play_vv, (int, str)) and str(play_vv).isdigit(): vv = int(play_vv) - # 数据验证:确保合集名称不为空 - if not mix_name or mix_name.strip() == "": - logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}, mix_id={mix_id}") - return - - # 🔧 修复:不跳过播放量为0的数据,而是标记并保留 - # 这些数据可能是因为页面加载不完整,但合集本身是存在的 - # 警告信息移到去重检查之后,只有真正添加时才警告 - - # 构建合集链接 - video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" - - # 提取合集封面图片URL - 直接存储完整的图片链接 - cover_image_url = "" - cover_image_backup_urls = [] # 备用链接列表 + # 数据验证:确保有mix_id(按短剧ID去重,所以必须有mix_id) + if not mix_id or mix_id.strip() == "": + logging.warning(f"跳过缺少mix_id的数据: play_vv={vv}, mix_name={mix_name}") + # 跳过当前项,但继续递归解析其他数据(不使用return) + else: + # 如果mix_name为空,使用mix_id作为名称 + if not mix_name or mix_name.strip() == "": + mix_name = f"短剧_{mix_id}" + logging.warning(f"⚠️ mix_name为空,使用mix_id作为名称: {mix_name}") + # 🔧 修复:不跳过播放量为0的数据,而是标记并保留 + # 这些数据可能是因为页面加载不完整,但合集本身是存在的 + # 警告信息移到去重检查之后,只有真正添加时才警告 + + # 构建合集链接 + video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" + + # 提取合集封面图片URL - 直接存储完整的图片链接 + cover_image_url = "" + cover_image_backup_urls = [] # 备用链接列表 - # 查找封面图片字段,优先获取完整的URL链接 - if 'cover' in obj: - cover = obj['cover'] - if isinstance(cover, dict) and 'url_list' in cover and cover['url_list']: - # 主链接 - cover_image_url = cover['url_list'][0] - # 备用链接 - cover_image_backup_urls = cover['url_list'][1:] if len(cover['url_list']) > 1 else [] - elif isinstance(cover, str): - cover_image_url = cover - elif 'cover_url' in obj: - cover_url = obj['cover_url'] - if isinstance(cover_url, dict) and 'url_list' in cover_url and cover_url['url_list']: - cover_image_url = cover_url['url_list'][0] - cover_image_backup_urls = cover_url['url_list'][1:] if len(cover_url['url_list']) > 1 else [] - elif isinstance(cover_url, str): - cover_image_url = cover_url - elif 'image' in obj: - image = obj['image'] - if isinstance(image, dict) and 'url_list' in image and image['url_list']: - cover_image_url = image['url_list'][0] - cover_image_backup_urls = image['url_list'][1:] if len(image['url_list']) > 1 else [] - elif isinstance(image, str): - cover_image_url = image - elif 'pic' in obj: - pic = obj['pic'] - if isinstance(pic, dict) and 'url_list' in pic and pic['url_list']: - cover_image_url = pic['url_list'][0] - cover_image_backup_urls = pic['url_list'][1:] if len(pic['url_list']) > 1 else [] - elif isinstance(pic, str): - cover_image_url = pic + # 查找封面图片字段,优先获取完整的URL链接 + if 'cover' in obj: + cover = obj['cover'] + if isinstance(cover, dict) and 'url_list' in cover and cover['url_list']: + # 主链接 + cover_image_url = cover['url_list'][0] + # 备用链接 + cover_image_backup_urls = cover['url_list'][1:] if len(cover['url_list']) > 1 else [] + elif isinstance(cover, str): + cover_image_url = cover + elif 'cover_url' in obj: + cover_url = obj['cover_url'] + if isinstance(cover_url, dict) and 'url_list' in cover_url and cover_url['url_list']: + cover_image_url = cover_url['url_list'][0] + cover_image_backup_urls = cover_url['url_list'][1:] if len(cover_url['url_list']) > 1 else [] + elif isinstance(cover_url, str): + cover_image_url = cover_url + elif 'image' in obj: + image = obj['image'] + if isinstance(image, dict) and 'url_list' in image and image['url_list']: + cover_image_url = image['url_list'][0] + cover_image_backup_urls = image['url_list'][1:] if len(image['url_list']) > 1 else [] + elif isinstance(image, str): + cover_image_url = image + elif 'pic' in obj: + pic = obj['pic'] + if isinstance(pic, dict) and 'url_list' in pic and pic['url_list']: + cover_image_url = pic['url_list'][0] + cover_image_backup_urls = pic['url_list'][1:] if len(pic['url_list']) > 1 else [] + elif isinstance(pic, str): + cover_image_url = pic - # 提取新增的五个字段 - series_author = "" - desc = "" - updated_to_episode = 0 - manufacturing_field = "" # 承制信息 - copyright_field = "" # 版权信息 + # 提取新增的五个字段 + series_author = "" + desc = "" + updated_to_episode = 0 + manufacturing_field = "" # 承制信息 + copyright_field = "" # 版权信息 - # 提取合集作者/影视工作室 - if 'author' in obj: - author = obj['author'] - if isinstance(author, dict): - # 尝试多个可能的作者字段 - series_author = (author.get('nickname') or - author.get('unique_id') or - author.get('short_id') or - author.get('name') or '') - elif isinstance(author, str): - series_author = author - elif 'creator' in obj: - creator = obj['creator'] - if isinstance(creator, dict): - series_author = (creator.get('nickname') or - creator.get('unique_id') or - creator.get('name') or '') - elif isinstance(creator, str): - series_author = creator - elif 'user' in obj: - user = obj['user'] - if isinstance(user, dict): - series_author = (user.get('nickname') or - user.get('unique_id') or - user.get('name') or '') - elif isinstance(user, str): - series_author = user + # 提取合集作者/影视工作室 + if 'author' in obj: + author = obj['author'] + if isinstance(author, dict): + # 尝试多个可能的作者字段 + series_author = (author.get('nickname') or + author.get('unique_id') or + author.get('short_id') or + author.get('name') or '') + elif isinstance(author, str): + series_author = author + elif 'creator' in obj: + creator = obj['creator'] + if isinstance(creator, dict): + series_author = (creator.get('nickname') or + creator.get('unique_id') or + creator.get('name') or '') + elif isinstance(creator, str): + series_author = creator + elif 'user' in obj: + user = obj['user'] + if isinstance(user, dict): + series_author = (user.get('nickname') or + user.get('unique_id') or + user.get('name') or '') + elif isinstance(user, str): + series_author = user - # 提取合集描述 - 扩展更多可能的字段 - description_fields = ['desc', 'share_info'] # 保持字段列表 + # 提取合集描述 - 扩展更多可能的字段 + description_fields = ['desc', 'share_info'] # 保持字段列表 # 先检查desc字段 if 'desc' in obj and obj['desc']: @@ -999,9 +1003,9 @@ class DouyinPlayVVScraper: self.play_vv_items.remove(existing_item) self.play_vv_items.append(item_data) else: - # 已有数据更好,跳过 + # 已有数据更好,跳过当前数据但继续递归解析其他数据 logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})') - return # 跳过当前数据 + # 注意:不使用return,避免中断递归解析 else: # 不存在,直接添加 self.play_vv_items.append(item_data) @@ -1049,14 +1053,20 @@ class DouyinPlayVVScraper: vv = int(match.group(3)) episodes = int(match.group(4)) - # 数据验证:确保播放量大于0且合集名称不为空 + # 数据验证:确保有mix_id(按短剧ID去重) + # 注意:播放量为0的数据也会被保存,可能是新发布的短剧 if vv <= 0: - logging.warning(f"正则提取跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}") + logging.warning(f"⚠️ 发现播放量为0的数据: mix_name={mix_name}, play_vv={vv},仍会保存") + + # 检查mix_id,如果没有则跳过 + if not mix_id or mix_id.strip() == "": + logging.warning(f"正则提取跳过缺少mix_id的数据: play_vv={vv}, mix_name={mix_name}") continue + # 如果mix_name为空,使用mix_id作为名称 if not mix_name or mix_name.strip() == "": - logging.warning(f"正则提取跳过缺少合集名称的数据: play_vv={vv}") - continue + mix_name = f"短剧_{mix_id}" + logging.warning(f"⚠️ mix_name为空,使用mix_id作为名称: {mix_name}") # 构建合集链接 video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" @@ -1088,10 +1098,9 @@ class DouyinPlayVVScraper: for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text): try: vv = int(match) - # 数据验证:跳过无效的播放量数据 + # 数据验证:播放量为0的数据也会被保存 if vv <= 0: - logging.warning(f"跳过无效的播放量数据: play_vv={vv}") - continue + logging.warning(f"⚠️ 发现播放量为0的数据: play_vv={vv},仍会保存") # 检查是否已经存在相同的play_vv if not any(item['play_vv'] == vv for item in self.play_vv_items): @@ -1208,10 +1217,9 @@ class DouyinPlayVVScraper: for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source): try: vv = int(m) - # 数据验证:跳过无效的播放量数据 + # 数据验证:播放量为0的数据也会被保存 if vv <= 0: - logging.warning(f"跳过无效的播放量数据: play_vv={vv}") - continue + logging.warning(f"⚠️ 发现播放量为0的数据: play_vv={vv},仍会保存") # 检查是否已经存在相同的play_vv if not any(item['play_vv'] == vv for item in self.play_vv_items):