From 8b607f6e248d18aa11ef369389c4de4d22be6dfe Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Tue, 21 Oct 2025 15:12:18 +0800 Subject: [PATCH] =?UTF-8?q?=E5=9C=A8Rankings=5Flist=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93=E9=87=8C=E9=9D=A2=E6=B7=BB=E5=8A=A0=E4=B8=89=E4=B8=AA?= =?UTF-8?q?=E5=AD=97=E6=AE=B5=EF=BC=9A=E5=90=88=E9=9B=86=E4=BD=9C=E8=80=85?= =?UTF-8?q?=EF=BC=8C=E5=90=88=E9=9B=86=E6=8F=8F=E8=BF=B0=EF=BC=8C=E5=90=88?= =?UTF-8?q?=E9=9B=86=E6=80=BB=E9=9B=86=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../handlers/Rankings/rank_data_scraper.py | 140 +++++++++++++++++- backend/routers/rank_api_routes.py | 8 +- 2 files changed, 141 insertions(+), 7 deletions(-) diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 95c885b..b3d3dd7 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -481,10 +481,13 @@ class DouyinPlayVVScraper: if len(self.play_vv_items) < 3: logging.info(f"=== 调试:合集对象结构 ===") logging.info(f"完整对象键: {list(obj.keys())}") - # 查找可能的视频相关字段 + # 查找可能的视频相关字段和新增字段 for key, value in obj.items(): if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower(): logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}") + # 检查新增字段相关的键 + elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']): + logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}") # 特别检查ids字段 if 'ids' in obj: @@ -539,6 +542,112 @@ class DouyinPlayVVScraper: elif isinstance(pic, str): cover_image_url = pic + # 提取新增的三个字段 + series_author = "" + desc = "" + updated_to_episode = 0 + + # 提取合集作者/影视工作室 + if 'author' in obj: + author = obj['author'] + if isinstance(author, dict): + # 尝试多个可能的作者字段 + series_author = (author.get('nickname') or + author.get('unique_id') or + author.get('short_id') or + author.get('name') or '') + elif isinstance(author, str): + series_author = author + elif 'creator' in obj: + creator = obj['creator'] + if isinstance(creator, dict): + series_author = (creator.get('nickname') or + creator.get('unique_id') or + creator.get('name') or '') + elif isinstance(creator, str): + series_author = creator + elif 'user' in obj: + user = obj['user'] + if isinstance(user, dict): + series_author = (user.get('nickname') or + user.get('unique_id') or + user.get('name') or '') + elif isinstance(user, str): + series_author = user + + # 提取合集描述 - 扩展更多可能的字段 + description_fields = ['desc', 'share_info'] # 保持字段列表 + + # 先检查desc字段 + if 'desc' in obj and obj['desc']: + desc_value = str(obj['desc']).strip() + if desc_value: + desc = desc_value + logging.info(f"从desc提取到描述") + + # 如果desc中没有找到有效描述,检查share_info + if not desc and 'share_info' in obj and isinstance(obj['share_info'], dict): + share_desc = obj['share_info'].get('share_desc', '').strip() + if share_desc: + desc = share_desc + logging.info(f"从share_info.share_desc提取到描述") + + # 如果share_info中没有找到有效描述,继续检查desc字段 + if not desc: + for field in description_fields: + if field in obj and obj[field]: + desc_value = str(obj[field]).strip() + if desc_value: + desc = desc_value + logging.info(f"从{field}提取到描述") + break + + # 如果还没有找到描述,尝试从嵌套对象中查找desc字段 + if not desc: + def search_nested_desc(data, depth=0): + if depth > 3: # 限制递归深度 + return None + + if isinstance(data, dict): + # 检查当前层级的desc字段 + if 'desc' in data and data['desc']: + desc_value = str(data['desc']).strip() + if 5 <= len(desc_value) <= 1000: + return desc_value + + # 递归检查嵌套对象 + for value in data.values(): + if isinstance(value, dict): + nested_result = search_nested_desc(value, depth + 1) + if nested_result: + return nested_result + return None + + desc = search_nested_desc(obj) + + + # 提取合集总集数 - 从statis字段中获取 + updated_to_episode = 0 # 初始化默认值 + if 'statis' in obj and isinstance(obj['statis'], dict): + statis = obj['statis'] + if 'updated_to_episode' in statis: + try: + episodes = int(statis['updated_to_episode']) + if episodes > 0: + updated_to_episode = episodes + logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") + except ValueError: + logging.warning("updated_to_episode字段值无法转换为整数") + else: + logging.info("未找到statis字段或statis不是字典类型") + try: + episodes = int(obj['updated_to_episode']) + if episodes > 0: + updated_to_episode = episodes + logging.info(f"从updated_to_episode提取到集数: {episodes}") + except ValueError: + pass # 忽略无法转换为整数的情况 + self.play_vv_items.append({ 'play_vv': vv, 'formatted': self.format_count(vv), @@ -549,9 +658,18 @@ class DouyinPlayVVScraper: 'mix_id': mix_id, # 合集ID 'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL) 'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表 + 'series_author': series_author, # 合集作者/影视工作室 + 'desc': desc, # 合集描述 + 'updated_to_episode': updated_to_episode, # 合集总集数 'timestamp': datetime.now().isoformat() }) logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') + if series_author: + logging.info(f' 作者: {series_author}') + if desc: + logging.info(f' 描述: {desc[:100]}{"..." if len(desc) > 100 else ""}') + if updated_to_episode > 0: + logging.info(f' 总集数: {updated_to_episode}') # 递归搜索子对象 for key, value in obj.items(): @@ -567,17 +685,21 @@ class DouyinPlayVVScraper: def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None): """使用正则表达式从文本中提取信息""" - # 查找包含完整合集信息的JSON片段 - mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' + # 查找包含完整合集信息的JSON片段,包括statis中的updated_to_episode + mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*"updated_to_episode"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' for match in re.finditer(mix_pattern, text): try: mix_id = match.group(1) mix_name = match.group(2) vv = int(match.group(3)) + episodes = int(match.group(4)) # 构建合集链接 video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" + + if episodes > 0: + logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") self.play_vv_items.append({ 'play_vv': vv, @@ -587,6 +709,7 @@ class DouyinPlayVVScraper: 'mix_name': mix_name, 'video_url': video_url, # 合集链接 'mix_id': mix_id, # 合集ID + 'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数 'timestamp': datetime.now().isoformat() }) logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') @@ -607,6 +730,7 @@ class DouyinPlayVVScraper: 'mix_name': '', # 未知合集名称 'video_url': '', # 未知链接 'mix_id': '', # 未知mix_id + 'updated_to_episode': None, # 未知集数 'timestamp': datetime.now().isoformat() }) except Exception: @@ -871,7 +995,7 @@ class DouyinPlayVVScraper: # 没有封面图片,使用空字符串 permanent_cover_url = '' - # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段 doc = { 'batch_time': batch_time, 'mix_name': mix_name, @@ -882,7 +1006,11 @@ class DouyinPlayVVScraper: 'rank': 0, # 临时设置,后面会重新计算 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 - 'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表 + 'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表 + # 新增的三个字段 + 'series_author': item.get('series_author', ''), # 合集作者/影视工作室 + 'desc': item.get('desc', ''), # 合集描述 + 'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数 } documents.append(doc) @@ -900,7 +1028,7 @@ class DouyinPlayVVScraper: max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0 logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}') - logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url') + logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, series_author, desc, updated_to_episode') # 统计封面图片处理情况 cover_count = sum(1 for doc in documents if doc.get('cover_image_url')) diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 54061b5..660cb2b 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -128,7 +128,13 @@ def format_mix_item(doc): "play_vv": doc.get("play_vv", 0), "request_id": doc.get("request_id", ""), "rank": doc.get("rank", 0), - "cover_image_url": doc.get("cover_image_url", "") + "cover_image_url": doc.get("cover_image_url", ""), + # 新增字段 + "series_author": doc.get("series_author", ""), + "desc": doc.get("desc", ""), + "updated_to_episode": doc.get("updated_to_episode", 0), + "cover_backup_urls": doc.get("cover_backup_urls", []), + "mix_id": doc.get("mix_id", "") } def get_mix_list(page=1, limit=20, sort_by="playcount"):