在Rankings_list数据库里面添加三个字段:合集作者,合集描述,合集总集数

This commit is contained in:
Qyir 2025-10-21 15:12:18 +08:00
parent be44334960
commit 8b607f6e24
2 changed files with 141 additions and 7 deletions

View File

@ -481,10 +481,13 @@ class DouyinPlayVVScraper:
if len(self.play_vv_items) < 3: if len(self.play_vv_items) < 3:
logging.info(f"=== 调试:合集对象结构 ===") logging.info(f"=== 调试:合集对象结构 ===")
logging.info(f"完整对象键: {list(obj.keys())}") logging.info(f"完整对象键: {list(obj.keys())}")
# 查找可能的视频相关字段 # 查找可能的视频相关字段和新增字段
for key, value in obj.items(): for key, value in obj.items():
if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower(): if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower():
logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}") logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}")
# 检查新增字段相关的键
elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']):
logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}")
# 特别检查ids字段 # 特别检查ids字段
if 'ids' in obj: if 'ids' in obj:
@ -539,6 +542,112 @@ class DouyinPlayVVScraper:
elif isinstance(pic, str): elif isinstance(pic, str):
cover_image_url = pic cover_image_url = pic
# 提取新增的三个字段
series_author = ""
desc = ""
updated_to_episode = 0
# 提取合集作者/影视工作室
if 'author' in obj:
author = obj['author']
if isinstance(author, dict):
# 尝试多个可能的作者字段
series_author = (author.get('nickname') or
author.get('unique_id') or
author.get('short_id') or
author.get('name') or '')
elif isinstance(author, str):
series_author = author
elif 'creator' in obj:
creator = obj['creator']
if isinstance(creator, dict):
series_author = (creator.get('nickname') or
creator.get('unique_id') or
creator.get('name') or '')
elif isinstance(creator, str):
series_author = creator
elif 'user' in obj:
user = obj['user']
if isinstance(user, dict):
series_author = (user.get('nickname') or
user.get('unique_id') or
user.get('name') or '')
elif isinstance(user, str):
series_author = user
# 提取合集描述 - 扩展更多可能的字段
description_fields = ['desc', 'share_info'] # 保持字段列表
# 先检查desc字段
if 'desc' in obj and obj['desc']:
desc_value = str(obj['desc']).strip()
if desc_value:
desc = desc_value
logging.info(f"从desc提取到描述")
# 如果desc中没有找到有效描述检查share_info
if not desc and 'share_info' in obj and isinstance(obj['share_info'], dict):
share_desc = obj['share_info'].get('share_desc', '').strip()
if share_desc:
desc = share_desc
logging.info(f"从share_info.share_desc提取到描述")
# 如果share_info中没有找到有效描述继续检查desc字段
if not desc:
for field in description_fields:
if field in obj and obj[field]:
desc_value = str(obj[field]).strip()
if desc_value:
desc = desc_value
logging.info(f"{field}提取到描述")
break
# 如果还没有找到描述尝试从嵌套对象中查找desc字段
if not desc:
def search_nested_desc(data, depth=0):
if depth > 3: # 限制递归深度
return None
if isinstance(data, dict):
# 检查当前层级的desc字段
if 'desc' in data and data['desc']:
desc_value = str(data['desc']).strip()
if 5 <= len(desc_value) <= 1000:
return desc_value
# 递归检查嵌套对象
for value in data.values():
if isinstance(value, dict):
nested_result = search_nested_desc(value, depth + 1)
if nested_result:
return nested_result
return None
desc = search_nested_desc(obj)
# 提取合集总集数 - 从statis字段中获取
updated_to_episode = 0 # 初始化默认值
if 'statis' in obj and isinstance(obj['statis'], dict):
statis = obj['statis']
if 'updated_to_episode' in statis:
try:
episodes = int(statis['updated_to_episode'])
if episodes > 0:
updated_to_episode = episodes
logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")
except ValueError:
logging.warning("updated_to_episode字段值无法转换为整数")
else:
logging.info("未找到statis字段或statis不是字典类型")
try:
episodes = int(obj['updated_to_episode'])
if episodes > 0:
updated_to_episode = episodes
logging.info(f"从updated_to_episode提取到集数: {episodes}")
except ValueError:
pass # 忽略无法转换为整数的情况
self.play_vv_items.append({ self.play_vv_items.append({
'play_vv': vv, 'play_vv': vv,
'formatted': self.format_count(vv), 'formatted': self.format_count(vv),
@ -549,9 +658,18 @@ class DouyinPlayVVScraper:
'mix_id': mix_id, # 合集ID 'mix_id': mix_id, # 合集ID
'cover_image_url': cover_image_url, # 合集封面图片主链接完整URL 'cover_image_url': cover_image_url, # 合集封面图片主链接完整URL
'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表 'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表
'series_author': series_author, # 合集作者/影视工作室
'desc': desc, # 合集描述
'updated_to_episode': updated_to_episode, # 合集总集数
'timestamp': datetime.now().isoformat() 'timestamp': datetime.now().isoformat()
}) })
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
if series_author:
logging.info(f' 作者: {series_author}')
if desc:
logging.info(f' 描述: {desc[:100]}{"..." if len(desc) > 100 else ""}')
if updated_to_episode > 0:
logging.info(f' 总集数: {updated_to_episode}')
# 递归搜索子对象 # 递归搜索子对象
for key, value in obj.items(): for key, value in obj.items():
@ -567,17 +685,21 @@ class DouyinPlayVVScraper:
def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None): def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None):
"""使用正则表达式从文本中提取信息""" """使用正则表达式从文本中提取信息"""
# 查找包含完整合集信息的JSON片段 # 查找包含完整合集信息的JSON片段包括statis中的updated_to_episode
mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*"updated_to_episode"\s*:\s*(\d+)[^{}]*\}[^{}]*\}'
for match in re.finditer(mix_pattern, text): for match in re.finditer(mix_pattern, text):
try: try:
mix_id = match.group(1) mix_id = match.group(1)
mix_name = match.group(2) mix_name = match.group(2)
vv = int(match.group(3)) vv = int(match.group(3))
episodes = int(match.group(4))
# 构建合集链接 # 构建合集链接
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
if episodes > 0:
logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")
self.play_vv_items.append({ self.play_vv_items.append({
'play_vv': vv, 'play_vv': vv,
@ -587,6 +709,7 @@ class DouyinPlayVVScraper:
'mix_name': mix_name, 'mix_name': mix_name,
'video_url': video_url, # 合集链接 'video_url': video_url, # 合集链接
'mix_id': mix_id, # 合集ID 'mix_id': mix_id, # 合集ID
'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数
'timestamp': datetime.now().isoformat() 'timestamp': datetime.now().isoformat()
}) })
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
@ -607,6 +730,7 @@ class DouyinPlayVVScraper:
'mix_name': '', # 未知合集名称 'mix_name': '', # 未知合集名称
'video_url': '', # 未知链接 'video_url': '', # 未知链接
'mix_id': '', # 未知mix_id 'mix_id': '', # 未知mix_id
'updated_to_episode': None, # 未知集数
'timestamp': datetime.now().isoformat() 'timestamp': datetime.now().isoformat()
}) })
except Exception: except Exception:
@ -871,7 +995,7 @@ class DouyinPlayVVScraper:
# 没有封面图片,使用空字符串 # 没有封面图片,使用空字符串
permanent_cover_url = '' permanent_cover_url = ''
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段
doc = { doc = {
'batch_time': batch_time, 'batch_time': batch_time,
'mix_name': mix_name, 'mix_name': mix_name,
@ -882,7 +1006,11 @@ class DouyinPlayVVScraper:
'rank': 0, # 临时设置,后面会重新计算 'rank': 0, # 临时设置,后面会重新计算
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表 'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
# 新增的三个字段
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
'desc': item.get('desc', ''), # 合集描述
'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数
} }
documents.append(doc) documents.append(doc)
@ -900,7 +1028,7 @@ class DouyinPlayVVScraper:
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0 max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}') logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url') logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, series_author, desc, updated_to_episode')
# 统计封面图片处理情况 # 统计封面图片处理情况
cover_count = sum(1 for doc in documents if doc.get('cover_image_url')) cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))

View File

@ -128,7 +128,13 @@ def format_mix_item(doc):
"play_vv": doc.get("play_vv", 0), "play_vv": doc.get("play_vv", 0),
"request_id": doc.get("request_id", ""), "request_id": doc.get("request_id", ""),
"rank": doc.get("rank", 0), "rank": doc.get("rank", 0),
"cover_image_url": doc.get("cover_image_url", "") "cover_image_url": doc.get("cover_image_url", ""),
# 新增字段
"series_author": doc.get("series_author", ""),
"desc": doc.get("desc", ""),
"updated_to_episode": doc.get("updated_to_episode", 0),
"cover_backup_urls": doc.get("cover_backup_urls", []),
"mix_id": doc.get("mix_id", "")
} }
def get_mix_list(page=1, limit=20, sort_by="playcount"): def get_mix_list(page=1, limit=20, sort_by="playcount"):