在Rankings_list数据库里面添加三个字段:合集作者,合集描述,合集总集数
This commit is contained in:
parent
be44334960
commit
8b607f6e24
@ -481,10 +481,13 @@ class DouyinPlayVVScraper:
|
|||||||
if len(self.play_vv_items) < 3:
|
if len(self.play_vv_items) < 3:
|
||||||
logging.info(f"=== 调试:合集对象结构 ===")
|
logging.info(f"=== 调试:合集对象结构 ===")
|
||||||
logging.info(f"完整对象键: {list(obj.keys())}")
|
logging.info(f"完整对象键: {list(obj.keys())}")
|
||||||
# 查找可能的视频相关字段
|
# 查找可能的视频相关字段和新增字段
|
||||||
for key, value in obj.items():
|
for key, value in obj.items():
|
||||||
if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower():
|
if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower():
|
||||||
logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}")
|
logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}")
|
||||||
|
# 检查新增字段相关的键
|
||||||
|
elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']):
|
||||||
|
logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}")
|
||||||
|
|
||||||
# 特别检查ids字段
|
# 特别检查ids字段
|
||||||
if 'ids' in obj:
|
if 'ids' in obj:
|
||||||
@ -539,6 +542,112 @@ class DouyinPlayVVScraper:
|
|||||||
elif isinstance(pic, str):
|
elif isinstance(pic, str):
|
||||||
cover_image_url = pic
|
cover_image_url = pic
|
||||||
|
|
||||||
|
# 提取新增的三个字段
|
||||||
|
series_author = ""
|
||||||
|
desc = ""
|
||||||
|
updated_to_episode = 0
|
||||||
|
|
||||||
|
# 提取合集作者/影视工作室
|
||||||
|
if 'author' in obj:
|
||||||
|
author = obj['author']
|
||||||
|
if isinstance(author, dict):
|
||||||
|
# 尝试多个可能的作者字段
|
||||||
|
series_author = (author.get('nickname') or
|
||||||
|
author.get('unique_id') or
|
||||||
|
author.get('short_id') or
|
||||||
|
author.get('name') or '')
|
||||||
|
elif isinstance(author, str):
|
||||||
|
series_author = author
|
||||||
|
elif 'creator' in obj:
|
||||||
|
creator = obj['creator']
|
||||||
|
if isinstance(creator, dict):
|
||||||
|
series_author = (creator.get('nickname') or
|
||||||
|
creator.get('unique_id') or
|
||||||
|
creator.get('name') or '')
|
||||||
|
elif isinstance(creator, str):
|
||||||
|
series_author = creator
|
||||||
|
elif 'user' in obj:
|
||||||
|
user = obj['user']
|
||||||
|
if isinstance(user, dict):
|
||||||
|
series_author = (user.get('nickname') or
|
||||||
|
user.get('unique_id') or
|
||||||
|
user.get('name') or '')
|
||||||
|
elif isinstance(user, str):
|
||||||
|
series_author = user
|
||||||
|
|
||||||
|
# 提取合集描述 - 扩展更多可能的字段
|
||||||
|
description_fields = ['desc', 'share_info'] # 保持字段列表
|
||||||
|
|
||||||
|
# 先检查desc字段
|
||||||
|
if 'desc' in obj and obj['desc']:
|
||||||
|
desc_value = str(obj['desc']).strip()
|
||||||
|
if desc_value:
|
||||||
|
desc = desc_value
|
||||||
|
logging.info(f"从desc提取到描述")
|
||||||
|
|
||||||
|
# 如果desc中没有找到有效描述,检查share_info
|
||||||
|
if not desc and 'share_info' in obj and isinstance(obj['share_info'], dict):
|
||||||
|
share_desc = obj['share_info'].get('share_desc', '').strip()
|
||||||
|
if share_desc:
|
||||||
|
desc = share_desc
|
||||||
|
logging.info(f"从share_info.share_desc提取到描述")
|
||||||
|
|
||||||
|
# 如果share_info中没有找到有效描述,继续检查desc字段
|
||||||
|
if not desc:
|
||||||
|
for field in description_fields:
|
||||||
|
if field in obj and obj[field]:
|
||||||
|
desc_value = str(obj[field]).strip()
|
||||||
|
if desc_value:
|
||||||
|
desc = desc_value
|
||||||
|
logging.info(f"从{field}提取到描述")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 如果还没有找到描述,尝试从嵌套对象中查找desc字段
|
||||||
|
if not desc:
|
||||||
|
def search_nested_desc(data, depth=0):
|
||||||
|
if depth > 3: # 限制递归深度
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(data, dict):
|
||||||
|
# 检查当前层级的desc字段
|
||||||
|
if 'desc' in data and data['desc']:
|
||||||
|
desc_value = str(data['desc']).strip()
|
||||||
|
if 5 <= len(desc_value) <= 1000:
|
||||||
|
return desc_value
|
||||||
|
|
||||||
|
# 递归检查嵌套对象
|
||||||
|
for value in data.values():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
nested_result = search_nested_desc(value, depth + 1)
|
||||||
|
if nested_result:
|
||||||
|
return nested_result
|
||||||
|
return None
|
||||||
|
|
||||||
|
desc = search_nested_desc(obj)
|
||||||
|
|
||||||
|
|
||||||
|
# 提取合集总集数 - 从statis字段中获取
|
||||||
|
updated_to_episode = 0 # 初始化默认值
|
||||||
|
if 'statis' in obj and isinstance(obj['statis'], dict):
|
||||||
|
statis = obj['statis']
|
||||||
|
if 'updated_to_episode' in statis:
|
||||||
|
try:
|
||||||
|
episodes = int(statis['updated_to_episode'])
|
||||||
|
if episodes > 0:
|
||||||
|
updated_to_episode = episodes
|
||||||
|
logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")
|
||||||
|
except ValueError:
|
||||||
|
logging.warning("updated_to_episode字段值无法转换为整数")
|
||||||
|
else:
|
||||||
|
logging.info("未找到statis字段或statis不是字典类型")
|
||||||
|
try:
|
||||||
|
episodes = int(obj['updated_to_episode'])
|
||||||
|
if episodes > 0:
|
||||||
|
updated_to_episode = episodes
|
||||||
|
logging.info(f"从updated_to_episode提取到集数: {episodes}")
|
||||||
|
except ValueError:
|
||||||
|
pass # 忽略无法转换为整数的情况
|
||||||
|
|
||||||
self.play_vv_items.append({
|
self.play_vv_items.append({
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
'formatted': self.format_count(vv),
|
'formatted': self.format_count(vv),
|
||||||
@ -549,9 +658,18 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_id': mix_id, # 合集ID
|
'mix_id': mix_id, # 合集ID
|
||||||
'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL)
|
'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL)
|
||||||
'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表
|
'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表
|
||||||
|
'series_author': series_author, # 合集作者/影视工作室
|
||||||
|
'desc': desc, # 合集描述
|
||||||
|
'updated_to_episode': updated_to_episode, # 合集总集数
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
|
if series_author:
|
||||||
|
logging.info(f' 作者: {series_author}')
|
||||||
|
if desc:
|
||||||
|
logging.info(f' 描述: {desc[:100]}{"..." if len(desc) > 100 else ""}')
|
||||||
|
if updated_to_episode > 0:
|
||||||
|
logging.info(f' 总集数: {updated_to_episode}')
|
||||||
|
|
||||||
# 递归搜索子对象
|
# 递归搜索子对象
|
||||||
for key, value in obj.items():
|
for key, value in obj.items():
|
||||||
@ -567,17 +685,21 @@ class DouyinPlayVVScraper:
|
|||||||
|
|
||||||
def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None):
|
def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None):
|
||||||
"""使用正则表达式从文本中提取信息"""
|
"""使用正则表达式从文本中提取信息"""
|
||||||
# 查找包含完整合集信息的JSON片段
|
# 查找包含完整合集信息的JSON片段,包括statis中的updated_to_episode
|
||||||
mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*\}[^{}]*\}'
|
mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*"updated_to_episode"\s*:\s*(\d+)[^{}]*\}[^{}]*\}'
|
||||||
|
|
||||||
for match in re.finditer(mix_pattern, text):
|
for match in re.finditer(mix_pattern, text):
|
||||||
try:
|
try:
|
||||||
mix_id = match.group(1)
|
mix_id = match.group(1)
|
||||||
mix_name = match.group(2)
|
mix_name = match.group(2)
|
||||||
vv = int(match.group(3))
|
vv = int(match.group(3))
|
||||||
|
episodes = int(match.group(4))
|
||||||
|
|
||||||
# 构建合集链接
|
# 构建合集链接
|
||||||
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
||||||
|
|
||||||
|
if episodes > 0:
|
||||||
|
logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")
|
||||||
|
|
||||||
self.play_vv_items.append({
|
self.play_vv_items.append({
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
@ -587,6 +709,7 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_name': mix_name,
|
'mix_name': mix_name,
|
||||||
'video_url': video_url, # 合集链接
|
'video_url': video_url, # 合集链接
|
||||||
'mix_id': mix_id, # 合集ID
|
'mix_id': mix_id, # 合集ID
|
||||||
|
'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
@ -607,6 +730,7 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_name': '', # 未知合集名称
|
'mix_name': '', # 未知合集名称
|
||||||
'video_url': '', # 未知链接
|
'video_url': '', # 未知链接
|
||||||
'mix_id': '', # 未知mix_id
|
'mix_id': '', # 未知mix_id
|
||||||
|
'updated_to_episode': None, # 未知集数
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -871,7 +995,7 @@ class DouyinPlayVVScraper:
|
|||||||
# 没有封面图片,使用空字符串
|
# 没有封面图片,使用空字符串
|
||||||
permanent_cover_url = ''
|
permanent_cover_url = ''
|
||||||
|
|
||||||
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接
|
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段
|
||||||
doc = {
|
doc = {
|
||||||
'batch_time': batch_time,
|
'batch_time': batch_time,
|
||||||
'mix_name': mix_name,
|
'mix_name': mix_name,
|
||||||
@ -882,7 +1006,11 @@ class DouyinPlayVVScraper:
|
|||||||
'rank': 0, # 临时设置,后面会重新计算
|
'rank': 0, # 临时设置,后面会重新计算
|
||||||
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
|
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
|
||||||
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
|
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
|
||||||
'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表
|
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
|
||||||
|
# 新增的三个字段
|
||||||
|
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
||||||
|
'desc': item.get('desc', ''), # 合集描述
|
||||||
|
'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数
|
||||||
}
|
}
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
|
|
||||||
@ -900,7 +1028,7 @@ class DouyinPlayVVScraper:
|
|||||||
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
|
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
|
||||||
|
|
||||||
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
|
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
|
||||||
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url')
|
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, series_author, desc, updated_to_episode')
|
||||||
|
|
||||||
# 统计封面图片处理情况
|
# 统计封面图片处理情况
|
||||||
cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))
|
cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))
|
||||||
|
|||||||
@ -128,7 +128,13 @@ def format_mix_item(doc):
|
|||||||
"play_vv": doc.get("play_vv", 0),
|
"play_vv": doc.get("play_vv", 0),
|
||||||
"request_id": doc.get("request_id", ""),
|
"request_id": doc.get("request_id", ""),
|
||||||
"rank": doc.get("rank", 0),
|
"rank": doc.get("rank", 0),
|
||||||
"cover_image_url": doc.get("cover_image_url", "")
|
"cover_image_url": doc.get("cover_image_url", ""),
|
||||||
|
# 新增字段
|
||||||
|
"series_author": doc.get("series_author", ""),
|
||||||
|
"desc": doc.get("desc", ""),
|
||||||
|
"updated_to_episode": doc.get("updated_to_episode", 0),
|
||||||
|
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
||||||
|
"mix_id": doc.get("mix_id", "")
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user