添加了episode_video_ids字段,在运行rank_data_scraper.py

或定时器代码的时候不仅保证了原来的数据库存入数据库还保存了
每一部短剧的每一集的视频ID

提示:
拉取代码在您运行脚本之后会主动创建一个episode_video_ids文件夹
里面存入的是您第一次运行脚本的每一集的视频ID(作为缓存)
判断的方法是:在运行脚本之后检查每一集的缓存数量是否与本剧的集数相同,相同则使用缓存的视频ID
不相同则重新获取

获取视频ID的时间不长
This commit is contained in:
Qyir 2025-10-21 17:56:00 +08:00
parent 8b607f6e24
commit 8b1149da56
2 changed files with 242 additions and 4 deletions

View File

@ -23,6 +23,7 @@ import logging
import os
import shutil
from datetime import datetime
import requests
from selenium import webdriver
import os
@ -995,7 +996,20 @@ class DouyinPlayVVScraper:
# 没有封面图片,使用空字符串
permanent_cover_url = ''
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段
# 获取合集中的所有视频ID
mix_id = item.get('mix_id', '')
episode_video_ids = []
if mix_id:
logging.info(f'获取合集 {mix_name} 的所有视频ID')
current_episode_count = item.get('updated_to_episode', 0)
episode_video_ids = self.get_collection_videos(
mix_id=mix_id,
mix_name=mix_name,
current_episode_count=current_episode_count
)
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
doc = {
'batch_time': batch_time,
'mix_name': mix_name,
@ -1007,10 +1021,11 @@ class DouyinPlayVVScraper:
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
# 新增的三个字段
# 新增的字段
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
'desc': item.get('desc', ''), # 合集描述
'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
'episode_video_ids': episode_video_ids # 每一集的视频ID列表
}
documents.append(doc)
@ -1042,6 +1057,228 @@ class DouyinPlayVVScraper:
except Exception as e:
logging.error(f'保存到MongoDB时出错: {e}')
def get_video_info(self, video_id: str) -> dict:
"""获取视频详细信息
Args:
video_id: 视频ID
Returns:
dict: 包含视频详细信息的字典
"""
video_url = f'https://www.douyin.com/video/{video_id}'
logging.info(f'获取视频信息: {video_url}')
# 清除之前的网络日志
self.driver.execute_cdp_cmd('Network.clearBrowserCache', {})
self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {})
self.driver.get(video_url)
time.sleep(3)
# 等待页面加载完成
try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "video"))
)
except Exception as e:
logging.warning(f'等待视频元素超时: {e}')
# 获取网络请求日志
logs = self.driver.get_log('performance')
video_info = {}
for entry in logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and 'url' in log['params']['response']
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
):
request_id = log['params']['requestId']
response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
if response and 'body' in response:
data = json.loads(response['body'])
if 'item_list' in data and len(data['item_list']) > 0:
item = data['item_list'][0]
video_info = {
'video_id': item.get('aweme_id'),
'create_time': item.get('create_time'),
'desc': item.get('desc'),
'duration': item.get('duration'),
'mix_info': {
'mix_id': item.get('mix_info', {}).get('mix_id'),
'mix_name': item.get('mix_info', {}).get('mix_name'),
'total': item.get('mix_info', {}).get('total')
}
}
break
except Exception as e:
logging.warning(f'解析日志条目时出错: {e}')
return video_info
def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list:
"""获取合集中的所有视频ID列表支持增量更新
Args:
mix_id: 合集ID
mix_name: 合集名称用于日志
current_episode_count: 当前已知的集数
Returns:
list: 按集数排序的视频ID列表
"""
try:
# 检查缓存文件
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
# 确保缓存目录存在
os.makedirs(cache_dir, exist_ok=True)
cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json')
cached_videos = []
try:
if os.path.exists(cache_file):
with open(cache_file, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
cached_videos = cache_data.get('episodes', [])
last_update = cache_data.get('last_update')
# 如果缓存的集数等于当前集数,直接返回缓存的结果
if len(cached_videos) == current_episode_count:
logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
except Exception as e:
logging.warning(f"读取缓存文件失败: {e}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.douyin.com/',
}
params = {
'device_platform': 'webapp',
'aid': '6383',
'channel': 'channel_pc_web',
'pc_client_type': '1',
'version_code': '170400',
'version_name': '17.4.0',
'cookie_enabled': 'true',
'platform': 'PC',
'downlink': '10',
'mix_id': mix_id,
'cursor': '0',
'count': '30',
'screen_width': '1920',
'screen_height': '1080',
'browser_language': 'zh-CN',
'browser_platform': 'Win32',
'browser_name': 'Chrome',
'browser_version': '120.0.0.0',
'browser_online': 'true',
'engine_name': 'Blink',
'engine_version': '120.0.0.0',
'os_name': 'Windows',
'os_version': '10',
'cpu_core_num': '16',
'device_memory': '8',
'effective_type': '4g',
'round_trip_time': '50',
}
all_videos = []
while True:
response = requests.get(
'https://www.douyin.com/aweme/v1/web/mix/aweme/',
params=params,
cookies=self.get_cookies_dict(),
headers=headers
)
if response.status_code != 200:
logging.error(f"请求失败: {response.status_code}")
logging.error(f"响应内容: {response.text}")
break
try:
data = response.json()
aweme_list = data.get('aweme_list', [])
if not aweme_list:
break
for aweme in aweme_list:
video_id = aweme.get('aweme_id')
if video_id:
all_videos.append({
'video_id': video_id,
'episode_num': int(aweme.get('episode_num', 0))
})
has_more = data.get('has_more', False)
if not has_more:
break
params['cursor'] = str(len(all_videos))
time.sleep(1)
except json.JSONDecodeError as e:
logging.error(f"JSON解析错误: {e}")
logging.error(f"响应内容: {response.text}")
break
if not all_videos:
if cached_videos:
logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
return []
logging.info(f"获取到 {len(all_videos)} 个视频ID")
# 按集数排序
all_videos.sort(key=lambda x: x['episode_num'])
# 整理视频ID和集数信息
episode_info = []
for video in all_videos:
episode_info.append({
'video_id': video['video_id'],
'episode_num': video['episode_num']
})
# 检查是否有新增视频
if len(episode_info) > len(cached_videos):
logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)}")
# 保存到缓存文件
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump({
'episodes': episode_info,
'total_count': len(episode_info),
'last_update': datetime.now().isoformat(),
'mix_name': mix_name
}, f, ensure_ascii=False, indent=2)
# 返回视频ID列表
return [video['video_id'] for video in all_videos]
except Exception as e:
logging.error(f"获取合集视频时出错: {e}")
# 如果出错且有缓存,返回缓存的结果
if cached_videos:
logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
return []
def get_cookies_dict(self):
"""获取当前页面的cookies"""
if not hasattr(self, 'cookies') or not self.cookies:
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
return self.cookies
def run(self):
try:
self.setup_driver()

View File

@ -134,7 +134,8 @@ def format_mix_item(doc):
"desc": doc.get("desc", ""),
"updated_to_episode": doc.get("updated_to_episode", 0),
"cover_backup_urls": doc.get("cover_backup_urls", []),
"mix_id": doc.get("mix_id", "")
"mix_id": doc.get("mix_id", ""),
"episode_video_ids": doc.get("episode_video_ids", [])
}
def get_mix_list(page=1, limit=20, sort_by="playcount"):