添加了episode_video_ids字段,在运行rank_data_scraper.py

或定时器代码的时候不仅保证了原来的数据库存入数据库还保存了
每一部短剧的每一集的视频ID

提示:
拉取代码在您运行脚本之后会主动创建一个episode_video_ids文件夹
里面存入的是您第一次运行脚本的每一集的视频ID(作为缓存)
判断的方法是:在运行脚本之后检查每一集的缓存数量是否与本剧的集数相同,相同则使用缓存的视频ID
不相同则重新获取

获取视频ID的时间不长
This commit is contained in:
Qyir 2025-10-21 17:56:00 +08:00
parent 8b607f6e24
commit 8b1149da56
2 changed files with 242 additions and 4 deletions

View File

@ -23,6 +23,7 @@ import logging
import os import os
import shutil import shutil
from datetime import datetime from datetime import datetime
import requests
from selenium import webdriver from selenium import webdriver
import os import os
@ -995,7 +996,20 @@ class DouyinPlayVVScraper:
# 没有封面图片,使用空字符串 # 没有封面图片,使用空字符串
permanent_cover_url = '' permanent_cover_url = ''
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段 # 获取合集中的所有视频ID
mix_id = item.get('mix_id', '')
episode_video_ids = []
if mix_id:
logging.info(f'获取合集 {mix_name} 的所有视频ID')
current_episode_count = item.get('updated_to_episode', 0)
episode_video_ids = self.get_collection_videos(
mix_id=mix_id,
mix_name=mix_name,
current_episode_count=current_episode_count
)
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
doc = { doc = {
'batch_time': batch_time, 'batch_time': batch_time,
'mix_name': mix_name, 'mix_name': mix_name,
@ -1007,10 +1021,11 @@ class DouyinPlayVVScraper:
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表 'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
# 新增的三个字段 # 新增的字段
'series_author': item.get('series_author', ''), # 合集作者/影视工作室 'series_author': item.get('series_author', ''), # 合集作者/影视工作室
'desc': item.get('desc', ''), # 合集描述 'desc': item.get('desc', ''), # 合集描述
'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
'episode_video_ids': episode_video_ids # 每一集的视频ID列表
} }
documents.append(doc) documents.append(doc)
@ -1042,6 +1057,228 @@ class DouyinPlayVVScraper:
except Exception as e: except Exception as e:
logging.error(f'保存到MongoDB时出错: {e}') logging.error(f'保存到MongoDB时出错: {e}')
def get_video_info(self, video_id: str) -> dict:
"""获取视频详细信息
Args:
video_id: 视频ID
Returns:
dict: 包含视频详细信息的字典
"""
video_url = f'https://www.douyin.com/video/{video_id}'
logging.info(f'获取视频信息: {video_url}')
# 清除之前的网络日志
self.driver.execute_cdp_cmd('Network.clearBrowserCache', {})
self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {})
self.driver.get(video_url)
time.sleep(3)
# 等待页面加载完成
try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "video"))
)
except Exception as e:
logging.warning(f'等待视频元素超时: {e}')
# 获取网络请求日志
logs = self.driver.get_log('performance')
video_info = {}
for entry in logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and 'url' in log['params']['response']
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
):
request_id = log['params']['requestId']
response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
if response and 'body' in response:
data = json.loads(response['body'])
if 'item_list' in data and len(data['item_list']) > 0:
item = data['item_list'][0]
video_info = {
'video_id': item.get('aweme_id'),
'create_time': item.get('create_time'),
'desc': item.get('desc'),
'duration': item.get('duration'),
'mix_info': {
'mix_id': item.get('mix_info', {}).get('mix_id'),
'mix_name': item.get('mix_info', {}).get('mix_name'),
'total': item.get('mix_info', {}).get('total')
}
}
break
except Exception as e:
logging.warning(f'解析日志条目时出错: {e}')
return video_info
def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list:
"""获取合集中的所有视频ID列表支持增量更新
Args:
mix_id: 合集ID
mix_name: 合集名称用于日志
current_episode_count: 当前已知的集数
Returns:
list: 按集数排序的视频ID列表
"""
try:
# 检查缓存文件
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
# 确保缓存目录存在
os.makedirs(cache_dir, exist_ok=True)
cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json')
cached_videos = []
try:
if os.path.exists(cache_file):
with open(cache_file, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
cached_videos = cache_data.get('episodes', [])
last_update = cache_data.get('last_update')
# 如果缓存的集数等于当前集数,直接返回缓存的结果
if len(cached_videos) == current_episode_count:
logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
except Exception as e:
logging.warning(f"读取缓存文件失败: {e}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.douyin.com/',
}
params = {
'device_platform': 'webapp',
'aid': '6383',
'channel': 'channel_pc_web',
'pc_client_type': '1',
'version_code': '170400',
'version_name': '17.4.0',
'cookie_enabled': 'true',
'platform': 'PC',
'downlink': '10',
'mix_id': mix_id,
'cursor': '0',
'count': '30',
'screen_width': '1920',
'screen_height': '1080',
'browser_language': 'zh-CN',
'browser_platform': 'Win32',
'browser_name': 'Chrome',
'browser_version': '120.0.0.0',
'browser_online': 'true',
'engine_name': 'Blink',
'engine_version': '120.0.0.0',
'os_name': 'Windows',
'os_version': '10',
'cpu_core_num': '16',
'device_memory': '8',
'effective_type': '4g',
'round_trip_time': '50',
}
all_videos = []
while True:
response = requests.get(
'https://www.douyin.com/aweme/v1/web/mix/aweme/',
params=params,
cookies=self.get_cookies_dict(),
headers=headers
)
if response.status_code != 200:
logging.error(f"请求失败: {response.status_code}")
logging.error(f"响应内容: {response.text}")
break
try:
data = response.json()
aweme_list = data.get('aweme_list', [])
if not aweme_list:
break
for aweme in aweme_list:
video_id = aweme.get('aweme_id')
if video_id:
all_videos.append({
'video_id': video_id,
'episode_num': int(aweme.get('episode_num', 0))
})
has_more = data.get('has_more', False)
if not has_more:
break
params['cursor'] = str(len(all_videos))
time.sleep(1)
except json.JSONDecodeError as e:
logging.error(f"JSON解析错误: {e}")
logging.error(f"响应内容: {response.text}")
break
if not all_videos:
if cached_videos:
logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
return []
logging.info(f"获取到 {len(all_videos)} 个视频ID")
# 按集数排序
all_videos.sort(key=lambda x: x['episode_num'])
# 整理视频ID和集数信息
episode_info = []
for video in all_videos:
episode_info.append({
'video_id': video['video_id'],
'episode_num': video['episode_num']
})
# 检查是否有新增视频
if len(episode_info) > len(cached_videos):
logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)}")
# 保存到缓存文件
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump({
'episodes': episode_info,
'total_count': len(episode_info),
'last_update': datetime.now().isoformat(),
'mix_name': mix_name
}, f, ensure_ascii=False, indent=2)
# 返回视频ID列表
return [video['video_id'] for video in all_videos]
except Exception as e:
logging.error(f"获取合集视频时出错: {e}")
# 如果出错且有缓存,返回缓存的结果
if cached_videos:
logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
return []
def get_cookies_dict(self):
"""获取当前页面的cookies"""
if not hasattr(self, 'cookies') or not self.cookies:
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
return self.cookies
def run(self): def run(self):
try: try:
self.setup_driver() self.setup_driver()

View File

@ -134,7 +134,8 @@ def format_mix_item(doc):
"desc": doc.get("desc", ""), "desc": doc.get("desc", ""),
"updated_to_episode": doc.get("updated_to_episode", 0), "updated_to_episode": doc.get("updated_to_episode", 0),
"cover_backup_urls": doc.get("cover_backup_urls", []), "cover_backup_urls": doc.get("cover_backup_urls", []),
"mix_id": doc.get("mix_id", "") "mix_id": doc.get("mix_id", ""),
"episode_video_ids": doc.get("episode_video_ids", [])
} }
def get_mix_list(page=1, limit=20, sort_by="playcount"): def get_mix_list(page=1, limit=20, sort_by="playcount"):