添加了episode_video_ids字段,在运行rank_data_scraper.py
或定时器代码的时候不仅保证了原来的数据库存入数据库还保存了 每一部短剧的每一集的视频ID 提示: 拉取代码在您运行脚本之后会主动创建一个episode_video_ids文件夹 里面存入的是您第一次运行脚本的每一集的视频ID(作为缓存) 判断的方法是:在运行脚本之后检查每一集的缓存数量是否与本剧的集数相同,相同则使用缓存的视频ID 不相同则重新获取 获取视频ID的时间不长
This commit is contained in:
parent
8b607f6e24
commit
8b1149da56
@ -23,6 +23,7 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
import requests
|
||||
|
||||
from selenium import webdriver
|
||||
import os
|
||||
@ -995,7 +996,20 @@ class DouyinPlayVVScraper:
|
||||
# 没有封面图片,使用空字符串
|
||||
permanent_cover_url = ''
|
||||
|
||||
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段
|
||||
# 获取合集中的所有视频ID
|
||||
mix_id = item.get('mix_id', '')
|
||||
episode_video_ids = []
|
||||
if mix_id:
|
||||
logging.info(f'获取合集 {mix_name} 的所有视频ID')
|
||||
current_episode_count = item.get('updated_to_episode', 0)
|
||||
episode_video_ids = self.get_collection_videos(
|
||||
mix_id=mix_id,
|
||||
mix_name=mix_name,
|
||||
current_episode_count=current_episode_count
|
||||
)
|
||||
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
|
||||
|
||||
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
|
||||
doc = {
|
||||
'batch_time': batch_time,
|
||||
'mix_name': mix_name,
|
||||
@ -1007,10 +1021,11 @@ class DouyinPlayVVScraper:
|
||||
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
|
||||
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
|
||||
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
|
||||
# 新增的三个字段
|
||||
# 新增的字段
|
||||
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
||||
'desc': item.get('desc', ''), # 合集描述
|
||||
'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数
|
||||
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
|
||||
'episode_video_ids': episode_video_ids # 每一集的视频ID列表
|
||||
}
|
||||
documents.append(doc)
|
||||
|
||||
@ -1042,6 +1057,228 @@ class DouyinPlayVVScraper:
|
||||
except Exception as e:
|
||||
logging.error(f'保存到MongoDB时出错: {e}')
|
||||
|
||||
def get_video_info(self, video_id: str) -> dict:
|
||||
"""获取视频详细信息
|
||||
Args:
|
||||
video_id: 视频ID
|
||||
Returns:
|
||||
dict: 包含视频详细信息的字典
|
||||
"""
|
||||
video_url = f'https://www.douyin.com/video/{video_id}'
|
||||
logging.info(f'获取视频信息: {video_url}')
|
||||
|
||||
# 清除之前的网络日志
|
||||
self.driver.execute_cdp_cmd('Network.clearBrowserCache', {})
|
||||
self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {})
|
||||
self.driver.get(video_url)
|
||||
time.sleep(3)
|
||||
|
||||
# 等待页面加载完成
|
||||
try:
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "video"))
|
||||
)
|
||||
except Exception as e:
|
||||
logging.warning(f'等待视频元素超时: {e}')
|
||||
|
||||
# 获取网络请求日志
|
||||
logs = self.driver.get_log('performance')
|
||||
video_info = {}
|
||||
|
||||
for entry in logs:
|
||||
try:
|
||||
log = json.loads(entry['message'])['message']
|
||||
if (
|
||||
'Network.responseReceived' in log['method']
|
||||
and 'response' in log['params']
|
||||
and 'url' in log['params']['response']
|
||||
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
|
||||
):
|
||||
request_id = log['params']['requestId']
|
||||
response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
|
||||
if response and 'body' in response:
|
||||
data = json.loads(response['body'])
|
||||
if 'item_list' in data and len(data['item_list']) > 0:
|
||||
item = data['item_list'][0]
|
||||
video_info = {
|
||||
'video_id': item.get('aweme_id'),
|
||||
'create_time': item.get('create_time'),
|
||||
'desc': item.get('desc'),
|
||||
'duration': item.get('duration'),
|
||||
'mix_info': {
|
||||
'mix_id': item.get('mix_info', {}).get('mix_id'),
|
||||
'mix_name': item.get('mix_info', {}).get('mix_name'),
|
||||
'total': item.get('mix_info', {}).get('total')
|
||||
}
|
||||
}
|
||||
break
|
||||
except Exception as e:
|
||||
logging.warning(f'解析日志条目时出错: {e}')
|
||||
|
||||
return video_info
|
||||
|
||||
def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list:
|
||||
"""获取合集中的所有视频ID列表,支持增量更新
|
||||
Args:
|
||||
mix_id: 合集ID
|
||||
mix_name: 合集名称,用于日志
|
||||
current_episode_count: 当前已知的集数
|
||||
Returns:
|
||||
list: 按集数排序的视频ID列表
|
||||
"""
|
||||
try:
|
||||
# 检查缓存文件
|
||||
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
|
||||
# 确保缓存目录存在
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json')
|
||||
cached_videos = []
|
||||
|
||||
try:
|
||||
if os.path.exists(cache_file):
|
||||
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||
cache_data = json.load(f)
|
||||
cached_videos = cache_data.get('episodes', [])
|
||||
last_update = cache_data.get('last_update')
|
||||
|
||||
# 如果缓存的集数等于当前集数,直接返回缓存的结果
|
||||
if len(cached_videos) == current_episode_count:
|
||||
logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
|
||||
return [video['video_id'] for video in cached_videos]
|
||||
except Exception as e:
|
||||
logging.warning(f"读取缓存文件失败: {e}")
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||
'Referer': 'https://www.douyin.com/',
|
||||
}
|
||||
|
||||
params = {
|
||||
'device_platform': 'webapp',
|
||||
'aid': '6383',
|
||||
'channel': 'channel_pc_web',
|
||||
'pc_client_type': '1',
|
||||
'version_code': '170400',
|
||||
'version_name': '17.4.0',
|
||||
'cookie_enabled': 'true',
|
||||
'platform': 'PC',
|
||||
'downlink': '10',
|
||||
'mix_id': mix_id,
|
||||
'cursor': '0',
|
||||
'count': '30',
|
||||
'screen_width': '1920',
|
||||
'screen_height': '1080',
|
||||
'browser_language': 'zh-CN',
|
||||
'browser_platform': 'Win32',
|
||||
'browser_name': 'Chrome',
|
||||
'browser_version': '120.0.0.0',
|
||||
'browser_online': 'true',
|
||||
'engine_name': 'Blink',
|
||||
'engine_version': '120.0.0.0',
|
||||
'os_name': 'Windows',
|
||||
'os_version': '10',
|
||||
'cpu_core_num': '16',
|
||||
'device_memory': '8',
|
||||
'effective_type': '4g',
|
||||
'round_trip_time': '50',
|
||||
}
|
||||
|
||||
all_videos = []
|
||||
|
||||
while True:
|
||||
response = requests.get(
|
||||
'https://www.douyin.com/aweme/v1/web/mix/aweme/',
|
||||
params=params,
|
||||
cookies=self.get_cookies_dict(),
|
||||
headers=headers
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
logging.error(f"请求失败: {response.status_code}")
|
||||
logging.error(f"响应内容: {response.text}")
|
||||
break
|
||||
|
||||
try:
|
||||
data = response.json()
|
||||
aweme_list = data.get('aweme_list', [])
|
||||
if not aweme_list:
|
||||
break
|
||||
|
||||
for aweme in aweme_list:
|
||||
video_id = aweme.get('aweme_id')
|
||||
if video_id:
|
||||
all_videos.append({
|
||||
'video_id': video_id,
|
||||
'episode_num': int(aweme.get('episode_num', 0))
|
||||
})
|
||||
|
||||
has_more = data.get('has_more', False)
|
||||
if not has_more:
|
||||
break
|
||||
|
||||
params['cursor'] = str(len(all_videos))
|
||||
time.sleep(1)
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logging.error(f"JSON解析错误: {e}")
|
||||
logging.error(f"响应内容: {response.text}")
|
||||
break
|
||||
|
||||
if not all_videos:
|
||||
if cached_videos:
|
||||
logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})")
|
||||
return [video['video_id'] for video in cached_videos]
|
||||
return []
|
||||
|
||||
logging.info(f"获取到 {len(all_videos)} 个视频ID")
|
||||
|
||||
# 按集数排序
|
||||
all_videos.sort(key=lambda x: x['episode_num'])
|
||||
|
||||
# 整理视频ID和集数信息
|
||||
episode_info = []
|
||||
for video in all_videos:
|
||||
episode_info.append({
|
||||
'video_id': video['video_id'],
|
||||
'episode_num': video['episode_num']
|
||||
})
|
||||
|
||||
# 检查是否有新增视频
|
||||
if len(episode_info) > len(cached_videos):
|
||||
logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)} 集")
|
||||
|
||||
# 保存到缓存文件
|
||||
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||
json.dump({
|
||||
'episodes': episode_info,
|
||||
'total_count': len(episode_info),
|
||||
'last_update': datetime.now().isoformat(),
|
||||
'mix_name': mix_name
|
||||
}, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# 返回视频ID列表
|
||||
return [video['video_id'] for video in all_videos]
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"获取合集视频时出错: {e}")
|
||||
# 如果出错且有缓存,返回缓存的结果
|
||||
if cached_videos:
|
||||
logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
|
||||
return [video['video_id'] for video in cached_videos]
|
||||
return []
|
||||
|
||||
def get_cookies_dict(self):
|
||||
"""获取当前页面的cookies"""
|
||||
if not hasattr(self, 'cookies') or not self.cookies:
|
||||
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
|
||||
return self.cookies
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
self.setup_driver()
|
||||
|
||||
@ -134,7 +134,8 @@ def format_mix_item(doc):
|
||||
"desc": doc.get("desc", ""),
|
||||
"updated_to_episode": doc.get("updated_to_episode", 0),
|
||||
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
||||
"mix_id": doc.get("mix_id", "")
|
||||
"mix_id": doc.get("mix_id", ""),
|
||||
"episode_video_ids": doc.get("episode_video_ids", [])
|
||||
}
|
||||
|
||||
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user