添加了episode_video_ids字段,在运行rank_data_scraper.py
或定时器代码的时候不仅保证了原来的数据库存入数据库还保存了 每一部短剧的每一集的视频ID 提示: 拉取代码在您运行脚本之后会主动创建一个episode_video_ids文件夹 里面存入的是您第一次运行脚本的每一集的视频ID(作为缓存) 判断的方法是:在运行脚本之后检查每一集的缓存数量是否与本剧的集数相同,相同则使用缓存的视频ID 不相同则重新获取 获取视频ID的时间不长
This commit is contained in:
parent
8b607f6e24
commit
8b1149da56
@ -23,6 +23,7 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import requests
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
import os
|
import os
|
||||||
@ -995,7 +996,20 @@ class DouyinPlayVVScraper:
|
|||||||
# 没有封面图片,使用空字符串
|
# 没有封面图片,使用空字符串
|
||||||
permanent_cover_url = ''
|
permanent_cover_url = ''
|
||||||
|
|
||||||
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段
|
# 获取合集中的所有视频ID
|
||||||
|
mix_id = item.get('mix_id', '')
|
||||||
|
episode_video_ids = []
|
||||||
|
if mix_id:
|
||||||
|
logging.info(f'获取合集 {mix_name} 的所有视频ID')
|
||||||
|
current_episode_count = item.get('updated_to_episode', 0)
|
||||||
|
episode_video_ids = self.get_collection_videos(
|
||||||
|
mix_id=mix_id,
|
||||||
|
mix_name=mix_name,
|
||||||
|
current_episode_count=current_episode_count
|
||||||
|
)
|
||||||
|
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
|
||||||
|
|
||||||
|
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
|
||||||
doc = {
|
doc = {
|
||||||
'batch_time': batch_time,
|
'batch_time': batch_time,
|
||||||
'mix_name': mix_name,
|
'mix_name': mix_name,
|
||||||
@ -1007,10 +1021,11 @@ class DouyinPlayVVScraper:
|
|||||||
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
|
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
|
||||||
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
|
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
|
||||||
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
|
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
|
||||||
# 新增的三个字段
|
# 新增的字段
|
||||||
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
||||||
'desc': item.get('desc', ''), # 合集描述
|
'desc': item.get('desc', ''), # 合集描述
|
||||||
'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数
|
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
|
||||||
|
'episode_video_ids': episode_video_ids # 每一集的视频ID列表
|
||||||
}
|
}
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
|
|
||||||
@ -1042,6 +1057,228 @@ class DouyinPlayVVScraper:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'保存到MongoDB时出错: {e}')
|
logging.error(f'保存到MongoDB时出错: {e}')
|
||||||
|
|
||||||
|
def get_video_info(self, video_id: str) -> dict:
|
||||||
|
"""获取视频详细信息
|
||||||
|
Args:
|
||||||
|
video_id: 视频ID
|
||||||
|
Returns:
|
||||||
|
dict: 包含视频详细信息的字典
|
||||||
|
"""
|
||||||
|
video_url = f'https://www.douyin.com/video/{video_id}'
|
||||||
|
logging.info(f'获取视频信息: {video_url}')
|
||||||
|
|
||||||
|
# 清除之前的网络日志
|
||||||
|
self.driver.execute_cdp_cmd('Network.clearBrowserCache', {})
|
||||||
|
self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {})
|
||||||
|
self.driver.get(video_url)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# 等待页面加载完成
|
||||||
|
try:
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "video"))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'等待视频元素超时: {e}')
|
||||||
|
|
||||||
|
# 获取网络请求日志
|
||||||
|
logs = self.driver.get_log('performance')
|
||||||
|
video_info = {}
|
||||||
|
|
||||||
|
for entry in logs:
|
||||||
|
try:
|
||||||
|
log = json.loads(entry['message'])['message']
|
||||||
|
if (
|
||||||
|
'Network.responseReceived' in log['method']
|
||||||
|
and 'response' in log['params']
|
||||||
|
and 'url' in log['params']['response']
|
||||||
|
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
|
||||||
|
):
|
||||||
|
request_id = log['params']['requestId']
|
||||||
|
response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
|
||||||
|
if response and 'body' in response:
|
||||||
|
data = json.loads(response['body'])
|
||||||
|
if 'item_list' in data and len(data['item_list']) > 0:
|
||||||
|
item = data['item_list'][0]
|
||||||
|
video_info = {
|
||||||
|
'video_id': item.get('aweme_id'),
|
||||||
|
'create_time': item.get('create_time'),
|
||||||
|
'desc': item.get('desc'),
|
||||||
|
'duration': item.get('duration'),
|
||||||
|
'mix_info': {
|
||||||
|
'mix_id': item.get('mix_info', {}).get('mix_id'),
|
||||||
|
'mix_name': item.get('mix_info', {}).get('mix_name'),
|
||||||
|
'total': item.get('mix_info', {}).get('total')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'解析日志条目时出错: {e}')
|
||||||
|
|
||||||
|
return video_info
|
||||||
|
|
||||||
|
def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list:
|
||||||
|
"""获取合集中的所有视频ID列表,支持增量更新
|
||||||
|
Args:
|
||||||
|
mix_id: 合集ID
|
||||||
|
mix_name: 合集名称,用于日志
|
||||||
|
current_episode_count: 当前已知的集数
|
||||||
|
Returns:
|
||||||
|
list: 按集数排序的视频ID列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 检查缓存文件
|
||||||
|
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
|
||||||
|
# 确保缓存目录存在
|
||||||
|
os.makedirs(cache_dir, exist_ok=True)
|
||||||
|
cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json')
|
||||||
|
cached_videos = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.path.exists(cache_file):
|
||||||
|
with open(cache_file, 'r', encoding='utf-8') as f:
|
||||||
|
cache_data = json.load(f)
|
||||||
|
cached_videos = cache_data.get('episodes', [])
|
||||||
|
last_update = cache_data.get('last_update')
|
||||||
|
|
||||||
|
# 如果缓存的集数等于当前集数,直接返回缓存的结果
|
||||||
|
if len(cached_videos) == current_episode_count:
|
||||||
|
logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
|
||||||
|
return [video['video_id'] for video in cached_videos]
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"读取缓存文件失败: {e}")
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'application/json, text/plain, */*',
|
||||||
|
'Accept-Language': 'zh-CN,zh;q=0.9',
|
||||||
|
'Referer': 'https://www.douyin.com/',
|
||||||
|
}
|
||||||
|
|
||||||
|
params = {
|
||||||
|
'device_platform': 'webapp',
|
||||||
|
'aid': '6383',
|
||||||
|
'channel': 'channel_pc_web',
|
||||||
|
'pc_client_type': '1',
|
||||||
|
'version_code': '170400',
|
||||||
|
'version_name': '17.4.0',
|
||||||
|
'cookie_enabled': 'true',
|
||||||
|
'platform': 'PC',
|
||||||
|
'downlink': '10',
|
||||||
|
'mix_id': mix_id,
|
||||||
|
'cursor': '0',
|
||||||
|
'count': '30',
|
||||||
|
'screen_width': '1920',
|
||||||
|
'screen_height': '1080',
|
||||||
|
'browser_language': 'zh-CN',
|
||||||
|
'browser_platform': 'Win32',
|
||||||
|
'browser_name': 'Chrome',
|
||||||
|
'browser_version': '120.0.0.0',
|
||||||
|
'browser_online': 'true',
|
||||||
|
'engine_name': 'Blink',
|
||||||
|
'engine_version': '120.0.0.0',
|
||||||
|
'os_name': 'Windows',
|
||||||
|
'os_version': '10',
|
||||||
|
'cpu_core_num': '16',
|
||||||
|
'device_memory': '8',
|
||||||
|
'effective_type': '4g',
|
||||||
|
'round_trip_time': '50',
|
||||||
|
}
|
||||||
|
|
||||||
|
all_videos = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
response = requests.get(
|
||||||
|
'https://www.douyin.com/aweme/v1/web/mix/aweme/',
|
||||||
|
params=params,
|
||||||
|
cookies=self.get_cookies_dict(),
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
|
||||||
|
if response.status_code != 200:
|
||||||
|
logging.error(f"请求失败: {response.status_code}")
|
||||||
|
logging.error(f"响应内容: {response.text}")
|
||||||
|
break
|
||||||
|
|
||||||
|
try:
|
||||||
|
data = response.json()
|
||||||
|
aweme_list = data.get('aweme_list', [])
|
||||||
|
if not aweme_list:
|
||||||
|
break
|
||||||
|
|
||||||
|
for aweme in aweme_list:
|
||||||
|
video_id = aweme.get('aweme_id')
|
||||||
|
if video_id:
|
||||||
|
all_videos.append({
|
||||||
|
'video_id': video_id,
|
||||||
|
'episode_num': int(aweme.get('episode_num', 0))
|
||||||
|
})
|
||||||
|
|
||||||
|
has_more = data.get('has_more', False)
|
||||||
|
if not has_more:
|
||||||
|
break
|
||||||
|
|
||||||
|
params['cursor'] = str(len(all_videos))
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logging.error(f"JSON解析错误: {e}")
|
||||||
|
logging.error(f"响应内容: {response.text}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if not all_videos:
|
||||||
|
if cached_videos:
|
||||||
|
logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})")
|
||||||
|
return [video['video_id'] for video in cached_videos]
|
||||||
|
return []
|
||||||
|
|
||||||
|
logging.info(f"获取到 {len(all_videos)} 个视频ID")
|
||||||
|
|
||||||
|
# 按集数排序
|
||||||
|
all_videos.sort(key=lambda x: x['episode_num'])
|
||||||
|
|
||||||
|
# 整理视频ID和集数信息
|
||||||
|
episode_info = []
|
||||||
|
for video in all_videos:
|
||||||
|
episode_info.append({
|
||||||
|
'video_id': video['video_id'],
|
||||||
|
'episode_num': video['episode_num']
|
||||||
|
})
|
||||||
|
|
||||||
|
# 检查是否有新增视频
|
||||||
|
if len(episode_info) > len(cached_videos):
|
||||||
|
logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)} 集")
|
||||||
|
|
||||||
|
# 保存到缓存文件
|
||||||
|
with open(cache_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump({
|
||||||
|
'episodes': episode_info,
|
||||||
|
'total_count': len(episode_info),
|
||||||
|
'last_update': datetime.now().isoformat(),
|
||||||
|
'mix_name': mix_name
|
||||||
|
}, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
# 返回视频ID列表
|
||||||
|
return [video['video_id'] for video in all_videos]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"获取合集视频时出错: {e}")
|
||||||
|
# 如果出错且有缓存,返回缓存的结果
|
||||||
|
if cached_videos:
|
||||||
|
logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
|
||||||
|
return [video['video_id'] for video in cached_videos]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_cookies_dict(self):
|
||||||
|
"""获取当前页面的cookies"""
|
||||||
|
if not hasattr(self, 'cookies') or not self.cookies:
|
||||||
|
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
|
||||||
|
return self.cookies
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
self.setup_driver()
|
self.setup_driver()
|
||||||
|
|||||||
@ -134,7 +134,8 @@ def format_mix_item(doc):
|
|||||||
"desc": doc.get("desc", ""),
|
"desc": doc.get("desc", ""),
|
||||||
"updated_to_episode": doc.get("updated_to_episode", 0),
|
"updated_to_episode": doc.get("updated_to_episode", 0),
|
||||||
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
||||||
"mix_id": doc.get("mix_id", "")
|
"mix_id": doc.get("mix_id", ""),
|
||||||
|
"episode_video_ids": doc.get("episode_video_ids", [])
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user