1.添加判断代码,启动定时器时不调用主代码的某几个函数,确保定时器正常计算播放量差值
2.新增功能:获取点赞,收藏,转发数量+评论内容列表(不完整,正在继续优化) 3.增加数据库文件夹,当启动定时器时存储到Ranking_storage_list中, 按照Ranking_storage_list中的数据进行计算播放量差值,计算结果存入Ranking_storage中 单独运行rank_data_scraper.py的时候存入Rankings_list 原因: Rankings_list里面存储的数据结构较多 Ranking_storage_list里面存储的主要是播放量 Rankings_list里面存入的是播放量差值
This commit is contained in:
parent
8b1149da56
commit
2a32b2a8c0
@ -61,6 +61,8 @@ class DouyinAutoScheduler:
|
||||
|
||||
# 设置环境变量,确保自动模式
|
||||
os.environ['AUTO_CONTINUE'] = '1'
|
||||
# 设置定时器模式环境变量,跳过评论抓取等函数
|
||||
os.environ['TIMER_MODE'] = '1'
|
||||
|
||||
# 直接创建并运行 DouyinPlayVVScraper 实例
|
||||
scraper = DouyinPlayVVScraper(
|
||||
@ -68,10 +70,10 @@ class DouyinAutoScheduler:
|
||||
auto_continue=True,
|
||||
duration_s=60
|
||||
)
|
||||
|
||||
|
||||
logging.info("📁 开始执行抓取任务...")
|
||||
scraper.run()
|
||||
|
||||
|
||||
logging.info("✅ 抖音播放量抓取任务执行成功")
|
||||
|
||||
# 数据抓取完成后,自动生成当日榜单
|
||||
@ -89,7 +91,7 @@ class DouyinAutoScheduler:
|
||||
from datetime import timedelta
|
||||
|
||||
# 获取集合
|
||||
douyin_collection = db['Rankings_list'] # 使用真实抓取的数据
|
||||
douyin_collection = db['Ranking_storage_list'] # 使用定时器抓取的数据
|
||||
rankings_collection = db['Ranking_storage']
|
||||
|
||||
today = date.today()
|
||||
@ -107,10 +109,20 @@ class DouyinAutoScheduler:
|
||||
try:
|
||||
logging.info("🔄 正在生成时间轴对比榜单...")
|
||||
|
||||
# 获取今天的数据,按短剧名称去重,只保留播放量最高的
|
||||
today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1))
|
||||
# 获取最新批次的数据
|
||||
latest_batch = douyin_collection.find_one(sort=[("batch_time", -1)])
|
||||
if not latest_batch:
|
||||
logging.warning("⚠️ 未找到任何数据")
|
||||
return False
|
||||
|
||||
latest_batch_time = latest_batch.get("batch_time")
|
||||
logging.info(f"📊 找到最新批次时间: {latest_batch_time}")
|
||||
|
||||
# 按短剧名称去重,每个短剧只保留播放量最高的一条
|
||||
# 只获取最新批次的数据
|
||||
today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1))
|
||||
logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}")
|
||||
|
||||
# 按短剧名称去重(虽然同一批次应该不会有重复,但为了代码健壮性保留此逻辑)
|
||||
unique_videos = {}
|
||||
for video in today_videos_raw:
|
||||
mix_name = video.get("mix_name", "")
|
||||
@ -121,26 +133,36 @@ class DouyinAutoScheduler:
|
||||
|
||||
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
|
||||
|
||||
# 获取昨天的榜单数据(如果存在),取最新的计算结果
|
||||
yesterday_ranking = rankings_collection.find_one({
|
||||
"date": yesterday_str,
|
||||
"type": "comprehensive"
|
||||
}, sort=[("calculation_sequence", -1)])
|
||||
# 获取昨天最后一批次的数据
|
||||
yesterday_batch = douyin_collection.find_one({
|
||||
"batch_time": {"$regex": f"^{yesterday_str}"}
|
||||
}, sort=[("batch_time", -1)])
|
||||
|
||||
yesterday_data = {}
|
||||
if yesterday_ranking and "data" in yesterday_ranking:
|
||||
# 将昨天的数据转换为字典,以短剧名称为键
|
||||
for item in yesterday_ranking["data"]:
|
||||
title = item.get("title", "")
|
||||
if title:
|
||||
yesterday_data[title] = {
|
||||
"rank": item.get("rank", 0),
|
||||
"play_vv": item.get("play_vv", 0),
|
||||
"video_id": item.get("video_id", "")
|
||||
if yesterday_batch:
|
||||
# 获取昨天最后一批次的所有数据
|
||||
yesterday_videos = list(douyin_collection.find({
|
||||
"batch_time": yesterday_batch["batch_time"]
|
||||
}).sort("play_vv", -1))
|
||||
|
||||
# 按短剧名称去重,保留播放量最高的记录
|
||||
for video in yesterday_videos:
|
||||
mix_name = video.get("mix_name", "")
|
||||
if mix_name and (mix_name not in yesterday_data or video.get("play_vv", 0) > yesterday_data[mix_name].get("play_vv", 0)):
|
||||
yesterday_data[mix_name] = {
|
||||
"play_vv": video.get("play_vv", 0),
|
||||
"video_id": str(video.get("_id", "")),
|
||||
"rank": 0 # 稍后计算排名
|
||||
}
|
||||
logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧")
|
||||
|
||||
# 计算排名
|
||||
sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True)
|
||||
for rank, (mix_name, data) in enumerate(sorted_videos, 1):
|
||||
yesterday_data[mix_name]["rank"] = rank
|
||||
|
||||
logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧")
|
||||
else:
|
||||
logging.info("📊 未找到昨天的榜单数据,将作为首次生成")
|
||||
logging.info("📊 未找到昨天的原始数据,将作为首次生成")
|
||||
|
||||
if today_videos:
|
||||
# 先计算所有视频的播放量差值
|
||||
|
||||
@ -83,11 +83,13 @@ class DouyinPlayVVScraper:
|
||||
# 使用 database.py 中的连接
|
||||
self.db = db
|
||||
|
||||
# 设置集合
|
||||
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list')
|
||||
# 根据运行模式选择集合
|
||||
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
|
||||
mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_list'
|
||||
self.collection = self.db[mongo_collection]
|
||||
|
||||
logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}')
|
||||
logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}')
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f'MongoDB连接失败: {e}')
|
||||
@ -447,6 +449,28 @@ class DouyinPlayVVScraper:
|
||||
if n >= 10_000:
|
||||
return f"{n/10_000:.1f}万"
|
||||
return str(n)
|
||||
|
||||
def format_interaction_count(self, n: int) -> str:
|
||||
"""格式化互动数据数量,返回带单位的字符串
|
||||
Args:
|
||||
n: 数量
|
||||
Returns:
|
||||
str: 格式化后的字符串,如 27898 -> 2.8W, 1234 -> 1234
|
||||
"""
|
||||
if n >= 100_000_000:
|
||||
result = n / 100_000_000
|
||||
if result == int(result):
|
||||
return f"{int(result)}亿"
|
||||
else:
|
||||
return f"{result:.1f}亿"
|
||||
elif n >= 10_000:
|
||||
result = n / 10_000
|
||||
if result == int(result):
|
||||
return f"{int(result)}W"
|
||||
else:
|
||||
return f"{result:.1f}W"
|
||||
else:
|
||||
return str(n)
|
||||
|
||||
|
||||
|
||||
@ -1008,6 +1032,80 @@ class DouyinPlayVVScraper:
|
||||
current_episode_count=current_episode_count
|
||||
)
|
||||
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
|
||||
|
||||
# 获取每个视频的详细互动数据
|
||||
logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据')
|
||||
video_details_list = self.get_collection_video_details(
|
||||
episode_video_ids=episode_video_ids,
|
||||
mix_name=mix_name,
|
||||
max_comments_per_video=10 # 每个视频最多获取10条评论
|
||||
)
|
||||
|
||||
# 构建每集的详细信息,使用获取到的真实数据
|
||||
episode_details = []
|
||||
total_episodes = item.get('updated_to_episode', 0)
|
||||
|
||||
for i in range(total_episodes):
|
||||
episode_number = i + 1
|
||||
video_id = episode_video_ids[i] if i < len(episode_video_ids) else ''
|
||||
|
||||
# 查找对应的视频详细数据
|
||||
video_detail = None
|
||||
if i < len(video_details_list):
|
||||
video_detail = video_details_list[i]
|
||||
|
||||
if video_detail and video_detail.get('success', False):
|
||||
# 使用获取到的真实数据
|
||||
likes = video_detail.get('likes', 0)
|
||||
shares = video_detail.get('shares', 0)
|
||||
favorites = video_detail.get('favorites', 0)
|
||||
|
||||
episode_info = {
|
||||
'episode_number': episode_number,
|
||||
'video_id': video_id,
|
||||
'likes': likes,
|
||||
'shares': shares,
|
||||
'favorites': favorites,
|
||||
'likes_formatted': self.format_interaction_count(likes),
|
||||
'shares_formatted': self.format_interaction_count(shares),
|
||||
'favorites_formatted': self.format_interaction_count(favorites),
|
||||
'comments': video_detail.get('comments', [])
|
||||
}
|
||||
else:
|
||||
# 使用默认值
|
||||
episode_info = {
|
||||
'episode_number': episode_number,
|
||||
'video_id': video_id,
|
||||
'likes': 0,
|
||||
'shares': 0,
|
||||
'favorites': 0,
|
||||
'likes_formatted': '0',
|
||||
'shares_formatted': '0',
|
||||
'favorites_formatted': '0',
|
||||
'comments': []
|
||||
}
|
||||
|
||||
episode_details.append(episode_info)
|
||||
|
||||
# 统计获取到的数据
|
||||
total_likes = sum(ep.get('likes', 0) for ep in episode_details)
|
||||
total_comments = sum(len(ep.get('comments', [])) for ep in episode_details)
|
||||
logging.info(f'合集 {mix_name} 详细数据统计: 总点赞数={total_likes:,}, 总评论数={total_comments}')
|
||||
else:
|
||||
# 如果没有获取到视频ID,使用默认的episode_details
|
||||
episode_details = [
|
||||
{
|
||||
'episode_number': i + 1,
|
||||
'video_id': '',
|
||||
'likes': 0,
|
||||
'shares': 0,
|
||||
'favorites': 0,
|
||||
'likes_formatted': '0',
|
||||
'shares_formatted': '0',
|
||||
'favorites_formatted': '0',
|
||||
'comments': []
|
||||
} for i in range(item.get('updated_to_episode', 0))
|
||||
]
|
||||
|
||||
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
|
||||
doc = {
|
||||
@ -1025,7 +1123,8 @@ class DouyinPlayVVScraper:
|
||||
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
||||
'desc': item.get('desc', ''), # 合集描述
|
||||
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
|
||||
'episode_video_ids': episode_video_ids # 每一集的视频ID列表
|
||||
'episode_video_ids': episode_video_ids, # 每一集的视频ID列表
|
||||
'episode_details': episode_details # 每集的详细信息
|
||||
}
|
||||
documents.append(doc)
|
||||
|
||||
@ -1095,6 +1194,7 @@ class DouyinPlayVVScraper:
|
||||
if (
|
||||
'Network.responseReceived' in log['method']
|
||||
and 'response' in log['params']
|
||||
and log['params']['response']
|
||||
and 'url' in log['params']['response']
|
||||
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
|
||||
):
|
||||
@ -1130,6 +1230,11 @@ class DouyinPlayVVScraper:
|
||||
Returns:
|
||||
list: 按集数排序的视频ID列表
|
||||
"""
|
||||
# 定时器模式下跳过此函数
|
||||
if os.environ.get('TIMER_MODE') == '1':
|
||||
logging.info(f'定时器模式:跳过 get_collection_videos 函数')
|
||||
return []
|
||||
|
||||
try:
|
||||
# 检查缓存文件
|
||||
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
|
||||
@ -1273,6 +1378,374 @@ class DouyinPlayVVScraper:
|
||||
return [video['video_id'] for video in cached_videos]
|
||||
return []
|
||||
|
||||
def get_video_details(self, video_id: str, max_comments: int = 20) -> dict:
|
||||
"""获取单个视频的详细互动数据
|
||||
Args:
|
||||
video_id: 视频ID
|
||||
max_comments: 最大评论数量,默认20条
|
||||
Returns:
|
||||
dict: 包含点赞数、收藏数、转发数、评论内容的字典
|
||||
"""
|
||||
video_details = {
|
||||
'video_id': video_id,
|
||||
'likes': 0,
|
||||
'shares': 0,
|
||||
'favorites': 0,
|
||||
'likes_formatted': '0',
|
||||
'shares_formatted': '0',
|
||||
'favorites_formatted': '0',
|
||||
'comments': [],
|
||||
'success': False,
|
||||
'error': None
|
||||
}
|
||||
|
||||
try:
|
||||
# 确保driver已初始化
|
||||
if self.driver is None:
|
||||
logging.info('Driver未初始化,正在设置...')
|
||||
self.setup_driver()
|
||||
if self.driver is None:
|
||||
raise Exception("无法初始化WebDriver")
|
||||
|
||||
video_url = f'https://www.douyin.com/video/{video_id}'
|
||||
logging.info(f'获取视频详细数据: {video_url}')
|
||||
|
||||
# 导航到视频页面
|
||||
self.driver.get(video_url)
|
||||
time.sleep(3)
|
||||
|
||||
# 等待页面加载完成
|
||||
try:
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "video"))
|
||||
)
|
||||
except Exception as e:
|
||||
logging.warning(f'等待视频元素超时: {e}')
|
||||
|
||||
# 获取网络请求日志
|
||||
logs = self.driver.get_log('performance')
|
||||
|
||||
# 解析网络日志获取视频详细数据
|
||||
for entry in logs:
|
||||
try:
|
||||
log = json.loads(entry['message'])['message']
|
||||
if (
|
||||
'Network.responseReceived' in log['method']
|
||||
and 'response' in log['params']
|
||||
and log['params']['response']
|
||||
and log['params']['response'].get('url')
|
||||
):
|
||||
url = log['params']['response']['url']
|
||||
|
||||
# 检查是否是视频详情API
|
||||
if '/aweme/v1/web/aweme/detail/' in url and video_id in url:
|
||||
try:
|
||||
# 获取响应体
|
||||
response_body = self.driver.execute_cdp_cmd(
|
||||
'Network.getResponseBody',
|
||||
{'requestId': log['params']['requestId']}
|
||||
)
|
||||
|
||||
if response_body and 'body' in response_body:
|
||||
data = json.loads(response_body['body'])
|
||||
aweme_detail = data.get('aweme_detail', {})
|
||||
|
||||
if aweme_detail:
|
||||
# 获取统计数据
|
||||
statistics = aweme_detail.get('statistics', {})
|
||||
video_details['likes'] = int(statistics.get('digg_count', 0))
|
||||
video_details['shares'] = int(statistics.get('share_count', 0))
|
||||
video_details['favorites'] = int(statistics.get('collect_count', 0))
|
||||
|
||||
# 添加格式化字段
|
||||
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
|
||||
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
|
||||
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
|
||||
|
||||
logging.info(f'视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f'解析视频详情API响应失败: {e}')
|
||||
continue
|
||||
|
||||
# 检查是否是评论API
|
||||
elif '/aweme/v1/web/comment/list/' in url and video_id in url:
|
||||
try:
|
||||
# 获取响应体
|
||||
response_body = self.driver.execute_cdp_cmd(
|
||||
'Network.getResponseBody',
|
||||
{'requestId': log['params']['requestId']}
|
||||
)
|
||||
|
||||
if response_body and 'body' in response_body:
|
||||
data = json.loads(response_body['body'])
|
||||
comments = data.get('comments', [])
|
||||
|
||||
for comment in comments[:max_comments]:
|
||||
comment_info = {
|
||||
'text': comment.get('text', ''),
|
||||
'user_name': comment.get('user', {}).get('nickname', ''),
|
||||
'digg_count': int(comment.get('digg_count', 0)),
|
||||
'create_time': comment.get('create_time', 0)
|
||||
}
|
||||
video_details['comments'].append(comment_info)
|
||||
|
||||
logging.info(f'视频 {video_id} 获取到 {len(video_details["comments"])} 条评论')
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f'解析评论API响应失败: {e}')
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# 如果网络日志没有获取到数据,尝试页面解析
|
||||
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
|
||||
video_details = self._parse_video_details_from_page(video_id, video_details, max_comments)
|
||||
|
||||
video_details['success'] = True
|
||||
return video_details
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f'获取视频 {video_id} 详细数据失败: {e}'
|
||||
logging.error(error_msg)
|
||||
video_details['error'] = error_msg
|
||||
return video_details
|
||||
|
||||
def _parse_video_details_from_page(self, video_id: str, video_details: dict, max_comments: int = 20) -> dict:
|
||||
"""从页面元素解析视频详细数据(备用方案)
|
||||
Args:
|
||||
video_id: 视频ID
|
||||
video_details: 现有的视频详细数据字典
|
||||
max_comments: 最大评论数量
|
||||
Returns:
|
||||
dict: 更新后的视频详细数据字典
|
||||
"""
|
||||
try:
|
||||
logging.info(f'尝试从页面元素解析视频 {video_id} 的详细数据')
|
||||
|
||||
# 尝试解析页面中的SSR数据
|
||||
try:
|
||||
# 查找包含视频数据的script标签
|
||||
scripts = self.driver.find_elements("tag name", "script")
|
||||
for script in scripts:
|
||||
script_content = script.get_attribute('innerHTML')
|
||||
if script_content and ('window._SSR_HYDRATED_DATA' in script_content or 'RENDER_DATA' in script_content):
|
||||
# 提取JSON数据
|
||||
if 'window._SSR_HYDRATED_DATA' in script_content:
|
||||
match = re.search(r'window\._SSR_HYDRATED_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
|
||||
else:
|
||||
match = re.search(r'window\.RENDER_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
|
||||
|
||||
if match:
|
||||
data = json.loads(match.group(1))
|
||||
|
||||
# 查找视频详情数据
|
||||
def find_video_data(obj, target_id):
|
||||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
if key == 'aweme_id' and str(value) == str(target_id):
|
||||
return obj
|
||||
elif isinstance(value, (dict, list)):
|
||||
result = find_video_data(value, target_id)
|
||||
if result:
|
||||
return result
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
result = find_video_data(item, target_id)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
video_data = find_video_data(data, video_id)
|
||||
if video_data:
|
||||
statistics = video_data.get('statistics', {})
|
||||
video_details['likes'] = int(statistics.get('digg_count', 0))
|
||||
video_details['shares'] = int(statistics.get('share_count', 0))
|
||||
video_details['favorites'] = int(statistics.get('collect_count', 0))
|
||||
|
||||
# 添加格式化字段
|
||||
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
|
||||
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
|
||||
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
|
||||
|
||||
logging.info(f'从SSR数据解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f'解析SSR数据失败: {e}')
|
||||
|
||||
# 如果SSR数据解析失败,尝试CSS选择器
|
||||
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
|
||||
try:
|
||||
# 尝试常见的点赞、分享、收藏按钮选择器
|
||||
selectors = {
|
||||
'likes': [
|
||||
'[data-e2e="video-like-count"]',
|
||||
'[class*="like"] [class*="count"]',
|
||||
'[class*="digg"] [class*="count"]'
|
||||
],
|
||||
'shares': [
|
||||
'[data-e2e="video-share-count"]',
|
||||
'[class*="share"] [class*="count"]'
|
||||
],
|
||||
'favorites': [
|
||||
'[data-e2e="video-collect-count"]',
|
||||
'[class*="collect"] [class*="count"]',
|
||||
'[class*="favorite"] [class*="count"]'
|
||||
]
|
||||
}
|
||||
|
||||
for data_type, selector_list in selectors.items():
|
||||
for selector in selector_list:
|
||||
try:
|
||||
elements = self.driver.find_elements("css selector", selector)
|
||||
if elements:
|
||||
text = elements[0].text.strip()
|
||||
if text and text.replace('.', '').replace('万', '').replace('亿', '').isdigit():
|
||||
# 转换数字格式
|
||||
if '亿' in text:
|
||||
video_details[data_type] = int(float(text.replace('亿', '')) * 100000000)
|
||||
elif '万' in text:
|
||||
video_details[data_type] = int(float(text.replace('万', '')) * 10000)
|
||||
else:
|
||||
video_details[data_type] = int(text)
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if video_details['likes'] > 0 or video_details['shares'] > 0 or video_details['favorites'] > 0:
|
||||
# 添加格式化字段
|
||||
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
|
||||
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
|
||||
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
|
||||
|
||||
logging.info(f'从页面元素解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f'CSS选择器解析失败: {e}')
|
||||
|
||||
# 尝试获取评论(如果还没有获取到)
|
||||
if not video_details['comments']:
|
||||
try:
|
||||
# 滚动到评论区域
|
||||
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(2)
|
||||
|
||||
# 尝试常见的评论选择器
|
||||
comment_selectors = [
|
||||
'[data-e2e="comment-item"]',
|
||||
'[class*="comment-item"]',
|
||||
'[class*="comment"] [class*="content"]'
|
||||
]
|
||||
|
||||
for selector in comment_selectors:
|
||||
try:
|
||||
comment_elements = self.driver.find_elements("css selector", selector)[:max_comments]
|
||||
if comment_elements:
|
||||
for element in comment_elements:
|
||||
try:
|
||||
comment_text = element.text.strip()
|
||||
if comment_text:
|
||||
comment_info = {
|
||||
'text': comment_text,
|
||||
'user_name': '',
|
||||
'digg_count': 0,
|
||||
'create_time': 0
|
||||
}
|
||||
video_details['comments'].append(comment_info)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if video_details['comments']:
|
||||
logging.info(f'从页面元素获取到视频 {video_id} 的 {len(video_details["comments"])} 条评论')
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f'获取评论失败: {e}')
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f'页面解析视频详细数据失败: {e}')
|
||||
|
||||
return video_details
|
||||
|
||||
def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', max_comments_per_video: int = 10) -> list:
|
||||
"""获取合集中所有视频的详细互动数据
|
||||
Args:
|
||||
episode_video_ids: 视频ID列表
|
||||
mix_name: 合集名称,用于日志
|
||||
max_comments_per_video: 每个视频最大评论数量,默认10条
|
||||
Returns:
|
||||
list: 包含每个视频详细数据的列表
|
||||
"""
|
||||
# 定时器模式下跳过此函数
|
||||
if os.environ.get('TIMER_MODE') == '1':
|
||||
logging.info(f'定时器模式:跳过 get_collection_video_details 函数')
|
||||
return []
|
||||
|
||||
if not episode_video_ids:
|
||||
logging.info(f'合集 {mix_name} 没有视频ID,跳过详细数据获取')
|
||||
return []
|
||||
|
||||
logging.info(f'开始获取合集 {mix_name} 中 {len(episode_video_ids)} 个视频的详细数据')
|
||||
|
||||
video_details_list = []
|
||||
|
||||
for i, video_id in enumerate(episode_video_ids, 1):
|
||||
if not video_id:
|
||||
logging.warning(f'合集 {mix_name} 第 {i} 集视频ID为空,跳过')
|
||||
video_details_list.append({
|
||||
'episode_number': i,
|
||||
'video_id': '',
|
||||
'likes': 0,
|
||||
'shares': 0,
|
||||
'favorites': 0,
|
||||
'comments': [],
|
||||
'success': False,
|
||||
'error': '视频ID为空'
|
||||
})
|
||||
continue
|
||||
|
||||
logging.info(f'获取合集 {mix_name} 第 {i}/{len(episode_video_ids)} 集视频详细数据: {video_id}')
|
||||
|
||||
try:
|
||||
# 获取单个视频的详细数据
|
||||
video_details = self.get_video_details(video_id, max_comments_per_video)
|
||||
video_details['episode_number'] = i
|
||||
video_details_list.append(video_details)
|
||||
|
||||
# 添加延迟避免请求过快
|
||||
time.sleep(2)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f'获取视频 {video_id} 详细数据时出错: {e}'
|
||||
logging.error(error_msg)
|
||||
video_details_list.append({
|
||||
'episode_number': i,
|
||||
'video_id': video_id,
|
||||
'likes': 0,
|
||||
'shares': 0,
|
||||
'favorites': 0,
|
||||
'comments': [],
|
||||
'success': False,
|
||||
'error': error_msg
|
||||
})
|
||||
|
||||
# 统计获取结果
|
||||
success_count = sum(1 for detail in video_details_list if detail.get('success', False))
|
||||
total_likes = sum(detail.get('likes', 0) for detail in video_details_list)
|
||||
total_comments = sum(len(detail.get('comments', [])) for detail in video_details_list)
|
||||
|
||||
logging.info(f'合集 {mix_name} 视频详细数据获取完成: {success_count}/{len(episode_video_ids)} 成功, 总点赞数={total_likes:,}, 总评论数={total_comments}')
|
||||
|
||||
return video_details_list
|
||||
|
||||
def get_cookies_dict(self):
|
||||
"""获取当前页面的cookies"""
|
||||
if not hasattr(self, 'cookies') or not self.cookies:
|
||||
|
||||
@ -135,7 +135,8 @@ def format_mix_item(doc):
|
||||
"updated_to_episode": doc.get("updated_to_episode", 0),
|
||||
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
||||
"mix_id": doc.get("mix_id", ""),
|
||||
"episode_video_ids": doc.get("episode_video_ids", [])
|
||||
"episode_video_ids": doc.get("episode_video_ids", []),
|
||||
"episode_details": doc.get("episode_details", [])
|
||||
}
|
||||
|
||||
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user