1.添加判断代码,启动定时器时不调用主代码的某几个函数,确保定时器正常计算播放量差值
2.新增功能:获取点赞,收藏,转发数量+评论内容列表(不完整,正在继续优化) 3.增加数据库文件夹,当启动定时器时存储到Ranking_storage_list中, 按照Ranking_storage_list中的数据进行计算播放量差值,计算结果存入Ranking_storage中 单独运行rank_data_scraper.py的时候存入Rankings_list 原因: Rankings_list里面存储的数据结构较多 Ranking_storage_list里面存储的主要是播放量 Rankings_list里面存入的是播放量差值
This commit is contained in:
parent
8b1149da56
commit
2a32b2a8c0
@ -61,6 +61,8 @@ class DouyinAutoScheduler:
|
|||||||
|
|
||||||
# 设置环境变量,确保自动模式
|
# 设置环境变量,确保自动模式
|
||||||
os.environ['AUTO_CONTINUE'] = '1'
|
os.environ['AUTO_CONTINUE'] = '1'
|
||||||
|
# 设置定时器模式环境变量,跳过评论抓取等函数
|
||||||
|
os.environ['TIMER_MODE'] = '1'
|
||||||
|
|
||||||
# 直接创建并运行 DouyinPlayVVScraper 实例
|
# 直接创建并运行 DouyinPlayVVScraper 实例
|
||||||
scraper = DouyinPlayVVScraper(
|
scraper = DouyinPlayVVScraper(
|
||||||
@ -89,7 +91,7 @@ class DouyinAutoScheduler:
|
|||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
# 获取集合
|
# 获取集合
|
||||||
douyin_collection = db['Rankings_list'] # 使用真实抓取的数据
|
douyin_collection = db['Ranking_storage_list'] # 使用定时器抓取的数据
|
||||||
rankings_collection = db['Ranking_storage']
|
rankings_collection = db['Ranking_storage']
|
||||||
|
|
||||||
today = date.today()
|
today = date.today()
|
||||||
@ -107,10 +109,20 @@ class DouyinAutoScheduler:
|
|||||||
try:
|
try:
|
||||||
logging.info("🔄 正在生成时间轴对比榜单...")
|
logging.info("🔄 正在生成时间轴对比榜单...")
|
||||||
|
|
||||||
# 获取今天的数据,按短剧名称去重,只保留播放量最高的
|
# 获取最新批次的数据
|
||||||
today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1))
|
latest_batch = douyin_collection.find_one(sort=[("batch_time", -1)])
|
||||||
|
if not latest_batch:
|
||||||
|
logging.warning("⚠️ 未找到任何数据")
|
||||||
|
return False
|
||||||
|
|
||||||
# 按短剧名称去重,每个短剧只保留播放量最高的一条
|
latest_batch_time = latest_batch.get("batch_time")
|
||||||
|
logging.info(f"📊 找到最新批次时间: {latest_batch_time}")
|
||||||
|
|
||||||
|
# 只获取最新批次的数据
|
||||||
|
today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1))
|
||||||
|
logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}")
|
||||||
|
|
||||||
|
# 按短剧名称去重(虽然同一批次应该不会有重复,但为了代码健壮性保留此逻辑)
|
||||||
unique_videos = {}
|
unique_videos = {}
|
||||||
for video in today_videos_raw:
|
for video in today_videos_raw:
|
||||||
mix_name = video.get("mix_name", "")
|
mix_name = video.get("mix_name", "")
|
||||||
@ -121,26 +133,36 @@ class DouyinAutoScheduler:
|
|||||||
|
|
||||||
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
|
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
|
||||||
|
|
||||||
# 获取昨天的榜单数据(如果存在),取最新的计算结果
|
# 获取昨天最后一批次的数据
|
||||||
yesterday_ranking = rankings_collection.find_one({
|
yesterday_batch = douyin_collection.find_one({
|
||||||
"date": yesterday_str,
|
"batch_time": {"$regex": f"^{yesterday_str}"}
|
||||||
"type": "comprehensive"
|
}, sort=[("batch_time", -1)])
|
||||||
}, sort=[("calculation_sequence", -1)])
|
|
||||||
|
|
||||||
yesterday_data = {}
|
yesterday_data = {}
|
||||||
if yesterday_ranking and "data" in yesterday_ranking:
|
if yesterday_batch:
|
||||||
# 将昨天的数据转换为字典,以短剧名称为键
|
# 获取昨天最后一批次的所有数据
|
||||||
for item in yesterday_ranking["data"]:
|
yesterday_videos = list(douyin_collection.find({
|
||||||
title = item.get("title", "")
|
"batch_time": yesterday_batch["batch_time"]
|
||||||
if title:
|
}).sort("play_vv", -1))
|
||||||
yesterday_data[title] = {
|
|
||||||
"rank": item.get("rank", 0),
|
# 按短剧名称去重,保留播放量最高的记录
|
||||||
"play_vv": item.get("play_vv", 0),
|
for video in yesterday_videos:
|
||||||
"video_id": item.get("video_id", "")
|
mix_name = video.get("mix_name", "")
|
||||||
|
if mix_name and (mix_name not in yesterday_data or video.get("play_vv", 0) > yesterday_data[mix_name].get("play_vv", 0)):
|
||||||
|
yesterday_data[mix_name] = {
|
||||||
|
"play_vv": video.get("play_vv", 0),
|
||||||
|
"video_id": str(video.get("_id", "")),
|
||||||
|
"rank": 0 # 稍后计算排名
|
||||||
}
|
}
|
||||||
logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧")
|
|
||||||
|
# 计算排名
|
||||||
|
sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True)
|
||||||
|
for rank, (mix_name, data) in enumerate(sorted_videos, 1):
|
||||||
|
yesterday_data[mix_name]["rank"] = rank
|
||||||
|
|
||||||
|
logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧")
|
||||||
else:
|
else:
|
||||||
logging.info("📊 未找到昨天的榜单数据,将作为首次生成")
|
logging.info("📊 未找到昨天的原始数据,将作为首次生成")
|
||||||
|
|
||||||
if today_videos:
|
if today_videos:
|
||||||
# 先计算所有视频的播放量差值
|
# 先计算所有视频的播放量差值
|
||||||
|
|||||||
@ -83,11 +83,13 @@ class DouyinPlayVVScraper:
|
|||||||
# 使用 database.py 中的连接
|
# 使用 database.py 中的连接
|
||||||
self.db = db
|
self.db = db
|
||||||
|
|
||||||
# 设置集合
|
# 根据运行模式选择集合
|
||||||
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list')
|
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
|
||||||
|
mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_list'
|
||||||
self.collection = self.db[mongo_collection]
|
self.collection = self.db[mongo_collection]
|
||||||
|
|
||||||
logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}')
|
logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}')
|
||||||
|
logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'MongoDB连接失败: {e}')
|
logging.error(f'MongoDB连接失败: {e}')
|
||||||
@ -448,6 +450,28 @@ class DouyinPlayVVScraper:
|
|||||||
return f"{n/10_000:.1f}万"
|
return f"{n/10_000:.1f}万"
|
||||||
return str(n)
|
return str(n)
|
||||||
|
|
||||||
|
def format_interaction_count(self, n: int) -> str:
|
||||||
|
"""格式化互动数据数量,返回带单位的字符串
|
||||||
|
Args:
|
||||||
|
n: 数量
|
||||||
|
Returns:
|
||||||
|
str: 格式化后的字符串,如 27898 -> 2.8W, 1234 -> 1234
|
||||||
|
"""
|
||||||
|
if n >= 100_000_000:
|
||||||
|
result = n / 100_000_000
|
||||||
|
if result == int(result):
|
||||||
|
return f"{int(result)}亿"
|
||||||
|
else:
|
||||||
|
return f"{result:.1f}亿"
|
||||||
|
elif n >= 10_000:
|
||||||
|
result = n / 10_000
|
||||||
|
if result == int(result):
|
||||||
|
return f"{int(result)}W"
|
||||||
|
else:
|
||||||
|
return f"{result:.1f}W"
|
||||||
|
else:
|
||||||
|
return str(n)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None):
|
def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None):
|
||||||
@ -1009,6 +1033,80 @@ class DouyinPlayVVScraper:
|
|||||||
)
|
)
|
||||||
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
|
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
|
||||||
|
|
||||||
|
# 获取每个视频的详细互动数据
|
||||||
|
logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据')
|
||||||
|
video_details_list = self.get_collection_video_details(
|
||||||
|
episode_video_ids=episode_video_ids,
|
||||||
|
mix_name=mix_name,
|
||||||
|
max_comments_per_video=10 # 每个视频最多获取10条评论
|
||||||
|
)
|
||||||
|
|
||||||
|
# 构建每集的详细信息,使用获取到的真实数据
|
||||||
|
episode_details = []
|
||||||
|
total_episodes = item.get('updated_to_episode', 0)
|
||||||
|
|
||||||
|
for i in range(total_episodes):
|
||||||
|
episode_number = i + 1
|
||||||
|
video_id = episode_video_ids[i] if i < len(episode_video_ids) else ''
|
||||||
|
|
||||||
|
# 查找对应的视频详细数据
|
||||||
|
video_detail = None
|
||||||
|
if i < len(video_details_list):
|
||||||
|
video_detail = video_details_list[i]
|
||||||
|
|
||||||
|
if video_detail and video_detail.get('success', False):
|
||||||
|
# 使用获取到的真实数据
|
||||||
|
likes = video_detail.get('likes', 0)
|
||||||
|
shares = video_detail.get('shares', 0)
|
||||||
|
favorites = video_detail.get('favorites', 0)
|
||||||
|
|
||||||
|
episode_info = {
|
||||||
|
'episode_number': episode_number,
|
||||||
|
'video_id': video_id,
|
||||||
|
'likes': likes,
|
||||||
|
'shares': shares,
|
||||||
|
'favorites': favorites,
|
||||||
|
'likes_formatted': self.format_interaction_count(likes),
|
||||||
|
'shares_formatted': self.format_interaction_count(shares),
|
||||||
|
'favorites_formatted': self.format_interaction_count(favorites),
|
||||||
|
'comments': video_detail.get('comments', [])
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# 使用默认值
|
||||||
|
episode_info = {
|
||||||
|
'episode_number': episode_number,
|
||||||
|
'video_id': video_id,
|
||||||
|
'likes': 0,
|
||||||
|
'shares': 0,
|
||||||
|
'favorites': 0,
|
||||||
|
'likes_formatted': '0',
|
||||||
|
'shares_formatted': '0',
|
||||||
|
'favorites_formatted': '0',
|
||||||
|
'comments': []
|
||||||
|
}
|
||||||
|
|
||||||
|
episode_details.append(episode_info)
|
||||||
|
|
||||||
|
# 统计获取到的数据
|
||||||
|
total_likes = sum(ep.get('likes', 0) for ep in episode_details)
|
||||||
|
total_comments = sum(len(ep.get('comments', [])) for ep in episode_details)
|
||||||
|
logging.info(f'合集 {mix_name} 详细数据统计: 总点赞数={total_likes:,}, 总评论数={total_comments}')
|
||||||
|
else:
|
||||||
|
# 如果没有获取到视频ID,使用默认的episode_details
|
||||||
|
episode_details = [
|
||||||
|
{
|
||||||
|
'episode_number': i + 1,
|
||||||
|
'video_id': '',
|
||||||
|
'likes': 0,
|
||||||
|
'shares': 0,
|
||||||
|
'favorites': 0,
|
||||||
|
'likes_formatted': '0',
|
||||||
|
'shares_formatted': '0',
|
||||||
|
'favorites_formatted': '0',
|
||||||
|
'comments': []
|
||||||
|
} for i in range(item.get('updated_to_episode', 0))
|
||||||
|
]
|
||||||
|
|
||||||
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
|
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
|
||||||
doc = {
|
doc = {
|
||||||
'batch_time': batch_time,
|
'batch_time': batch_time,
|
||||||
@ -1025,7 +1123,8 @@ class DouyinPlayVVScraper:
|
|||||||
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
||||||
'desc': item.get('desc', ''), # 合集描述
|
'desc': item.get('desc', ''), # 合集描述
|
||||||
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
|
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
|
||||||
'episode_video_ids': episode_video_ids # 每一集的视频ID列表
|
'episode_video_ids': episode_video_ids, # 每一集的视频ID列表
|
||||||
|
'episode_details': episode_details # 每集的详细信息
|
||||||
}
|
}
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
|
|
||||||
@ -1095,6 +1194,7 @@ class DouyinPlayVVScraper:
|
|||||||
if (
|
if (
|
||||||
'Network.responseReceived' in log['method']
|
'Network.responseReceived' in log['method']
|
||||||
and 'response' in log['params']
|
and 'response' in log['params']
|
||||||
|
and log['params']['response']
|
||||||
and 'url' in log['params']['response']
|
and 'url' in log['params']['response']
|
||||||
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
|
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
|
||||||
):
|
):
|
||||||
@ -1130,6 +1230,11 @@ class DouyinPlayVVScraper:
|
|||||||
Returns:
|
Returns:
|
||||||
list: 按集数排序的视频ID列表
|
list: 按集数排序的视频ID列表
|
||||||
"""
|
"""
|
||||||
|
# 定时器模式下跳过此函数
|
||||||
|
if os.environ.get('TIMER_MODE') == '1':
|
||||||
|
logging.info(f'定时器模式:跳过 get_collection_videos 函数')
|
||||||
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 检查缓存文件
|
# 检查缓存文件
|
||||||
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
|
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
|
||||||
@ -1273,6 +1378,374 @@ class DouyinPlayVVScraper:
|
|||||||
return [video['video_id'] for video in cached_videos]
|
return [video['video_id'] for video in cached_videos]
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def get_video_details(self, video_id: str, max_comments: int = 20) -> dict:
|
||||||
|
"""获取单个视频的详细互动数据
|
||||||
|
Args:
|
||||||
|
video_id: 视频ID
|
||||||
|
max_comments: 最大评论数量,默认20条
|
||||||
|
Returns:
|
||||||
|
dict: 包含点赞数、收藏数、转发数、评论内容的字典
|
||||||
|
"""
|
||||||
|
video_details = {
|
||||||
|
'video_id': video_id,
|
||||||
|
'likes': 0,
|
||||||
|
'shares': 0,
|
||||||
|
'favorites': 0,
|
||||||
|
'likes_formatted': '0',
|
||||||
|
'shares_formatted': '0',
|
||||||
|
'favorites_formatted': '0',
|
||||||
|
'comments': [],
|
||||||
|
'success': False,
|
||||||
|
'error': None
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 确保driver已初始化
|
||||||
|
if self.driver is None:
|
||||||
|
logging.info('Driver未初始化,正在设置...')
|
||||||
|
self.setup_driver()
|
||||||
|
if self.driver is None:
|
||||||
|
raise Exception("无法初始化WebDriver")
|
||||||
|
|
||||||
|
video_url = f'https://www.douyin.com/video/{video_id}'
|
||||||
|
logging.info(f'获取视频详细数据: {video_url}')
|
||||||
|
|
||||||
|
# 导航到视频页面
|
||||||
|
self.driver.get(video_url)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# 等待页面加载完成
|
||||||
|
try:
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
|
||||||
|
WebDriverWait(self.driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "video"))
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'等待视频元素超时: {e}')
|
||||||
|
|
||||||
|
# 获取网络请求日志
|
||||||
|
logs = self.driver.get_log('performance')
|
||||||
|
|
||||||
|
# 解析网络日志获取视频详细数据
|
||||||
|
for entry in logs:
|
||||||
|
try:
|
||||||
|
log = json.loads(entry['message'])['message']
|
||||||
|
if (
|
||||||
|
'Network.responseReceived' in log['method']
|
||||||
|
and 'response' in log['params']
|
||||||
|
and log['params']['response']
|
||||||
|
and log['params']['response'].get('url')
|
||||||
|
):
|
||||||
|
url = log['params']['response']['url']
|
||||||
|
|
||||||
|
# 检查是否是视频详情API
|
||||||
|
if '/aweme/v1/web/aweme/detail/' in url and video_id in url:
|
||||||
|
try:
|
||||||
|
# 获取响应体
|
||||||
|
response_body = self.driver.execute_cdp_cmd(
|
||||||
|
'Network.getResponseBody',
|
||||||
|
{'requestId': log['params']['requestId']}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response_body and 'body' in response_body:
|
||||||
|
data = json.loads(response_body['body'])
|
||||||
|
aweme_detail = data.get('aweme_detail', {})
|
||||||
|
|
||||||
|
if aweme_detail:
|
||||||
|
# 获取统计数据
|
||||||
|
statistics = aweme_detail.get('statistics', {})
|
||||||
|
video_details['likes'] = int(statistics.get('digg_count', 0))
|
||||||
|
video_details['shares'] = int(statistics.get('share_count', 0))
|
||||||
|
video_details['favorites'] = int(statistics.get('collect_count', 0))
|
||||||
|
|
||||||
|
# 添加格式化字段
|
||||||
|
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
|
||||||
|
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
|
||||||
|
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
|
||||||
|
|
||||||
|
logging.info(f'视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'解析视频详情API响应失败: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 检查是否是评论API
|
||||||
|
elif '/aweme/v1/web/comment/list/' in url and video_id in url:
|
||||||
|
try:
|
||||||
|
# 获取响应体
|
||||||
|
response_body = self.driver.execute_cdp_cmd(
|
||||||
|
'Network.getResponseBody',
|
||||||
|
{'requestId': log['params']['requestId']}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response_body and 'body' in response_body:
|
||||||
|
data = json.loads(response_body['body'])
|
||||||
|
comments = data.get('comments', [])
|
||||||
|
|
||||||
|
for comment in comments[:max_comments]:
|
||||||
|
comment_info = {
|
||||||
|
'text': comment.get('text', ''),
|
||||||
|
'user_name': comment.get('user', {}).get('nickname', ''),
|
||||||
|
'digg_count': int(comment.get('digg_count', 0)),
|
||||||
|
'create_time': comment.get('create_time', 0)
|
||||||
|
}
|
||||||
|
video_details['comments'].append(comment_info)
|
||||||
|
|
||||||
|
logging.info(f'视频 {video_id} 获取到 {len(video_details["comments"])} 条评论')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'解析评论API响应失败: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 如果网络日志没有获取到数据,尝试页面解析
|
||||||
|
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
|
||||||
|
video_details = self._parse_video_details_from_page(video_id, video_details, max_comments)
|
||||||
|
|
||||||
|
video_details['success'] = True
|
||||||
|
return video_details
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f'获取视频 {video_id} 详细数据失败: {e}'
|
||||||
|
logging.error(error_msg)
|
||||||
|
video_details['error'] = error_msg
|
||||||
|
return video_details
|
||||||
|
|
||||||
|
def _parse_video_details_from_page(self, video_id: str, video_details: dict, max_comments: int = 20) -> dict:
|
||||||
|
"""从页面元素解析视频详细数据(备用方案)
|
||||||
|
Args:
|
||||||
|
video_id: 视频ID
|
||||||
|
video_details: 现有的视频详细数据字典
|
||||||
|
max_comments: 最大评论数量
|
||||||
|
Returns:
|
||||||
|
dict: 更新后的视频详细数据字典
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logging.info(f'尝试从页面元素解析视频 {video_id} 的详细数据')
|
||||||
|
|
||||||
|
# 尝试解析页面中的SSR数据
|
||||||
|
try:
|
||||||
|
# 查找包含视频数据的script标签
|
||||||
|
scripts = self.driver.find_elements("tag name", "script")
|
||||||
|
for script in scripts:
|
||||||
|
script_content = script.get_attribute('innerHTML')
|
||||||
|
if script_content and ('window._SSR_HYDRATED_DATA' in script_content or 'RENDER_DATA' in script_content):
|
||||||
|
# 提取JSON数据
|
||||||
|
if 'window._SSR_HYDRATED_DATA' in script_content:
|
||||||
|
match = re.search(r'window\._SSR_HYDRATED_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
|
||||||
|
else:
|
||||||
|
match = re.search(r'window\.RENDER_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
|
||||||
|
|
||||||
|
if match:
|
||||||
|
data = json.loads(match.group(1))
|
||||||
|
|
||||||
|
# 查找视频详情数据
|
||||||
|
def find_video_data(obj, target_id):
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
for key, value in obj.items():
|
||||||
|
if key == 'aweme_id' and str(value) == str(target_id):
|
||||||
|
return obj
|
||||||
|
elif isinstance(value, (dict, list)):
|
||||||
|
result = find_video_data(value, target_id)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
for item in obj:
|
||||||
|
result = find_video_data(item, target_id)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
return None
|
||||||
|
|
||||||
|
video_data = find_video_data(data, video_id)
|
||||||
|
if video_data:
|
||||||
|
statistics = video_data.get('statistics', {})
|
||||||
|
video_details['likes'] = int(statistics.get('digg_count', 0))
|
||||||
|
video_details['shares'] = int(statistics.get('share_count', 0))
|
||||||
|
video_details['favorites'] = int(statistics.get('collect_count', 0))
|
||||||
|
|
||||||
|
# 添加格式化字段
|
||||||
|
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
|
||||||
|
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
|
||||||
|
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
|
||||||
|
|
||||||
|
logging.info(f'从SSR数据解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'解析SSR数据失败: {e}')
|
||||||
|
|
||||||
|
# 如果SSR数据解析失败,尝试CSS选择器
|
||||||
|
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
|
||||||
|
try:
|
||||||
|
# 尝试常见的点赞、分享、收藏按钮选择器
|
||||||
|
selectors = {
|
||||||
|
'likes': [
|
||||||
|
'[data-e2e="video-like-count"]',
|
||||||
|
'[class*="like"] [class*="count"]',
|
||||||
|
'[class*="digg"] [class*="count"]'
|
||||||
|
],
|
||||||
|
'shares': [
|
||||||
|
'[data-e2e="video-share-count"]',
|
||||||
|
'[class*="share"] [class*="count"]'
|
||||||
|
],
|
||||||
|
'favorites': [
|
||||||
|
'[data-e2e="video-collect-count"]',
|
||||||
|
'[class*="collect"] [class*="count"]',
|
||||||
|
'[class*="favorite"] [class*="count"]'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
for data_type, selector_list in selectors.items():
|
||||||
|
for selector in selector_list:
|
||||||
|
try:
|
||||||
|
elements = self.driver.find_elements("css selector", selector)
|
||||||
|
if elements:
|
||||||
|
text = elements[0].text.strip()
|
||||||
|
if text and text.replace('.', '').replace('万', '').replace('亿', '').isdigit():
|
||||||
|
# 转换数字格式
|
||||||
|
if '亿' in text:
|
||||||
|
video_details[data_type] = int(float(text.replace('亿', '')) * 100000000)
|
||||||
|
elif '万' in text:
|
||||||
|
video_details[data_type] = int(float(text.replace('万', '')) * 10000)
|
||||||
|
else:
|
||||||
|
video_details[data_type] = int(text)
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if video_details['likes'] > 0 or video_details['shares'] > 0 or video_details['favorites'] > 0:
|
||||||
|
# 添加格式化字段
|
||||||
|
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
|
||||||
|
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
|
||||||
|
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
|
||||||
|
|
||||||
|
logging.info(f'从页面元素解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'CSS选择器解析失败: {e}')
|
||||||
|
|
||||||
|
# 尝试获取评论(如果还没有获取到)
|
||||||
|
if not video_details['comments']:
|
||||||
|
try:
|
||||||
|
# 滚动到评论区域
|
||||||
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# 尝试常见的评论选择器
|
||||||
|
comment_selectors = [
|
||||||
|
'[data-e2e="comment-item"]',
|
||||||
|
'[class*="comment-item"]',
|
||||||
|
'[class*="comment"] [class*="content"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in comment_selectors:
|
||||||
|
try:
|
||||||
|
comment_elements = self.driver.find_elements("css selector", selector)[:max_comments]
|
||||||
|
if comment_elements:
|
||||||
|
for element in comment_elements:
|
||||||
|
try:
|
||||||
|
comment_text = element.text.strip()
|
||||||
|
if comment_text:
|
||||||
|
comment_info = {
|
||||||
|
'text': comment_text,
|
||||||
|
'user_name': '',
|
||||||
|
'digg_count': 0,
|
||||||
|
'create_time': 0
|
||||||
|
}
|
||||||
|
video_details['comments'].append(comment_info)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if video_details['comments']:
|
||||||
|
logging.info(f'从页面元素获取到视频 {video_id} 的 {len(video_details["comments"])} 条评论')
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'获取评论失败: {e}')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'页面解析视频详细数据失败: {e}')
|
||||||
|
|
||||||
|
return video_details
|
||||||
|
|
||||||
|
def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', max_comments_per_video: int = 10) -> list:
|
||||||
|
"""获取合集中所有视频的详细互动数据
|
||||||
|
Args:
|
||||||
|
episode_video_ids: 视频ID列表
|
||||||
|
mix_name: 合集名称,用于日志
|
||||||
|
max_comments_per_video: 每个视频最大评论数量,默认10条
|
||||||
|
Returns:
|
||||||
|
list: 包含每个视频详细数据的列表
|
||||||
|
"""
|
||||||
|
# 定时器模式下跳过此函数
|
||||||
|
if os.environ.get('TIMER_MODE') == '1':
|
||||||
|
logging.info(f'定时器模式:跳过 get_collection_video_details 函数')
|
||||||
|
return []
|
||||||
|
|
||||||
|
if not episode_video_ids:
|
||||||
|
logging.info(f'合集 {mix_name} 没有视频ID,跳过详细数据获取')
|
||||||
|
return []
|
||||||
|
|
||||||
|
logging.info(f'开始获取合集 {mix_name} 中 {len(episode_video_ids)} 个视频的详细数据')
|
||||||
|
|
||||||
|
video_details_list = []
|
||||||
|
|
||||||
|
for i, video_id in enumerate(episode_video_ids, 1):
|
||||||
|
if not video_id:
|
||||||
|
logging.warning(f'合集 {mix_name} 第 {i} 集视频ID为空,跳过')
|
||||||
|
video_details_list.append({
|
||||||
|
'episode_number': i,
|
||||||
|
'video_id': '',
|
||||||
|
'likes': 0,
|
||||||
|
'shares': 0,
|
||||||
|
'favorites': 0,
|
||||||
|
'comments': [],
|
||||||
|
'success': False,
|
||||||
|
'error': '视频ID为空'
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
logging.info(f'获取合集 {mix_name} 第 {i}/{len(episode_video_ids)} 集视频详细数据: {video_id}')
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取单个视频的详细数据
|
||||||
|
video_details = self.get_video_details(video_id, max_comments_per_video)
|
||||||
|
video_details['episode_number'] = i
|
||||||
|
video_details_list.append(video_details)
|
||||||
|
|
||||||
|
# 添加延迟避免请求过快
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f'获取视频 {video_id} 详细数据时出错: {e}'
|
||||||
|
logging.error(error_msg)
|
||||||
|
video_details_list.append({
|
||||||
|
'episode_number': i,
|
||||||
|
'video_id': video_id,
|
||||||
|
'likes': 0,
|
||||||
|
'shares': 0,
|
||||||
|
'favorites': 0,
|
||||||
|
'comments': [],
|
||||||
|
'success': False,
|
||||||
|
'error': error_msg
|
||||||
|
})
|
||||||
|
|
||||||
|
# 统计获取结果
|
||||||
|
success_count = sum(1 for detail in video_details_list if detail.get('success', False))
|
||||||
|
total_likes = sum(detail.get('likes', 0) for detail in video_details_list)
|
||||||
|
total_comments = sum(len(detail.get('comments', [])) for detail in video_details_list)
|
||||||
|
|
||||||
|
logging.info(f'合集 {mix_name} 视频详细数据获取完成: {success_count}/{len(episode_video_ids)} 成功, 总点赞数={total_likes:,}, 总评论数={total_comments}')
|
||||||
|
|
||||||
|
return video_details_list
|
||||||
|
|
||||||
def get_cookies_dict(self):
|
def get_cookies_dict(self):
|
||||||
"""获取当前页面的cookies"""
|
"""获取当前页面的cookies"""
|
||||||
if not hasattr(self, 'cookies') or not self.cookies:
|
if not hasattr(self, 'cookies') or not self.cookies:
|
||||||
|
|||||||
@ -135,7 +135,8 @@ def format_mix_item(doc):
|
|||||||
"updated_to_episode": doc.get("updated_to_episode", 0),
|
"updated_to_episode": doc.get("updated_to_episode", 0),
|
||||||
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
"cover_backup_urls": doc.get("cover_backup_urls", []),
|
||||||
"mix_id": doc.get("mix_id", ""),
|
"mix_id": doc.get("mix_id", ""),
|
||||||
"episode_video_ids": doc.get("episode_video_ids", [])
|
"episode_video_ids": doc.get("episode_video_ids", []),
|
||||||
|
"episode_details": doc.get("episode_details", [])
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
def get_mix_list(page=1, limit=20, sort_by="playcount"):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user