1.添加判断代码,启动定时器时不调用主代码的某几个函数,确保定时器正常计算播放量差值

2.新增功能:获取点赞,收藏,转发数量+评论内容列表(不完整,正在继续优化)
3.增加数据库文件夹,当启动定时器时存储到Ranking_storage_list中,
按照Ranking_storage_list中的数据进行计算播放量差值,计算结果存入Ranking_storage中
单独运行rank_data_scraper.py的时候存入Rankings_list

原因:
Rankings_list里面存储的数据结构较多
Ranking_storage_list里面存储的主要是播放量
Rankings_list里面存入的是播放量差值
This commit is contained in:
Qyir 2025-10-23 10:04:44 +08:00
parent 8b1149da56
commit 2a32b2a8c0
3 changed files with 522 additions and 26 deletions

View File

@ -61,6 +61,8 @@ class DouyinAutoScheduler:
# 设置环境变量,确保自动模式 # 设置环境变量,确保自动模式
os.environ['AUTO_CONTINUE'] = '1' os.environ['AUTO_CONTINUE'] = '1'
# 设置定时器模式环境变量,跳过评论抓取等函数
os.environ['TIMER_MODE'] = '1'
# 直接创建并运行 DouyinPlayVVScraper 实例 # 直接创建并运行 DouyinPlayVVScraper 实例
scraper = DouyinPlayVVScraper( scraper = DouyinPlayVVScraper(
@ -89,7 +91,7 @@ class DouyinAutoScheduler:
from datetime import timedelta from datetime import timedelta
# 获取集合 # 获取集合
douyin_collection = db['Rankings_list'] # 使用真实抓取的数据 douyin_collection = db['Ranking_storage_list'] # 使用定时器抓取的数据
rankings_collection = db['Ranking_storage'] rankings_collection = db['Ranking_storage']
today = date.today() today = date.today()
@ -107,10 +109,20 @@ class DouyinAutoScheduler:
try: try:
logging.info("🔄 正在生成时间轴对比榜单...") logging.info("🔄 正在生成时间轴对比榜单...")
# 获取今天的数据,按短剧名称去重,只保留播放量最高的 # 获取最新批次的数据
today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1)) latest_batch = douyin_collection.find_one(sort=[("batch_time", -1)])
if not latest_batch:
logging.warning("⚠️ 未找到任何数据")
return False
# 按短剧名称去重,每个短剧只保留播放量最高的一条 latest_batch_time = latest_batch.get("batch_time")
logging.info(f"📊 找到最新批次时间: {latest_batch_time}")
# 只获取最新批次的数据
today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1))
logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}")
# 按短剧名称去重(虽然同一批次应该不会有重复,但为了代码健壮性保留此逻辑)
unique_videos = {} unique_videos = {}
for video in today_videos_raw: for video in today_videos_raw:
mix_name = video.get("mix_name", "") mix_name = video.get("mix_name", "")
@ -121,26 +133,36 @@ class DouyinAutoScheduler:
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
# 获取昨天的榜单数据(如果存在),取最新的计算结果 # 获取昨天最后一批次的数据
yesterday_ranking = rankings_collection.find_one({ yesterday_batch = douyin_collection.find_one({
"date": yesterday_str, "batch_time": {"$regex": f"^{yesterday_str}"}
"type": "comprehensive" }, sort=[("batch_time", -1)])
}, sort=[("calculation_sequence", -1)])
yesterday_data = {} yesterday_data = {}
if yesterday_ranking and "data" in yesterday_ranking: if yesterday_batch:
# 将昨天的数据转换为字典,以短剧名称为键 # 获取昨天最后一批次的所有数据
for item in yesterday_ranking["data"]: yesterday_videos = list(douyin_collection.find({
title = item.get("title", "") "batch_time": yesterday_batch["batch_time"]
if title: }).sort("play_vv", -1))
yesterday_data[title] = {
"rank": item.get("rank", 0), # 按短剧名称去重,保留播放量最高的记录
"play_vv": item.get("play_vv", 0), for video in yesterday_videos:
"video_id": item.get("video_id", "") mix_name = video.get("mix_name", "")
if mix_name and (mix_name not in yesterday_data or video.get("play_vv", 0) > yesterday_data[mix_name].get("play_vv", 0)):
yesterday_data[mix_name] = {
"play_vv": video.get("play_vv", 0),
"video_id": str(video.get("_id", "")),
"rank": 0 # 稍后计算排名
} }
logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧")
# 计算排名
sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True)
for rank, (mix_name, data) in enumerate(sorted_videos, 1):
yesterday_data[mix_name]["rank"] = rank
logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧")
else: else:
logging.info("📊 未找到昨天的榜单数据,将作为首次生成") logging.info("📊 未找到昨天的原始数据,将作为首次生成")
if today_videos: if today_videos:
# 先计算所有视频的播放量差值 # 先计算所有视频的播放量差值

View File

@ -83,11 +83,13 @@ class DouyinPlayVVScraper:
# 使用 database.py 中的连接 # 使用 database.py 中的连接
self.db = db self.db = db
# 设置集合 # 根据运行模式选择集合
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list') is_timer_mode = os.environ.get('TIMER_MODE') == '1'
mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_list'
self.collection = self.db[mongo_collection] self.collection = self.db[mongo_collection]
logging.info(f'MongoDB连接成功使用数据库: {self.db.name},集合: {mongo_collection}') logging.info(f'MongoDB连接成功使用数据库: {self.db.name},集合: {mongo_collection}')
logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}')
except Exception as e: except Exception as e:
logging.error(f'MongoDB连接失败: {e}') logging.error(f'MongoDB连接失败: {e}')
@ -448,6 +450,28 @@ class DouyinPlayVVScraper:
return f"{n/10_000:.1f}" return f"{n/10_000:.1f}"
return str(n) return str(n)
def format_interaction_count(self, n: int) -> str:
"""格式化互动数据数量,返回带单位的字符串
Args:
n: 数量
Returns:
str: 格式化后的字符串 27898 -> 2.8W, 1234 -> 1234
"""
if n >= 100_000_000:
result = n / 100_000_000
if result == int(result):
return f"{int(result)}亿"
else:
return f"{result:.1f}亿"
elif n >= 10_000:
result = n / 10_000
if result == int(result):
return f"{int(result)}W"
else:
return f"{result:.1f}W"
else:
return str(n)
def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None): def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None):
@ -1009,6 +1033,80 @@ class DouyinPlayVVScraper:
) )
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID') logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
# 获取每个视频的详细互动数据
logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据')
video_details_list = self.get_collection_video_details(
episode_video_ids=episode_video_ids,
mix_name=mix_name,
max_comments_per_video=10 # 每个视频最多获取10条评论
)
# 构建每集的详细信息,使用获取到的真实数据
episode_details = []
total_episodes = item.get('updated_to_episode', 0)
for i in range(total_episodes):
episode_number = i + 1
video_id = episode_video_ids[i] if i < len(episode_video_ids) else ''
# 查找对应的视频详细数据
video_detail = None
if i < len(video_details_list):
video_detail = video_details_list[i]
if video_detail and video_detail.get('success', False):
# 使用获取到的真实数据
likes = video_detail.get('likes', 0)
shares = video_detail.get('shares', 0)
favorites = video_detail.get('favorites', 0)
episode_info = {
'episode_number': episode_number,
'video_id': video_id,
'likes': likes,
'shares': shares,
'favorites': favorites,
'likes_formatted': self.format_interaction_count(likes),
'shares_formatted': self.format_interaction_count(shares),
'favorites_formatted': self.format_interaction_count(favorites),
'comments': video_detail.get('comments', [])
}
else:
# 使用默认值
episode_info = {
'episode_number': episode_number,
'video_id': video_id,
'likes': 0,
'shares': 0,
'favorites': 0,
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': []
}
episode_details.append(episode_info)
# 统计获取到的数据
total_likes = sum(ep.get('likes', 0) for ep in episode_details)
total_comments = sum(len(ep.get('comments', [])) for ep in episode_details)
logging.info(f'合集 {mix_name} 详细数据统计: 总点赞数={total_likes:,}, 总评论数={total_comments}')
else:
# 如果没有获取到视频ID使用默认的episode_details
episode_details = [
{
'episode_number': i + 1,
'video_id': '',
'likes': 0,
'shares': 0,
'favorites': 0,
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': []
} for i in range(item.get('updated_to_episode', 0))
]
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段 # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
doc = { doc = {
'batch_time': batch_time, 'batch_time': batch_time,
@ -1025,7 +1123,8 @@ class DouyinPlayVVScraper:
'series_author': item.get('series_author', ''), # 合集作者/影视工作室 'series_author': item.get('series_author', ''), # 合集作者/影视工作室
'desc': item.get('desc', ''), # 合集描述 'desc': item.get('desc', ''), # 合集描述
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
'episode_video_ids': episode_video_ids # 每一集的视频ID列表 'episode_video_ids': episode_video_ids, # 每一集的视频ID列表
'episode_details': episode_details # 每集的详细信息
} }
documents.append(doc) documents.append(doc)
@ -1095,6 +1194,7 @@ class DouyinPlayVVScraper:
if ( if (
'Network.responseReceived' in log['method'] 'Network.responseReceived' in log['method']
and 'response' in log['params'] and 'response' in log['params']
and log['params']['response']
and 'url' in log['params']['response'] and 'url' in log['params']['response']
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url'] and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
): ):
@ -1130,6 +1230,11 @@ class DouyinPlayVVScraper:
Returns: Returns:
list: 按集数排序的视频ID列表 list: 按集数排序的视频ID列表
""" """
# 定时器模式下跳过此函数
if os.environ.get('TIMER_MODE') == '1':
logging.info(f'定时器模式:跳过 get_collection_videos 函数')
return []
try: try:
# 检查缓存文件 # 检查缓存文件
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids') cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
@ -1273,6 +1378,374 @@ class DouyinPlayVVScraper:
return [video['video_id'] for video in cached_videos] return [video['video_id'] for video in cached_videos]
return [] return []
def get_video_details(self, video_id: str, max_comments: int = 20) -> dict:
"""获取单个视频的详细互动数据
Args:
video_id: 视频ID
max_comments: 最大评论数量默认20条
Returns:
dict: 包含点赞数收藏数转发数评论内容的字典
"""
video_details = {
'video_id': video_id,
'likes': 0,
'shares': 0,
'favorites': 0,
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': [],
'success': False,
'error': None
}
try:
# 确保driver已初始化
if self.driver is None:
logging.info('Driver未初始化正在设置...')
self.setup_driver()
if self.driver is None:
raise Exception("无法初始化WebDriver")
video_url = f'https://www.douyin.com/video/{video_id}'
logging.info(f'获取视频详细数据: {video_url}')
# 导航到视频页面
self.driver.get(video_url)
time.sleep(3)
# 等待页面加载完成
try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "video"))
)
except Exception as e:
logging.warning(f'等待视频元素超时: {e}')
# 获取网络请求日志
logs = self.driver.get_log('performance')
# 解析网络日志获取视频详细数据
for entry in logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and log['params']['response']
and log['params']['response'].get('url')
):
url = log['params']['response']['url']
# 检查是否是视频详情API
if '/aweme/v1/web/aweme/detail/' in url and video_id in url:
try:
# 获取响应体
response_body = self.driver.execute_cdp_cmd(
'Network.getResponseBody',
{'requestId': log['params']['requestId']}
)
if response_body and 'body' in response_body:
data = json.loads(response_body['body'])
aweme_detail = data.get('aweme_detail', {})
if aweme_detail:
# 获取统计数据
statistics = aweme_detail.get('statistics', {})
video_details['likes'] = int(statistics.get('digg_count', 0))
video_details['shares'] = int(statistics.get('share_count', 0))
video_details['favorites'] = int(statistics.get('collect_count', 0))
# 添加格式化字段
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
logging.info(f'视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
except Exception as e:
logging.warning(f'解析视频详情API响应失败: {e}')
continue
# 检查是否是评论API
elif '/aweme/v1/web/comment/list/' in url and video_id in url:
try:
# 获取响应体
response_body = self.driver.execute_cdp_cmd(
'Network.getResponseBody',
{'requestId': log['params']['requestId']}
)
if response_body and 'body' in response_body:
data = json.loads(response_body['body'])
comments = data.get('comments', [])
for comment in comments[:max_comments]:
comment_info = {
'text': comment.get('text', ''),
'user_name': comment.get('user', {}).get('nickname', ''),
'digg_count': int(comment.get('digg_count', 0)),
'create_time': comment.get('create_time', 0)
}
video_details['comments'].append(comment_info)
logging.info(f'视频 {video_id} 获取到 {len(video_details["comments"])} 条评论')
except Exception as e:
logging.warning(f'解析评论API响应失败: {e}')
continue
except Exception as e:
continue
# 如果网络日志没有获取到数据,尝试页面解析
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
video_details = self._parse_video_details_from_page(video_id, video_details, max_comments)
video_details['success'] = True
return video_details
except Exception as e:
error_msg = f'获取视频 {video_id} 详细数据失败: {e}'
logging.error(error_msg)
video_details['error'] = error_msg
return video_details
def _parse_video_details_from_page(self, video_id: str, video_details: dict, max_comments: int = 20) -> dict:
"""从页面元素解析视频详细数据(备用方案)
Args:
video_id: 视频ID
video_details: 现有的视频详细数据字典
max_comments: 最大评论数量
Returns:
dict: 更新后的视频详细数据字典
"""
try:
logging.info(f'尝试从页面元素解析视频 {video_id} 的详细数据')
# 尝试解析页面中的SSR数据
try:
# 查找包含视频数据的script标签
scripts = self.driver.find_elements("tag name", "script")
for script in scripts:
script_content = script.get_attribute('innerHTML')
if script_content and ('window._SSR_HYDRATED_DATA' in script_content or 'RENDER_DATA' in script_content):
# 提取JSON数据
if 'window._SSR_HYDRATED_DATA' in script_content:
match = re.search(r'window\._SSR_HYDRATED_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
else:
match = re.search(r'window\.RENDER_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
if match:
data = json.loads(match.group(1))
# 查找视频详情数据
def find_video_data(obj, target_id):
if isinstance(obj, dict):
for key, value in obj.items():
if key == 'aweme_id' and str(value) == str(target_id):
return obj
elif isinstance(value, (dict, list)):
result = find_video_data(value, target_id)
if result:
return result
elif isinstance(obj, list):
for item in obj:
result = find_video_data(item, target_id)
if result:
return result
return None
video_data = find_video_data(data, video_id)
if video_data:
statistics = video_data.get('statistics', {})
video_details['likes'] = int(statistics.get('digg_count', 0))
video_details['shares'] = int(statistics.get('share_count', 0))
video_details['favorites'] = int(statistics.get('collect_count', 0))
# 添加格式化字段
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
logging.info(f'从SSR数据解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
break
except Exception as e:
logging.warning(f'解析SSR数据失败: {e}')
# 如果SSR数据解析失败尝试CSS选择器
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
try:
# 尝试常见的点赞、分享、收藏按钮选择器
selectors = {
'likes': [
'[data-e2e="video-like-count"]',
'[class*="like"] [class*="count"]',
'[class*="digg"] [class*="count"]'
],
'shares': [
'[data-e2e="video-share-count"]',
'[class*="share"] [class*="count"]'
],
'favorites': [
'[data-e2e="video-collect-count"]',
'[class*="collect"] [class*="count"]',
'[class*="favorite"] [class*="count"]'
]
}
for data_type, selector_list in selectors.items():
for selector in selector_list:
try:
elements = self.driver.find_elements("css selector", selector)
if elements:
text = elements[0].text.strip()
if text and text.replace('.', '').replace('', '').replace('亿', '').isdigit():
# 转换数字格式
if '亿' in text:
video_details[data_type] = int(float(text.replace('亿', '')) * 100000000)
elif '' in text:
video_details[data_type] = int(float(text.replace('', '')) * 10000)
else:
video_details[data_type] = int(text)
break
except Exception:
continue
if video_details['likes'] > 0 or video_details['shares'] > 0 or video_details['favorites'] > 0:
# 添加格式化字段
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
logging.info(f'从页面元素解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
except Exception as e:
logging.warning(f'CSS选择器解析失败: {e}')
# 尝试获取评论(如果还没有获取到)
if not video_details['comments']:
try:
# 滚动到评论区域
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# 尝试常见的评论选择器
comment_selectors = [
'[data-e2e="comment-item"]',
'[class*="comment-item"]',
'[class*="comment"] [class*="content"]'
]
for selector in comment_selectors:
try:
comment_elements = self.driver.find_elements("css selector", selector)[:max_comments]
if comment_elements:
for element in comment_elements:
try:
comment_text = element.text.strip()
if comment_text:
comment_info = {
'text': comment_text,
'user_name': '',
'digg_count': 0,
'create_time': 0
}
video_details['comments'].append(comment_info)
except Exception:
continue
if video_details['comments']:
logging.info(f'从页面元素获取到视频 {video_id}{len(video_details["comments"])} 条评论')
break
except Exception:
continue
except Exception as e:
logging.warning(f'获取评论失败: {e}')
except Exception as e:
logging.warning(f'页面解析视频详细数据失败: {e}')
return video_details
def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', max_comments_per_video: int = 10) -> list:
"""获取合集中所有视频的详细互动数据
Args:
episode_video_ids: 视频ID列表
mix_name: 合集名称用于日志
max_comments_per_video: 每个视频最大评论数量默认10条
Returns:
list: 包含每个视频详细数据的列表
"""
# 定时器模式下跳过此函数
if os.environ.get('TIMER_MODE') == '1':
logging.info(f'定时器模式:跳过 get_collection_video_details 函数')
return []
if not episode_video_ids:
logging.info(f'合集 {mix_name} 没有视频ID跳过详细数据获取')
return []
logging.info(f'开始获取合集 {mix_name}{len(episode_video_ids)} 个视频的详细数据')
video_details_list = []
for i, video_id in enumerate(episode_video_ids, 1):
if not video_id:
logging.warning(f'合集 {mix_name}{i} 集视频ID为空跳过')
video_details_list.append({
'episode_number': i,
'video_id': '',
'likes': 0,
'shares': 0,
'favorites': 0,
'comments': [],
'success': False,
'error': '视频ID为空'
})
continue
logging.info(f'获取合集 {mix_name}{i}/{len(episode_video_ids)} 集视频详细数据: {video_id}')
try:
# 获取单个视频的详细数据
video_details = self.get_video_details(video_id, max_comments_per_video)
video_details['episode_number'] = i
video_details_list.append(video_details)
# 添加延迟避免请求过快
time.sleep(2)
except Exception as e:
error_msg = f'获取视频 {video_id} 详细数据时出错: {e}'
logging.error(error_msg)
video_details_list.append({
'episode_number': i,
'video_id': video_id,
'likes': 0,
'shares': 0,
'favorites': 0,
'comments': [],
'success': False,
'error': error_msg
})
# 统计获取结果
success_count = sum(1 for detail in video_details_list if detail.get('success', False))
total_likes = sum(detail.get('likes', 0) for detail in video_details_list)
total_comments = sum(len(detail.get('comments', [])) for detail in video_details_list)
logging.info(f'合集 {mix_name} 视频详细数据获取完成: {success_count}/{len(episode_video_ids)} 成功, 总点赞数={total_likes:,}, 总评论数={total_comments}')
return video_details_list
def get_cookies_dict(self): def get_cookies_dict(self):
"""获取当前页面的cookies""" """获取当前页面的cookies"""
if not hasattr(self, 'cookies') or not self.cookies: if not hasattr(self, 'cookies') or not self.cookies:

View File

@ -135,7 +135,8 @@ def format_mix_item(doc):
"updated_to_episode": doc.get("updated_to_episode", 0), "updated_to_episode": doc.get("updated_to_episode", 0),
"cover_backup_urls": doc.get("cover_backup_urls", []), "cover_backup_urls": doc.get("cover_backup_urls", []),
"mix_id": doc.get("mix_id", ""), "mix_id": doc.get("mix_id", ""),
"episode_video_ids": doc.get("episode_video_ids", []) "episode_video_ids": doc.get("episode_video_ids", []),
"episode_details": doc.get("episode_details", [])
} }
def get_mix_list(page=1, limit=20, sort_by="playcount"): def get_mix_list(page=1, limit=20, sort_by="playcount"):