优化定时器功能
This commit is contained in:
parent
97c5fbe4df
commit
e8baaa4ce9
@ -32,6 +32,12 @@ from handlers.Rankings.rank_data_scraper import DouyinPlayVVScraper
|
||||
|
||||
|
||||
# 配置日志的函数
|
||||
def setup_timer_environment():
|
||||
"""设置定时器相关的环境变量"""
|
||||
config.apply_timer_environment()
|
||||
for key, value in config.TIMER_ENV_CONFIG.items():
|
||||
logging.info(f"设置环境变量: {key}={value}")
|
||||
|
||||
def setup_logging(quiet_mode=False):
|
||||
"""设置日志配置"""
|
||||
# 确保logs目录存在
|
||||
@ -101,8 +107,8 @@ class DouyinAutoScheduler:
|
||||
try:
|
||||
logging.warning("🚀 开始执行抖音播放量抓取任务...")
|
||||
|
||||
# 设置环境变量,确保自动模式
|
||||
os.environ['AUTO_CONTINUE'] = '1'
|
||||
# 设置环境变量,确保定时器模式和自动模式
|
||||
setup_timer_environment()
|
||||
|
||||
# 直接创建并运行 DouyinPlayVVScraper 实例
|
||||
scraper = DouyinPlayVVScraper(
|
||||
|
||||
@ -11,9 +11,13 @@
|
||||
{
|
||||
"video_id": "7539690162612079872",
|
||||
"episode_num": 0
|
||||
},
|
||||
{
|
||||
"video_id": "7565426543275609378",
|
||||
"episode_num": 0
|
||||
}
|
||||
],
|
||||
"total_count": 3,
|
||||
"last_update": "2025-10-22T09:55:17.087205",
|
||||
"total_count": 4,
|
||||
"last_update": "2025-10-27T10:05:06.655628",
|
||||
"mix_name": "《小宝穿越|课本古诗文》"
|
||||
}
|
||||
@ -51,9 +51,13 @@
|
||||
{
|
||||
"video_id": "7564982296051338534",
|
||||
"episode_num": 0
|
||||
},
|
||||
{
|
||||
"video_id": "7565346285362548019",
|
||||
"episode_num": 0
|
||||
}
|
||||
],
|
||||
"total_count": 13,
|
||||
"last_update": "2025-10-25T12:53:08.640840",
|
||||
"total_count": 14,
|
||||
"last_update": "2025-10-27T11:04:23.469116",
|
||||
"mix_name": "暗黑神话《葫芦兄弟》大电影"
|
||||
}
|
||||
@ -31,7 +31,7 @@ import psutil
|
||||
import random
|
||||
import threading
|
||||
import argparse
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from concurrent.futures import ThreadPoolExecutor # 使用线程池实现异步滑动和监控
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
@ -48,7 +48,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||
backend_dir = os.path.join(os.path.dirname(__file__), '..', '..')
|
||||
sys.path.insert(0, backend_dir)
|
||||
from database import db
|
||||
from tos_client import oss_client
|
||||
from handlers.Rankings.tos_client import oss_client
|
||||
|
||||
|
||||
# 配置日志
|
||||
@ -851,7 +851,6 @@ class DouyinPlayVVScraper:
|
||||
# 可能是base64编码
|
||||
if body_obj.get('base64Encoded'):
|
||||
try:
|
||||
import base64
|
||||
body_text = base64.b64decode(body_text).decode('utf-8', errors='ignore')
|
||||
except Exception:
|
||||
pass
|
||||
@ -964,7 +963,7 @@ class DouyinPlayVVScraper:
|
||||
|
||||
def upload_cover_image(self, cover_url, mix_name):
|
||||
"""
|
||||
上传封面图片到TOS并返回永久链接(带去重功能)
|
||||
上传封面图片到TOS并返回永久链接(带去重功能和重试机制)
|
||||
|
||||
Args:
|
||||
cover_url: 临时封面图片链接
|
||||
@ -976,57 +975,94 @@ class DouyinPlayVVScraper:
|
||||
if not cover_url:
|
||||
return cover_url
|
||||
|
||||
try:
|
||||
# 提取图片ID
|
||||
image_id = self.extract_douyin_image_id(cover_url)
|
||||
|
||||
# 如果能提取到图片ID,检查缓存
|
||||
if image_id:
|
||||
if image_id in self.image_cache:
|
||||
cached_url = self.image_cache[image_id]
|
||||
logging.info(f'使用缓存图片: {image_id} -> {cached_url} (合集: {mix_name})')
|
||||
return cached_url
|
||||
|
||||
# 生成随机文件名,保持原有的扩展名
|
||||
file_extension = '.jpg' # 抖音封面图片通常是jpg格式
|
||||
|
||||
# 改进的扩展名检测逻辑
|
||||
url_without_params = cover_url.split('?')[0]
|
||||
url_path = url_without_params.split('/')[-1] # 获取URL路径的最后一部分
|
||||
|
||||
# 只有当最后一部分包含点且点后面的内容是常见图片扩展名时才使用
|
||||
if '.' in url_path:
|
||||
potential_ext = url_path.split('.')[-1].lower()
|
||||
# 检查是否为常见的图片扩展名
|
||||
if potential_ext in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
|
||||
file_extension = '.' + potential_ext
|
||||
|
||||
# 生成唯一文件名
|
||||
random_filename = f"{uuid.uuid4().hex}{file_extension}"
|
||||
object_key = f"media/rank/{random_filename}"
|
||||
|
||||
logging.info(f'开始上传封面图片: {mix_name}')
|
||||
logging.info(f'封面图片URL: {cover_url}')
|
||||
|
||||
# 从URL上传到TOS并获取新的URL
|
||||
oss_url = oss_client.upload_from_url(
|
||||
url=cover_url,
|
||||
object_key=object_key,
|
||||
return_url=True
|
||||
)
|
||||
|
||||
logging.info(f'封面图片上传成功: {mix_name} -> {oss_url}')
|
||||
|
||||
# 如果有图片ID,将结果缓存
|
||||
if image_id:
|
||||
self.image_cache[image_id] = oss_url
|
||||
logging.debug(f'图片缓存已更新: {image_id} -> {oss_url}')
|
||||
|
||||
return oss_url
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f'封面图片上传失败: {mix_name} - {str(e)}')
|
||||
return cover_url # 上传失败时返回原链接
|
||||
# 提取图片ID
|
||||
image_id = self.extract_douyin_image_id(cover_url)
|
||||
|
||||
# 如果能提取到图片ID,检查缓存
|
||||
if image_id:
|
||||
if image_id in self.image_cache:
|
||||
cached_url = self.image_cache[image_id]
|
||||
logging.info(f'使用缓存图片: {image_id} -> {cached_url} (合集: {mix_name})')
|
||||
return cached_url
|
||||
|
||||
# 生成随机文件名,保持原有的扩展名
|
||||
file_extension = '.jpg' # 抖音封面图片通常是jpg格式
|
||||
|
||||
# 改进的扩展名检测逻辑
|
||||
url_without_params = cover_url.split('?')[0]
|
||||
url_path = url_without_params.split('/')[-1] # 获取URL路径的最后一部分
|
||||
|
||||
# 只有当最后一部分包含点且点后面的内容是常见图片扩展名时才使用
|
||||
if '.' in url_path:
|
||||
potential_ext = url_path.split('.')[-1].lower()
|
||||
# 检查是否为常见的图片扩展名
|
||||
if potential_ext in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
|
||||
file_extension = '.' + potential_ext
|
||||
|
||||
# 生成唯一文件名
|
||||
random_filename = f"{uuid.uuid4().hex}{file_extension}"
|
||||
object_key = f"media/rank/{random_filename}"
|
||||
|
||||
# 重试机制:最多尝试3次
|
||||
max_retries = 3
|
||||
last_error = None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
logging.info(f'开始上传封面图片 (尝试 {attempt + 1}/{max_retries}): {mix_name}')
|
||||
logging.info(f'封面图片URL: {cover_url}')
|
||||
logging.info(f'目标对象键: {object_key}')
|
||||
|
||||
# 从URL上传到TOS并获取新的URL
|
||||
oss_url = oss_client.upload_from_url(
|
||||
url=cover_url,
|
||||
object_key=object_key,
|
||||
return_url=True,
|
||||
timeout=30 # 30秒超时
|
||||
)
|
||||
|
||||
# 验证上传是否成功:检查返回的URL是否包含预期的域名
|
||||
if not oss_url or not isinstance(oss_url, str):
|
||||
raise Exception(f"上传返回了无效的URL: {oss_url}")
|
||||
|
||||
# 检查URL格式是否正确
|
||||
expected_domain = oss_client.self_domain
|
||||
if expected_domain not in oss_url:
|
||||
raise Exception(f"上传返回的URL域名不正确: {oss_url}, 期望包含: {expected_domain}")
|
||||
|
||||
# 检查URL是否包含正确的对象键
|
||||
if object_key not in oss_url:
|
||||
raise Exception(f"上传返回的URL不包含对象键: {oss_url}, 期望包含: {object_key}")
|
||||
|
||||
logging.info(f'封面图片上传成功: {mix_name} -> {oss_url}')
|
||||
|
||||
# 如果有图片ID,将结果缓存
|
||||
if image_id:
|
||||
self.image_cache[image_id] = oss_url
|
||||
logging.debug(f'图片缓存已更新: {image_id} -> {oss_url}')
|
||||
|
||||
return oss_url
|
||||
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
error_msg = str(e)
|
||||
logging.warning(f'封面图片上传失败 (尝试 {attempt + 1}/{max_retries}): {mix_name} - {error_msg}')
|
||||
|
||||
# 如果不是最后一次尝试,等待一段时间后重试
|
||||
if attempt < max_retries - 1:
|
||||
import time
|
||||
wait_time = (attempt + 1) * 2 # 递增等待时间:2秒、4秒、6秒
|
||||
logging.info(f'等待 {wait_time} 秒后重试...')
|
||||
time.sleep(wait_time)
|
||||
|
||||
# 为重试生成新的文件名,避免可能的冲突
|
||||
random_filename = f"{uuid.uuid4().hex}{file_extension}"
|
||||
object_key = f"media/rank/{random_filename}"
|
||||
|
||||
# 所有重试都失败了
|
||||
logging.error(f'封面图片上传彻底失败 (已尝试 {max_retries} 次): {mix_name} - 最后错误: {last_error}')
|
||||
logging.error(f'将使用原始链接作为回退: {cover_url}')
|
||||
return cover_url # 上传失败时返回原链接
|
||||
|
||||
def save_to_mongodb(self):
|
||||
"""将数据保存到MongoDB"""
|
||||
@ -1049,16 +1085,33 @@ class DouyinPlayVVScraper:
|
||||
|
||||
# 处理封面图片
|
||||
permanent_cover_url = ''
|
||||
upload_success = False
|
||||
|
||||
if original_cover_url:
|
||||
# 上传封面图片到TOS获取永久链接
|
||||
permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name)
|
||||
|
||||
# 如果上传失败且有原始链接,记录警告但继续保存
|
||||
if permanent_cover_url == original_cover_url:
|
||||
logging.warning(f'封面图片上传失败,使用原始链接: {mix_name}')
|
||||
# 检查上传是否成功
|
||||
if permanent_cover_url != original_cover_url:
|
||||
# 上传成功,URL已经改变
|
||||
upload_success = True
|
||||
logging.info(f'封面图片上传成功,已获得永久链接: {mix_name}')
|
||||
else:
|
||||
# 上传失败,使用原始链接作为回退
|
||||
upload_success = False
|
||||
logging.warning(f'封面图片上传失败,回退使用原始链接: {mix_name}')
|
||||
logging.warning(f'原始链接: {original_cover_url}')
|
||||
|
||||
# 可以在这里添加额外的回退策略,比如:
|
||||
# 1. 尝试使用备用的图片链接
|
||||
# 2. 设置一个默认的占位图片
|
||||
# 3. 记录失败的链接以便后续重试
|
||||
|
||||
# 当前策略:保持原始链接,但在数据库中标记上传状态
|
||||
else:
|
||||
# 没有封面图片,使用空字符串
|
||||
permanent_cover_url = ''
|
||||
upload_success = True # 没有图片不算失败
|
||||
|
||||
# 获取合集中的所有视频ID
|
||||
mix_id = item.get('mix_id', '')
|
||||
@ -1158,6 +1211,7 @@ class DouyinPlayVVScraper:
|
||||
'rank': 0, # 临时设置,后面会重新计算
|
||||
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
|
||||
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
|
||||
'cover_upload_success': upload_success, # 封面图片上传是否成功
|
||||
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
|
||||
# 新增的字段
|
||||
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
|
||||
@ -1182,15 +1236,18 @@ class DouyinPlayVVScraper:
|
||||
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
|
||||
|
||||
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
|
||||
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, series_author, desc, updated_to_episode')
|
||||
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, cover_upload_success, series_author, desc, updated_to_episode')
|
||||
|
||||
# 统计封面图片处理情况
|
||||
cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))
|
||||
original_count = sum(1 for item in self.play_vv_items if item.get('cover_image_url'))
|
||||
success_count = sum(1 for doc in documents if doc.get('cover_image_url') and doc.get('cover_image_url') != doc.get('cover_image_url_original', ''))
|
||||
upload_success_count = sum(1 for doc in documents if doc.get('cover_upload_success', False))
|
||||
upload_failed_count = sum(1 for doc in documents if doc.get('cover_image_url_original') and not doc.get('cover_upload_success', False))
|
||||
|
||||
logging.info(f'封面图片统计: {cover_count}/{len(documents)} 个合集有封面链接')
|
||||
logging.info(f'封面上传统计: {success_count}/{original_count} 个封面成功上传到TOS')
|
||||
logging.info(f'封面上传统计: {upload_success_count}/{original_count} 个封面成功上传到TOS')
|
||||
if upload_failed_count > 0:
|
||||
logging.warning(f'封面上传失败: {upload_failed_count} 个封面上传失败,使用原始链接')
|
||||
logging.info(f'图片缓存统计: 当前缓存 {len(self.image_cache)} 个图片映射')
|
||||
|
||||
except Exception as e:
|
||||
@ -1432,6 +1489,11 @@ class DouyinPlayVVScraper:
|
||||
Returns:
|
||||
list: 收集到的所有评论数据
|
||||
"""
|
||||
# 检查AUTO_CONTINUE环境变量,如果设置为'1'则跳过评论滑动
|
||||
if os.environ.get('AUTO_CONTINUE') == '1' or self.auto_continue:
|
||||
logging.info(f'🚀 AUTO_CONTINUE模式:跳过视频 {video_id} 的评论滑动加载')
|
||||
return []
|
||||
|
||||
all_comments = []
|
||||
collected_comment_ids = set()
|
||||
|
||||
@ -1446,11 +1508,7 @@ class DouyinPlayVVScraper:
|
||||
|
||||
# 点击评论区域以触发网络请求
|
||||
self._click_comment_area()
|
||||
|
||||
# 使用线程池实现异步滑动和监控
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import threading
|
||||
|
||||
|
||||
# 创建共享状态对象,用于任务间通信
|
||||
shared_state = {
|
||||
'scroll_completed': False,
|
||||
@ -2107,6 +2165,15 @@ class DouyinPlayVVScraper:
|
||||
'error': None
|
||||
}
|
||||
|
||||
# 检查AUTO_CONTINUE环境变量,如果设置为'1'则跳过详细数据获取
|
||||
if os.environ.get('AUTO_CONTINUE') == '1' or self.auto_continue:
|
||||
logging.info(f'🚀 AUTO_CONTINUE模式:跳过视频 {video_id} 的详细数据获取(点赞、收藏、分享、评论)')
|
||||
video_details['success'] = True
|
||||
video_details['error'] = 'AUTO_CONTINUE模式:跳过详细数据获取'
|
||||
return video_details
|
||||
|
||||
logging.info(f'🔍 get_video_details 被调用: video_id={video_id}, max_comments={max_comments}')
|
||||
|
||||
try:
|
||||
# 确保driver已初始化
|
||||
if self.driver is None:
|
||||
@ -2424,9 +2491,9 @@ class DouyinPlayVVScraper:
|
||||
Returns:
|
||||
list: 包含每个视频详细数据的列表
|
||||
"""
|
||||
# 定时器模式下跳过此函数
|
||||
if os.environ.get('TIMER_MODE') == '1':
|
||||
logging.info(f'定时器模式:跳过 get_collection_video_details 函数')
|
||||
# AUTO_CONTINUE模式下跳过此函数
|
||||
if os.environ.get('AUTO_CONTINUE') == '1' or self.auto_continue:
|
||||
logging.info(f'🚀 AUTO_CONTINUE模式:跳过 get_collection_video_details 函数(合集视频详细数据获取)')
|
||||
return []
|
||||
|
||||
if not episode_video_ids:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user