主代码可以实时更新
定时器由于要进行播放量插值计算,所以要有固定的时间戳,还是统一保存。
This commit is contained in:
parent
e8baaa4ce9
commit
9295e77cf1
@ -18,6 +18,19 @@ LOG_DIR = 'logs'
|
|||||||
# 定时器配置
|
# 定时器配置
|
||||||
SCHEDULER_TIME = "24:00" # 定时器执行时间,格式为 HH:MM (24小时制)
|
SCHEDULER_TIME = "24:00" # 定时器执行时间,格式为 HH:MM (24小时制)
|
||||||
|
|
||||||
|
# 定时器环境变量配置
|
||||||
|
TIMER_ENV_CONFIG = {
|
||||||
|
'TIMER_MODE': '1', # 启用定时器模式,使数据保存到 Ranking_storage_list 集合
|
||||||
|
'AUTO_CONTINUE': '1' # 启用自动模式,跳过详细数据获取以提高性能
|
||||||
|
}
|
||||||
|
|
||||||
|
# 自动模式跳过函数配置
|
||||||
|
AUTO_CONTINUE_SKIP_FUNCTIONS = [
|
||||||
|
'get_collection_video_details', # 跳过合集视频详细数据获取
|
||||||
|
'scroll_comments', # 跳过评论滚动
|
||||||
|
# 可以在这里添加更多需要跳过的函数名
|
||||||
|
]
|
||||||
|
|
||||||
# TOS/火山云对象存储配置
|
# TOS/火山云对象存储配置
|
||||||
TOS_CONFIG = {
|
TOS_CONFIG = {
|
||||||
'access_key_id': os.getenv('TOS_ACCESS_KEY_ID', 'AKLTYjQyYmE1ZDAwZTY5NGZiOWI3ODZkZDhhOWE4MzVjODE'),
|
'access_key_id': os.getenv('TOS_ACCESS_KEY_ID', 'AKLTYjQyYmE1ZDAwZTY5NGZiOWI3ODZkZDhhOWE4MzVjODE'),
|
||||||
@ -39,4 +52,13 @@ API_CONFIG = {
|
|||||||
'OSS_HOST': TOS_CONFIG['self_domain']
|
'OSS_HOST': TOS_CONFIG['self_domain']
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def apply_timer_environment():
|
||||||
|
"""应用定时器环境变量配置"""
|
||||||
|
for key, value in TIMER_ENV_CONFIG.items():
|
||||||
|
os.environ[key] = value
|
||||||
|
|
||||||
|
def get_skip_functions():
|
||||||
|
"""获取自动模式下需要跳过的函数列表"""
|
||||||
|
return AUTO_CONTINUE_SKIP_FUNCTIONS.copy()
|
||||||
|
|
||||||
print(f"Successfully loaded configuration for environment: {APP_ENV}")
|
print(f"Successfully loaded configuration for environment: {APP_ENV}")
|
||||||
@ -713,7 +713,7 @@ class DouyinPlayVVScraper:
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
pass # 忽略无法转换为整数的情况
|
pass # 忽略无法转换为整数的情况
|
||||||
|
|
||||||
self.play_vv_items.append({
|
item_data = {
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
'formatted': self.format_count(vv),
|
'formatted': self.format_count(vv),
|
||||||
'url': source_url,
|
'url': source_url,
|
||||||
@ -727,7 +727,9 @@ class DouyinPlayVVScraper:
|
|||||||
'desc': desc, # 合集描述
|
'desc': desc, # 合集描述
|
||||||
'updated_to_episode': updated_to_episode, # 合集总集数
|
'updated_to_episode': updated_to_episode, # 合集总集数
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
}
|
||||||
|
|
||||||
|
self.play_vv_items.append(item_data)
|
||||||
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
if series_author:
|
if series_author:
|
||||||
logging.info(f' 作者: {series_author}')
|
logging.info(f' 作者: {series_author}')
|
||||||
@ -736,6 +738,14 @@ class DouyinPlayVVScraper:
|
|||||||
if updated_to_episode > 0:
|
if updated_to_episode > 0:
|
||||||
logging.info(f' 总集数: {updated_to_episode}')
|
logging.info(f' 总集数: {updated_to_episode}')
|
||||||
|
|
||||||
|
# 只在非定时器模式下使用实时保存
|
||||||
|
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
|
||||||
|
if not is_timer_mode:
|
||||||
|
logging.info(f'立即保存合集数据: {mix_name}')
|
||||||
|
self.save_single_item_to_mongodb(item_data)
|
||||||
|
else:
|
||||||
|
logging.info(f'定时器模式:暂存合集数据: {mix_name},将在最后批量保存')
|
||||||
|
|
||||||
# 递归搜索子对象
|
# 递归搜索子对象
|
||||||
for key, value in obj.items():
|
for key, value in obj.items():
|
||||||
if isinstance(value, (dict, list)):
|
if isinstance(value, (dict, list)):
|
||||||
@ -766,7 +776,7 @@ class DouyinPlayVVScraper:
|
|||||||
if episodes > 0:
|
if episodes > 0:
|
||||||
logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")
|
logging.info(f"从statis.updated_to_episode提取到集数: {episodes}")
|
||||||
|
|
||||||
self.play_vv_items.append({
|
item_data = {
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
'formatted': self.format_count(vv),
|
'formatted': self.format_count(vv),
|
||||||
'url': source_url,
|
'url': source_url,
|
||||||
@ -776,8 +786,18 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_id': mix_id, # 合集ID
|
'mix_id': mix_id, # 合集ID
|
||||||
'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数
|
'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
}
|
||||||
|
|
||||||
|
self.play_vv_items.append(item_data)
|
||||||
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
|
|
||||||
|
# 只在非定时器模式下使用实时保存
|
||||||
|
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
|
||||||
|
if not is_timer_mode:
|
||||||
|
logging.info(f'立即保存正则提取的合集数据: {mix_name}')
|
||||||
|
self.save_single_item_to_mongodb(item_data)
|
||||||
|
else:
|
||||||
|
logging.info(f'定时器模式:暂存正则提取的合集数据: {mix_name},将在最后批量保存')
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -787,7 +807,7 @@ class DouyinPlayVVScraper:
|
|||||||
vv = int(match)
|
vv = int(match)
|
||||||
# 检查是否已经存在相同的play_vv
|
# 检查是否已经存在相同的play_vv
|
||||||
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
||||||
self.play_vv_items.append({
|
item_data = {
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
'formatted': self.format_count(vv),
|
'formatted': self.format_count(vv),
|
||||||
'url': source_url,
|
'url': source_url,
|
||||||
@ -797,7 +817,18 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_id': '', # 未知mix_id
|
'mix_id': '', # 未知mix_id
|
||||||
'updated_to_episode': None, # 未知集数
|
'updated_to_episode': None, # 未知集数
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
}
|
||||||
|
|
||||||
|
self.play_vv_items.append(item_data)
|
||||||
|
logging.info(f'兜底提取到播放量: {vv:,}')
|
||||||
|
|
||||||
|
# 只在非定时器模式下使用实时保存
|
||||||
|
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
|
||||||
|
if not is_timer_mode:
|
||||||
|
logging.info(f'立即保存兜底提取的数据: {vv:,} 播放量')
|
||||||
|
self.save_single_item_to_mongodb(item_data)
|
||||||
|
else:
|
||||||
|
logging.info(f'定时器模式:暂存兜底提取的数据: {vv:,} 播放量,将在最后批量保存')
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -1065,7 +1096,11 @@ class DouyinPlayVVScraper:
|
|||||||
return cover_url # 上传失败时返回原链接
|
return cover_url # 上传失败时返回原链接
|
||||||
|
|
||||||
def save_to_mongodb(self):
|
def save_to_mongodb(self):
|
||||||
"""将数据保存到MongoDB"""
|
"""
|
||||||
|
将数据批量保存到MongoDB
|
||||||
|
注意:此方法现在作为备用保留,正常流程使用实时保存功能(save_single_item_to_mongodb)
|
||||||
|
避免重复保存数据
|
||||||
|
"""
|
||||||
if self.collection is None:
|
if self.collection is None:
|
||||||
logging.warning('MongoDB未连接,跳过数据库保存')
|
logging.warning('MongoDB未连接,跳过数据库保存')
|
||||||
return
|
return
|
||||||
@ -1253,6 +1288,119 @@ class DouyinPlayVVScraper:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'保存到MongoDB时出错: {e}')
|
logging.error(f'保存到MongoDB时出错: {e}')
|
||||||
|
|
||||||
|
def save_single_item_to_mongodb(self, item: dict):
|
||||||
|
"""将单条数据立即保存到MongoDB
|
||||||
|
Args:
|
||||||
|
item: 包含合集信息的字典
|
||||||
|
"""
|
||||||
|
if self.collection is None:
|
||||||
|
logging.warning('MongoDB未连接,跳过单条数据保存')
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
batch_time = datetime.now()
|
||||||
|
|
||||||
|
# 获取原始封面图片URL
|
||||||
|
original_cover_url = item.get('cover_image_url', '')
|
||||||
|
mix_name = item.get('mix_name', '')
|
||||||
|
mix_id = item.get('mix_id', '')
|
||||||
|
|
||||||
|
# 处理封面图片
|
||||||
|
permanent_cover_url = ''
|
||||||
|
upload_success = False
|
||||||
|
|
||||||
|
if original_cover_url:
|
||||||
|
# 上传封面图片到TOS获取永久链接
|
||||||
|
permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name)
|
||||||
|
|
||||||
|
# 检查上传是否成功
|
||||||
|
if permanent_cover_url != original_cover_url:
|
||||||
|
upload_success = True
|
||||||
|
logging.info(f'封面图片上传成功: {mix_name}')
|
||||||
|
else:
|
||||||
|
upload_success = False
|
||||||
|
logging.warning(f'封面图片上传失败,使用原始链接: {mix_name}')
|
||||||
|
else:
|
||||||
|
permanent_cover_url = ''
|
||||||
|
upload_success = True # 没有图片不算失败
|
||||||
|
|
||||||
|
# 获取合集中的所有视频ID(定时器模式时不获取详细互动数据)
|
||||||
|
episode_video_ids = []
|
||||||
|
episode_details = []
|
||||||
|
|
||||||
|
if mix_id:
|
||||||
|
logging.info(f'获取合集 {mix_name} 的视频ID')
|
||||||
|
current_episode_count = item.get('updated_to_episode', 0)
|
||||||
|
episode_video_ids = self.get_collection_videos(
|
||||||
|
mix_id=mix_id,
|
||||||
|
mix_name=mix_name,
|
||||||
|
current_episode_count=current_episode_count
|
||||||
|
)
|
||||||
|
|
||||||
|
# 构建每集信息(定时器模式时不获取详细互动数据以提高速度)
|
||||||
|
total_episodes = item.get('updated_to_episode', 0)
|
||||||
|
for i in range(total_episodes):
|
||||||
|
episode_number = i + 1
|
||||||
|
video_id = episode_video_ids[i] if i < len(episode_video_ids) else ''
|
||||||
|
|
||||||
|
episode_info = {
|
||||||
|
'episode_number': episode_number,
|
||||||
|
'video_id': video_id,
|
||||||
|
'likes': 0, # 定时器模式时不获取详细数据
|
||||||
|
'shares': 0,
|
||||||
|
'favorites': 0,
|
||||||
|
'likes_formatted': '0',
|
||||||
|
'shares_formatted': '0',
|
||||||
|
'favorites_formatted': '0',
|
||||||
|
'comments': []
|
||||||
|
}
|
||||||
|
episode_details.append(episode_info)
|
||||||
|
|
||||||
|
# 计算当前排名(基于当前批次的数据)
|
||||||
|
higher_count = self.collection.count_documents({
|
||||||
|
'play_vv': {'$gt': item.get('play_vv', 0)},
|
||||||
|
'batch_time': {'$gte': batch_time.replace(hour=0, minute=0, second=0, microsecond=0)}
|
||||||
|
})
|
||||||
|
current_rank = higher_count + 1
|
||||||
|
|
||||||
|
# 构建文档 - 每次都插入新记录,保留历史数据
|
||||||
|
doc = {
|
||||||
|
'batch_time': batch_time,
|
||||||
|
'mix_name': mix_name,
|
||||||
|
'video_url': item.get('video_url', ''),
|
||||||
|
'playcount': item.get('formatted', ''),
|
||||||
|
'play_vv': item.get('play_vv', 0),
|
||||||
|
'request_id': item.get('request_id', ''),
|
||||||
|
'rank': current_rank,
|
||||||
|
'cover_image_url_original': original_cover_url,
|
||||||
|
'cover_image_url': permanent_cover_url,
|
||||||
|
'cover_upload_success': upload_success,
|
||||||
|
'cover_backup_urls': item.get('cover_backup_urls', []),
|
||||||
|
'series_author': item.get('series_author', ''),
|
||||||
|
'desc': item.get('desc', ''),
|
||||||
|
'updated_to_episode': item.get('updated_to_episode', 0),
|
||||||
|
'episode_video_ids': episode_video_ids,
|
||||||
|
'episode_details': episode_details,
|
||||||
|
'created_at': datetime.now()
|
||||||
|
}
|
||||||
|
|
||||||
|
# 插入新记录 - 始终插入,不更新已存在的记录
|
||||||
|
result = self.collection.insert_one(doc)
|
||||||
|
logging.info(f'边抓取边保存新记录: {mix_name} - {item.get("play_vv", 0):,} 播放量 (排名: {current_rank})')
|
||||||
|
|
||||||
|
# 更新其他记录的排名
|
||||||
|
self.collection.update_many(
|
||||||
|
{
|
||||||
|
'play_vv': {'$lt': item.get('play_vv', 0)},
|
||||||
|
'batch_time': {'$gte': batch_time.replace(hour=0, minute=0, second=0, microsecond=0)},
|
||||||
|
'_id': {'$ne': result.inserted_id}
|
||||||
|
},
|
||||||
|
{'$inc': {'rank': 1}}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'实时保存单条数据到MongoDB时出错: {e}')
|
||||||
|
|
||||||
def get_video_info(self, video_id: str) -> dict:
|
def get_video_info(self, video_id: str) -> dict:
|
||||||
"""获取视频详细信息
|
"""获取视频详细信息
|
||||||
Args:
|
Args:
|
||||||
@ -2569,8 +2717,17 @@ class DouyinPlayVVScraper:
|
|||||||
self.collect_network_bodies()
|
self.collect_network_bodies()
|
||||||
self.parse_ssr_data()
|
self.parse_ssr_data()
|
||||||
self.dedupe()
|
self.dedupe()
|
||||||
self.save_results()
|
|
||||||
logging.info('完成,play_vv数量: %d', len(self.play_vv_items))
|
# 根据模式选择保存方式
|
||||||
|
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
|
||||||
|
if is_timer_mode:
|
||||||
|
# 定时器模式:使用批量保存,所有数据使用相同的batch_time
|
||||||
|
self.save_results()
|
||||||
|
logging.info('定时器模式:完成批量保存,play_vv数量: %d', len(self.play_vv_items))
|
||||||
|
else:
|
||||||
|
# 普通模式:数据已通过实时保存功能保存
|
||||||
|
logging.info('普通模式:完成,play_vv数量: %d', len(self.play_vv_items))
|
||||||
|
logging.info('所有数据已通过实时保存功能保存到数据库')
|
||||||
finally:
|
finally:
|
||||||
if self.driver:
|
if self.driver:
|
||||||
try:
|
try:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user