363 lines
16 KiB
Python
363 lines
16 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
抖音播放量自动抓取定时器 - 跨平台版本
|
||
|
||
功能:
|
||
- 每晚自动执行抖音播放量抓取任务
|
||
- 数据抓取完成后自动生类榜单
|
||
- 支持Windows、macOS、Linux
|
||
- 自动保存数据到MongoDB
|
||
|
||
使用方法:
|
||
- 正常模式:python Timer_worker.py(启动定时器)
|
||
- 测试模式:python Timer_worker.py --test(立即执行一次)
|
||
- 单次执行:python Timer_worker.py --once(立即执行一次并退出)
|
||
- 仅生成榜单:python Timer_worker.py --ranking-only(仅生成榜单,不抓取数据)
|
||
"""
|
||
|
||
import schedule
|
||
import time
|
||
import sys
|
||
import os
|
||
import logging
|
||
import argparse
|
||
from pathlib import Path
|
||
from datetime import datetime, date
|
||
import config
|
||
|
||
# 添加项目路径到 Python 路径
|
||
sys.path.append(os.path.join(os.path.dirname(__file__), 'handlers', 'Rankings'))
|
||
from rank_data_scraper import DouyinPlayVVScraper
|
||
|
||
|
||
|
||
# 配置日志的函数
|
||
def setup_logging():
|
||
"""设置日志配置"""
|
||
# 确保logs目录存在
|
||
import os
|
||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||
logs_dir = os.path.join(script_dir, 'handlers', 'Rankings', 'logs')
|
||
os.makedirs(logs_dir, exist_ok=True)
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
handlers=[
|
||
logging.FileHandler(os.path.join(logs_dir, 'scheduler.log'), encoding='utf-8'),
|
||
logging.StreamHandler()
|
||
]
|
||
)
|
||
|
||
class DouyinAutoScheduler:
|
||
def __init__(self):
|
||
self.is_running = False
|
||
|
||
def run_douyin_scraper(self):
|
||
"""执行抖音播放量抓取任务"""
|
||
try:
|
||
logging.info("🚀 开始执行抖音播放量抓取任务...")
|
||
|
||
# 设置环境变量,确保自动模式
|
||
os.environ['AUTO_CONTINUE'] = '1'
|
||
|
||
# 直接创建并运行 DouyinPlayVVScraper 实例
|
||
scraper = DouyinPlayVVScraper(
|
||
start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation",
|
||
auto_continue=True,
|
||
duration_s=60
|
||
)
|
||
|
||
logging.info("📁 开始执行抓取任务...")
|
||
scraper.run()
|
||
|
||
logging.info("✅ 抖音播放量抓取任务执行成功")
|
||
|
||
# 数据抓取完成后,自动生成当日榜单
|
||
self.generate_daily_rankings()
|
||
|
||
except Exception as e:
|
||
logging.error(f"💥 执行任务时发生异常: {e}")
|
||
import traceback
|
||
logging.error(f"详细错误信息: {traceback.format_exc()}")
|
||
|
||
def generate_daily_rankings(self):
|
||
"""生成每日榜单数据(基于时间轴对比)"""
|
||
try:
|
||
from database import db
|
||
from datetime import timedelta
|
||
|
||
# 获取集合
|
||
douyin_collection = db['Rankings_list'] # 使用真实抓取的数据
|
||
rankings_collection = db['Ranking_storage']
|
||
|
||
today = date.today()
|
||
yesterday = today - timedelta(days=1)
|
||
today_str = today.strftime('%Y-%m-%d')
|
||
yesterday_str = yesterday.strftime('%Y-%m-%d')
|
||
|
||
logging.info(f"📅 正在生成 {today_str} 的榜单(对比 {yesterday_str})...")
|
||
|
||
# 删除当天已有的榜单数据
|
||
rankings_collection.delete_many({"date": today_str})
|
||
logging.info(f"🗑️ 已清理 {today_str} 的旧榜单数据")
|
||
|
||
# 获取今天和昨天的榜单数据进行对比
|
||
try:
|
||
logging.info("🔄 正在生成时间轴对比榜单...")
|
||
|
||
# 获取今天的数据,按短剧名称去重,只保留播放量最高的
|
||
today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1))
|
||
|
||
# 按短剧名称去重,每个短剧只保留播放量最高的一条
|
||
unique_videos = {}
|
||
for video in today_videos_raw:
|
||
mix_name = video.get("mix_name", "")
|
||
if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)):
|
||
unique_videos[mix_name] = video
|
||
|
||
today_videos = list(unique_videos.values())
|
||
|
||
logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)")
|
||
|
||
# 获取昨天的榜单数据(如果存在),取最新的计算结果
|
||
yesterday_ranking = rankings_collection.find_one({
|
||
"date": yesterday_str,
|
||
"type": "comprehensive"
|
||
}, sort=[("calculation_sequence", -1)])
|
||
|
||
yesterday_data = {}
|
||
if yesterday_ranking and "data" in yesterday_ranking:
|
||
# 将昨天的数据转换为字典,以短剧名称为键
|
||
for item in yesterday_ranking["data"]:
|
||
title = item.get("title", "")
|
||
if title:
|
||
yesterday_data[title] = {
|
||
"rank": item.get("rank", 0),
|
||
"play_vv": item.get("play_vv", 0),
|
||
"video_id": item.get("video_id", "")
|
||
}
|
||
logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧")
|
||
else:
|
||
logging.info("📊 未找到昨天的榜单数据,将作为首次生成")
|
||
|
||
if today_videos:
|
||
# 先计算所有视频的播放量差值
|
||
videos_with_growth = []
|
||
for video in today_videos:
|
||
video_id = str(video.get("_id", ""))
|
||
current_play_vv = video.get("play_vv", 0)
|
||
|
||
# 计算与昨天的对比数据
|
||
play_vv_change = 0
|
||
play_vv_change_rate = 0
|
||
is_new = True
|
||
|
||
mix_name = video.get("mix_name", "")
|
||
if mix_name in yesterday_data:
|
||
is_new = False
|
||
yesterday_play_vv = yesterday_data[mix_name]["play_vv"]
|
||
|
||
# 计算播放量变化
|
||
play_vv_change = current_play_vv - yesterday_play_vv
|
||
if yesterday_play_vv > 0:
|
||
play_vv_change_rate = round((play_vv_change / yesterday_play_vv) * 100, 2)
|
||
|
||
# 创建包含增长数据的视频项
|
||
video_with_growth = {
|
||
"video": video,
|
||
"play_vv_change": play_vv_change,
|
||
"play_vv_change_rate": play_vv_change_rate,
|
||
"is_new": is_new,
|
||
"yesterday_data": yesterday_data.get(mix_name, {})
|
||
}
|
||
videos_with_growth.append(video_with_growth)
|
||
|
||
# 按播放量差值降序排序(差值越大排名越靠前)
|
||
videos_with_growth.sort(key=lambda x: x["play_vv_change"], reverse=True)
|
||
|
||
comprehensive_ranking = {
|
||
"date": today_str,
|
||
"type": "comprehensive",
|
||
"name": "播放量增长榜单",
|
||
"description": f"基于 {yesterday_str} 和 {today_str} 播放量差值排序的榜单(差值越大排名越靠前)",
|
||
"comparison_date": yesterday_str,
|
||
"total_videos": len(videos_with_growth),
|
||
"data": []
|
||
}
|
||
|
||
# 生成排序后的榜单数据
|
||
for i, item in enumerate(videos_with_growth, 1):
|
||
video = item["video"]
|
||
video_id = str(video.get("_id", ""))
|
||
current_play_vv = video.get("play_vv", 0)
|
||
mix_name = video.get("mix_name", "")
|
||
|
||
# 计算排名变化(基于昨天的排名)
|
||
rank_change = 0
|
||
if not item["is_new"] and item["yesterday_data"]:
|
||
yesterday_rank = item["yesterday_data"].get("rank", 0)
|
||
rank_change = yesterday_rank - i
|
||
|
||
ranking_item = {
|
||
"rank": i,
|
||
"title": mix_name,
|
||
"play_vv": current_play_vv,
|
||
"author": video.get("author", ""),
|
||
"video_id": video_id,
|
||
"video_url": video.get("video_url", ""),
|
||
"cover_image_url": video.get("cover_image_url", ""),
|
||
"playcount_str": video.get("playcount", ""),
|
||
# 时间轴对比数据
|
||
"timeline_data": {
|
||
"is_new": item["is_new"],
|
||
"rank_change": rank_change,
|
||
"play_vv_change": item["play_vv_change"],
|
||
"play_vv_change_rate": item["play_vv_change_rate"],
|
||
"yesterday_rank": item["yesterday_data"].get("rank", 0) if not item["is_new"] else 0,
|
||
"yesterday_play_vv": item["yesterday_data"].get("play_vv", 0) if not item["is_new"] else 0
|
||
}
|
||
}
|
||
|
||
comprehensive_ranking["data"].append(ranking_item)
|
||
|
||
# 为每次计算添加唯一的时间戳,确保数据唯一性
|
||
current_timestamp = datetime.now()
|
||
comprehensive_ranking["created_at"] = current_timestamp
|
||
comprehensive_ranking["calculation_id"] = f"{today_str}_{current_timestamp.strftime('%H%M%S')}"
|
||
|
||
# 检查今天已有多少次计算
|
||
existing_count = rankings_collection.count_documents({
|
||
"date": today_str,
|
||
"type": "comprehensive"
|
||
})
|
||
|
||
comprehensive_ranking["calculation_sequence"] = existing_count + 1
|
||
|
||
# 总是插入新的榜单记录,保留所有历史计算数据
|
||
rankings_collection.insert_one(comprehensive_ranking)
|
||
logging.info(f"📝 创建了新的今日榜单数据(第{existing_count + 1}次计算,包含最新差值)")
|
||
logging.info(f"🔖 计算ID: {comprehensive_ranking['calculation_id']}")
|
||
|
||
# 统计信息
|
||
new_count = sum(1 for item in comprehensive_ranking["data"] if item["timeline_data"]["is_new"])
|
||
logging.info(f"✅ 时间轴对比榜单生成成功")
|
||
logging.info(f"📊 总计 {len(comprehensive_ranking['data'])} 条记录")
|
||
logging.info(f"🆕 新上榜 {new_count} 条")
|
||
logging.info(f"🔄 对比基准日期: {yesterday_str}")
|
||
|
||
return True
|
||
else:
|
||
logging.warning("⚠️ 榜单生成失败:无今日数据")
|
||
return False
|
||
|
||
except Exception as e:
|
||
logging.error(f"💥 生成时间轴对比榜单时发生异常: {e}")
|
||
import traceback
|
||
logging.error(f"详细错误信息: {traceback.format_exc()}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
logging.error(f"💥 生成榜单时发生异常: {e}")
|
||
import traceback
|
||
logging.error(f"详细错误信息: {traceback.format_exc()}")
|
||
|
||
|
||
|
||
def setup_schedule(self):
|
||
"""设置定时任务"""
|
||
# 从配置文件读取执行时间
|
||
scheduler_time = config.SCHEDULER_TIME
|
||
schedule.every().day.at(scheduler_time).do(self.run_douyin_scraper)
|
||
|
||
logging.info(f"⏰ 定时器已设置:每晚{scheduler_time}执行抖音播放量抓取")
|
||
|
||
def show_next_run(self):
|
||
"""显示下次执行时间"""
|
||
jobs = schedule.get_jobs()
|
||
if jobs:
|
||
next_run = jobs[0].next_run
|
||
logging.info(f"⏰ 下次执行时间: {next_run}")
|
||
|
||
def run_once(self):
|
||
"""立即执行一次"""
|
||
logging.info("🔧 立即执行模式...")
|
||
self.run_douyin_scraper()
|
||
|
||
def run_test(self):
|
||
"""测试模式 - 立即执行一次"""
|
||
logging.info("🧪 测试模式 - 立即执行抖音播放量抓取任务...")
|
||
self.run_douyin_scraper()
|
||
|
||
def run_ranking_only(self):
|
||
"""仅生成榜单(不抓取数据)"""
|
||
logging.info("📊 仅生成榜单模式...")
|
||
self.generate_daily_rankings()
|
||
|
||
def start_scheduler(self):
|
||
"""启动定时器"""
|
||
self.is_running = True
|
||
logging.info("🚀 抖音播放量自动抓取定时器已启动")
|
||
logging.info(f"⏰ 执行时间:每天{config.SCHEDULER_TIME}执行抖音播放量抓取")
|
||
logging.info("📁 目标脚本:rank_data_scraper.py")
|
||
logging.info("💾 数据保存:MongoDB")
|
||
logging.info("⏹️ 按 Ctrl+C 停止定时器")
|
||
|
||
try:
|
||
while self.is_running:
|
||
schedule.run_pending()
|
||
time.sleep(1)
|
||
|
||
# 每分钟显示一次状态
|
||
if int(time.time()) % 60 == 0:
|
||
self.show_next_run()
|
||
|
||
except KeyboardInterrupt:
|
||
logging.info("\n⏹️ 定时器已停止")
|
||
self.is_running = False
|
||
|
||
def main():
|
||
"""主函数"""
|
||
import argparse
|
||
|
||
try:
|
||
parser = argparse.ArgumentParser(description='抖音播放量自动抓取定时器')
|
||
parser.add_argument('--test', action='store_true', help='测试模式 - 立即执行一次')
|
||
parser.add_argument('--once', action='store_true', help='立即执行一次并退出')
|
||
parser.add_argument('--ranking-only', action='store_true', help='仅生成榜单(不抓取数据)')
|
||
|
||
args = parser.parse_args()
|
||
|
||
# 设置日志配置
|
||
setup_logging()
|
||
|
||
print("正在初始化定时器...")
|
||
scheduler = DouyinAutoScheduler()
|
||
|
||
if args.test:
|
||
print("执行测试模式...")
|
||
scheduler.run_test()
|
||
elif args.once:
|
||
print("执行单次模式...")
|
||
scheduler.run_once()
|
||
elif args.ranking_only:
|
||
print("执行榜单生成模式...")
|
||
scheduler.run_ranking_only()
|
||
else:
|
||
print("启动定时器模式...")
|
||
scheduler.setup_schedule()
|
||
scheduler.start_scheduler()
|
||
|
||
print("程序执行完成")
|
||
|
||
except Exception as e:
|
||
print(f"程序执行出错: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return 1
|
||
|
||
return 0
|
||
|
||
if __name__ == '__main__':
|
||
main() |