#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 抖音播放量自动抓取定时器 - 跨平台版本 功能: - 每晚自动执行抖音播放量抓取任务 - 数据抓取完成后自动生成各类榜单(播放量榜、增长榜、新晋榜、热门趋势榜) - 支持Windows、macOS、Linux - 自动保存数据到MongoDB 使用方法: - 正常模式:python Timer_worker.py(启动定时器) - 测试模式:python Timer_worker.py --test(立即执行一次) - 单次执行:python Timer_worker.py --once(立即执行一次并退出) - 仅生成榜单:python Timer_worker.py --ranking-only(仅生成榜单,不抓取数据) """ import schedule import time import sys import os import logging import argparse from pathlib import Path from datetime import datetime, date import config # 添加项目路径到 Python 路径 sys.path.append(os.path.join(os.path.dirname(__file__), 'handlers', 'Rankings')) from rank_data_scraper import DouyinPlayVVScraper # 配置日志的函数 def setup_logging(): """设置日志配置""" # 确保logs目录存在 import os script_dir = os.path.dirname(os.path.abspath(__file__)) logs_dir = os.path.join(script_dir, 'handlers', 'Rankings', 'logs') os.makedirs(logs_dir, exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(os.path.join(logs_dir, 'scheduler.log'), encoding='utf-8'), logging.StreamHandler() ] ) class DouyinAutoScheduler: def __init__(self): self.is_running = False def run_douyin_scraper(self): """执行抖音播放量抓取任务""" try: logging.info("🚀 开始执行抖音播放量抓取任务...") # 设置环境变量,确保自动模式 os.environ['AUTO_CONTINUE'] = '1' # 直接创建并运行 DouyinPlayVVScraper 实例 scraper = DouyinPlayVVScraper( start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation", auto_continue=True, duration_s=60 ) logging.info("📁 开始执行抓取任务...") scraper.run() logging.info("✅ 抖音播放量抓取任务执行成功") # 数据抓取完成后,自动生成当日榜单 self.generate_daily_rankings() except Exception as e: logging.error(f"💥 执行任务时发生异常: {e}") import traceback logging.error(f"详细错误信息: {traceback.format_exc()}") def generate_daily_rankings(self): """生成每日榜单数据(基于时间轴对比)""" try: from database import db from datetime import timedelta # 获取集合 douyin_collection = db['Rankings_list'] # 使用真实抓取的数据 rankings_collection = db['Ranking_storage'] today = date.today() yesterday = today - timedelta(days=1) today_str = today.strftime('%Y-%m-%d') yesterday_str = yesterday.strftime('%Y-%m-%d') logging.info(f"📅 正在生成 {today_str} 的榜单(对比 {yesterday_str})...") # 删除当天已有的榜单数据 rankings_collection.delete_many({"date": today_str}) logging.info(f"🗑️ 已清理 {today_str} 的旧榜单数据") # 获取今天和昨天的榜单数据进行对比 try: logging.info("🔄 正在生成时间轴对比榜单...") # 获取今天的数据,按短剧名称去重,只保留播放量最高的 today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1)) # 按短剧名称去重,每个短剧只保留播放量最高的一条 unique_videos = {} for video in today_videos_raw: mix_name = video.get("mix_name", "") if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)): unique_videos[mix_name] = video today_videos = list(unique_videos.values()) logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") # 获取昨天的榜单数据(如果存在),取最新的计算结果 yesterday_ranking = rankings_collection.find_one({ "date": yesterday_str, "type": "comprehensive" }, sort=[("calculation_sequence", -1)]) yesterday_data = {} if yesterday_ranking and "data" in yesterday_ranking: # 将昨天的数据转换为字典,以短剧名称为键 for item in yesterday_ranking["data"]: title = item.get("title", "") if title: yesterday_data[title] = { "rank": item.get("rank", 0), "play_vv": item.get("play_vv", 0), "video_id": item.get("video_id", "") } logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧") else: logging.info("📊 未找到昨天的榜单数据,将作为首次生成") if today_videos: # 先计算所有视频的播放量差值 videos_with_growth = [] for video in today_videos: video_id = str(video.get("_id", "")) current_play_vv = video.get("play_vv", 0) # 计算与昨天的对比数据 play_vv_change = 0 play_vv_change_rate = 0 is_new = True mix_name = video.get("mix_name", "") if mix_name in yesterday_data: is_new = False yesterday_play_vv = yesterday_data[mix_name]["play_vv"] # 计算播放量变化 play_vv_change = current_play_vv - yesterday_play_vv if yesterday_play_vv > 0: play_vv_change_rate = round((play_vv_change / yesterday_play_vv) * 100, 2) # 创建包含增长数据的视频项 video_with_growth = { "video": video, "play_vv_change": play_vv_change, "play_vv_change_rate": play_vv_change_rate, "is_new": is_new, "yesterday_data": yesterday_data.get(mix_name, {}) } videos_with_growth.append(video_with_growth) # 按播放量差值降序排序(差值越大排名越靠前) videos_with_growth.sort(key=lambda x: x["play_vv_change"], reverse=True) comprehensive_ranking = { "date": today_str, "type": "comprehensive", "name": "播放量增长榜单", "description": f"基于 {yesterday_str} 和 {today_str} 播放量差值排序的榜单(差值越大排名越靠前)", "comparison_date": yesterday_str, "total_videos": len(videos_with_growth), "data": [] } # 生成排序后的榜单数据 for i, item in enumerate(videos_with_growth, 1): video = item["video"] video_id = str(video.get("_id", "")) current_play_vv = video.get("play_vv", 0) mix_name = video.get("mix_name", "") # 计算排名变化(基于昨天的排名) rank_change = 0 if not item["is_new"] and item["yesterday_data"]: yesterday_rank = item["yesterday_data"].get("rank", 0) rank_change = yesterday_rank - i ranking_item = { "rank": i, "title": mix_name, "play_vv": current_play_vv, "author": video.get("author", ""), "video_id": video_id, "video_url": video.get("video_url", ""), "cover_image_url": video.get("cover_image_url", ""), "playcount_str": video.get("playcount", ""), # 时间轴对比数据 "timeline_data": { "is_new": item["is_new"], "rank_change": rank_change, "play_vv_change": item["play_vv_change"], "play_vv_change_rate": item["play_vv_change_rate"], "yesterday_rank": item["yesterday_data"].get("rank", 0) if not item["is_new"] else 0, "yesterday_play_vv": item["yesterday_data"].get("play_vv", 0) if not item["is_new"] else 0 } } comprehensive_ranking["data"].append(ranking_item) # 为每次计算添加唯一的时间戳,确保数据唯一性 current_timestamp = datetime.now() comprehensive_ranking["created_at"] = current_timestamp comprehensive_ranking["calculation_id"] = f"{today_str}_{current_timestamp.strftime('%H%M%S')}" # 检查今天已有多少次计算 existing_count = rankings_collection.count_documents({ "date": today_str, "type": "comprehensive" }) comprehensive_ranking["calculation_sequence"] = existing_count + 1 # 总是插入新的榜单记录,保留所有历史计算数据 rankings_collection.insert_one(comprehensive_ranking) logging.info(f"📝 创建了新的今日榜单数据(第{existing_count + 1}次计算,包含最新差值)") logging.info(f"🔖 计算ID: {comprehensive_ranking['calculation_id']}") # 统计信息 new_count = sum(1 for item in comprehensive_ranking["data"] if item["timeline_data"]["is_new"]) logging.info(f"✅ 时间轴对比榜单生成成功") logging.info(f"📊 总计 {len(comprehensive_ranking['data'])} 条记录") logging.info(f"🆕 新上榜 {new_count} 条") logging.info(f"🔄 对比基准日期: {yesterday_str}") return True else: logging.warning("⚠️ 榜单生成失败:无今日数据") return False except Exception as e: logging.error(f"💥 生成时间轴对比榜单时发生异常: {e}") import traceback logging.error(f"详细错误信息: {traceback.format_exc()}") return False except Exception as e: logging.error(f"💥 生成榜单时发生异常: {e}") import traceback logging.error(f"详细错误信息: {traceback.format_exc()}") def setup_schedule(self): """设置定时任务""" # 从配置文件读取执行时间 scheduler_time = config.SCHEDULER_TIME schedule.every().day.at(scheduler_time).do(self.run_douyin_scraper) logging.info(f"⏰ 定时器已设置:每晚{scheduler_time}执行抖音播放量抓取") def show_next_run(self): """显示下次执行时间""" jobs = schedule.get_jobs() if jobs: next_run = jobs[0].next_run logging.info(f"⏰ 下次执行时间: {next_run}") def run_once(self): """立即执行一次""" logging.info("🔧 立即执行模式...") self.run_douyin_scraper() def run_test(self): """测试模式 - 立即执行一次""" logging.info("🧪 测试模式 - 立即执行抖音播放量抓取任务...") self.run_douyin_scraper() def run_ranking_only(self): """仅生成榜单(不抓取数据)""" logging.info("📊 仅生成榜单模式...") self.generate_daily_rankings() def start_scheduler(self): """启动定时器""" self.is_running = True logging.info("🚀 抖音播放量自动抓取定时器已启动") logging.info(f"⏰ 执行时间:每天{config.SCHEDULER_TIME}执行抖音播放量抓取") logging.info("📁 目标脚本:rank_data_scraper.py") logging.info("💾 数据保存:MongoDB") logging.info("⏹️ 按 Ctrl+C 停止定时器") try: while self.is_running: schedule.run_pending() time.sleep(1) # 每分钟显示一次状态 if int(time.time()) % 60 == 0: self.show_next_run() except KeyboardInterrupt: logging.info("\n⏹️ 定时器已停止") self.is_running = False def main(): """主函数""" import argparse try: parser = argparse.ArgumentParser(description='抖音播放量自动抓取定时器') parser.add_argument('--test', action='store_true', help='测试模式 - 立即执行一次') parser.add_argument('--once', action='store_true', help='立即执行一次并退出') parser.add_argument('--ranking-only', action='store_true', help='仅生成榜单(不抓取数据)') args = parser.parse_args() # 设置日志配置 setup_logging() print("正在初始化定时器...") scheduler = DouyinAutoScheduler() if args.test: print("执行测试模式...") scheduler.run_test() elif args.once: print("执行单次模式...") scheduler.run_once() elif args.ranking_only: print("执行榜单生成模式...") scheduler.run_ranking_only() else: print("启动定时器模式...") scheduler.setup_schedule() scheduler.start_scheduler() print("程序执行完成") except Exception as e: print(f"程序执行出错: {e}") import traceback traceback.print_exc() return 1 return 0 if __name__ == '__main__': main()