diff --git a/.gitignore b/.gitignore index 239095e..05acbec 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,9 @@ scripts/config/chrome_profile/ drivers/* !drivers/chromedriver.exe +# Rankings config directory +handlers/Rankings/config/ + # Environment variables .env .venv diff --git a/Timer_worker.py b/Timer_worker.py index 6aa2d85..51e5b57 100644 --- a/Timer_worker.py +++ b/Timer_worker.py @@ -11,12 +11,16 @@ import schedule import time -import subprocess import sys import os import logging from pathlib import Path from datetime import datetime +import config + +# 添加项目路径到 Python 路径 +sys.path.append(os.path.join(os.path.dirname(__file__), 'handlers', 'Rankings')) +from rank_data_scraper import DouyinPlayVVScraper # 配置日志的函数 def setup_logging(): @@ -48,43 +52,30 @@ class DouyinAutoScheduler: # 设置环境变量,确保自动模式 os.environ['AUTO_CONTINUE'] = '1' - # 构建脚本路径 - 指向Rankings目录中的脚本 - script_path = Path(__file__).parent / 'handlers' / 'Rankings' / 'rank_data_scraper.py' - - if not script_path.exists(): - logging.error(f"❌ 脚本文件不存在: {script_path}") - return - - logging.info(f"📁 执行脚本: {script_path}") - - # 使用subprocess执行脚本 - result = subprocess.run([ - sys.executable, - str(script_path), - '--auto', - '--duration', '60' - ], capture_output=True, text=True, encoding='utf-8', errors='ignore') - - if result.returncode == 0: - logging.info("✅ 抖音播放量抓取任务执行成功") - if result.stdout: - logging.info(f"📄 输出: {result.stdout.strip()}") - else: - logging.error(f"❌ 任务执行失败,返回码: {result.returncode}") - if result.stderr: - logging.error(f"💥 错误信息: {result.stderr.strip()}") - if result.stdout: - logging.info(f"📄 输出: {result.stdout.strip()}") + # 直接创建并运行 DouyinPlayVVScraper 实例 + scraper = DouyinPlayVVScraper( + start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation", + auto_continue=True, + duration_s=60 + ) + + logging.info("📁 开始执行抓取任务...") + scraper.run() + + logging.info("✅ 抖音播放量抓取任务执行成功") except Exception as e: logging.error(f"💥 执行任务时发生异常: {e}") + import traceback + logging.error(f"详细错误信息: {traceback.format_exc()}") def setup_schedule(self): """设置定时任务""" - # 主执行时间:每晚24:00(午夜) - schedule.every().day.at("00:00").do(self.run_douyin_scraper) + # 从配置文件读取执行时间 + scheduler_time = config.SCHEDULER_TIME + schedule.every().day.at(scheduler_time).do(self.run_douyin_scraper) - logging.info("⏰ 定时器已设置:每晚24:00执行抖音播放量抓取") + logging.info(f"⏰ 定时器已设置:每晚{scheduler_time}执行抖音播放量抓取") def show_next_run(self): """显示下次执行时间""" @@ -107,13 +98,11 @@ class DouyinAutoScheduler: """启动定时器""" self.is_running = True logging.info("🚀 抖音播放量自动抓取定时器已启动") - logging.info("⏰ 执行时间:每晚24:00") + logging.info(f"⏰ 执行时间:每天{config.SCHEDULER_TIME}执行抖音播放量抓取") logging.info("📁 目标脚本:rank_data_scraper.py") logging.info("💾 数据保存:MongoDB") logging.info("⏹️ 按 Ctrl+C 停止定时器") - self.show_next_run() - try: while self.is_running: schedule.run_pending() diff --git a/config.py b/config.py index e6d477e..384ccdc 100644 --- a/config.py +++ b/config.py @@ -3,7 +3,8 @@ import importlib # 数据库配置 MONGO_URI = "mongodb://localhost:27017" -MONGO_DB_NAME = "Rankings" +# MONGO_URI = "mongodb://mongouser:Jdei2243afN@172.16.0.6:27017,172.16.0.4:27017/test?replicaSet=cmgo-r6qkaern_0&authSource=admin" +MONGO_DB_NAME = "jubian" # 应用配置 APP_ENV = os.getenv('APP_ENV', 'development') @@ -13,4 +14,7 @@ DEBUG = APP_ENV == 'development' LOG_LEVEL = 'INFO' LOG_DIR = 'logs' +# 定时器配置 +SCHEDULER_TIME = "20:23" # 定时器执行时间,格式为 HH:MM (24小时制) + print(f"Successfully loaded configuration for environment: {APP_ENV}") \ No newline at end of file diff --git a/handlers/Rankings/rank_data_scraper.py b/handlers/Rankings/rank_data_scraper.py index d7e3bbc..d3d8312 100644 --- a/handlers/Rankings/rank_data_scraper.py +++ b/handlers/Rankings/rank_data_scraper.py @@ -31,8 +31,11 @@ from selenium.webdriver.chrome.options import Options # 保留导入但默认不使用webdriver_manager,避免网络下载卡顿 from webdriver_manager.chrome import ChromeDriverManager # noqa: F401 import chromedriver_autoinstaller -from pymongo import MongoClient -from pymongo.errors import ConnectionFailure +import sys +import os +# 添加项目根目录到 Python 路径 +sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) +from database import db # 配置日志 @@ -60,7 +63,6 @@ class DouyinPlayVVScraper: self.driver = None self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item} self.captured_responses = [] - self.mongo_client = None self.db = None self.collection = None self._cleanup_old_profiles() @@ -69,33 +71,17 @@ class DouyinPlayVVScraper: def _setup_mongodb(self): """设置MongoDB连接""" try: - # MongoDB连接配置 - mongo_host = os.environ.get('MONGO_HOST', 'localhost') - mongo_port = int(os.environ.get('MONGO_PORT', 27017)) - mongo_db = os.environ.get('MONGO_DB', 'Rankings') + # 使用 database.py 中的连接 + self.db = db + + # 设置集合 mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list') - - # 创建MongoDB连接 - self.mongo_client = MongoClient(mongo_host, mongo_port, serverSelectionTimeoutMS=5000) - - # 测试连接 - self.mongo_client.admin.command('ping') - - # 设置数据库和集合 - self.db = self.mongo_client[mongo_db] self.collection = self.db[mongo_collection] - logging.info(f'MongoDB连接成功: {mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}') + logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}') - except ConnectionFailure as e: - logging.warning(f'MongoDB连接失败: {e}') - logging.info('将仅保存到本地文件') - self.mongo_client = None - self.db = None - self.collection = None except Exception as e: logging.warning(f'MongoDB设置出错: {e}') - self.mongo_client = None self.db = None self.collection = None @@ -649,10 +635,10 @@ class DouyinPlayVVScraper: elapsed = int(time.time() - start) if elapsed - last_progress >= 5: last_progress = elapsed - logging.info(f'进度: {elapsed}/{duration_s}s, 已发现play_vv候选 {len(self.play_vv_items)}') + logging.info(f'进度: {elapsed}/{duration_s}, 目标数量: {len(self.play_vv_items)}') time.sleep(0.8) - logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个play_vv候选') + logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个目标') def parse_ssr_data(self): @@ -712,7 +698,7 @@ class DouyinPlayVVScraper: def save_to_mongodb(self): """将数据保存到MongoDB""" - if self.mongo_client is None or self.collection is None: + if self.collection is None: logging.warning('MongoDB未连接,跳过数据库保存') return @@ -797,6 +783,5 @@ if __name__ == '__main__': os.environ['AUTO_CONTINUE'] = '1' print('=== Selenium+CDP 抖音play_vv抓取器 ===') - print('将复用本地Chrome配置并抓取网络响应中的play_vv') scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration) scraper.run() \ No newline at end of file