整理了一下代码

This commit is contained in:
xbh 2025-10-17 21:58:19 +08:00
parent ab8fd3131d
commit 2af34bd0dc
4 changed files with 44 additions and 63 deletions

3
.gitignore vendored
View File

@ -36,6 +36,9 @@ scripts/config/chrome_profile/
drivers/* drivers/*
!drivers/chromedriver.exe !drivers/chromedriver.exe
# Rankings config directory
handlers/Rankings/config/
# Environment variables # Environment variables
.env .env
.venv .venv

View File

@ -11,12 +11,16 @@
import schedule import schedule
import time import time
import subprocess
import sys import sys
import os import os
import logging import logging
from pathlib import Path from pathlib import Path
from datetime import datetime from datetime import datetime
import config
# 添加项目路径到 Python 路径
sys.path.append(os.path.join(os.path.dirname(__file__), 'handlers', 'Rankings'))
from rank_data_scraper import DouyinPlayVVScraper
# 配置日志的函数 # 配置日志的函数
def setup_logging(): def setup_logging():
@ -48,43 +52,30 @@ class DouyinAutoScheduler:
# 设置环境变量,确保自动模式 # 设置环境变量,确保自动模式
os.environ['AUTO_CONTINUE'] = '1' os.environ['AUTO_CONTINUE'] = '1'
# 构建脚本路径 - 指向Rankings目录中的脚本 # 直接创建并运行 DouyinPlayVVScraper 实例
script_path = Path(__file__).parent / 'handlers' / 'Rankings' / 'rank_data_scraper.py' scraper = DouyinPlayVVScraper(
start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation",
auto_continue=True,
duration_s=60
)
if not script_path.exists(): logging.info("📁 开始执行抓取任务...")
logging.error(f"❌ 脚本文件不存在: {script_path}") scraper.run()
return
logging.info(f"📁 执行脚本: {script_path}") logging.info("✅ 抖音播放量抓取任务执行成功")
# 使用subprocess执行脚本
result = subprocess.run([
sys.executable,
str(script_path),
'--auto',
'--duration', '60'
], capture_output=True, text=True, encoding='utf-8', errors='ignore')
if result.returncode == 0:
logging.info("✅ 抖音播放量抓取任务执行成功")
if result.stdout:
logging.info(f"📄 输出: {result.stdout.strip()}")
else:
logging.error(f"❌ 任务执行失败,返回码: {result.returncode}")
if result.stderr:
logging.error(f"💥 错误信息: {result.stderr.strip()}")
if result.stdout:
logging.info(f"📄 输出: {result.stdout.strip()}")
except Exception as e: except Exception as e:
logging.error(f"💥 执行任务时发生异常: {e}") logging.error(f"💥 执行任务时发生异常: {e}")
import traceback
logging.error(f"详细错误信息: {traceback.format_exc()}")
def setup_schedule(self): def setup_schedule(self):
"""设置定时任务""" """设置定时任务"""
# 主执行时间每晚24:00午夜 # 从配置文件读取执行时间
schedule.every().day.at("00:00").do(self.run_douyin_scraper) scheduler_time = config.SCHEDULER_TIME
schedule.every().day.at(scheduler_time).do(self.run_douyin_scraper)
logging.info("⏰ 定时器已设置:每晚24:00执行抖音播放量抓取") logging.info(f"⏰ 定时器已设置:每晚{scheduler_time}执行抖音播放量抓取")
def show_next_run(self): def show_next_run(self):
"""显示下次执行时间""" """显示下次执行时间"""
@ -107,13 +98,11 @@ class DouyinAutoScheduler:
"""启动定时器""" """启动定时器"""
self.is_running = True self.is_running = True
logging.info("🚀 抖音播放量自动抓取定时器已启动") logging.info("🚀 抖音播放量自动抓取定时器已启动")
logging.info("⏰ 执行时间每晚24:00") logging.info(f"⏰ 执行时间:每天{config.SCHEDULER_TIME}执行抖音播放量抓取")
logging.info("📁 目标脚本rank_data_scraper.py") logging.info("📁 目标脚本rank_data_scraper.py")
logging.info("💾 数据保存MongoDB") logging.info("💾 数据保存MongoDB")
logging.info("⏹️ 按 Ctrl+C 停止定时器") logging.info("⏹️ 按 Ctrl+C 停止定时器")
self.show_next_run()
try: try:
while self.is_running: while self.is_running:
schedule.run_pending() schedule.run_pending()

View File

@ -3,7 +3,8 @@ import importlib
# 数据库配置 # 数据库配置
MONGO_URI = "mongodb://localhost:27017" MONGO_URI = "mongodb://localhost:27017"
MONGO_DB_NAME = "Rankings" # MONGO_URI = "mongodb://mongouser:Jdei2243afN@172.16.0.6:27017,172.16.0.4:27017/test?replicaSet=cmgo-r6qkaern_0&authSource=admin"
MONGO_DB_NAME = "jubian"
# 应用配置 # 应用配置
APP_ENV = os.getenv('APP_ENV', 'development') APP_ENV = os.getenv('APP_ENV', 'development')
@ -13,4 +14,7 @@ DEBUG = APP_ENV == 'development'
LOG_LEVEL = 'INFO' LOG_LEVEL = 'INFO'
LOG_DIR = 'logs' LOG_DIR = 'logs'
# 定时器配置
SCHEDULER_TIME = "20:23" # 定时器执行时间,格式为 HH:MM (24小时制)
print(f"Successfully loaded configuration for environment: {APP_ENV}") print(f"Successfully loaded configuration for environment: {APP_ENV}")

View File

@ -31,8 +31,11 @@ from selenium.webdriver.chrome.options import Options
# 保留导入但默认不使用webdriver_manager避免网络下载卡顿 # 保留导入但默认不使用webdriver_manager避免网络下载卡顿
from webdriver_manager.chrome import ChromeDriverManager # noqa: F401 from webdriver_manager.chrome import ChromeDriverManager # noqa: F401
import chromedriver_autoinstaller import chromedriver_autoinstaller
from pymongo import MongoClient import sys
from pymongo.errors import ConnectionFailure import os
# 添加项目根目录到 Python 路径
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
from database import db
# 配置日志 # 配置日志
@ -60,7 +63,6 @@ class DouyinPlayVVScraper:
self.driver = None self.driver = None
self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item} self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item}
self.captured_responses = [] self.captured_responses = []
self.mongo_client = None
self.db = None self.db = None
self.collection = None self.collection = None
self._cleanup_old_profiles() self._cleanup_old_profiles()
@ -69,33 +71,17 @@ class DouyinPlayVVScraper:
def _setup_mongodb(self): def _setup_mongodb(self):
"""设置MongoDB连接""" """设置MongoDB连接"""
try: try:
# MongoDB连接配置 # 使用 database.py 中的连接
mongo_host = os.environ.get('MONGO_HOST', 'localhost') self.db = db
mongo_port = int(os.environ.get('MONGO_PORT', 27017))
mongo_db = os.environ.get('MONGO_DB', 'Rankings') # 设置集合
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list') mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list')
# 创建MongoDB连接
self.mongo_client = MongoClient(mongo_host, mongo_port, serverSelectionTimeoutMS=5000)
# 测试连接
self.mongo_client.admin.command('ping')
# 设置数据库和集合
self.db = self.mongo_client[mongo_db]
self.collection = self.db[mongo_collection] self.collection = self.db[mongo_collection]
logging.info(f'MongoDB连接成功: {mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}') logging.info(f'MongoDB连接成功使用数据库: {self.db.name},集合: {mongo_collection}')
except ConnectionFailure as e:
logging.warning(f'MongoDB连接失败: {e}')
logging.info('将仅保存到本地文件')
self.mongo_client = None
self.db = None
self.collection = None
except Exception as e: except Exception as e:
logging.warning(f'MongoDB设置出错: {e}') logging.warning(f'MongoDB设置出错: {e}')
self.mongo_client = None
self.db = None self.db = None
self.collection = None self.collection = None
@ -649,10 +635,10 @@ class DouyinPlayVVScraper:
elapsed = int(time.time() - start) elapsed = int(time.time() - start)
if elapsed - last_progress >= 5: if elapsed - last_progress >= 5:
last_progress = elapsed last_progress = elapsed
logging.info(f'进度: {elapsed}/{duration_s}s, 已发现play_vv候选 {len(self.play_vv_items)}') logging.info(f'进度: {elapsed}/{duration_s}, 目标数量: {len(self.play_vv_items)}')
time.sleep(0.8) time.sleep(0.8)
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)}play_vv候选') logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)}目标')
def parse_ssr_data(self): def parse_ssr_data(self):
@ -712,7 +698,7 @@ class DouyinPlayVVScraper:
def save_to_mongodb(self): def save_to_mongodb(self):
"""将数据保存到MongoDB""" """将数据保存到MongoDB"""
if self.mongo_client is None or self.collection is None: if self.collection is None:
logging.warning('MongoDB未连接跳过数据库保存') logging.warning('MongoDB未连接跳过数据库保存')
return return
@ -797,6 +783,5 @@ if __name__ == '__main__':
os.environ['AUTO_CONTINUE'] = '1' os.environ['AUTO_CONTINUE'] = '1'
print('=== Selenium+CDP 抖音play_vv抓取器 ===') print('=== Selenium+CDP 抖音play_vv抓取器 ===')
print('将复用本地Chrome配置并抓取网络响应中的play_vv')
scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration) scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration)
scraper.run() scraper.run()