整理了一下代码
This commit is contained in:
parent
ab8fd3131d
commit
2af34bd0dc
3
.gitignore
vendored
3
.gitignore
vendored
@ -36,6 +36,9 @@ scripts/config/chrome_profile/
|
|||||||
drivers/*
|
drivers/*
|
||||||
!drivers/chromedriver.exe
|
!drivers/chromedriver.exe
|
||||||
|
|
||||||
|
# Rankings config directory
|
||||||
|
handlers/Rankings/config/
|
||||||
|
|
||||||
# Environment variables
|
# Environment variables
|
||||||
.env
|
.env
|
||||||
.venv
|
.venv
|
||||||
|
|||||||
@ -11,12 +11,16 @@
|
|||||||
|
|
||||||
import schedule
|
import schedule
|
||||||
import time
|
import time
|
||||||
import subprocess
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import config
|
||||||
|
|
||||||
|
# 添加项目路径到 Python 路径
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__), 'handlers', 'Rankings'))
|
||||||
|
from rank_data_scraper import DouyinPlayVVScraper
|
||||||
|
|
||||||
# 配置日志的函数
|
# 配置日志的函数
|
||||||
def setup_logging():
|
def setup_logging():
|
||||||
@ -48,43 +52,30 @@ class DouyinAutoScheduler:
|
|||||||
# 设置环境变量,确保自动模式
|
# 设置环境变量,确保自动模式
|
||||||
os.environ['AUTO_CONTINUE'] = '1'
|
os.environ['AUTO_CONTINUE'] = '1'
|
||||||
|
|
||||||
# 构建脚本路径 - 指向Rankings目录中的脚本
|
# 直接创建并运行 DouyinPlayVVScraper 实例
|
||||||
script_path = Path(__file__).parent / 'handlers' / 'Rankings' / 'rank_data_scraper.py'
|
scraper = DouyinPlayVVScraper(
|
||||||
|
start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation",
|
||||||
|
auto_continue=True,
|
||||||
|
duration_s=60
|
||||||
|
)
|
||||||
|
|
||||||
if not script_path.exists():
|
logging.info("📁 开始执行抓取任务...")
|
||||||
logging.error(f"❌ 脚本文件不存在: {script_path}")
|
scraper.run()
|
||||||
return
|
|
||||||
|
|
||||||
logging.info(f"📁 执行脚本: {script_path}")
|
logging.info("✅ 抖音播放量抓取任务执行成功")
|
||||||
|
|
||||||
# 使用subprocess执行脚本
|
|
||||||
result = subprocess.run([
|
|
||||||
sys.executable,
|
|
||||||
str(script_path),
|
|
||||||
'--auto',
|
|
||||||
'--duration', '60'
|
|
||||||
], capture_output=True, text=True, encoding='utf-8', errors='ignore')
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
|
||||||
logging.info("✅ 抖音播放量抓取任务执行成功")
|
|
||||||
if result.stdout:
|
|
||||||
logging.info(f"📄 输出: {result.stdout.strip()}")
|
|
||||||
else:
|
|
||||||
logging.error(f"❌ 任务执行失败,返回码: {result.returncode}")
|
|
||||||
if result.stderr:
|
|
||||||
logging.error(f"💥 错误信息: {result.stderr.strip()}")
|
|
||||||
if result.stdout:
|
|
||||||
logging.info(f"📄 输出: {result.stdout.strip()}")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"💥 执行任务时发生异常: {e}")
|
logging.error(f"💥 执行任务时发生异常: {e}")
|
||||||
|
import traceback
|
||||||
|
logging.error(f"详细错误信息: {traceback.format_exc()}")
|
||||||
|
|
||||||
def setup_schedule(self):
|
def setup_schedule(self):
|
||||||
"""设置定时任务"""
|
"""设置定时任务"""
|
||||||
# 主执行时间:每晚24:00(午夜)
|
# 从配置文件读取执行时间
|
||||||
schedule.every().day.at("00:00").do(self.run_douyin_scraper)
|
scheduler_time = config.SCHEDULER_TIME
|
||||||
|
schedule.every().day.at(scheduler_time).do(self.run_douyin_scraper)
|
||||||
|
|
||||||
logging.info("⏰ 定时器已设置:每晚24:00执行抖音播放量抓取")
|
logging.info(f"⏰ 定时器已设置:每晚{scheduler_time}执行抖音播放量抓取")
|
||||||
|
|
||||||
def show_next_run(self):
|
def show_next_run(self):
|
||||||
"""显示下次执行时间"""
|
"""显示下次执行时间"""
|
||||||
@ -107,13 +98,11 @@ class DouyinAutoScheduler:
|
|||||||
"""启动定时器"""
|
"""启动定时器"""
|
||||||
self.is_running = True
|
self.is_running = True
|
||||||
logging.info("🚀 抖音播放量自动抓取定时器已启动")
|
logging.info("🚀 抖音播放量自动抓取定时器已启动")
|
||||||
logging.info("⏰ 执行时间:每晚24:00")
|
logging.info(f"⏰ 执行时间:每天{config.SCHEDULER_TIME}执行抖音播放量抓取")
|
||||||
logging.info("📁 目标脚本:rank_data_scraper.py")
|
logging.info("📁 目标脚本:rank_data_scraper.py")
|
||||||
logging.info("💾 数据保存:MongoDB")
|
logging.info("💾 数据保存:MongoDB")
|
||||||
logging.info("⏹️ 按 Ctrl+C 停止定时器")
|
logging.info("⏹️ 按 Ctrl+C 停止定时器")
|
||||||
|
|
||||||
self.show_next_run()
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while self.is_running:
|
while self.is_running:
|
||||||
schedule.run_pending()
|
schedule.run_pending()
|
||||||
|
|||||||
@ -3,7 +3,8 @@ import importlib
|
|||||||
|
|
||||||
# 数据库配置
|
# 数据库配置
|
||||||
MONGO_URI = "mongodb://localhost:27017"
|
MONGO_URI = "mongodb://localhost:27017"
|
||||||
MONGO_DB_NAME = "Rankings"
|
# MONGO_URI = "mongodb://mongouser:Jdei2243afN@172.16.0.6:27017,172.16.0.4:27017/test?replicaSet=cmgo-r6qkaern_0&authSource=admin"
|
||||||
|
MONGO_DB_NAME = "jubian"
|
||||||
|
|
||||||
# 应用配置
|
# 应用配置
|
||||||
APP_ENV = os.getenv('APP_ENV', 'development')
|
APP_ENV = os.getenv('APP_ENV', 'development')
|
||||||
@ -13,4 +14,7 @@ DEBUG = APP_ENV == 'development'
|
|||||||
LOG_LEVEL = 'INFO'
|
LOG_LEVEL = 'INFO'
|
||||||
LOG_DIR = 'logs'
|
LOG_DIR = 'logs'
|
||||||
|
|
||||||
|
# 定时器配置
|
||||||
|
SCHEDULER_TIME = "20:23" # 定时器执行时间,格式为 HH:MM (24小时制)
|
||||||
|
|
||||||
print(f"Successfully loaded configuration for environment: {APP_ENV}")
|
print(f"Successfully loaded configuration for environment: {APP_ENV}")
|
||||||
@ -31,8 +31,11 @@ from selenium.webdriver.chrome.options import Options
|
|||||||
# 保留导入但默认不使用webdriver_manager,避免网络下载卡顿
|
# 保留导入但默认不使用webdriver_manager,避免网络下载卡顿
|
||||||
from webdriver_manager.chrome import ChromeDriverManager # noqa: F401
|
from webdriver_manager.chrome import ChromeDriverManager # noqa: F401
|
||||||
import chromedriver_autoinstaller
|
import chromedriver_autoinstaller
|
||||||
from pymongo import MongoClient
|
import sys
|
||||||
from pymongo.errors import ConnectionFailure
|
import os
|
||||||
|
# 添加项目根目录到 Python 路径
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
|
||||||
|
from database import db
|
||||||
|
|
||||||
|
|
||||||
# 配置日志
|
# 配置日志
|
||||||
@ -60,7 +63,6 @@ class DouyinPlayVVScraper:
|
|||||||
self.driver = None
|
self.driver = None
|
||||||
self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item}
|
self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item}
|
||||||
self.captured_responses = []
|
self.captured_responses = []
|
||||||
self.mongo_client = None
|
|
||||||
self.db = None
|
self.db = None
|
||||||
self.collection = None
|
self.collection = None
|
||||||
self._cleanup_old_profiles()
|
self._cleanup_old_profiles()
|
||||||
@ -69,33 +71,17 @@ class DouyinPlayVVScraper:
|
|||||||
def _setup_mongodb(self):
|
def _setup_mongodb(self):
|
||||||
"""设置MongoDB连接"""
|
"""设置MongoDB连接"""
|
||||||
try:
|
try:
|
||||||
# MongoDB连接配置
|
# 使用 database.py 中的连接
|
||||||
mongo_host = os.environ.get('MONGO_HOST', 'localhost')
|
self.db = db
|
||||||
mongo_port = int(os.environ.get('MONGO_PORT', 27017))
|
|
||||||
mongo_db = os.environ.get('MONGO_DB', 'Rankings')
|
# 设置集合
|
||||||
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list')
|
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list')
|
||||||
|
|
||||||
# 创建MongoDB连接
|
|
||||||
self.mongo_client = MongoClient(mongo_host, mongo_port, serverSelectionTimeoutMS=5000)
|
|
||||||
|
|
||||||
# 测试连接
|
|
||||||
self.mongo_client.admin.command('ping')
|
|
||||||
|
|
||||||
# 设置数据库和集合
|
|
||||||
self.db = self.mongo_client[mongo_db]
|
|
||||||
self.collection = self.db[mongo_collection]
|
self.collection = self.db[mongo_collection]
|
||||||
|
|
||||||
logging.info(f'MongoDB连接成功: {mongo_host}:{mongo_port}/{mongo_db}.{mongo_collection}')
|
logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}')
|
||||||
|
|
||||||
except ConnectionFailure as e:
|
|
||||||
logging.warning(f'MongoDB连接失败: {e}')
|
|
||||||
logging.info('将仅保存到本地文件')
|
|
||||||
self.mongo_client = None
|
|
||||||
self.db = None
|
|
||||||
self.collection = None
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warning(f'MongoDB设置出错: {e}')
|
logging.warning(f'MongoDB设置出错: {e}')
|
||||||
self.mongo_client = None
|
|
||||||
self.db = None
|
self.db = None
|
||||||
self.collection = None
|
self.collection = None
|
||||||
|
|
||||||
@ -649,10 +635,10 @@ class DouyinPlayVVScraper:
|
|||||||
elapsed = int(time.time() - start)
|
elapsed = int(time.time() - start)
|
||||||
if elapsed - last_progress >= 5:
|
if elapsed - last_progress >= 5:
|
||||||
last_progress = elapsed
|
last_progress = elapsed
|
||||||
logging.info(f'进度: {elapsed}/{duration_s}s, 已发现play_vv候选 {len(self.play_vv_items)}')
|
logging.info(f'进度: {elapsed}/{duration_s}, 目标数量: {len(self.play_vv_items)}')
|
||||||
time.sleep(0.8)
|
time.sleep(0.8)
|
||||||
|
|
||||||
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个play_vv候选')
|
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个目标')
|
||||||
|
|
||||||
|
|
||||||
def parse_ssr_data(self):
|
def parse_ssr_data(self):
|
||||||
@ -712,7 +698,7 @@ class DouyinPlayVVScraper:
|
|||||||
|
|
||||||
def save_to_mongodb(self):
|
def save_to_mongodb(self):
|
||||||
"""将数据保存到MongoDB"""
|
"""将数据保存到MongoDB"""
|
||||||
if self.mongo_client is None or self.collection is None:
|
if self.collection is None:
|
||||||
logging.warning('MongoDB未连接,跳过数据库保存')
|
logging.warning('MongoDB未连接,跳过数据库保存')
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -797,6 +783,5 @@ if __name__ == '__main__':
|
|||||||
os.environ['AUTO_CONTINUE'] = '1'
|
os.environ['AUTO_CONTINUE'] = '1'
|
||||||
|
|
||||||
print('=== Selenium+CDP 抖音play_vv抓取器 ===')
|
print('=== Selenium+CDP 抖音play_vv抓取器 ===')
|
||||||
print('将复用本地Chrome配置并抓取网络响应中的play_vv')
|
|
||||||
scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration)
|
scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration)
|
||||||
scraper.run()
|
scraper.run()
|
||||||
Loading…
x
Reference in New Issue
Block a user