架构优化
This commit is contained in:
parent
8d4369ecef
commit
f51278742c
@ -24,8 +24,7 @@ def setup_logging():
|
||||
# 确保logs目录存在
|
||||
import os
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
logs_dir = os.path.join(project_root, 'logs')
|
||||
logs_dir = os.path.join(script_dir, 'handlers', 'Rankings', 'logs')
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
@ -49,8 +48,8 @@ class DouyinAutoScheduler:
|
||||
# 设置环境变量,确保自动模式
|
||||
os.environ['AUTO_CONTINUE'] = '1'
|
||||
|
||||
# 构建脚本路径 - 现在在同一目录下
|
||||
script_path = Path(__file__).parent / 'douyin_selenium_cdp_play_vv.py'
|
||||
# 构建脚本路径 - 指向Rankings目录中的脚本
|
||||
script_path = Path(__file__).parent / 'handlers' / 'Rankings' / 'rank_data_scraper.py'
|
||||
|
||||
if not script_path.exists():
|
||||
logging.error(f"❌ 脚本文件不存在: {script_path}")
|
||||
@ -109,7 +108,7 @@ class DouyinAutoScheduler:
|
||||
self.is_running = True
|
||||
logging.info("🚀 抖音播放量自动抓取定时器已启动")
|
||||
logging.info("⏰ 执行时间:每天上午9:35")
|
||||
logging.info("📁 目标脚本:douyin_selenium_cdp_play_vv.py")
|
||||
logging.info("📁 目标脚本:rank_data_scraper.py")
|
||||
logging.info("💾 数据保存:MongoDB")
|
||||
logging.info("⏹️ 按 Ctrl+C 停止定时器")
|
||||
|
||||
93
app.py
Normal file
93
app.py
Normal file
@ -0,0 +1,93 @@
|
||||
from flask import Flask
|
||||
from flask_cors import CORS
|
||||
import logging
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app) # 允许跨域访问
|
||||
|
||||
# 配置日志
|
||||
# 确保logs目录存在
|
||||
logs_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'handlers', 'Rankings', 'logs')
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(logs_dir, 'app.log'), encoding='utf-8'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
# 导入路由
|
||||
from routers.rank_api_routes import api
|
||||
|
||||
# 注册路由
|
||||
@app.route('/')
|
||||
def index():
|
||||
"""API首页"""
|
||||
from flask import jsonify
|
||||
return jsonify({
|
||||
"name": "抖音播放量数据API服务",
|
||||
"version": "2.0",
|
||||
"description": "主程序服务 - 整合小程序API功能",
|
||||
"endpoints": {
|
||||
"/api/videos": "获取视频列表 (支持分页和排序)",
|
||||
"/api/top": "获取热门视频榜单",
|
||||
"/api/search": "搜索视频",
|
||||
"/api/detail": "获取视频详情",
|
||||
"/api/stats": "获取统计信息",
|
||||
"/api/health": "健康检查"
|
||||
},
|
||||
"features": [
|
||||
"分页支持",
|
||||
"多种排序方式",
|
||||
"搜索功能",
|
||||
"详情查看",
|
||||
"统计分析",
|
||||
"小程序优化"
|
||||
]
|
||||
})
|
||||
|
||||
# 注册小程序API路由
|
||||
@app.route('/api/videos')
|
||||
def get_videos():
|
||||
return api.get_videos()
|
||||
|
||||
@app.route('/api/top')
|
||||
def get_top():
|
||||
return api.get_top()
|
||||
|
||||
@app.route('/api/search')
|
||||
def search():
|
||||
return api.search()
|
||||
|
||||
@app.route('/api/detail')
|
||||
def get_detail():
|
||||
return api.get_detail()
|
||||
|
||||
@app.route('/api/stats')
|
||||
def get_stats():
|
||||
return api.get_stats()
|
||||
|
||||
@app.route('/api/health')
|
||||
def health_check():
|
||||
return api.health_check()
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("启动主程序服务...")
|
||||
print("服务地址: http://localhost:5000")
|
||||
print("API接口列表:")
|
||||
print(" - GET / 显示API信息")
|
||||
print(" - GET /api/videos?page=1&limit=20&sort=playcount 获取视频列表(总播放量排序)")
|
||||
print(" - GET /api/videos?page=1&limit=20&sort=growth 获取视频列表(增长排序,默认昨天到今天的差值)")
|
||||
print(" - GET /api/videos?page=1&limit=20&sort=growth&start_date=2025-10-16&end_date=2025-10-17 获取视频列表(自定义日期范围增长排序)")
|
||||
print(" - GET /api/top?limit=10 获取热门榜单")
|
||||
print(" - GET /api/search?q=关键词&page=1&limit=10 搜索视频")
|
||||
print(" - GET /api/detail?id=视频ID 获取视频详情")
|
||||
print(" - GET /api/stats 获取统计信息")
|
||||
print(" - GET /api/health 健康检查")
|
||||
print("专为小程序优化:分页、搜索、详情、统计、增长排序、自定义日期范围")
|
||||
|
||||
app.run(host='0.0.0.0', port=5000, debug=True)
|
||||
16
config.py
Normal file
16
config.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import importlib
|
||||
|
||||
# 数据库配置
|
||||
MONGO_URI = "mongodb://localhost:27017"
|
||||
MONGO_DB_NAME = "Rankings"
|
||||
|
||||
# 应用配置
|
||||
APP_ENV = os.getenv('APP_ENV', 'development')
|
||||
DEBUG = APP_ENV == 'development'
|
||||
|
||||
# 日志配置
|
||||
LOG_LEVEL = 'INFO'
|
||||
LOG_DIR = 'logs'
|
||||
|
||||
print(f"Successfully loaded configuration for environment: {APP_ENV}")
|
||||
19
database.py
Normal file
19
database.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pymongo import MongoClient
|
||||
import config
|
||||
# from mongo_listeners import all_event_listeners # 导入监听器(暂时注释掉,因为文件不存在)
|
||||
|
||||
MONGO_URI = config.MONGO_URI
|
||||
DB_NAME = config.MONGO_DB_NAME
|
||||
|
||||
# 创建MongoDB客户端连接
|
||||
try:
|
||||
# 实例化MongoClient时传入事件监听器
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) # 设置5秒超时
|
||||
db = client[DB_NAME]
|
||||
# 主动检查连接状态
|
||||
client.admin.command('ping')
|
||||
success_message = f"\033[92m成功连接到MongoDB: {DB_NAME}\033[0m"
|
||||
print(success_message)
|
||||
except Exception as e:
|
||||
error_message = f"\033[91m数据库连接失败: {MONGO_URI},请检查MongoDB服务是否已启动。\033[0m"
|
||||
print(error_message)
|
||||
289
docs/API接口文档.md
289
docs/API接口文档.md
@ -1,289 +0,0 @@
|
||||
# 小程序抖音播放量数据API文档
|
||||
|
||||
## 概述
|
||||
|
||||
抖音短剧播放量数据API,专为小程序优化,支持分页、搜索、增长分析等功能。
|
||||
|
||||
**服务地址**: `http://localhost:5001`
|
||||
**启动命令**: `cd backend && python miniprogram_api_server.py`
|
||||
|
||||
## 接口列表
|
||||
|
||||
### 1. 健康检查
|
||||
`GET /api/health`
|
||||
- 检查服务器和数据库状态
|
||||
|
||||
### 2. 视频列表
|
||||
`GET /api/videos`
|
||||
|
||||
**参数**:
|
||||
- `page`: 页码(默认1)
|
||||
- `limit`: 每页数量(默认20,最大50)
|
||||
- `sort`: 排序方式
|
||||
- `playcount`: 按总播放量排序
|
||||
- `growth`: 按增长排序
|
||||
- `time`: 按时间排序
|
||||
- `start_date`: 开始日期(增长排序用,格式:2025-10-16)
|
||||
- `end_date`: 结束日期(增长排序用,格式:2025-10-17)
|
||||
|
||||
**示例**:
|
||||
```
|
||||
# 总播放量排序
|
||||
/api/videos?page=1&limit=10&sort=playcount
|
||||
|
||||
# 增长排序(昨天到今天)
|
||||
/api/videos?page=1&limit=10&sort=growth
|
||||
|
||||
# 自定义日期增长排序
|
||||
/api/videos?page=1&limit=10&sort=growth&start_date=2025-10-16&end_date=2025-10-17
|
||||
```
|
||||
|
||||
### 3. 热门榜单
|
||||
`GET /api/top`
|
||||
- `limit`: 返回数量(默认10,最大50)
|
||||
|
||||
### 4. 搜索视频
|
||||
`GET /api/search`
|
||||
- `q`: 搜索关键词
|
||||
- `page`: 页码
|
||||
- `limit`: 每页数量
|
||||
|
||||
### 5. 视频详情
|
||||
`GET /api/detail`
|
||||
- `id`: 视频ID
|
||||
|
||||
## 数据字段说明
|
||||
|
||||
### 视频数据字段
|
||||
- `_id`: 视频唯一ID
|
||||
- `mix_name`: 短剧名称
|
||||
- `playcount`: 播放量文本(如"2.1亿")
|
||||
- `play_vv`: 播放量数值
|
||||
- `video_url`: 抖音合集链接
|
||||
- `rank`: 排名
|
||||
- `batch_time`: 数据采集时间
|
||||
- `aweme_ids`: 视频ID数组
|
||||
- `cover_image_url`: 封面图片
|
||||
- `cover_backup_urls`: 备用封面图片
|
||||
- `request_id`: 请求ID
|
||||
|
||||
### 增长排序特有字段
|
||||
- `growth`: 播放量增长值
|
||||
- `start_date`: 开始日期
|
||||
- `end_date`: 结束日期
|
||||
|
||||
## 响应格式
|
||||
|
||||
所有接口返回格式:
|
||||
```json
|
||||
{
|
||||
"success": true/false,
|
||||
"data": [...],
|
||||
"message": "错误信息(仅当success为false时)"
|
||||
}
|
||||
```
|
||||
|
||||
视频列表接口额外包含:
|
||||
- `pagination`: 分页信息
|
||||
- `sort_by`: 排序方式
|
||||
- `date_range`: 日期范围(仅增长排序)
|
||||
- `update_time`: 更新时间
|
||||
|
||||
## 📱 小程序集成示例
|
||||
|
||||
### 微信小程序示例
|
||||
|
||||
```javascript
|
||||
// 获取视频列表
|
||||
wx.request({
|
||||
url: 'http://localhost:5001/api/videos',
|
||||
data: {
|
||||
page: 1,
|
||||
limit: 10,
|
||||
sort: 'playcount'
|
||||
},
|
||||
success: function(res) {
|
||||
if (res.data.success) {
|
||||
const videos = res.data.data;
|
||||
videos.forEach(video => {
|
||||
console.log(`${video.mix_name}: ${video.playcount}`);
|
||||
console.log(`封面: ${video.cover_image_url}`);
|
||||
console.log(`链接: ${video.video_url}`);
|
||||
console.log(`视频数量: ${video.aweme_ids.length}`);
|
||||
});
|
||||
console.log('分页信息:', res.data.pagination);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 搜索视频
|
||||
wx.request({
|
||||
url: 'http://localhost:5001/api/search',
|
||||
data: {
|
||||
q: '奶团',
|
||||
page: 1,
|
||||
limit: 5
|
||||
},
|
||||
success: function(res) {
|
||||
if (res.data.success) {
|
||||
const results = res.data.data;
|
||||
results.forEach(video => {
|
||||
console.log(`找到: ${video.mix_name}`);
|
||||
console.log(`播放量: ${video.playcount}`);
|
||||
console.log(`数值播放量: ${video.play_vv}`);
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 获取热门榜单
|
||||
wx.request({
|
||||
url: 'http://localhost:5001/api/top',
|
||||
data: {
|
||||
limit: 10
|
||||
},
|
||||
success: function(res) {
|
||||
if (res.data.success) {
|
||||
console.log('热门榜单:', res.data.data);
|
||||
}
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
### uni-app示例
|
||||
|
||||
```javascript
|
||||
// 封装API请求
|
||||
const API_BASE = 'http://localhost:5001';
|
||||
|
||||
// 获取视频列表
|
||||
export function getVideoList(page = 1, limit = 20, sort = 'playcount') {
|
||||
return uni.request({
|
||||
url: `${API_BASE}/api/videos`,
|
||||
data: { page, limit, sort }
|
||||
});
|
||||
}
|
||||
|
||||
// 搜索视频
|
||||
export function searchVideos(keyword, page = 1, limit = 10) {
|
||||
return uni.request({
|
||||
url: `${API_BASE}/api/search`,
|
||||
data: { q: keyword, page, limit }
|
||||
});
|
||||
}
|
||||
|
||||
// 获取视频详情
|
||||
export function getVideoDetail(id) {
|
||||
return uni.request({
|
||||
url: `${API_BASE}/api/detail`,
|
||||
data: { id }
|
||||
});
|
||||
}
|
||||
|
||||
// 使用示例
|
||||
getVideoList(1, 10).then(([err, res]) => {
|
||||
if (!err && res.data.success) {
|
||||
console.log('视频列表:', res.data.data);
|
||||
}
|
||||
});
|
||||
```
|
||||
|
||||
## 🎯 数据字段说明
|
||||
|
||||
### 视频合集字段
|
||||
- `_id`: 合集唯一标识符(MongoDB ObjectId)
|
||||
- `mix_name`: 合集名称
|
||||
- `playcount`: 播放量文本(如"2.1亿")
|
||||
- `play_vv`: 播放量数值
|
||||
- `video_url`: 合集链接
|
||||
- `rank`: 排名
|
||||
- `batch_time`: 批次时间
|
||||
- `aweme_ids`: 视频ID数组
|
||||
- `cover_image_url`: 封面图片URL
|
||||
- `cover_backup_urls`: 备用封面图片URL数组
|
||||
- `request_id`: 请求ID
|
||||
|
||||
### 分页信息字段
|
||||
- `page`: 当前页码
|
||||
- `limit`: 每页数量
|
||||
- `total`: 总记录数
|
||||
- `pages`: 总页数
|
||||
- `has_next`: 是否有下一页
|
||||
- `has_prev`: 是否有上一页
|
||||
|
||||
## 🔧 技术特性
|
||||
|
||||
### 1. 小程序优化
|
||||
- **轻量级响应**: 精简数据结构,减少传输量
|
||||
- **分页支持**: 避免一次性加载大量数据
|
||||
- **搜索功能**: 支持关键词模糊搜索
|
||||
- **错误处理**: 统一的错误响应格式
|
||||
|
||||
### 2. 性能优化
|
||||
- **数据缓存**: MongoDB查询优化
|
||||
- **分页限制**: 防止过大的数据请求
|
||||
- **连接池**: 数据库连接复用
|
||||
- **日志记录**: 完整的请求日志
|
||||
|
||||
### 3. 安全特性
|
||||
- **参数验证**: 输入参数安全检查
|
||||
- **CORS支持**: 跨域请求支持
|
||||
- **错误隐藏**: 不暴露内部错误信息
|
||||
|
||||
## 📊 测试结果
|
||||
|
||||
最新测试结果(100%通过率):
|
||||
- ✅ API首页: 正常
|
||||
- ✅ 健康检查: 数据库连接正常,35条记录
|
||||
- ✅ 视频列表: 分页功能正常
|
||||
- ✅ 热门榜单: 排序功能正常
|
||||
- ✅ 搜索功能: 关键词搜索正常
|
||||
- ✅ 视频详情: 详情获取正常
|
||||
- ✅ 统计信息: 数据统计正常
|
||||
|
||||
## 🚀 部署建议
|
||||
|
||||
### 开发环境
|
||||
```bash
|
||||
# 启动API服务器
|
||||
python scripts/miniprogram_api_server.py
|
||||
|
||||
# 运行测试
|
||||
python scripts/test_miniprogram_api.py
|
||||
```
|
||||
|
||||
### 生产环境
|
||||
```bash
|
||||
# 使用Gunicorn部署
|
||||
pip install gunicorn
|
||||
gunicorn -w 4 -b 0.0.0.0:5001 scripts.miniprogram_api_server:app
|
||||
|
||||
# 使用Nginx反向代理
|
||||
# 配置SSL证书支持HTTPS
|
||||
```
|
||||
|
||||
## 📝 更新日志
|
||||
|
||||
### v2.0 (2025-10-16)
|
||||
- 🎉 全新的小程序优化API
|
||||
- ✨ 添加分页和搜索功能
|
||||
- 🔧 优化数据结构和响应格式
|
||||
- 📊 增加统计信息接口
|
||||
- 🧪 完整的测试覆盖
|
||||
|
||||
### 与v1.0的主要区别
|
||||
- **更好的分页**: 支持灵活的分页参数
|
||||
- **搜索功能**: 关键词模糊搜索
|
||||
- **详情接口**: 单独的视频详情查看
|
||||
- **统计分析**: 数据统计和分类
|
||||
- **小程序优化**: 专为小程序设计的数据格式
|
||||
|
||||
## 🤝 技术支持
|
||||
|
||||
如有问题,请检查:
|
||||
1. MongoDB服务是否正常运行
|
||||
2. API服务器是否启动成功
|
||||
3. 网络连接是否正常
|
||||
4. 参数格式是否正确
|
||||
|
||||
测试工具会自动生成详细的测试报告,保存在 `api_test_report.json` 文件中。
|
||||
224
docs/README.md
224
docs/README.md
@ -1,224 +0,0 @@
|
||||
# 抖音合集播放量数据抓取系统
|
||||
|
||||
这是一个完整的抖音合集播放量数据抓取和分析系统,包含自动化抓取、定时任务、数据存储和API服务。
|
||||
|
||||
## 🎯 系统概述
|
||||
|
||||
本系统通过Selenium + Chrome DevTools Protocol技术,自动化抓取抖音收藏合集的真实播放量数据,并提供以下功能:
|
||||
|
||||
- **自动化数据抓取**: 每天定时自动抓取抖音收藏合集的播放量数据
|
||||
- **持久化存储**: 数据自动保存到MongoDB数据库
|
||||
- **RESTful API**: 为小程序提供数据接口服务
|
||||
- **多维度分析**: 支持总播放量、增长排序、搜索等功能
|
||||
|
||||
## 📁 项目结构
|
||||
|
||||
```
|
||||
rank_backend/
|
||||
├── scripts/ # 核心脚本目录
|
||||
│ ├── douyin_selenium_cdp_play_vv.py # 主抓取脚本
|
||||
│ ├── douyin_auto_scheduler.py # 定时任务调度器
|
||||
│ ├── miniprogram_api_server.py # 小程序API服务器
|
||||
│ ├── query_mongodb_data.py # 数据库查询工具
|
||||
│ ├── view_latest_data.py # 最新数据查看工具
|
||||
│ ├── check_mongodb.py # MongoDB连接检查
|
||||
│ └── mongodb_quick_view.py # 数据库快速查看
|
||||
├── docs/ # 文档目录
|
||||
│ ├── README.md # 项目说明文档
|
||||
│ └── API接口文档.md # API接口文档
|
||||
├── config/ # 配置文件目录
|
||||
├── drivers/ # Chrome驱动目录
|
||||
├── data/ # 数据文件目录
|
||||
├── logs/ # 日志文件目录
|
||||
├── requirements.txt # Python依赖包
|
||||
├── CHROME_PROFILE_SETUP.md # Chrome配置设置指南
|
||||
└── .gitignore # Git忽略文件
|
||||
```
|
||||
|
||||
## 🚀 快速开始
|
||||
|
||||
### 1. 环境准备
|
||||
|
||||
#### 安装Python依赖
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
#### 安装MongoDB
|
||||
- 下载并安装MongoDB Community Server
|
||||
- 启动MongoDB服务(默认端口27017)
|
||||
|
||||
#### Chrome浏览器
|
||||
- 确保已安装Chrome浏览器
|
||||
- 系统会自动检测并使用合适的ChromeDriver
|
||||
|
||||
### 2. Chrome配置文件
|
||||
```bash
|
||||
# 首次运行需要手动登录
|
||||
python scripts/douyin_selenium_cdp_play_vv.py
|
||||
自动创建
|
||||
```
|
||||
### 3. ChromeDriver说明
|
||||
|
||||
**ChromeDriver已包含在仓库中** (`drivers/chromedriver.exe`),无需额外下载。系统会自动检测并使用该驱动。
|
||||
|
||||
如果遇到驱动版本不匹配问题,可以:
|
||||
1. 删除 `drivers/chromedriver.exe`
|
||||
2. 重新运行脚本,系统会自动下载合适版本的ChromeDriver
|
||||
|
||||
### 3. 运行系统
|
||||
|
||||
#### 方式一:手动运行抓取脚本
|
||||
```bash
|
||||
python scripts/douyin_selenium_cdp_play_vv.py --auto --duration 60
|
||||
```
|
||||
|
||||
#### 方式二:启动定时任务(推荐)
|
||||
```bash
|
||||
python scripts/douyin_auto_scheduler.py
|
||||
```
|
||||
|
||||
#### 方式三:启动API服务器
|
||||
```bash
|
||||
python scripts/miniprogram_api_server.py
|
||||
```
|
||||
|
||||
## ⚙️ 核心功能
|
||||
|
||||
### 1. 数据抓取模块
|
||||
- **技术栈**: Selenium + Chrome DevTools Protocol
|
||||
- **数据源**: 抖音收藏合集页面
|
||||
- **提取字段**:
|
||||
- 合集名称 (mix_name)
|
||||
- 真实播放量 (play_vv)
|
||||
- 合集链接 (video_url)
|
||||
- 合集ID (mix_id)
|
||||
- 视频ID列表 (aweme_ids)
|
||||
- 封面图片 (cover_image_url)
|
||||
|
||||
### 2. 定时任务模块
|
||||
- **执行时间**: 每天上午9:35自动执行
|
||||
- **日志记录**: 完整的执行日志
|
||||
- **错误处理**: 自动重试和异常处理
|
||||
|
||||
### 3. API服务模块
|
||||
- **端口**: 5001
|
||||
- **跨域支持**: 支持小程序调用
|
||||
- **接口功能**:
|
||||
- 视频列表查询(分页、排序)
|
||||
- 热门榜单
|
||||
- 搜索功能
|
||||
- 视频详情
|
||||
- 统计分析
|
||||
|
||||
## 📊 API接口
|
||||
|
||||
### 基础接口
|
||||
|
||||
| 接口 | 方法 | 描述 | 参数 |
|
||||
|------|------|------|------|
|
||||
| `/api/videos` | GET | 获取视频列表 | `page`, `limit`, `sort` |
|
||||
| `/api/top` | GET | 热门榜单 | `limit` |
|
||||
| `/api/search` | GET | 搜索视频 | `q`, `page`, `limit` |
|
||||
| `/api/detail` | GET | 视频详情 | `id` |
|
||||
| `/api/stats` | GET | 统计信息 | - |
|
||||
| `/api/health` | GET | 健康检查 | - |
|
||||
|
||||
### 排序方式
|
||||
- `playcount`: 按总播放量排序(默认)
|
||||
- `growth`: 按增长量排序
|
||||
- `time`: 按时间排序
|
||||
|
||||
### 增长排序参数
|
||||
```
|
||||
/api/videos?sort=growth&start_date=2025-10-16&end_date=2025-10-17
|
||||
```
|
||||
|
||||
## 🔧 配置说明
|
||||
|
||||
### 环境变量
|
||||
```bash
|
||||
# MongoDB配置
|
||||
MONGO_HOST=localhost
|
||||
MONGO_PORT=27017
|
||||
MONGO_DB=douyin_data
|
||||
MONGO_COLLECTION=play_vv_records
|
||||
|
||||
# ChromeDriver配置
|
||||
OVERRIDE_CHROMEDRIVER=/path/to/chromedriver
|
||||
|
||||
# 自动模式
|
||||
AUTO_CONTINUE=1
|
||||
```
|
||||
|
||||
### 定时任务配置
|
||||
在 `scripts/douyin_auto_scheduler.py` 中修改执行时间:
|
||||
```python
|
||||
schedule.every().day.at("09:35").do(self.run_douyin_scraper)
|
||||
```
|
||||
|
||||
## 📈 数据格式
|
||||
|
||||
### MongoDB文档结构
|
||||
```json
|
||||
{
|
||||
"_id": ObjectId,
|
||||
"batch_time": "2025-10-17T09:35:10",
|
||||
"mix_name": "合集名称",
|
||||
"video_url": "https://www.douyin.com/collection/xxx",
|
||||
"playcount": "1.2亿",
|
||||
"play_vv": 120000000,
|
||||
"request_id": "请求ID",
|
||||
"rank": 1,
|
||||
"aweme_ids": ["视频ID1", "视频ID2"],
|
||||
"cover_image_url": "封面图片URL",
|
||||
"cover_backup_urls": ["备用图片URL"]
|
||||
}
|
||||
```
|
||||
|
||||
## 🛠️ 工具脚本
|
||||
|
||||
### 数据库查询
|
||||
```bash
|
||||
python scripts/query_mongodb_data.py
|
||||
```
|
||||
|
||||
### 查看最新数据
|
||||
```bash
|
||||
python scripts/view_latest_data.py
|
||||
```
|
||||
|
||||
### 检查MongoDB连接
|
||||
```bash
|
||||
python scripts/check_mongodb.py
|
||||
```
|
||||
|
||||
## ⚠️ 注意事项
|
||||
|
||||
### 法律合规
|
||||
- 请确保使用符合抖音服务条款和相关法律法规
|
||||
- 数据仅供学习和研究使用,请勿用于商业用途
|
||||
- 避免过于频繁的请求,以免触发反爬虫机制
|
||||
|
||||
### 技术限制
|
||||
- Chrome配置文件需要手动设置登录状态
|
||||
- 抖音页面结构变化可能导致抓取失败
|
||||
- 需要稳定的网络环境
|
||||
|
||||
### 故障排除
|
||||
1. **ChromeDriver问题**: 确保Chrome浏览器版本与ChromeDriver匹配
|
||||
2. **登录状态丢失**: 重新运行手动登录流程
|
||||
3. **MongoDB连接失败**: 检查MongoDB服务是否启动
|
||||
|
||||
## 📝 版本历史
|
||||
|
||||
- **v2.0**: 新增小程序API服务、增长排序功能
|
||||
- **v1.0**: 基础抓取功能和定时任务
|
||||
|
||||
## 🤝 贡献
|
||||
|
||||
欢迎提交Issue和Pull Request来改进这个项目。
|
||||
|
||||
## 📄 许可证
|
||||
|
||||
本项目仅供学习和研究使用。使用者需要遵守相关法律法规和平台服务条款,作者不承担任何法律责任。
|
||||
102
handlers/Rankings/docs/README.md
Normal file
102
handlers/Rankings/docs/README.md
Normal file
@ -0,0 +1,102 @@
|
||||
# 排名系统(Rankings)说明大纲
|
||||
|
||||
## 1. 项目概览
|
||||
- 提供抖音收藏合集真实播放量数据采集与API服务
|
||||
- 抓取脚本写入 MongoDB;API 按播放量与增长榜返回数据
|
||||
|
||||
## 2. 目录速览(关键)
|
||||
- `handlers/Rankings/rank_data_scraper.py` 数据抓取脚本(Selenium+CDP)
|
||||
- `routers/rank_api_routes.py` 小程序 API 数据访问/逻辑模块(由 `app.py` 调用,不独立运行)
|
||||
- `app.py` 主服务入口(Flask应用,注册所有 API 路由)
|
||||
- `Timer_worker.py` 定时任务,每日自动运行抓取
|
||||
|
||||
### 项目结构(简版)
|
||||
```
|
||||
项目根/
|
||||
├── app.py # 主服务入口(5000)
|
||||
├── Timer_worker.py # 定时抓取任务
|
||||
├── config.py # 全局配置
|
||||
├── database.py # 数据库封装
|
||||
├── routers/
|
||||
│ └── rank_api_routes.py # 小程序API逻辑模块
|
||||
├── handlers/
|
||||
│ └── Rankings/
|
||||
│ ├── rank_data_scraper.py # 抓取脚本(Selenium+CDP)
|
||||
│ ├── config/
|
||||
│ │ └── chrome_profile/
|
||||
│ │ └── douyin_persistent/ # 持久化Chrome用户目录(登录态)
|
||||
│ ├── data/ # 数据导出/缓存(可选)
|
||||
│ ├── docs/ # 使用说明与文档
|
||||
│ ├── drivers/ # 浏览器驱动等
|
||||
│ └── logs/ # 运行日志
|
||||
└── 项目启动说明.md
|
||||
```
|
||||
- 核心数据表:`Rankings/Rankings_list`
|
||||
- 日志示例:`handlers/Rankings/logs/douyin_scraper.log`
|
||||
|
||||
## 3. 服务与端口
|
||||
- 单一服务:`app.py`(默认端口 `5000`,包含小程序 API 路由)
|
||||
|
||||
## 4. 一键启动
|
||||
- 启动主服务:
|
||||
```bash
|
||||
python app.py
|
||||
```
|
||||
- 启动定时任务(每日 9:35 自动抓取):
|
||||
```bash
|
||||
python Timer_worker.py
|
||||
```
|
||||
|
||||
## 5. 使用步骤(首次登录与日常)
|
||||
- 安装依赖:
|
||||
```bash
|
||||
pip install -r handlers/Rankings/docs/requirements.txt
|
||||
```
|
||||
- 第一次使用(登录抖音):
|
||||
- 运行抓取脚本:`python handlers/Rankings/rank_data_scraper.py`
|
||||
- 弹出 Chrome 后,完成抖音登录(扫码/账号均可)。
|
||||
- 登录完成后,回到终端提示界面按回车继续抓取。
|
||||
- 后续运行会复用已登录的浏览器配置,免重复登录。
|
||||
|
||||
- 日常流程:
|
||||
- 抓取:`python handlers/Rankings/rank_data_scraper.py`
|
||||
- 服务:`python app.py`(端口 `5000`)
|
||||
- 定时:`python Timer_worker.py`(每日 9:35 自动执行)
|
||||
|
||||
- 验证数据:
|
||||
- MongoDB:数据库 `Rankings`,集合 `Rankings_list`
|
||||
- API 检查:
|
||||
- `http://localhost:5000/api/health`
|
||||
- `http://localhost:5000/api/videos?page=1&limit=20&sort_by=playcount`
|
||||
- 增长榜:`http://localhost:5000/api/videos?sort_by=growth&page=1&limit=20`
|
||||
|
||||
## 6. 数据抓取流程(简版)
|
||||
- 复用已登录的 Chrome 配置,滚动/刷新触发请求
|
||||
- 通过 CDP 捕获响应,解析 `play_vv` 与 SSR 数据
|
||||
- 按合集聚合视频,写入 MongoDB 指定集合
|
||||
|
||||
## 7. 数据库与集合
|
||||
- 数据库:`Rankings`
|
||||
- 集合:`Rankings_list`
|
||||
- 连接:`mongodb://localhost:27017/`(可通过环境变量覆盖)
|
||||
|
||||
## 8. API 功能摘要
|
||||
- 视频列表(分页、按播放量/时间排序,仅当日最新数据)
|
||||
- 增长榜(按指定日期区间对比增长量,分页返回)
|
||||
|
||||
## 9. 配置项(环境变量)
|
||||
- `MONGO_HOST` 默认 `localhost`
|
||||
- `MONGO_PORT` 默认 `27017`
|
||||
- `MONGO_DB` 默认 `Rankings`
|
||||
- `MONGO_COLLECTION` 默认 `Rankings_list`
|
||||
|
||||
## 10. 快速排错
|
||||
- MongoDB 连接失败:抓取脚本将仅保存本地文件日志
|
||||
- Chrome 配置:`handlers/Rankings/config/chrome_profile/`
|
||||
- 日志位置:`handlers/Rankings/logs/`
|
||||
|
||||
## 11. 你需要知道的
|
||||
- 当前架构下没有独立的 `5001` 端口;`routers/rank_api_routes.py` 提供逻辑模块,由 `app.py` 注册路由并统一对外服务(`5000`)。
|
||||
- 抓取脚本与 API 使用同一集合,数据结构一致
|
||||
- 小程序 API 专注返回易用字段(封面、播放量、时间、链接)
|
||||
- 可直接在现有数据上新增排序或过滤,保持接口向后兼容
|
||||
@ -39,8 +39,7 @@ from pymongo.errors import ConnectionFailure
|
||||
# 确保logs目录存在
|
||||
import os
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
logs_dir = os.path.join(project_root, 'logs')
|
||||
logs_dir = os.path.join(script_dir, 'logs')
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
@ -75,8 +74,8 @@ class DouyinPlayVVScraper:
|
||||
# MongoDB连接配置
|
||||
mongo_host = os.environ.get('MONGO_HOST', 'localhost')
|
||||
mongo_port = int(os.environ.get('MONGO_PORT', 27017))
|
||||
mongo_db = os.environ.get('MONGO_DB', 'douyin_data')
|
||||
mongo_collection = os.environ.get('MONGO_COLLECTION', 'play_vv_records')
|
||||
mongo_db = os.environ.get('MONGO_DB', 'Rankings')
|
||||
mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list')
|
||||
|
||||
# 创建MongoDB连接
|
||||
self.mongo_client = MongoClient(mongo_host, mongo_port, serverSelectionTimeoutMS=5000)
|
||||
@ -105,7 +104,8 @@ class DouyinPlayVVScraper:
|
||||
def _cleanup_old_profiles(self):
|
||||
"""清理超过一天的旧临时Chrome配置文件"""
|
||||
try:
|
||||
profile_base_dir = os.path.abspath(os.path.join('.', 'config', 'chrome_profile'))
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
profile_base_dir = os.path.join(script_dir, 'config', 'chrome_profile')
|
||||
if not os.path.exists(profile_base_dir):
|
||||
return
|
||||
|
||||
@ -135,7 +135,8 @@ class DouyinPlayVVScraper:
|
||||
import psutil
|
||||
|
||||
# 获取当前配置文件路径
|
||||
profile_dir = os.path.abspath(os.path.join('.', 'config', 'chrome_profile', 'douyin_persistent'))
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent')
|
||||
|
||||
# 查找使用该配置文件的Chrome进程
|
||||
killed_processes = []
|
||||
@ -190,7 +191,8 @@ class DouyinPlayVVScraper:
|
||||
chrome_options.add_argument('--start-maximized')
|
||||
chrome_options.add_argument('--lang=zh-CN')
|
||||
# 使用固定的Chrome配置文件目录以保持登录状态
|
||||
profile_dir = os.path.abspath(os.path.join('.', 'config', 'chrome_profile', 'douyin_persistent'))
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile', 'douyin_persistent')
|
||||
os.makedirs(profile_dir, exist_ok=True)
|
||||
chrome_options.add_argument(f'--user-data-dir={profile_dir}')
|
||||
logging.info(f'使用持久化Chrome配置文件: {profile_dir}')
|
||||
@ -931,8 +933,14 @@ class DouyinPlayVVScraper:
|
||||
|
||||
def save_results(self):
|
||||
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
json_file = f'douyin_cdp_play_vv_{ts}.json'
|
||||
txt_file = f'douyin_cdp_play_vv_{ts}.txt'
|
||||
|
||||
# 创建data文件夹
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
data_dir = os.path.join(script_dir, 'data')
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
|
||||
json_file = os.path.join(data_dir, f'douyin_cdp_play_vv_{ts}.json')
|
||||
txt_file = os.path.join(data_dir, f'douyin_cdp_play_vv_{ts}.txt')
|
||||
|
||||
# 保存到JSON文件
|
||||
with open(json_file, 'w', encoding='utf-8') as f:
|
||||
@ -5,34 +5,11 @@
|
||||
优化的数据格式和接口设计,专为小程序使用
|
||||
"""
|
||||
|
||||
from flask import Flask, jsonify, request
|
||||
from flask_cors import CORS
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime, timedelta
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
|
||||
# 配置日志
|
||||
# 确保logs目录存在
|
||||
import os
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(script_dir)
|
||||
logs_dir = os.path.join(project_root, 'logs')
|
||||
os.makedirs(logs_dir, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(logs_dir, 'miniprogram_api.log'), encoding='utf-8'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app) # 允许跨域访问,支持小程序调用
|
||||
|
||||
class MiniprogramAPI:
|
||||
def __init__(self):
|
||||
self.client = None
|
||||
@ -47,8 +24,8 @@ class MiniprogramAPI:
|
||||
# 测试连接
|
||||
self.client.admin.command('ping')
|
||||
# 使用数据库与集合
|
||||
self.db = self.client['douyin_data']
|
||||
self.collection = self.db['play_vv_records']
|
||||
self.db = self.client['Rankings']
|
||||
self.collection = self.db['Rankings_list']
|
||||
logging.info("MongoDB连接成功")
|
||||
return True
|
||||
except Exception as e:
|
||||
@ -465,136 +442,70 @@ class MiniprogramAPI:
|
||||
logging.error(f"获取统计信息失败: {e}")
|
||||
return {"success": False, "message": f"获取统计失败: {str(e)}"}
|
||||
|
||||
def get_videos(self):
|
||||
"""获取视频列表 - 兼容app.py调用"""
|
||||
from flask import request
|
||||
|
||||
page = int(request.args.get('page', 1))
|
||||
limit = int(request.args.get('limit', 20))
|
||||
sort_by = request.args.get('sort', 'playcount')
|
||||
|
||||
if sort_by == 'growth':
|
||||
start_date = request.args.get('start_date')
|
||||
end_date = request.args.get('end_date')
|
||||
return self.get_growth_videos(page, limit, start_date, end_date)
|
||||
else:
|
||||
return self.get_video_list(page, limit, sort_by)
|
||||
|
||||
def get_top(self):
|
||||
"""获取热门榜单 - 兼容app.py调用"""
|
||||
from flask import request
|
||||
limit = int(request.args.get('limit', 10))
|
||||
return self.get_top_videos(limit)
|
||||
|
||||
def search(self):
|
||||
"""搜索视频 - 兼容app.py调用"""
|
||||
from flask import request
|
||||
keyword = request.args.get('q', '')
|
||||
page = int(request.args.get('page', 1))
|
||||
limit = int(request.args.get('limit', 10))
|
||||
return self.search_videos(keyword, page, limit)
|
||||
|
||||
def get_detail(self):
|
||||
"""获取视频详情 - 兼容app.py调用"""
|
||||
from flask import request
|
||||
video_id = request.args.get('id', '')
|
||||
return self.get_video_detail(video_id)
|
||||
|
||||
def get_stats(self):
|
||||
"""获取统计信息 - 兼容app.py调用"""
|
||||
return self.get_statistics()
|
||||
|
||||
def health_check(self):
|
||||
"""健康检查 - 兼容app.py调用"""
|
||||
try:
|
||||
# 检查数据库连接
|
||||
if not self.client:
|
||||
return {"success": False, "message": "数据库未连接"}
|
||||
|
||||
# 测试数据库连接
|
||||
self.client.admin.command('ping')
|
||||
|
||||
# 获取数据统计
|
||||
total_count = self.collection.count_documents({})
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": "服务正常",
|
||||
"data": {
|
||||
"database": "连接正常",
|
||||
"total_records": total_count,
|
||||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logging.error(f"健康检查失败: {e}")
|
||||
return {"success": False, "message": f"服务异常: {str(e)}"}
|
||||
|
||||
# 创建API实例
|
||||
api = MiniprogramAPI()
|
||||
|
||||
# API路由定义
|
||||
@app.route('/')
|
||||
def index():
|
||||
"""API首页"""
|
||||
return jsonify({
|
||||
"name": "小程序抖音播放量数据API",
|
||||
"version": "2.0",
|
||||
"description": "专为小程序优化的抖音播放量数据接口",
|
||||
"endpoints": {
|
||||
"/api/videos": "获取视频列表 (支持分页和排序)",
|
||||
"/api/top": "获取热门视频榜单",
|
||||
"/api/search": "搜索视频",
|
||||
"/api/detail": "获取视频详情",
|
||||
"/api/stats": "获取统计信息",
|
||||
"/api/health": "健康检查"
|
||||
},
|
||||
"features": [
|
||||
"分页支持",
|
||||
"多种排序方式",
|
||||
"搜索功能",
|
||||
"详情查看",
|
||||
"统计分析",
|
||||
"小程序优化"
|
||||
]
|
||||
})
|
||||
|
||||
@app.route('/api/videos')
|
||||
def get_videos():
|
||||
"""获取视频列表"""
|
||||
page = request.args.get('page', 1, type=int)
|
||||
limit = request.args.get('limit', 20, type=int)
|
||||
sort_by = request.args.get('sort', 'playcount') # playcount, time, 或 growth
|
||||
start_date = request.args.get('start_date', None)
|
||||
end_date = request.args.get('end_date', None)
|
||||
|
||||
# 限制参数范围
|
||||
page = max(1, page)
|
||||
limit = min(50, max(1, limit)) # 限制每页最多50条
|
||||
|
||||
if sort_by == "growth":
|
||||
# 增长排序需要特殊处理,支持日期参数
|
||||
result = api.get_growth_videos(page, limit, start_date, end_date)
|
||||
else:
|
||||
result = api.get_video_list(page, limit, sort_by)
|
||||
|
||||
return jsonify(result)
|
||||
|
||||
@app.route('/api/top')
|
||||
def get_top():
|
||||
"""获取热门视频榜单"""
|
||||
limit = request.args.get('limit', 10, type=int)
|
||||
limit = min(50, max(1, limit)) # 限制最多50条
|
||||
|
||||
result = api.get_top_videos(limit)
|
||||
return jsonify(result)
|
||||
|
||||
@app.route('/api/search')
|
||||
def search():
|
||||
"""搜索视频"""
|
||||
keyword = request.args.get('q', '').strip()
|
||||
page = request.args.get('page', 1, type=int)
|
||||
limit = request.args.get('limit', 10, type=int)
|
||||
|
||||
# 限制参数范围
|
||||
page = max(1, page)
|
||||
limit = min(30, max(1, limit)) # 搜索结果限制每页最多30条
|
||||
|
||||
result = api.search_videos(keyword, page, limit)
|
||||
return jsonify(result)
|
||||
|
||||
@app.route('/api/detail')
|
||||
def get_detail():
|
||||
"""获取视频详情"""
|
||||
video_id = request.args.get('id', '').strip()
|
||||
|
||||
if not video_id:
|
||||
return jsonify({"success": False, "message": "请提供视频ID"})
|
||||
|
||||
result = api.get_video_detail(video_id)
|
||||
return jsonify(result)
|
||||
|
||||
@app.route('/api/stats')
|
||||
def get_stats():
|
||||
"""获取统计信息"""
|
||||
result = api.get_statistics()
|
||||
return jsonify(result)
|
||||
|
||||
@app.route('/api/health')
|
||||
def health_check():
|
||||
"""健康检查"""
|
||||
try:
|
||||
# 检查MongoDB连接
|
||||
api.client.admin.command('ping')
|
||||
|
||||
# 获取基本信息
|
||||
total_count = api.collection.count_documents({})
|
||||
|
||||
return jsonify({
|
||||
"success": True,
|
||||
"status": "healthy",
|
||||
"mongodb": "connected",
|
||||
"total_records": total_count,
|
||||
"server_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"api_version": "2.0"
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
"success": False,
|
||||
"status": "unhealthy",
|
||||
"mongodb": "disconnected",
|
||||
"error": str(e),
|
||||
"server_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
})
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("启动小程序专用抖音播放量API服务器...")
|
||||
print("API地址: http://localhost:5001")
|
||||
print("小程序API接口列表:")
|
||||
print(" - GET /api/videos?page=1&limit=20&sort=playcount 获取视频列表(总播放量排序)")
|
||||
print(" - GET /api/videos?page=1&limit=20&sort=growth 获取视频列表(增长排序,默认昨天到今天的差值)")
|
||||
print(" - GET /api/videos?page=1&limit=20&sort=growth&start_date=2025-10-16&end_date=2025-10-17 获取视频列表(自定义日期范围增长排序)")
|
||||
print(" - GET /api/top?limit=10 获取热门榜单")
|
||||
print(" - GET /api/search?q=关键词&page=1&limit=10 搜索视频")
|
||||
print(" - GET /api/detail?id=视频ID 获取视频详情")
|
||||
print(" - GET /api/stats 获取统计信息")
|
||||
print(" - GET /api/health 健康检查")
|
||||
print("专为小程序优化:分页、搜索、详情、统计、增长排序、自定义日期范围")
|
||||
|
||||
app.run(host='0.0.0.0', port=5001, debug=True)
|
||||
@ -1,65 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
检查MongoDB数据保存状态
|
||||
"""
|
||||
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
def check_mongodb():
|
||||
"""检查MongoDB连接和数据"""
|
||||
try:
|
||||
# 使用与主脚本相同的连接参数
|
||||
client = MongoClient('localhost', 27017, serverSelectionTimeoutMS=5000)
|
||||
|
||||
# 测试连接
|
||||
client.admin.command('ping')
|
||||
print("MongoDB连接成功")
|
||||
|
||||
# 检查数据库和集合
|
||||
db = client['douyin_data']
|
||||
collection = db['play_vv_records']
|
||||
|
||||
total_count = collection.count_documents({})
|
||||
print(f"总记录数: {total_count}")
|
||||
|
||||
# 检查今天的数据
|
||||
today_start = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
today_count = collection.count_documents({'batch_time': {'$gte': today_start}})
|
||||
print(f"今天的数据记录数: {today_count}")
|
||||
|
||||
# 显示最新5条记录(按时间倒序排列)
|
||||
print("\n最新5条记录(按时间倒序排列):")
|
||||
print("-" * 60)
|
||||
for doc in collection.find().sort('batch_time', -1).limit(5):
|
||||
print(f"合集名称: {doc.get('mix_name', '未知')}")
|
||||
print(f"播放量: {doc.get('play_vv', 0):,} ({doc.get('playcount', '')})")
|
||||
print(f"合集链接: {doc.get('video_url', '')}")
|
||||
print(f"保存时间: {doc.get('batch_time', '')}")
|
||||
print(f"视频ID数: {len(doc.get('aweme_ids', []))}")
|
||||
print(f"封面图片: {'有' if doc.get('cover_image_url') else '无'}")
|
||||
print("-" * 60)
|
||||
|
||||
# 显示字段结构
|
||||
if total_count > 0:
|
||||
sample = collection.find_one()
|
||||
print(f"\n文档字段结构:")
|
||||
for key in sample.keys():
|
||||
print(f" - {key}: {type(sample[key]).__name__}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"检查MongoDB时出错: {e}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("=== MongoDB数据检查 ===")
|
||||
success = check_mongodb()
|
||||
if success:
|
||||
print("\n检查完成")
|
||||
else:
|
||||
print("\n检查失败")
|
||||
sys.exit(1)
|
||||
@ -1,294 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
MongoDB数据库快速查看工具
|
||||
一次性显示数据库结构、统计信息和最新数据
|
||||
"""
|
||||
|
||||
import pymongo
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
def connect_mongodb(connection_string='mongodb://localhost:27017/'):
|
||||
"""连接到MongoDB"""
|
||||
try:
|
||||
client = MongoClient(connection_string, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command('ping')
|
||||
print(f"✅ 成功连接到MongoDB: {connection_string}")
|
||||
return client
|
||||
except Exception as e:
|
||||
print(f"❌ 连接MongoDB失败: {e}")
|
||||
return None
|
||||
|
||||
def analyze_document_schema(document):
|
||||
"""分析文档结构"""
|
||||
if not document:
|
||||
return {}
|
||||
|
||||
schema = {}
|
||||
for key, value in document.items():
|
||||
if key == '_id':
|
||||
schema[key] = {'type': 'ObjectId', 'example': str(value)}
|
||||
elif isinstance(value, str):
|
||||
schema[key] = {'type': 'string', 'example': value[:50] + '...' if len(value) > 50 else value}
|
||||
elif isinstance(value, int):
|
||||
schema[key] = {'type': 'integer', 'example': value}
|
||||
elif isinstance(value, float):
|
||||
schema[key] = {'type': 'float', 'example': value}
|
||||
elif isinstance(value, bool):
|
||||
schema[key] = {'type': 'boolean', 'example': value}
|
||||
elif isinstance(value, datetime):
|
||||
schema[key] = {'type': 'datetime', 'example': value.strftime('%Y-%m-%d %H:%M:%S')}
|
||||
elif isinstance(value, list):
|
||||
schema[key] = {
|
||||
'type': 'array',
|
||||
'length': len(value),
|
||||
'example': value[:3] if len(value) <= 3 else value[:3] + ['...']
|
||||
}
|
||||
elif isinstance(value, dict):
|
||||
schema[key] = {
|
||||
'type': 'object',
|
||||
'keys': list(value.keys())[:5],
|
||||
'example': {k: v for k, v in list(value.items())[:2]}
|
||||
}
|
||||
else:
|
||||
schema[key] = {'type': type(value).__name__, 'example': str(value)[:50]}
|
||||
|
||||
return schema
|
||||
|
||||
def display_database_info(client):
|
||||
"""显示数据库信息"""
|
||||
print("\n" + "="*80)
|
||||
print("📊 MongoDB 数据库结构分析")
|
||||
print("="*80)
|
||||
|
||||
try:
|
||||
db_names = client.list_database_names()
|
||||
|
||||
for db_name in db_names:
|
||||
if db_name in ['admin', 'local', 'config']:
|
||||
continue
|
||||
|
||||
db = client[db_name]
|
||||
collections = db.list_collection_names()
|
||||
|
||||
print(f"\n🗄️ 数据库: {db_name}")
|
||||
print(f" 集合数量: {len(collections)}")
|
||||
|
||||
for coll_name in collections:
|
||||
collection = db[coll_name]
|
||||
count = collection.count_documents({})
|
||||
|
||||
print(f"\n 📁 集合: {coll_name}")
|
||||
print(f" 文档数量: {count:,}")
|
||||
|
||||
if count > 0:
|
||||
# 获取样本文档来分析结构
|
||||
sample_doc = collection.find_one()
|
||||
schema = analyze_document_schema(sample_doc)
|
||||
|
||||
if schema:
|
||||
print(f" 📋 字段结构:")
|
||||
for field_name, field_info in schema.items():
|
||||
print(f" • {field_name}: {field_info['type']}")
|
||||
if 'example' in field_info:
|
||||
example = field_info['example']
|
||||
if isinstance(example, str) and len(example) > 100:
|
||||
example = example[:100] + "..."
|
||||
print(f" 示例: {example}")
|
||||
else:
|
||||
print(f" ⚠️ 集合为空")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 获取数据库信息失败: {e}")
|
||||
|
||||
def display_statistics(client, db_name='douyin_data', collection_name='play_vv_records'):
|
||||
"""显示统计信息"""
|
||||
try:
|
||||
db = client[db_name]
|
||||
collection = db[collection_name]
|
||||
|
||||
print(f"\n📊 统计信息 ({db_name}.{collection_name})")
|
||||
print("-" * 50)
|
||||
|
||||
# 基本统计
|
||||
total_count = collection.count_documents({})
|
||||
print(f"📈 总文档数: {total_count:,}")
|
||||
|
||||
if total_count == 0:
|
||||
print("⚠️ 集合为空,无法显示统计信息")
|
||||
return
|
||||
|
||||
# 时间范围统计
|
||||
time_fields = ['batch_time', 'created_at', 'timestamp']
|
||||
for field in time_fields:
|
||||
if collection.find_one({field: {'$exists': True}}):
|
||||
pipeline = [
|
||||
{'$group': {
|
||||
'_id': None,
|
||||
'min_time': {'$min': f'${field}'},
|
||||
'max_time': {'$max': f'${field}'}
|
||||
}}
|
||||
]
|
||||
result = list(collection.aggregate(pipeline))
|
||||
if result:
|
||||
min_time = result[0]['min_time']
|
||||
max_time = result[0]['max_time']
|
||||
print(f"📅 时间范围 ({field}):")
|
||||
print(f" 最早: {min_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" 最新: {max_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
break
|
||||
|
||||
# 播放量统计
|
||||
playcount_fields = ['play_vv', 'playcount', 'play_count', 'views']
|
||||
for field in playcount_fields:
|
||||
if collection.find_one({field: {'$exists': True, '$type': 'number'}}):
|
||||
pipeline = [
|
||||
{'$group': {
|
||||
'_id': None,
|
||||
'total_plays': {'$sum': f'${field}'},
|
||||
'avg_plays': {'$avg': f'${field}'},
|
||||
'max_plays': {'$max': f'${field}'},
|
||||
'min_plays': {'$min': f'${field}'}
|
||||
}}
|
||||
]
|
||||
result = list(collection.aggregate(pipeline))
|
||||
if result:
|
||||
stats = result[0]
|
||||
print(f"🎬 播放量统计 ({field}):")
|
||||
print(f" 总播放量: {stats['total_plays']:,}")
|
||||
print(f" 平均播放量: {stats['avg_plays']:,.0f}")
|
||||
print(f" 最高播放量: {stats['max_plays']:,}")
|
||||
print(f" 最低播放量: {stats['min_plays']:,}")
|
||||
break
|
||||
|
||||
# 热门内容统计
|
||||
if collection.find_one({'mix_name': {'$exists': True}}):
|
||||
print(f"\n🔥 热门内容 (按播放量排序):")
|
||||
pipeline = [
|
||||
{'$match': {'play_vv': {'$exists': True, '$type': 'number'}}},
|
||||
{'$sort': {'play_vv': -1}},
|
||||
{'$limit': 5},
|
||||
{'$project': {'mix_name': 1, 'play_vv': 1, 'batch_time': 1}}
|
||||
]
|
||||
top_content = list(collection.aggregate(pipeline))
|
||||
for i, content in enumerate(top_content, 1):
|
||||
name = content.get('mix_name', '未知')
|
||||
plays = content.get('play_vv', 0)
|
||||
time_str = content.get('batch_time', datetime.now()).strftime('%m-%d %H:%M')
|
||||
print(f" {i}. {name}: {plays:,} ({time_str})")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 获取统计信息失败: {e}")
|
||||
|
||||
def display_recent_data(client, db_name='douyin_data', collection_name='play_vv_records', limit=3):
|
||||
"""显示最近的数据"""
|
||||
try:
|
||||
db = client[db_name]
|
||||
collection = db[collection_name]
|
||||
|
||||
print(f"\n📈 最近 {limit} 条数据 ({db_name}.{collection_name})")
|
||||
print("-" * 80)
|
||||
|
||||
# 尝试按时间字段排序
|
||||
time_fields = ['batch_time', 'created_at', 'timestamp', '_id']
|
||||
sort_field = None
|
||||
|
||||
for field in time_fields:
|
||||
if collection.find_one({field: {'$exists': True}}):
|
||||
sort_field = field
|
||||
break
|
||||
|
||||
if sort_field:
|
||||
recent_docs = list(collection.find().sort(sort_field, -1).limit(limit))
|
||||
else:
|
||||
recent_docs = list(collection.find().limit(limit))
|
||||
|
||||
if not recent_docs:
|
||||
print("⚠️ 没有找到数据")
|
||||
return
|
||||
|
||||
for i, doc in enumerate(recent_docs, 1):
|
||||
print(f"\n📄 记录 {i}:")
|
||||
display_document(doc)
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 获取最近数据失败: {e}")
|
||||
|
||||
def display_document(doc, indent=2):
|
||||
"""显示单个文档"""
|
||||
spaces = " " * indent
|
||||
|
||||
for key, value in doc.items():
|
||||
if key == '_id':
|
||||
print(f"{spaces}🆔 {key}: {value}")
|
||||
elif isinstance(value, datetime):
|
||||
print(f"{spaces}📅 {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
elif isinstance(value, str):
|
||||
display_value = value[:100] + "..." if len(value) > 100 else value
|
||||
print(f"{spaces}📝 {key}: {display_value}")
|
||||
elif isinstance(value, (int, float)):
|
||||
if key in ['playcount', 'play_count', 'views', 'play_vv']:
|
||||
print(f"{spaces}📊 {key}: {value:,}")
|
||||
else:
|
||||
print(f"{spaces}🔢 {key}: {value}")
|
||||
elif isinstance(value, list):
|
||||
print(f"{spaces}📋 {key}: [{len(value)} 项]")
|
||||
if len(value) > 0 and len(value) <= 3:
|
||||
for item in value[:3]:
|
||||
item_str = str(item)[:50] + "..." if len(str(item)) > 50 else str(item)
|
||||
print(f"{spaces} - {item_str}")
|
||||
elif len(value) > 3:
|
||||
for item in value[:2]:
|
||||
item_str = str(item)[:50] + "..." if len(str(item)) > 50 else str(item)
|
||||
print(f"{spaces} - {item_str}")
|
||||
print(f"{spaces} ... 还有 {len(value)-2} 项")
|
||||
elif isinstance(value, dict):
|
||||
print(f"{spaces}📦 {key}: {{对象}}")
|
||||
if len(value) <= 3:
|
||||
for k, v in value.items():
|
||||
v_str = str(v)[:50] + "..." if len(str(v)) > 50 else str(v)
|
||||
print(f"{spaces} {k}: {v_str}")
|
||||
else:
|
||||
for k, v in list(value.items())[:2]:
|
||||
v_str = str(v)[:50] + "..." if len(str(v)) > 50 else str(v)
|
||||
print(f"{spaces} {k}: {v_str}")
|
||||
print(f"{spaces} ... 还有 {len(value)-2} 个字段")
|
||||
else:
|
||||
print(f"{spaces}❓ {key}: {value}")
|
||||
|
||||
def main():
|
||||
"""主函数"""
|
||||
print("🚀 MongoDB 数据库快速查看工具")
|
||||
print(f"⏰ 查看时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
# 连接数据库
|
||||
client = connect_mongodb()
|
||||
if not client:
|
||||
return
|
||||
|
||||
try:
|
||||
# 显示数据库结构
|
||||
display_database_info(client)
|
||||
|
||||
# 显示统计信息
|
||||
display_statistics(client)
|
||||
|
||||
# 显示最近数据
|
||||
display_recent_data(client)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print("✅ 数据库查看完成!")
|
||||
print("💡 提示: 运行 'python scripts/mongodb_viewer.py' 可以使用交互式查看器")
|
||||
print("🔄 提示: 重新运行此脚本可以查看最新数据")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 程序被用户中断")
|
||||
finally:
|
||||
if client:
|
||||
client.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@ -1,142 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
查询MongoDB中的抖音播放量数据
|
||||
"""
|
||||
|
||||
from pymongo import MongoClient
|
||||
from pymongo.errors import ConnectionFailure
|
||||
from datetime import datetime
|
||||
|
||||
def connect_mongodb():
|
||||
"""连接MongoDB"""
|
||||
try:
|
||||
client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=5000)
|
||||
client.admin.command('ping')
|
||||
db = client['douyin——data']
|
||||
collection = db['playcounts']
|
||||
print("MongoDB连接成功")
|
||||
return client, collection
|
||||
except ConnectionFailure:
|
||||
print("MongoDB连接失败,请确保MongoDB服务已启动")
|
||||
return None, None
|
||||
except Exception as e:
|
||||
print(f"MongoDB连接出错: {e}")
|
||||
return None, None
|
||||
|
||||
def query_latest_batches(collection, limit=5):
|
||||
"""查询最近的几个批次数据"""
|
||||
try:
|
||||
# 按批次时间倒序获取最近的批次
|
||||
pipeline = [
|
||||
{"$group": {
|
||||
"_id": "$batch_id",
|
||||
"batch_time": {"$first": "$batch_time"},
|
||||
"count": {"$sum": 1}
|
||||
}},
|
||||
{"$sort": {"batch_time": -1}},
|
||||
{"$limit": limit}
|
||||
]
|
||||
|
||||
batches = list(collection.aggregate(pipeline))
|
||||
|
||||
if not batches:
|
||||
print("暂无数据")
|
||||
return
|
||||
|
||||
print(f"\n===== 最近 {len(batches)} 个批次 =====")
|
||||
for batch in batches:
|
||||
batch_time = batch['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
|
||||
print(f"批次ID: {batch['_id']}, 时间: {batch_time}, 数据条数: {batch['count']}")
|
||||
|
||||
# 显示该批次的具体数据,按播放量排序(如果有rank字段则按rank排序,否则按playcount_number排序)
|
||||
batch_data = list(collection.find(
|
||||
{"batch_id": batch['_id']},
|
||||
{"name": 1, "playcount": 1, "rank": 1, "playcount_number": 1, "_id": 0}
|
||||
))
|
||||
|
||||
# 按rank排序(如果存在),否则按playcount_number降序排序
|
||||
if batch_data and 'rank' in batch_data[0]:
|
||||
batch_data.sort(key=lambda x: x.get('rank', 999))
|
||||
elif batch_data and 'playcount_number' in batch_data[0]:
|
||||
batch_data.sort(key=lambda x: x.get('playcount_number', 0), reverse=True)
|
||||
|
||||
for i, item in enumerate(batch_data, 1):
|
||||
rank_info = f"[第{item.get('rank', i)}名] " if 'rank' in item else ""
|
||||
print(f" {rank_info}{item['name']}")
|
||||
print(f" 播放量: {item['playcount']}")
|
||||
print()
|
||||
|
||||
except Exception as e:
|
||||
print(f"查询数据失败: {e}")
|
||||
|
||||
def query_by_name(collection, name_keyword):
|
||||
"""根据剧本名称关键词查询"""
|
||||
try:
|
||||
# 使用正则表达式进行模糊匹配
|
||||
query = {"name": {"$regex": name_keyword, "$options": "i"}}
|
||||
results = list(collection.find(query).sort("batch_time", -1))
|
||||
|
||||
if not results:
|
||||
print(f"未找到包含'{name_keyword}'的剧本")
|
||||
return
|
||||
|
||||
print(f"\n===== 包含'{name_keyword}'的剧本 =====")
|
||||
for result in results:
|
||||
batch_time = result['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
|
||||
print(f"剧本: {result['name']}")
|
||||
print(f"播放量: {result['playcount']}")
|
||||
print(f"抓取时间: {batch_time}")
|
||||
print(f"批次ID: {result['batch_id']}")
|
||||
print("-" * 30)
|
||||
|
||||
except Exception as e:
|
||||
print(f"查询失败: {e}")
|
||||
|
||||
def main():
|
||||
print("抖音播放量数据查询工具")
|
||||
print("=" * 40)
|
||||
|
||||
client, collection = connect_mongodb()
|
||||
if collection is None:
|
||||
return
|
||||
|
||||
try:
|
||||
while True:
|
||||
print("\n请选择操作:")
|
||||
print("1. 查看最近的批次数据")
|
||||
print("2. 根据剧本名称搜索")
|
||||
print("3. 退出")
|
||||
|
||||
choice = input("请输入选项 (1-3): ").strip()
|
||||
|
||||
if choice == '1':
|
||||
limit = input("显示最近几个批次? (默认5): ").strip()
|
||||
try:
|
||||
limit = int(limit) if limit else 5
|
||||
except ValueError:
|
||||
limit = 5
|
||||
query_latest_batches(collection, limit)
|
||||
|
||||
elif choice == '2':
|
||||
keyword = input("请输入剧本名称关键词: ").strip()
|
||||
if keyword:
|
||||
query_by_name(collection, keyword)
|
||||
else:
|
||||
print("关键词不能为空")
|
||||
|
||||
elif choice == '3':
|
||||
break
|
||||
|
||||
else:
|
||||
print("无效选项,请重新选择")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n用户中断操作")
|
||||
finally:
|
||||
if client:
|
||||
client.close()
|
||||
print("已断开MongoDB连接")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@ -1,55 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
查看MongoDB最新数据 - 始终按时间倒序排列
|
||||
"""
|
||||
|
||||
from pymongo import MongoClient
|
||||
from datetime import datetime
|
||||
|
||||
def view_latest_data(limit=20):
|
||||
"""查看最新数据"""
|
||||
try:
|
||||
client = MongoClient('localhost', 27017)
|
||||
db = client['douyin_data']
|
||||
collection = db['play_vv_records']
|
||||
|
||||
print("=== 抖音播放量最新数据 ===")
|
||||
print(f"显示最新 {limit} 条记录(按时间倒序排列)")
|
||||
print("=" * 80)
|
||||
|
||||
# 获取最新数据,按时间倒序排列
|
||||
latest_docs = list(collection.find().sort('batch_time', -1).limit(limit))
|
||||
|
||||
if not latest_docs:
|
||||
print("没有找到数据")
|
||||
return
|
||||
|
||||
for i, doc in enumerate(latest_docs, 1):
|
||||
print(f"\n记录 #{i}")
|
||||
print("-" * 50)
|
||||
print(f"合集名称: {doc.get('mix_name', '未知')}")
|
||||
print(f"播放量: {doc.get('play_vv', 0):,} ({doc.get('playcount', '')})")
|
||||
print(f"合集链接: {doc.get('video_url', '')}")
|
||||
print(f"保存时间: {doc.get('batch_time', '')}")
|
||||
print(f"视频ID数: {len(doc.get('aweme_ids', []))}")
|
||||
print(f"封面图片: {'有' if doc.get('cover_image_url') else '无'}")
|
||||
|
||||
# 显示统计信息
|
||||
total_count = collection.count_documents({})
|
||||
today_start = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
today_count = collection.count_documents({'batch_time': {'$gte': today_start}})
|
||||
|
||||
print(f"\n" + "=" * 80)
|
||||
print(f"统计信息:")
|
||||
print(f"- 总记录数: {total_count}")
|
||||
print(f"- 今天记录数: {today_count}")
|
||||
print(f"- 最新记录时间: {latest_docs[0].get('batch_time')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"查看数据时出错: {e}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
limit = int(sys.argv[1]) if len(sys.argv) > 1 else 20
|
||||
view_latest_data(limit)
|
||||
Loading…
x
Reference in New Issue
Block a user