rank_backend/scripts/mongodb_quick_view.py
Qyir 53160420d1 Initial commit: Douyin play count tracking system
Features:
- Douyin play count scraper using Selenium + Chrome DevTools Protocol
- Automated scheduler for daily data collection
- MongoDB data storage
- Mini-program API server
- Data analysis and visualization tools

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 10:48:52 +08:00

294 lines
11 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
MongoDB数据库快速查看工具
一次性显示数据库结构、统计信息和最新数据
"""
import pymongo
from pymongo import MongoClient
from datetime import datetime
import json
from collections import defaultdict
def connect_mongodb(connection_string='mongodb://localhost:27017/'):
"""连接到MongoDB"""
try:
client = MongoClient(connection_string, serverSelectionTimeoutMS=5000)
client.admin.command('ping')
print(f"✅ 成功连接到MongoDB: {connection_string}")
return client
except Exception as e:
print(f"❌ 连接MongoDB失败: {e}")
return None
def analyze_document_schema(document):
"""分析文档结构"""
if not document:
return {}
schema = {}
for key, value in document.items():
if key == '_id':
schema[key] = {'type': 'ObjectId', 'example': str(value)}
elif isinstance(value, str):
schema[key] = {'type': 'string', 'example': value[:50] + '...' if len(value) > 50 else value}
elif isinstance(value, int):
schema[key] = {'type': 'integer', 'example': value}
elif isinstance(value, float):
schema[key] = {'type': 'float', 'example': value}
elif isinstance(value, bool):
schema[key] = {'type': 'boolean', 'example': value}
elif isinstance(value, datetime):
schema[key] = {'type': 'datetime', 'example': value.strftime('%Y-%m-%d %H:%M:%S')}
elif isinstance(value, list):
schema[key] = {
'type': 'array',
'length': len(value),
'example': value[:3] if len(value) <= 3 else value[:3] + ['...']
}
elif isinstance(value, dict):
schema[key] = {
'type': 'object',
'keys': list(value.keys())[:5],
'example': {k: v for k, v in list(value.items())[:2]}
}
else:
schema[key] = {'type': type(value).__name__, 'example': str(value)[:50]}
return schema
def display_database_info(client):
"""显示数据库信息"""
print("\n" + "="*80)
print("📊 MongoDB 数据库结构分析")
print("="*80)
try:
db_names = client.list_database_names()
for db_name in db_names:
if db_name in ['admin', 'local', 'config']:
continue
db = client[db_name]
collections = db.list_collection_names()
print(f"\n🗄️ 数据库: {db_name}")
print(f" 集合数量: {len(collections)}")
for coll_name in collections:
collection = db[coll_name]
count = collection.count_documents({})
print(f"\n 📁 集合: {coll_name}")
print(f" 文档数量: {count:,}")
if count > 0:
# 获取样本文档来分析结构
sample_doc = collection.find_one()
schema = analyze_document_schema(sample_doc)
if schema:
print(f" 📋 字段结构:")
for field_name, field_info in schema.items():
print(f"{field_name}: {field_info['type']}")
if 'example' in field_info:
example = field_info['example']
if isinstance(example, str) and len(example) > 100:
example = example[:100] + "..."
print(f" 示例: {example}")
else:
print(f" ⚠️ 集合为空")
except Exception as e:
print(f"❌ 获取数据库信息失败: {e}")
def display_statistics(client, db_name='douyin_data', collection_name='play_vv_records'):
"""显示统计信息"""
try:
db = client[db_name]
collection = db[collection_name]
print(f"\n📊 统计信息 ({db_name}.{collection_name})")
print("-" * 50)
# 基本统计
total_count = collection.count_documents({})
print(f"📈 总文档数: {total_count:,}")
if total_count == 0:
print("⚠️ 集合为空,无法显示统计信息")
return
# 时间范围统计
time_fields = ['batch_time', 'created_at', 'timestamp']
for field in time_fields:
if collection.find_one({field: {'$exists': True}}):
pipeline = [
{'$group': {
'_id': None,
'min_time': {'$min': f'${field}'},
'max_time': {'$max': f'${field}'}
}}
]
result = list(collection.aggregate(pipeline))
if result:
min_time = result[0]['min_time']
max_time = result[0]['max_time']
print(f"📅 时间范围 ({field}):")
print(f" 最早: {min_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f" 最新: {max_time.strftime('%Y-%m-%d %H:%M:%S')}")
break
# 播放量统计
playcount_fields = ['play_vv', 'playcount', 'play_count', 'views']
for field in playcount_fields:
if collection.find_one({field: {'$exists': True, '$type': 'number'}}):
pipeline = [
{'$group': {
'_id': None,
'total_plays': {'$sum': f'${field}'},
'avg_plays': {'$avg': f'${field}'},
'max_plays': {'$max': f'${field}'},
'min_plays': {'$min': f'${field}'}
}}
]
result = list(collection.aggregate(pipeline))
if result:
stats = result[0]
print(f"🎬 播放量统计 ({field}):")
print(f" 总播放量: {stats['total_plays']:,}")
print(f" 平均播放量: {stats['avg_plays']:,.0f}")
print(f" 最高播放量: {stats['max_plays']:,}")
print(f" 最低播放量: {stats['min_plays']:,}")
break
# 热门内容统计
if collection.find_one({'mix_name': {'$exists': True}}):
print(f"\n🔥 热门内容 (按播放量排序):")
pipeline = [
{'$match': {'play_vv': {'$exists': True, '$type': 'number'}}},
{'$sort': {'play_vv': -1}},
{'$limit': 5},
{'$project': {'mix_name': 1, 'play_vv': 1, 'batch_time': 1}}
]
top_content = list(collection.aggregate(pipeline))
for i, content in enumerate(top_content, 1):
name = content.get('mix_name', '未知')
plays = content.get('play_vv', 0)
time_str = content.get('batch_time', datetime.now()).strftime('%m-%d %H:%M')
print(f" {i}. {name}: {plays:,} ({time_str})")
except Exception as e:
print(f"❌ 获取统计信息失败: {e}")
def display_recent_data(client, db_name='douyin_data', collection_name='play_vv_records', limit=3):
"""显示最近的数据"""
try:
db = client[db_name]
collection = db[collection_name]
print(f"\n📈 最近 {limit} 条数据 ({db_name}.{collection_name})")
print("-" * 80)
# 尝试按时间字段排序
time_fields = ['batch_time', 'created_at', 'timestamp', '_id']
sort_field = None
for field in time_fields:
if collection.find_one({field: {'$exists': True}}):
sort_field = field
break
if sort_field:
recent_docs = list(collection.find().sort(sort_field, -1).limit(limit))
else:
recent_docs = list(collection.find().limit(limit))
if not recent_docs:
print("⚠️ 没有找到数据")
return
for i, doc in enumerate(recent_docs, 1):
print(f"\n📄 记录 {i}:")
display_document(doc)
except Exception as e:
print(f"❌ 获取最近数据失败: {e}")
def display_document(doc, indent=2):
"""显示单个文档"""
spaces = " " * indent
for key, value in doc.items():
if key == '_id':
print(f"{spaces}🆔 {key}: {value}")
elif isinstance(value, datetime):
print(f"{spaces}📅 {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
elif isinstance(value, str):
display_value = value[:100] + "..." if len(value) > 100 else value
print(f"{spaces}📝 {key}: {display_value}")
elif isinstance(value, (int, float)):
if key in ['playcount', 'play_count', 'views', 'play_vv']:
print(f"{spaces}📊 {key}: {value:,}")
else:
print(f"{spaces}🔢 {key}: {value}")
elif isinstance(value, list):
print(f"{spaces}📋 {key}: [{len(value)} 项]")
if len(value) > 0 and len(value) <= 3:
for item in value[:3]:
item_str = str(item)[:50] + "..." if len(str(item)) > 50 else str(item)
print(f"{spaces} - {item_str}")
elif len(value) > 3:
for item in value[:2]:
item_str = str(item)[:50] + "..." if len(str(item)) > 50 else str(item)
print(f"{spaces} - {item_str}")
print(f"{spaces} ... 还有 {len(value)-2}")
elif isinstance(value, dict):
print(f"{spaces}📦 {key}: {{对象}}")
if len(value) <= 3:
for k, v in value.items():
v_str = str(v)[:50] + "..." if len(str(v)) > 50 else str(v)
print(f"{spaces} {k}: {v_str}")
else:
for k, v in list(value.items())[:2]:
v_str = str(v)[:50] + "..." if len(str(v)) > 50 else str(v)
print(f"{spaces} {k}: {v_str}")
print(f"{spaces} ... 还有 {len(value)-2} 个字段")
else:
print(f"{spaces}{key}: {value}")
def main():
"""主函数"""
print("🚀 MongoDB 数据库快速查看工具")
print(f"⏰ 查看时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# 连接数据库
client = connect_mongodb()
if not client:
return
try:
# 显示数据库结构
display_database_info(client)
# 显示统计信息
display_statistics(client)
# 显示最近数据
display_recent_data(client)
print(f"\n{'='*80}")
print("✅ 数据库查看完成!")
print("💡 提示: 运行 'python scripts/mongodb_viewer.py' 可以使用交互式查看器")
print("🔄 提示: 重新运行此脚本可以查看最新数据")
except KeyboardInterrupt:
print("\n👋 程序被用户中断")
finally:
if client:
client.close()
if __name__ == '__main__':
main()