Features: - Douyin play count scraper using Selenium + Chrome DevTools Protocol - Automated scheduler for daily data collection - MongoDB data storage - Mini-program API server - Data analysis and visualization tools 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
294 lines
11 KiB
Python
294 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
MongoDB数据库快速查看工具
|
|
一次性显示数据库结构、统计信息和最新数据
|
|
"""
|
|
|
|
import pymongo
|
|
from pymongo import MongoClient
|
|
from datetime import datetime
|
|
import json
|
|
from collections import defaultdict
|
|
|
|
def connect_mongodb(connection_string='mongodb://localhost:27017/'):
|
|
"""连接到MongoDB"""
|
|
try:
|
|
client = MongoClient(connection_string, serverSelectionTimeoutMS=5000)
|
|
client.admin.command('ping')
|
|
print(f"✅ 成功连接到MongoDB: {connection_string}")
|
|
return client
|
|
except Exception as e:
|
|
print(f"❌ 连接MongoDB失败: {e}")
|
|
return None
|
|
|
|
def analyze_document_schema(document):
|
|
"""分析文档结构"""
|
|
if not document:
|
|
return {}
|
|
|
|
schema = {}
|
|
for key, value in document.items():
|
|
if key == '_id':
|
|
schema[key] = {'type': 'ObjectId', 'example': str(value)}
|
|
elif isinstance(value, str):
|
|
schema[key] = {'type': 'string', 'example': value[:50] + '...' if len(value) > 50 else value}
|
|
elif isinstance(value, int):
|
|
schema[key] = {'type': 'integer', 'example': value}
|
|
elif isinstance(value, float):
|
|
schema[key] = {'type': 'float', 'example': value}
|
|
elif isinstance(value, bool):
|
|
schema[key] = {'type': 'boolean', 'example': value}
|
|
elif isinstance(value, datetime):
|
|
schema[key] = {'type': 'datetime', 'example': value.strftime('%Y-%m-%d %H:%M:%S')}
|
|
elif isinstance(value, list):
|
|
schema[key] = {
|
|
'type': 'array',
|
|
'length': len(value),
|
|
'example': value[:3] if len(value) <= 3 else value[:3] + ['...']
|
|
}
|
|
elif isinstance(value, dict):
|
|
schema[key] = {
|
|
'type': 'object',
|
|
'keys': list(value.keys())[:5],
|
|
'example': {k: v for k, v in list(value.items())[:2]}
|
|
}
|
|
else:
|
|
schema[key] = {'type': type(value).__name__, 'example': str(value)[:50]}
|
|
|
|
return schema
|
|
|
|
def display_database_info(client):
|
|
"""显示数据库信息"""
|
|
print("\n" + "="*80)
|
|
print("📊 MongoDB 数据库结构分析")
|
|
print("="*80)
|
|
|
|
try:
|
|
db_names = client.list_database_names()
|
|
|
|
for db_name in db_names:
|
|
if db_name in ['admin', 'local', 'config']:
|
|
continue
|
|
|
|
db = client[db_name]
|
|
collections = db.list_collection_names()
|
|
|
|
print(f"\n🗄️ 数据库: {db_name}")
|
|
print(f" 集合数量: {len(collections)}")
|
|
|
|
for coll_name in collections:
|
|
collection = db[coll_name]
|
|
count = collection.count_documents({})
|
|
|
|
print(f"\n 📁 集合: {coll_name}")
|
|
print(f" 文档数量: {count:,}")
|
|
|
|
if count > 0:
|
|
# 获取样本文档来分析结构
|
|
sample_doc = collection.find_one()
|
|
schema = analyze_document_schema(sample_doc)
|
|
|
|
if schema:
|
|
print(f" 📋 字段结构:")
|
|
for field_name, field_info in schema.items():
|
|
print(f" • {field_name}: {field_info['type']}")
|
|
if 'example' in field_info:
|
|
example = field_info['example']
|
|
if isinstance(example, str) and len(example) > 100:
|
|
example = example[:100] + "..."
|
|
print(f" 示例: {example}")
|
|
else:
|
|
print(f" ⚠️ 集合为空")
|
|
|
|
except Exception as e:
|
|
print(f"❌ 获取数据库信息失败: {e}")
|
|
|
|
def display_statistics(client, db_name='douyin_data', collection_name='play_vv_records'):
|
|
"""显示统计信息"""
|
|
try:
|
|
db = client[db_name]
|
|
collection = db[collection_name]
|
|
|
|
print(f"\n📊 统计信息 ({db_name}.{collection_name})")
|
|
print("-" * 50)
|
|
|
|
# 基本统计
|
|
total_count = collection.count_documents({})
|
|
print(f"📈 总文档数: {total_count:,}")
|
|
|
|
if total_count == 0:
|
|
print("⚠️ 集合为空,无法显示统计信息")
|
|
return
|
|
|
|
# 时间范围统计
|
|
time_fields = ['batch_time', 'created_at', 'timestamp']
|
|
for field in time_fields:
|
|
if collection.find_one({field: {'$exists': True}}):
|
|
pipeline = [
|
|
{'$group': {
|
|
'_id': None,
|
|
'min_time': {'$min': f'${field}'},
|
|
'max_time': {'$max': f'${field}'}
|
|
}}
|
|
]
|
|
result = list(collection.aggregate(pipeline))
|
|
if result:
|
|
min_time = result[0]['min_time']
|
|
max_time = result[0]['max_time']
|
|
print(f"📅 时间范围 ({field}):")
|
|
print(f" 最早: {min_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f" 最新: {max_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
break
|
|
|
|
# 播放量统计
|
|
playcount_fields = ['play_vv', 'playcount', 'play_count', 'views']
|
|
for field in playcount_fields:
|
|
if collection.find_one({field: {'$exists': True, '$type': 'number'}}):
|
|
pipeline = [
|
|
{'$group': {
|
|
'_id': None,
|
|
'total_plays': {'$sum': f'${field}'},
|
|
'avg_plays': {'$avg': f'${field}'},
|
|
'max_plays': {'$max': f'${field}'},
|
|
'min_plays': {'$min': f'${field}'}
|
|
}}
|
|
]
|
|
result = list(collection.aggregate(pipeline))
|
|
if result:
|
|
stats = result[0]
|
|
print(f"🎬 播放量统计 ({field}):")
|
|
print(f" 总播放量: {stats['total_plays']:,}")
|
|
print(f" 平均播放量: {stats['avg_plays']:,.0f}")
|
|
print(f" 最高播放量: {stats['max_plays']:,}")
|
|
print(f" 最低播放量: {stats['min_plays']:,}")
|
|
break
|
|
|
|
# 热门内容统计
|
|
if collection.find_one({'mix_name': {'$exists': True}}):
|
|
print(f"\n🔥 热门内容 (按播放量排序):")
|
|
pipeline = [
|
|
{'$match': {'play_vv': {'$exists': True, '$type': 'number'}}},
|
|
{'$sort': {'play_vv': -1}},
|
|
{'$limit': 5},
|
|
{'$project': {'mix_name': 1, 'play_vv': 1, 'batch_time': 1}}
|
|
]
|
|
top_content = list(collection.aggregate(pipeline))
|
|
for i, content in enumerate(top_content, 1):
|
|
name = content.get('mix_name', '未知')
|
|
plays = content.get('play_vv', 0)
|
|
time_str = content.get('batch_time', datetime.now()).strftime('%m-%d %H:%M')
|
|
print(f" {i}. {name}: {plays:,} ({time_str})")
|
|
|
|
except Exception as e:
|
|
print(f"❌ 获取统计信息失败: {e}")
|
|
|
|
def display_recent_data(client, db_name='douyin_data', collection_name='play_vv_records', limit=3):
|
|
"""显示最近的数据"""
|
|
try:
|
|
db = client[db_name]
|
|
collection = db[collection_name]
|
|
|
|
print(f"\n📈 最近 {limit} 条数据 ({db_name}.{collection_name})")
|
|
print("-" * 80)
|
|
|
|
# 尝试按时间字段排序
|
|
time_fields = ['batch_time', 'created_at', 'timestamp', '_id']
|
|
sort_field = None
|
|
|
|
for field in time_fields:
|
|
if collection.find_one({field: {'$exists': True}}):
|
|
sort_field = field
|
|
break
|
|
|
|
if sort_field:
|
|
recent_docs = list(collection.find().sort(sort_field, -1).limit(limit))
|
|
else:
|
|
recent_docs = list(collection.find().limit(limit))
|
|
|
|
if not recent_docs:
|
|
print("⚠️ 没有找到数据")
|
|
return
|
|
|
|
for i, doc in enumerate(recent_docs, 1):
|
|
print(f"\n📄 记录 {i}:")
|
|
display_document(doc)
|
|
|
|
except Exception as e:
|
|
print(f"❌ 获取最近数据失败: {e}")
|
|
|
|
def display_document(doc, indent=2):
|
|
"""显示单个文档"""
|
|
spaces = " " * indent
|
|
|
|
for key, value in doc.items():
|
|
if key == '_id':
|
|
print(f"{spaces}🆔 {key}: {value}")
|
|
elif isinstance(value, datetime):
|
|
print(f"{spaces}📅 {key}: {value.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
elif isinstance(value, str):
|
|
display_value = value[:100] + "..." if len(value) > 100 else value
|
|
print(f"{spaces}📝 {key}: {display_value}")
|
|
elif isinstance(value, (int, float)):
|
|
if key in ['playcount', 'play_count', 'views', 'play_vv']:
|
|
print(f"{spaces}📊 {key}: {value:,}")
|
|
else:
|
|
print(f"{spaces}🔢 {key}: {value}")
|
|
elif isinstance(value, list):
|
|
print(f"{spaces}📋 {key}: [{len(value)} 项]")
|
|
if len(value) > 0 and len(value) <= 3:
|
|
for item in value[:3]:
|
|
item_str = str(item)[:50] + "..." if len(str(item)) > 50 else str(item)
|
|
print(f"{spaces} - {item_str}")
|
|
elif len(value) > 3:
|
|
for item in value[:2]:
|
|
item_str = str(item)[:50] + "..." if len(str(item)) > 50 else str(item)
|
|
print(f"{spaces} - {item_str}")
|
|
print(f"{spaces} ... 还有 {len(value)-2} 项")
|
|
elif isinstance(value, dict):
|
|
print(f"{spaces}📦 {key}: {{对象}}")
|
|
if len(value) <= 3:
|
|
for k, v in value.items():
|
|
v_str = str(v)[:50] + "..." if len(str(v)) > 50 else str(v)
|
|
print(f"{spaces} {k}: {v_str}")
|
|
else:
|
|
for k, v in list(value.items())[:2]:
|
|
v_str = str(v)[:50] + "..." if len(str(v)) > 50 else str(v)
|
|
print(f"{spaces} {k}: {v_str}")
|
|
print(f"{spaces} ... 还有 {len(value)-2} 个字段")
|
|
else:
|
|
print(f"{spaces}❓ {key}: {value}")
|
|
|
|
def main():
|
|
"""主函数"""
|
|
print("🚀 MongoDB 数据库快速查看工具")
|
|
print(f"⏰ 查看时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
# 连接数据库
|
|
client = connect_mongodb()
|
|
if not client:
|
|
return
|
|
|
|
try:
|
|
# 显示数据库结构
|
|
display_database_info(client)
|
|
|
|
# 显示统计信息
|
|
display_statistics(client)
|
|
|
|
# 显示最近数据
|
|
display_recent_data(client)
|
|
|
|
print(f"\n{'='*80}")
|
|
print("✅ 数据库查看完成!")
|
|
print("💡 提示: 运行 'python scripts/mongodb_viewer.py' 可以使用交互式查看器")
|
|
print("🔄 提示: 重新运行此脚本可以查看最新数据")
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n👋 程序被用户中断")
|
|
finally:
|
|
if client:
|
|
client.close()
|
|
|
|
if __name__ == '__main__':
|
|
main() |