rank_backend/scripts/query_mongodb_data.py
Qyir 53160420d1 Initial commit: Douyin play count tracking system
Features:
- Douyin play count scraper using Selenium + Chrome DevTools Protocol
- Automated scheduler for daily data collection
- MongoDB data storage
- Mini-program API server
- Data analysis and visualization tools

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-17 10:48:52 +08:00

142 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
查询MongoDB中的抖音播放量数据
"""
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from datetime import datetime
def connect_mongodb():
"""连接MongoDB"""
try:
client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=5000)
client.admin.command('ping')
db = client['douyin——data']
collection = db['playcounts']
print("MongoDB连接成功")
return client, collection
except ConnectionFailure:
print("MongoDB连接失败请确保MongoDB服务已启动")
return None, None
except Exception as e:
print(f"MongoDB连接出错: {e}")
return None, None
def query_latest_batches(collection, limit=5):
"""查询最近的几个批次数据"""
try:
# 按批次时间倒序获取最近的批次
pipeline = [
{"$group": {
"_id": "$batch_id",
"batch_time": {"$first": "$batch_time"},
"count": {"$sum": 1}
}},
{"$sort": {"batch_time": -1}},
{"$limit": limit}
]
batches = list(collection.aggregate(pipeline))
if not batches:
print("暂无数据")
return
print(f"\n===== 最近 {len(batches)} 个批次 =====")
for batch in batches:
batch_time = batch['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
print(f"批次ID: {batch['_id']}, 时间: {batch_time}, 数据条数: {batch['count']}")
# 显示该批次的具体数据按播放量排序如果有rank字段则按rank排序否则按playcount_number排序
batch_data = list(collection.find(
{"batch_id": batch['_id']},
{"name": 1, "playcount": 1, "rank": 1, "playcount_number": 1, "_id": 0}
))
# 按rank排序如果存在否则按playcount_number降序排序
if batch_data and 'rank' in batch_data[0]:
batch_data.sort(key=lambda x: x.get('rank', 999))
elif batch_data and 'playcount_number' in batch_data[0]:
batch_data.sort(key=lambda x: x.get('playcount_number', 0), reverse=True)
for i, item in enumerate(batch_data, 1):
rank_info = f"[第{item.get('rank', i)}名] " if 'rank' in item else ""
print(f" {rank_info}{item['name']}")
print(f" 播放量: {item['playcount']}")
print()
except Exception as e:
print(f"查询数据失败: {e}")
def query_by_name(collection, name_keyword):
"""根据剧本名称关键词查询"""
try:
# 使用正则表达式进行模糊匹配
query = {"name": {"$regex": name_keyword, "$options": "i"}}
results = list(collection.find(query).sort("batch_time", -1))
if not results:
print(f"未找到包含'{name_keyword}'的剧本")
return
print(f"\n===== 包含'{name_keyword}'的剧本 =====")
for result in results:
batch_time = result['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
print(f"剧本: {result['name']}")
print(f"播放量: {result['playcount']}")
print(f"抓取时间: {batch_time}")
print(f"批次ID: {result['batch_id']}")
print("-" * 30)
except Exception as e:
print(f"查询失败: {e}")
def main():
print("抖音播放量数据查询工具")
print("=" * 40)
client, collection = connect_mongodb()
if collection is None:
return
try:
while True:
print("\n请选择操作:")
print("1. 查看最近的批次数据")
print("2. 根据剧本名称搜索")
print("3. 退出")
choice = input("请输入选项 (1-3): ").strip()
if choice == '1':
limit = input("显示最近几个批次? (默认5): ").strip()
try:
limit = int(limit) if limit else 5
except ValueError:
limit = 5
query_latest_batches(collection, limit)
elif choice == '2':
keyword = input("请输入剧本名称关键词: ").strip()
if keyword:
query_by_name(collection, keyword)
else:
print("关键词不能为空")
elif choice == '3':
break
else:
print("无效选项,请重新选择")
except KeyboardInterrupt:
print("\n用户中断操作")
finally:
if client:
client.close()
print("已断开MongoDB连接")
if __name__ == '__main__':
main()