rank_backend/scripts/query_mongodb_data.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
查询MongoDB中的抖音播放量数据
"""

from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from datetime import datetime

def connect_mongodb():
    """连接MongoDB"""
    try:
        client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=5000)
        client.admin.command('ping')
        db = client['douyin——data']
        collection = db['playcounts']
        print("MongoDB连接成功")
        return client, collection
    except ConnectionFailure:
        print("MongoDB连接失败，请确保MongoDB服务已启动")
        return None, None
    except Exception as e:
        print(f"MongoDB连接出错: {e}")
        return None, None

def query_latest_batches(collection, limit=5):
    """查询最近的几个批次数据"""
    try:
        # 按批次时间倒序获取最近的批次
        pipeline = [
            {"$group": {
                "_id": "$batch_id",
                "batch_time": {"$first": "$batch_time"},
                "count": {"$sum": 1}
            }},
            {"$sort": {"batch_time": -1}},
            {"$limit": limit}
        ]

        batches = list(collection.aggregate(pipeline))

        if not batches:
            print("暂无数据")
            return

        print(f"\n===== 最近 {len(batches)} 个批次 =====")
        for batch in batches:
            batch_time = batch['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
            print(f"批次ID: {batch['_id']}, 时间: {batch_time}, 数据条数: {batch['count']}")

            # 显示该批次的具体数据，按播放量排序（如果有rank字段则按rank排序，否则按playcount_number排序）
            batch_data = list(collection.find(
                {"batch_id": batch['_id']},
                {"name": 1, "playcount": 1, "rank": 1, "playcount_number": 1, "_id": 0}
            ))

            # 按rank排序（如果存在），否则按playcount_number降序排序
            if batch_data and 'rank' in batch_data[0]:
                batch_data.sort(key=lambda x: x.get('rank', 999))
            elif batch_data and 'playcount_number' in batch_data[0]:
                batch_data.sort(key=lambda x: x.get('playcount_number', 0), reverse=True)

            for i, item in enumerate(batch_data, 1):
                rank_info = f"[第{item.get('rank', i)}名] " if 'rank' in item else ""
                print(f"  {rank_info}{item['name']}")
                print(f"    播放量: {item['playcount']}")
            print()

    except Exception as e:
        print(f"查询数据失败: {e}")

def query_by_name(collection, name_keyword):
    """根据剧本名称关键词查询"""
    try:
        # 使用正则表达式进行模糊匹配
        query = {"name": {"$regex": name_keyword, "$options": "i"}}
        results = list(collection.find(query).sort("batch_time", -1))

        if not results:
            print(f"未找到包含'{name_keyword}'的剧本")
            return

        print(f"\n===== 包含'{name_keyword}'的剧本 =====")
        for result in results:
            batch_time = result['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
            print(f"剧本: {result['name']}")
            print(f"播放量: {result['playcount']}")
            print(f"抓取时间: {batch_time}")
            print(f"批次ID: {result['batch_id']}")
            print("-" * 30)

    except Exception as e:
        print(f"查询失败: {e}")

def main():
    print("抖音播放量数据查询工具")
    print("=" * 40)

    client, collection = connect_mongodb()
    if collection is None:
        return

    try:
        while True:
            print("\n请选择操作:")
            print("1. 查看最近的批次数据")
            print("2. 根据剧本名称搜索")
            print("3. 退出")

            choice = input("请输入选项 (1-3): ").strip()

            if choice == '1':
                limit = input("显示最近几个批次? (默认5): ").strip()
                try:
                    limit = int(limit) if limit else 5
                except ValueError:
                    limit = 5
                query_latest_batches(collection, limit)

            elif choice == '2':
                keyword = input("请输入剧本名称关键词: ").strip()
                if keyword:
                    query_by_name(collection, keyword)
                else:
                    print("关键词不能为空")

            elif choice == '3':
                break

            else:
                print("无效选项，请重新选择")

    except KeyboardInterrupt:
        print("\n用户中断操作")
    finally:
        if client:
            client.close()
        print("已断开MongoDB连接")

if __name__ == '__main__':
    main()