Features: - Douyin play count scraper using Selenium + Chrome DevTools Protocol - Automated scheduler for daily data collection - MongoDB data storage - Mini-program API server - Data analysis and visualization tools 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
142 lines
4.9 KiB
Python
142 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
查询MongoDB中的抖音播放量数据
|
||
"""
|
||
|
||
from pymongo import MongoClient
|
||
from pymongo.errors import ConnectionFailure
|
||
from datetime import datetime
|
||
|
||
def connect_mongodb():
|
||
"""连接MongoDB"""
|
||
try:
|
||
client = MongoClient('mongodb://localhost:27017/', serverSelectionTimeoutMS=5000)
|
||
client.admin.command('ping')
|
||
db = client['douyin——data']
|
||
collection = db['playcounts']
|
||
print("MongoDB连接成功")
|
||
return client, collection
|
||
except ConnectionFailure:
|
||
print("MongoDB连接失败,请确保MongoDB服务已启动")
|
||
return None, None
|
||
except Exception as e:
|
||
print(f"MongoDB连接出错: {e}")
|
||
return None, None
|
||
|
||
def query_latest_batches(collection, limit=5):
|
||
"""查询最近的几个批次数据"""
|
||
try:
|
||
# 按批次时间倒序获取最近的批次
|
||
pipeline = [
|
||
{"$group": {
|
||
"_id": "$batch_id",
|
||
"batch_time": {"$first": "$batch_time"},
|
||
"count": {"$sum": 1}
|
||
}},
|
||
{"$sort": {"batch_time": -1}},
|
||
{"$limit": limit}
|
||
]
|
||
|
||
batches = list(collection.aggregate(pipeline))
|
||
|
||
if not batches:
|
||
print("暂无数据")
|
||
return
|
||
|
||
print(f"\n===== 最近 {len(batches)} 个批次 =====")
|
||
for batch in batches:
|
||
batch_time = batch['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
|
||
print(f"批次ID: {batch['_id']}, 时间: {batch_time}, 数据条数: {batch['count']}")
|
||
|
||
# 显示该批次的具体数据,按播放量排序(如果有rank字段则按rank排序,否则按playcount_number排序)
|
||
batch_data = list(collection.find(
|
||
{"batch_id": batch['_id']},
|
||
{"name": 1, "playcount": 1, "rank": 1, "playcount_number": 1, "_id": 0}
|
||
))
|
||
|
||
# 按rank排序(如果存在),否则按playcount_number降序排序
|
||
if batch_data and 'rank' in batch_data[0]:
|
||
batch_data.sort(key=lambda x: x.get('rank', 999))
|
||
elif batch_data and 'playcount_number' in batch_data[0]:
|
||
batch_data.sort(key=lambda x: x.get('playcount_number', 0), reverse=True)
|
||
|
||
for i, item in enumerate(batch_data, 1):
|
||
rank_info = f"[第{item.get('rank', i)}名] " if 'rank' in item else ""
|
||
print(f" {rank_info}{item['name']}")
|
||
print(f" 播放量: {item['playcount']}")
|
||
print()
|
||
|
||
except Exception as e:
|
||
print(f"查询数据失败: {e}")
|
||
|
||
def query_by_name(collection, name_keyword):
|
||
"""根据剧本名称关键词查询"""
|
||
try:
|
||
# 使用正则表达式进行模糊匹配
|
||
query = {"name": {"$regex": name_keyword, "$options": "i"}}
|
||
results = list(collection.find(query).sort("batch_time", -1))
|
||
|
||
if not results:
|
||
print(f"未找到包含'{name_keyword}'的剧本")
|
||
return
|
||
|
||
print(f"\n===== 包含'{name_keyword}'的剧本 =====")
|
||
for result in results:
|
||
batch_time = result['batch_time'].strftime("%Y-%m-%d %H:%M:%S")
|
||
print(f"剧本: {result['name']}")
|
||
print(f"播放量: {result['playcount']}")
|
||
print(f"抓取时间: {batch_time}")
|
||
print(f"批次ID: {result['batch_id']}")
|
||
print("-" * 30)
|
||
|
||
except Exception as e:
|
||
print(f"查询失败: {e}")
|
||
|
||
def main():
|
||
print("抖音播放量数据查询工具")
|
||
print("=" * 40)
|
||
|
||
client, collection = connect_mongodb()
|
||
if collection is None:
|
||
return
|
||
|
||
try:
|
||
while True:
|
||
print("\n请选择操作:")
|
||
print("1. 查看最近的批次数据")
|
||
print("2. 根据剧本名称搜索")
|
||
print("3. 退出")
|
||
|
||
choice = input("请输入选项 (1-3): ").strip()
|
||
|
||
if choice == '1':
|
||
limit = input("显示最近几个批次? (默认5): ").strip()
|
||
try:
|
||
limit = int(limit) if limit else 5
|
||
except ValueError:
|
||
limit = 5
|
||
query_latest_batches(collection, limit)
|
||
|
||
elif choice == '2':
|
||
keyword = input("请输入剧本名称关键词: ").strip()
|
||
if keyword:
|
||
query_by_name(collection, keyword)
|
||
else:
|
||
print("关键词不能为空")
|
||
|
||
elif choice == '3':
|
||
break
|
||
|
||
else:
|
||
print("无效选项,请重新选择")
|
||
|
||
except KeyboardInterrupt:
|
||
print("\n用户中断操作")
|
||
finally:
|
||
if client:
|
||
client.close()
|
||
print("已断开MongoDB连接")
|
||
|
||
if __name__ == '__main__':
|
||
main() |