Compare commits

..

No commits in common. "a726e4d8b3f90c0f912408848407d12756673e62" and "36be77948fa2121dbae4fb6354936260838ccfcf" have entirely different histories.

4 changed files with 104 additions and 196 deletions

View File

@ -66,8 +66,6 @@ def setup_logging(quiet_mode=False):
class DouyinAutoScheduler: class DouyinAutoScheduler:
def __init__(self): def __init__(self):
self.is_running = False self.is_running = False
# 创建logger实例
self.logger = logging.getLogger(__name__)
def _normalize_play_vv(self, play_vv): def _normalize_play_vv(self, play_vv):
"""标准化播放量数据类型,将字符串转换为数字""" """标准化播放量数据类型,将字符串转换为数字"""
@ -84,33 +82,23 @@ class DouyinAutoScheduler:
"""按短剧名称去重,保留播放量最高的记录""" """按短剧名称去重,保留播放量最高的记录"""
unique_data = {} unique_data = {}
for video in videos: for video in videos:
mix_name = video.get("mix_name", "").strip() mix_name = video.get("mix_name", "")
if mix_name:
# 过滤掉空的或无效的mix_name # 标准化播放量数据类型
if not mix_name or mix_name == "" or mix_name.lower() == "null": play_vv = self._normalize_play_vv(video.get("play_vv", 0))
self.logger.warning(f"跳过空的或无效的mix_name记录: {video.get('_id', 'unknown')}")
continue
# 标准化播放量数据类型 if mix_name not in unique_data or play_vv > unique_data[mix_name].get("play_vv", 0):
play_vv = self._normalize_play_vv(video.get("play_vv", 0)) if include_rank:
# 用于昨天数据的格式
# 确保播放量大于0过滤无效数据 unique_data[mix_name] = {
if play_vv <= 0: "play_vv": play_vv,
self.logger.warning(f"跳过播放量为0或无效的记录: mix_name={mix_name}, play_vv={video.get('play_vv', 0)}") "video_id": str(video.get("_id", "")),
continue "rank": 0 # 稍后计算排名
}
if mix_name not in unique_data or play_vv > unique_data[mix_name].get("play_vv", 0): else:
if include_rank: # 用于今天数据的格式,直接更新原视频对象
# 用于昨天数据的格式 video["play_vv"] = play_vv
unique_data[mix_name] = { unique_data[mix_name] = video
"play_vv": play_vv,
"video_id": str(video.get("_id", "")),
"rank": 0 # 稍后计算排名
}
else:
# 用于今天数据的格式,直接更新原视频对象
video["play_vv"] = play_vv
unique_data[mix_name] = video
return unique_data return unique_data
@ -193,21 +181,10 @@ class DouyinAutoScheduler:
logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}") logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}")
# 按短剧名称去重,每个短剧只保留播放量最高的一条 # 按短剧名称去重,每个短剧只保留播放量最高的一条
# 🚫 过滤掉空的或无效的mix_name和播放量为0的记录
unique_videos = {} unique_videos = {}
for video in today_videos_raw: for video in today_videos_raw:
mix_name = video.get("mix_name", "").strip() mix_name = video.get("mix_name", "")
play_vv = video.get("play_vv", 0) if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)):
# 过滤掉空的或无效的mix_name
if not mix_name or mix_name == "" or mix_name.lower() == "null":
continue
# 过滤掉播放量为0或无效的记录
if play_vv <= 0:
continue
if mix_name not in unique_videos or play_vv > unique_videos[mix_name].get("play_vv", 0):
unique_videos[mix_name] = video unique_videos[mix_name] = video
today_videos = list(unique_videos.values()) today_videos = list(unique_videos.values())
@ -237,21 +214,10 @@ class DouyinAutoScheduler:
}).sort("play_vv", -1)) }).sort("play_vv", -1))
# 按短剧名称去重,每个短剧只保留播放量最高的一条 # 按短剧名称去重,每个短剧只保留播放量最高的一条
# 🚫 过滤掉空的或无效的mix_name和播放量为0的记录
unique_yesterday_videos = {} unique_yesterday_videos = {}
for video in yesterday_videos_raw: for video in yesterday_videos_raw:
mix_name = video.get("mix_name", "").strip() mix_name = video.get("mix_name", "")
play_vv = video.get("play_vv", 0) if mix_name and (mix_name not in unique_yesterday_videos or video.get("play_vv", 0) > unique_yesterday_videos[mix_name].get("play_vv", 0)):
# 过滤掉空的或无效的mix_name
if not mix_name or mix_name == "" or mix_name.lower() == "null":
continue
# 过滤掉播放量为0或无效的记录
if play_vv <= 0:
continue
if mix_name not in unique_yesterday_videos or play_vv > unique_yesterday_videos[mix_name].get("play_vv", 0):
unique_yesterday_videos[mix_name] = video unique_yesterday_videos[mix_name] = video
# 将昨天的数据转换为字典,以短剧名称为键 # 将昨天的数据转换为字典,以短剧名称为键
@ -315,44 +281,24 @@ class DouyinAutoScheduler:
rankings_management_collection = db['Rankings_management'] rankings_management_collection = db['Rankings_management']
# 生成排序后的榜单数据 # 生成排序后的榜单数据
rank = 1 # 使用独立的排名计数器 for i, item in enumerate(videos_with_growth, 1):
for item in videos_with_growth:
video = item["video"] video = item["video"]
video_id = str(video.get("_id", "")) video_id = str(video.get("_id", ""))
current_play_vv = video.get("play_vv", 0) current_play_vv = video.get("play_vv", 0)
mix_name = video.get("mix_name", "").strip() mix_name = video.get("mix_name", "")
# 🚫 跳过无效数据确保mix_name不为空且播放量大于0
# 注意:这些数据应该已经在去重阶段被过滤掉了,这里是双重保险
if not mix_name or mix_name == "" or mix_name.lower() == "null":
self.logger.warning(f"跳过空的mix_name记录video_id: {video_id}")
continue
if current_play_vv <= 0:
self.logger.warning(f"跳过播放量无效的记录: mix_name={mix_name}, play_vv={current_play_vv}")
continue
# 计算排名变化(基于昨天的排名) # 计算排名变化(基于昨天的排名)
rank_change = 0 rank_change = 0
if not item["is_new"] and item["yesterday_data"]: if not item["is_new"] and item["yesterday_data"]:
yesterday_rank = item["yesterday_data"].get("rank", 0) yesterday_rank = item["yesterday_data"].get("rank", 0)
rank_change = yesterday_rank - rank # 使用当前排名计数器 rank_change = yesterday_rank - i
# 🔍 从Rankings_management获取详细信息按日期和mix_name查询 # 🔍 从Rankings_management获取详细信息
today_str = datetime.now().strftime('%Y-%m-%d') management_data = rankings_management_collection.find_one({"mix_name": mix_name})
management_data = rankings_management_collection.find_one({
"mix_name": mix_name,
"$or": [
{"created_at": {"$gte": datetime.strptime(today_str, '%Y-%m-%d'),
"$lt": datetime.strptime(today_str, '%Y-%m-%d') + timedelta(days=1)}},
{"last_updated": {"$gte": datetime.strptime(today_str, '%Y-%m-%d'),
"$lt": datetime.strptime(today_str, '%Y-%m-%d') + timedelta(days=1)}}
]
})
ranking_item = { ranking_item = {
# 🎯 核心榜单字段 # 🎯 核心榜单字段
"rank": rank, # 使用排名计数器 "rank": i,
"title": mix_name, "title": mix_name,
"mix_name": mix_name, # 确保包含mix_name字段用于同步 "mix_name": mix_name, # 确保包含mix_name字段用于同步
"play_vv": current_play_vv, "play_vv": current_play_vv,
@ -398,7 +344,6 @@ class DouyinAutoScheduler:
} }
comprehensive_ranking["data"].append(ranking_item) comprehensive_ranking["data"].append(ranking_item)
rank += 1 # 递增排名计数器
# 为每次计算添加唯一的时间戳,确保数据唯一性 # 为每次计算添加唯一的时间戳,确保数据唯一性
current_timestamp = datetime.now() current_timestamp = datetime.now()

View File

@ -769,16 +769,6 @@ class DouyinPlayVVScraper:
play_vv = statis.get('play_vv') play_vv = statis.get('play_vv')
if isinstance(play_vv, (int, str)) and str(play_vv).isdigit(): if isinstance(play_vv, (int, str)) and str(play_vv).isdigit():
vv = int(play_vv) vv = int(play_vv)
# 数据验证确保播放量大于0且合集名称不为空
if vv <= 0:
logging.warning(f"跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}")
return
if not mix_name or mix_name.strip() == "":
logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}")
return
# 构建合集链接 # 构建合集链接
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
@ -982,15 +972,6 @@ class DouyinPlayVVScraper:
vv = int(match.group(3)) vv = int(match.group(3))
episodes = int(match.group(4)) episodes = int(match.group(4))
# 数据验证确保播放量大于0且合集名称不为空
if vv <= 0:
logging.warning(f"正则提取跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}")
continue
if not mix_name or mix_name.strip() == "":
logging.warning(f"正则提取跳过缺少合集名称的数据: play_vv={vv}")
continue
# 构建合集链接 # 构建合集链接
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
@ -1025,17 +1006,27 @@ class DouyinPlayVVScraper:
for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text): for match in re.findall(r'"play_vv"\s*:\s*(\d+)', text):
try: try:
vv = int(match) vv = int(match)
# 数据验证:跳过无效的播放量数据
if vv <= 0:
logging.warning(f"跳过无效的播放量数据: play_vv={vv}")
continue
# 检查是否已经存在相同的play_vv # 检查是否已经存在相同的play_vv
if not any(item['play_vv'] == vv for item in self.play_vv_items): if not any(item['play_vv'] == vv for item in self.play_vv_items):
# 由于无法获取完整的合集信息,跳过这些不完整的数据 # 构建合集数据
# 避免产生mix_name为空的无效记录 item_data = {
logging.warning(f"跳过不完整的数据记录: play_vv={vv}, 缺少合集名称") 'play_vv': vv,
continue 'formatted': self.format_count(vv),
'url': source_url,
'request_id': request_id,
'mix_name': '', # 未知合集名称
'video_url': '', # 未知链接
'mix_id': '', # 未知mix_id
'updated_to_episode': None, # 未知集数
'timestamp': datetime.now().isoformat()
}
# 添加到列表(保持原有逻辑)
self.play_vv_items.append(item_data)
# 实时保存到数据库(对于未知合集,可能不需要实时保存,但为了一致性还是保存)
if self.realtime_save_enabled:
self.save_single_item_realtime(item_data)
except Exception: except Exception:
continue continue
@ -1138,17 +1129,25 @@ class DouyinPlayVVScraper:
for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source): for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source):
try: try:
vv = int(m) vv = int(m)
# 数据验证:跳过无效的播放量数据
if vv <= 0:
logging.warning(f"跳过无效的播放量数据: play_vv={vv}")
continue
# 检查是否已经存在相同的play_vv # 检查是否已经存在相同的play_vv
if not any(item['play_vv'] == vv for item in self.play_vv_items): if not any(item['play_vv'] == vv for item in self.play_vv_items):
# 由于从statis中无法获取完整的合集信息跳过这些不完整的数据 # 构建合集数据
# 避免产生mix_name为空的无效记录 item_data = {
logging.warning(f"跳过不完整的数据记录: play_vv={vv}, 来源statis但缺少合集名称") 'play_vv': vv,
continue 'formatted': self.format_count(vv),
'url': 'page_source_statis',
'request_id': None,
'mix_name': '', # 从statis中无法获取合集名称
'video_url': '', # 从statis中无法获取链接
'timestamp': datetime.now().isoformat()
}
# 添加到列表(保持原有逻辑)
self.play_vv_items.append(item_data)
# 实时保存到数据库
if self.realtime_save_enabled:
self.save_single_item_realtime(item_data)
except Exception: except Exception:
pass pass
except Exception: except Exception:

View File

@ -1200,18 +1200,8 @@ def update_content_classification():
} }
field_name = field_mapping[classification_type] field_name = field_mapping[classification_type]
# 首先从Rankings_management获取短剧的mix_id使用今天的日期 # 首先从Rankings_management获取短剧的mix_id
today = datetime.now().date() mgmt_doc = rankings_management_collection.find_one({"mix_name": mix_name})
start_of_day = datetime.combine(today, datetime.min.time())
end_of_day = datetime.combine(today, datetime.max.time())
mgmt_doc = rankings_management_collection.find_one({
"mix_name": mix_name,
"$or": [
{"created_at": {"$gte": start_of_day, "$lte": end_of_day}},
{"last_updated": {"$gte": start_of_day, "$lte": end_of_day}}
]
})
if not mgmt_doc: if not mgmt_doc:
return jsonify({"success": False, "message": f"未找到短剧: {mix_name}"}) return jsonify({"success": False, "message": f"未找到短剧: {mix_name}"})
@ -1296,14 +1286,8 @@ def update_content_classification():
logging.info(f"分类更新: {message}, Rankings_management({result_mgmt.modified_count}), Ranking_storage({result_storage.modified_count})") logging.info(f"分类更新: {message}, Rankings_management({result_mgmt.modified_count}), Ranking_storage({result_storage.modified_count})")
# 获取更新后的分类状态,使用今天的日期 # 获取更新后的分类状态
updated_mgmt_doc = rankings_management_collection.find_one({ updated_mgmt_doc = rankings_management_collection.find_one({"mix_name": mix_name})
"mix_name": mix_name,
"$or": [
{"created_at": {"$gte": start_of_day, "$lte": end_of_day}},
{"last_updated": {"$gte": start_of_day, "$lte": end_of_day}}
]
})
classification_status = { classification_status = {
'novel': mix_id in updated_mgmt_doc.get('Novel_IDs', []) if updated_mgmt_doc else False, 'novel': mix_id in updated_mgmt_doc.get('Novel_IDs', []) if updated_mgmt_doc else False,
'anime': mix_id in updated_mgmt_doc.get('Anime_IDs', []) if updated_mgmt_doc else False, 'anime': mix_id in updated_mgmt_doc.get('Anime_IDs', []) if updated_mgmt_doc else False,
@ -1537,84 +1521,41 @@ def sync_ranking_storage_fields(target_date=None, force_update=False, max_retrie
# 遍历data数组中的每个项目 # 遍历data数组中的每个项目
for data_item in data_array: for data_item in data_array:
try: try:
mix_name = data_item.get('mix_name', '').strip() mix_name = data_item.get('mix_name')
# 🚫 跳过无效数据确保mix_name不为空
if not mix_name or mix_name == "" or mix_name.lower() == "null":
logging.warning(f"跳过空的或无效的mix_name记录: {data_item.get('_id', 'unknown')}")
continue # 不添加到updated_data_array直接跳过
# 🔧 增强逻辑如果mix_name为空尝试通过其他方式找到对应数据 # 🔧 增强逻辑如果mix_name为空尝试通过其他方式找到对应数据
source_data = None source_data = None
# 构建日期查询条件 - 查找当天的数据
start_of_day = datetime.combine(target_date_obj, datetime.min.time())
end_of_day = datetime.combine(target_date_obj, datetime.max.time())
date_query = {
"$or": [
{"created_at": {"$gte": start_of_day, "$lte": end_of_day}},
{"last_updated": {"$gte": start_of_day, "$lte": end_of_day}}
]
}
if mix_name: if mix_name:
# 优先使用mix_name查找 - 从Rankings_management获取数据添加日期过滤 # 优先使用mix_name查找 - 从Rankings_management获取数据
query = {"mix_name": mix_name} source_data = rankings_management_collection.find_one({"mix_name": mix_name})
query.update(date_query)
source_data = rankings_management_collection.find_one(query)
# 如果通过mix_name没找到数据或者mix_name为空尝试其他匹配方式 # 如果通过mix_name没找到数据或者mix_name为空尝试其他匹配方式
if not source_data: if not source_data:
# 方法1通过mix_id匹配如果有的话 # 方法1通过mix_id匹配如果有的话
mix_id = data_item.get('mix_id') mix_id = data_item.get('mix_id')
if mix_id: if mix_id:
query = {"mix_id": mix_id} source_data = rankings_management_collection.find_one({"mix_id": mix_id})
query.update(date_query)
source_data = rankings_management_collection.find_one(query)
if source_data: if source_data:
logging.info(f"通过mix_id找到数据: {mix_id} -> {source_data.get('mix_name', 'N/A')}") logging.info(f"通过mix_id找到数据: {mix_id} -> {source_data.get('mix_name', 'N/A')}")
# 方法2如果还是没找到尝试通过title匹配 # 方法2如果还是没找到尝试通过title匹配
if not source_data: if not source_data:
title = data_item.get('title') title = data_item.get('title')
if title and title.strip(): if title:
query = {"mix_name": title.strip()} source_data = rankings_management_collection.find_one({"mix_name": title})
query.update(date_query)
source_data = rankings_management_collection.find_one(query)
if source_data: if source_data:
logging.info(f"通过title找到数据: {title} -> {source_data.get('mix_name', 'N/A')}") logging.info(f"通过title找到数据: {title} -> {source_data.get('mix_name', 'N/A')}")
# 如果找到了源数据更新mix_name如果原来为空的话 # 如果找到了源数据更新mix_name如果原来为空的话
if source_data and not mix_name: if source_data and not mix_name:
mix_name = source_data.get('mix_name', '').strip() mix_name = source_data.get('mix_name', '')
if mix_name: data_item['mix_name'] = mix_name
data_item['mix_name'] = mix_name logging.info(f"修复空的mix_name: {data_item.get('title', 'N/A')} -> {mix_name}")
logging.info(f"修复空的mix_name: {data_item.get('title', 'N/A')} -> {mix_name}")
else:
logging.warning(f"源数据中的mix_name也为空跳过此记录")
continue # 跳过无效记录
# 如果还是没有找到源数据,检查是否有锁定字段需要保护 # 如果还是没有找到源数据,保持原数据不变
if not source_data: if not source_data:
logging.warning(f"无法找到对应的源数据: mix_name={mix_name}, mix_id={data_item.get('mix_id')}, title={data_item.get('title')}") logging.warning(f"无法找到对应的源数据: mix_name={mix_name}, mix_id={data_item.get('mix_id')}, title={data_item.get('title')}")
updated_data_array.append(data_item)
# 检查是否有锁定字段,如果有锁定字段,保持原数据不变
field_lock_status = ranking_doc.get('field_lock_status', {})
has_locked_fields = any([
field_lock_status.get('Manufacturing_Field_locked', False),
field_lock_status.get('Copyright_field_locked', False),
field_lock_status.get('Novel_IDs_locked', False),
field_lock_status.get('Anime_IDs_locked', False),
field_lock_status.get('Drama_IDs_locked', False)
])
if has_locked_fields:
logging.info(f"保持锁定字段不变: {mix_name} (无源数据但有锁定字段)")
updated_data_array.append(data_item)
else:
# 只有当mix_name有效且没有锁定字段时才保留记录
if mix_name and mix_name.strip():
updated_data_array.append(data_item)
continue continue
# 检查是否需要更新 - 包含所有Rankings_management字段 # 检查是否需要更新 - 包含所有Rankings_management字段

View File

@ -253,9 +253,9 @@ const getRankBadgeClass = (rank) => {
} }
// //
// const goToAdmin = () => { const goToAdmin = () => {
// router.push('/admin') router.push('/admin')
// } }
// //
onMounted(() => { onMounted(() => {
@ -275,6 +275,7 @@ onMounted(() => {
<div class="header-section"> <div class="header-section">
<div class="title-wrapper"> <div class="title-wrapper">
<h1 class="main-title">AI棒榜</h1> <h1 class="main-title">AI棒榜</h1>
<button class="admin-btn" @click="goToAdmin">管理</button>
</div> </div>
</div> </div>
@ -460,7 +461,10 @@ onMounted(() => {
gap: 10px; gap: 10px;
position: relative; position: relative;
} }
.logo-icon {
width: 40px;
height: 40px;
}
.main-title { .main-title {
font-size: 24px; font-size: 24px;
font-weight: bold; font-weight: bold;
@ -469,6 +473,25 @@ onMounted(() => {
font-family: Alatsi, 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', SimHei, Arial, Helvetica, sans-serif; font-family: Alatsi, 'PingFang SC', 'Hiragino Sans GB', 'Microsoft YaHei', SimHei, Arial, Helvetica, sans-serif;
} }
.admin-btn {
position: absolute;
right: 20px;
top: 50%;
transform: translateY(-50%);
background: #4a90e2;
color: white;
border: none;
border-radius: 6px;
padding: 6px 12px;
font-size: 12px;
cursor: pointer;
transition: background-color 0.3s ease;
}
.admin-btn:hover {
background: #357abd;
}
/* 横幅区域 */ /* 横幅区域 */
.banner-section { .banner-section {
margin: 20px 16px; margin: 20px 16px;