rank_backend/backend/handlers/Rankings/rank_data_scraper.py

4328 lines
208 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Selenium + Chrome DevTools Protocol 抓取抖音收藏合集真实播放量(play_vv)
核心能力:
- 启用CDP网络事件获取响应体并解析play_vv
- 复用本地Chrome用户数据绕过登录障碍
- 自动滚动与刷新触发更多API请求
- 同时解析页面中的SSR数据(window._SSR_HYDRATED_DATA/RENDER_DATA)
使用方法:
1) 默认复用 `config/chrome_profile` 下的已登录Chrome配置。
2) 若仍需登录请在弹出的Chrome中完成登录后回到终端按回车。
3) 程序会滚动和刷新自动收集网络数据并提取play_vv。
"""
import json
import re
import subprocess
import time
import logging
import os
import shutil
from datetime import datetime
import requests
import base64
import uuid
import sys
import psutil
from typing import Dict, List, Optional, Set
import random
import threading
import argparse
from typing import Dict, List, Optional, Set
from concurrent.futures import ThreadPoolExecutor # 使用线程池实现异步滑动和监控
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
# 保留导入但默认不使用webdriver_manager避免网络下载卡顿
from webdriver_manager.chrome import ChromeDriverManager # noqa: F401
# 添加项目根目录到 Python 路径
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..'))
# 确保能找到backend目录下的模块
backend_dir = os.path.join(os.path.dirname(__file__), '..', '..')
sys.path.insert(0, backend_dir)
from database import db
from handlers.Rankings.tos_client import oss_client
import config
# 配置日志
# 确保logs目录存在
script_dir = os.path.dirname(os.path.abspath(__file__))
logs_dir = os.path.join(script_dir, 'logs')
os.makedirs(logs_dir, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(os.path.join(logs_dir, 'douyin_scraper.log'), encoding='utf-8'),
logging.StreamHandler()
]
)
class UnifiedDataCollector:
"""统一数据收集器 - 解决数据重复和抓取不全问题"""
def __init__(self, driver, duration_s: int = 60):
self.driver = driver
self.duration_s = duration_s
# 统一数据存储 - 按mix_id去重
self.collected_items: Dict[str, dict] = {}
# 数据源统计
self.source_stats = {
'network': 0,
'ssr': 0,
'page': 0,
'filtered': 0
}
# 已知请求ID集合用于去重
self.known_request_ids: Set[str] = set()
# 目标关键词(收藏/合集/视频)
self.url_keywords = ['aweme', 'mix', 'collection', 'favorite', 'note', 'api']
# 是否在网络收集过程中周期性触发滚动加载(默认关闭以避免浪费时间)
self.enable_network_scroll: bool = False
logging.info('统一数据收集器初始化完成')
def collect_all_data(self) -> List[dict]:
"""统一的数据收集入口 - 整合所有数据源"""
logging.info('开始统一数据收集')
# 重置统计
self.source_stats = {'network': 0, 'ssr': 0, 'page': 0, 'filtered': 0}
# 按优先级收集数据
self._collect_from_network()
self._collect_from_ssr()
self._collect_from_page()
# 输出统计信息
self._log_collection_stats()
return list(self.collected_items.values())
def _collect_from_network(self):
"""从网络API监控收集数据"""
logging.info('开始网络API数据收集')
start_time = time.time()
last_scroll_time = start_time
while time.time() - start_time < self.duration_s:
try:
logs = self.driver.get_log('performance')
except Exception as e:
logging.warning(f'获取性能日志失败: {e}')
time.sleep(1)
continue
for entry in logs:
try:
message = json.loads(entry['message'])['message']
method = message.get('method')
params = message.get('params', {})
# 响应到达,尝试获取响应体
if method == 'Network.responseReceived':
req_id = params.get('requestId')
url = params.get('response', {}).get('url', '')
type_ = params.get('type') # XHR, Fetch, Document
if req_id and req_id not in self.known_request_ids:
self.known_request_ids.add(req_id)
# 仅处理XHR/Fetch
if type_ in ('XHR', 'Fetch') and any(k in url for k in self.url_keywords):
try:
body_obj = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': req_id})
body_text = body_obj.get('body', '')
# 可能是base64编码
if body_obj.get('base64Encoded'):
try:
body_text = base64.b64decode(body_text).decode('utf-8', errors='ignore')
except Exception:
pass
# 解析数据
self._parse_and_add_item(body_text, url, req_id, 'network')
except Exception:
# 某些响应不可获取或过大
pass
except Exception:
continue
# 在收集过程中定期触发数据加载(默认关闭)
if self.enable_network_scroll:
current_time = time.time()
if current_time - last_scroll_time > 15: # 降低频率每15秒
# 若检测到底部则不再滚动
if not self._check_no_more_content():
self._trigger_mini_scroll()
last_scroll_time = current_time
time.sleep(0.8)
logging.info(f'网络API数据收集完成发现 {self.source_stats["network"]} 个有效项')
def _trigger_mini_scroll(self):
"""在数据收集过程中触发滚动加载数据 - 增强版滚动机制"""
try:
logging.info('开始触发滚动加载数据...')
# 方式1强力滚动策略 - 模拟真实用户行为
try:
# 强力滚动:多次大幅度滚动确保触发懒加载
for i in range(5):
# 计算滚动距离,递增以确保效果
scroll_distance = 800 + (i * 300)
# 执行强力滚动
self.driver.execute_script(f"""
// 1. 强制滚动页面
window.scrollBy(0, {scroll_distance});
document.documentElement.scrollTop += {scroll_distance};
document.body.scrollTop += {scroll_distance};
// 2. 滚动到页面底部(触发懒加载)
window.scrollTo(0, document.body.scrollHeight);
// 3. 查找并滚动所有可能的容器
const containers = document.querySelectorAll('[data-e2e="comment-list"], .comment-list, [class*="comment"], [class*="scroll"], [role="main"]');
containers.forEach(container => {{
if (container.scrollTop !== undefined) {{
container.scrollTop = container.scrollHeight;
container.dispatchEvent(new Event('scroll', {{ bubbles: true }}));
}}
}});
// 4. 触发所有相关事件
['scroll', 'wheel', 'touchmove', 'resize'].forEach(eventType => {{
window.dispatchEvent(new Event(eventType, {{ bubbles: true }}));
document.dispatchEvent(new Event(eventType, {{ bubbles: true }}));
}});
// 5. 模拟用户交互
document.body.click();
console.log('执行强力滚动:', {scroll_distance}, 'px');
""")
logging.info(f'{i+1}次强力滚动,距离: {scroll_distance}px')
time.sleep(2) # 等待数据加载
# 检查是否有新数据加载
current_height = self.driver.execute_script("return document.body.scrollHeight;")
logging.info(f'当前页面高度: {current_height}px')
# 检查是否到达底部
if self._check_no_more_content():
logging.info('检测到页面底部,停止滚动')
break
return
except Exception as e:
logging.debug(f'强力滚动失败: {e}')
# 方式2尝试滚动到特定元素
try:
# 查找可能的加载更多按钮或元素
load_more_selectors = [
"[data-e2e='load-more']",
"[class*='load-more']",
"[class*='loadmore']",
"[class*='more']",
"button",
"[role='button']"
]
for selector in load_more_selectors:
try:
elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
for element in elements:
if element.is_displayed():
# 滚动到元素
self.driver.execute_script("arguments[0].scrollIntoView();", element)
logging.info(f'滚动到元素: {selector}')
time.sleep(2)
# 尝试点击
try:
element.click()
logging.info(f'点击加载更多按钮: {selector}')
time.sleep(3)
except:
pass
return
except:
continue
except Exception as e:
logging.debug(f'滚动到元素失败: {e}')
# 方式3渐进式滚动
try:
current_position = self.driver.execute_script("return window.pageYOffset;")
page_height = self.driver.execute_script("return document.body.scrollHeight;")
window_height = self.driver.execute_script("return window.innerHeight;")
logging.info(f'当前位置: {current_position}px, 页面高度: {page_height}px, 窗口高度: {window_height}px')
# 如果页面高度很小,说明没有数据,需要触发加载
if page_height < 2000:
# 多次滚动触发数据加载
for i in range(5):
self.driver.execute_script(f"window.scrollTo(0, {500 * (i+1)});")
logging.info(f'渐进滚动 {i+1}: {500 * (i+1)}px')
time.sleep(2)
else:
# 正常滚动
scroll_distance = min(1000, page_height - current_position - window_height)
if scroll_distance > 100:
new_position = current_position + scroll_distance
self.driver.execute_script(f'window.scrollTo(0, {new_position});')
logging.info(f'滚动到位置: {new_position}px')
time.sleep(2)
return
except Exception as e:
logging.debug(f'渐进式滚动失败: {e}')
# 方式4检查是否已显示"暂时没有更多了"
if self._check_no_more_content():
logging.info('已到达页面底部:暂时没有更多了')
return
logging.info('滚动完成,等待数据加载...')
except Exception as e:
logging.error(f'滚动触发失败: {e}')
def _check_no_more_content(self) -> bool:
"""检查是否已到达页面底部,没有更多内容"""
try:
# 检查多种可能的底部标识文本
bottom_indicators = [
"暂时没有更多了",
"没有更多内容",
"已加载全部",
"加载完毕"
]
for indicator in bottom_indicators:
try:
result = self.driver.execute_script(f"""
var elements = document.querySelectorAll('*');
for (var i = 0; i < elements.length; i++) {{
var text = elements[i].textContent || elements[i].innerText;
if (text.includes('{indicator}')) {{
return true;
}}
}}
return false;
""")
if result:
logging.debug(f'检测到页面底部标识: "{indicator}"')
return True
except Exception:
continue
return False
except Exception as e:
logging.debug(f'检查页面底部失败: {e}')
return False
def _trigger_scroll_during_collection(self):
"""在数据收集过程中触发数据加载 - 简化版,仅使用滚动"""
logging.info('在数据收集过程中触发滚动加载')
try:
# 获取初始数据量
initial_count = len(self.collected_items)
logging.info(f'滚动前数据量: {initial_count} 个短剧')
# 仅使用强力滚动策略,不进行不必要的刷新和按钮点击
self._trigger_mini_scroll()
# 检查是否有新数据加载
final_count = len(self.collected_items)
total_new = final_count - initial_count
logging.info(f'滚动加载完成: 初始 {initial_count} → 最终 {final_count} 个短剧 (总共新增: {total_new} 个)')
except Exception as e:
logging.warning(f'滚动加载过程中出错: {e}')
def _collect_from_ssr(self):
"""从SSR数据收集数据"""
logging.info('开始SSR数据收集')
# 尝试直接从window对象获取
keys = ['_SSR_HYDRATED_DATA', 'RENDER_DATA']
for key in keys:
try:
data = self.driver.execute_script(f'return window.{key}')
if data:
text = json.dumps(data, ensure_ascii=False)
self._parse_and_add_item(text, f'page_{key}', None, 'ssr')
logging.info(f'{key} 中解析完成')
except Exception:
continue
logging.info(f'SSR数据收集完成发现 {self.source_stats["ssr"]} 个有效项')
def _collect_from_page(self):
"""从页面解析收集数据(兜底方案)"""
logging.info('开始页面数据收集(兜底方案)')
try:
page_source = self.driver.page_source
self._parse_and_add_item(page_source, 'page_source', None, 'page')
# 同时尝试识别statis结构中的play_vv
for m in re.findall(r'"statis"\s*:\s*\{[^}]*"play_vv"\s*:\s*(\d+)[^}]*\}', page_source):
try:
vv = int(m)
# 从页面源码中无法获取完整的合集信息,跳过这些不完整的数据
logging.debug(f'从页面源码statis中发现播放量: {vv},但缺少完整信息,跳过')
except Exception:
pass
except Exception:
pass
logging.info(f'页面数据收集完成,发现 {self.source_stats["page"]} 个有效项')
def _parse_and_add_item(self, text: str, source_url: str, request_id: str, source_type: str):
"""解析文本数据并添加到统一存储"""
try:
# 尝试解析JSON数据
if text.strip().startswith('{') or text.strip().startswith('['):
try:
data = json.loads(text)
self._extract_from_json_data(data, source_url, request_id, source_type)
return
except json.JSONDecodeError:
pass
# 如果不是JSON使用正则表达式查找
self._extract_from_text_regex(text, source_url, request_id, source_type)
except Exception as e:
logging.debug(f'解析 {source_type} 数据时出错: {e}')
def _extract_from_json_data(self, data, source_url: str, request_id: str, source_type: str):
"""从JSON数据中递归提取合集信息"""
def extract_mix_info(obj, path=""):
if isinstance(obj, dict):
# 检查是否包含有效的合集信息
if self._is_valid_collection_data(obj):
item_data = self._build_item_data(obj, source_url, request_id, source_type)
if item_data:
self._add_item_with_validation(item_data, source_type)
# 递归搜索子对象
for key, value in obj.items():
if isinstance(value, (dict, list)):
extract_mix_info(value, f"{path}.{key}" if path else key)
elif isinstance(obj, list):
for i, item in enumerate(obj):
if isinstance(item, (dict, list)):
extract_mix_info(item, f"{path}[{i}]" if path else f"[{i}]")
extract_mix_info(data)
def _extract_from_text_regex(self, text: str, source_url: str, request_id: str, source_type: str):
"""使用正则表达式从文本中提取信息"""
# 查找包含完整合集信息的JSON片段
mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*\}[^{}]*\}'
for match in re.finditer(mix_pattern, text):
try:
mix_id = match.group(1)
mix_name = match.group(2)
vv = int(match.group(3))
# 构建基础数据
item_data = {
'mix_id': mix_id,
'mix_name': mix_name,
'play_vv': vv,
'url': source_url,
'request_id': request_id,
'source_type': source_type,
'timestamp': datetime.now().isoformat()
}
# 验证并添加
if self._validate_item(item_data):
self._add_item_with_validation(item_data, source_type)
except Exception:
continue
def _is_valid_collection_data(self, obj: dict) -> bool:
"""检查是否为有效的收藏合集数据"""
# 必须有mix_id和statis字段
if 'mix_id' not in obj or 'statis' not in obj:
return False
# statis必须是字典且包含play_vv
statis = obj.get('statis', {})
if not isinstance(statis, dict) or 'play_vv' not in statis:
return False
# play_vv必须是有效数字
play_vv = statis.get('play_vv')
if not isinstance(play_vv, (int, str)):
return False
try:
vv = int(play_vv)
# 收藏合集的短剧播放量不可能为0
if vv <= 0:
return False
except (ValueError, TypeError):
return False
return True
def _build_item_data(self, obj: dict, source_url: str, request_id: str, source_type: str) -> Optional[dict]:
"""构建标准化的数据项"""
try:
mix_id = obj.get('mix_id', '')
mix_name = obj.get('mix_name', '')
# 获取播放量与_is_valid_collection_data方法保持一致
play_vv = 0
# 方式1从statis字段获取
if 'statis' in obj and isinstance(obj['statis'], dict):
statis = obj['statis']
if 'play_vv' in statis:
play_vv = statis['play_vv']
# 方式2直接从对象中获取play_vv
if play_vv == 0 and 'play_vv' in obj:
play_vv = obj['play_vv']
# 方式3从其他可能的字段获取
if play_vv == 0:
for field in ['play_count', 'view_count', 'vv']:
if field in obj:
play_vv = obj[field]
break
# 转换为整数
if isinstance(play_vv, str) and play_vv.isdigit():
play_vv = int(play_vv)
# 数据验证
if not mix_id or play_vv <= 0:
return None
# 如果mix_name为空使用mix_id作为名称
if not mix_name or mix_name.strip() == "":
mix_name = f"短剧_{mix_id}"
logging.warning(f"⚠️ mix_name为空使用mix_id作为名称: {mix_name}")
# 构建合集链接
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
# 构建标准数据项
item_data = {
'mix_id': mix_id,
'mix_name': mix_name,
'play_vv': play_vv,
'formatted': self._format_count(play_vv),
'url': source_url,
'request_id': request_id,
'video_url': video_url,
'source_type': source_type,
'timestamp': datetime.now().isoformat()
}
# 提取额外字段
self._extract_additional_fields(obj, item_data)
return item_data
except Exception as e:
logging.debug(f'构建数据项失败: {e}')
return None
def _extract_additional_fields(self, obj: dict, item_data: dict):
"""提取额外的字段信息"""
# 提取合集封面图片URL
cover_image_url = ""
cover_image_backup_urls = []
# 查找封面图片字段
for field in ['cover', 'cover_url', 'image', 'pic']:
if field in obj:
field_data = obj[field]
if isinstance(field_data, dict) and 'url_list' in field_data and field_data['url_list']:
cover_image_url = field_data['url_list'][0]
cover_image_backup_urls = field_data['url_list'][1:] if len(field_data['url_list']) > 1 else []
break
elif isinstance(field_data, str):
cover_image_url = field_data
break
item_data['cover_image_url'] = cover_image_url
item_data['cover_backup_urls'] = cover_image_backup_urls
# 提取合集作者/影视工作室
series_author = ""
for author_field in ['author', 'creator', 'user']:
if author_field in obj:
author_data = obj[author_field]
if isinstance(author_data, dict):
series_author = (author_data.get('nickname') or
author_data.get('unique_id') or
author_data.get('short_id') or
author_data.get('name') or '')
break
elif isinstance(author_data, str):
series_author = author_data
break
item_data['series_author'] = series_author
# 提取合集描述
desc = ""
if 'desc' in obj and obj['desc']:
desc_value = str(obj['desc']).strip()
if desc_value:
desc = desc_value
item_data['desc'] = desc
# 提取合集总集数
updated_to_episode = 0
if 'statis' in obj and isinstance(obj['statis'], dict):
statis = obj['statis']
if 'updated_to_episode' in statis:
try:
episodes = int(statis['updated_to_episode'])
if episodes > 0:
updated_to_episode = episodes
except ValueError:
pass
item_data['updated_to_episode'] = updated_to_episode
def _validate_item(self, item_data: dict) -> bool:
"""验证数据项的有效性"""
# 基本字段验证
mix_id = item_data.get('mix_id', '')
mix_name = item_data.get('mix_name', '')
play_vv = item_data.get('play_vv', 0)
# 必须有mix_id和mix_name
if not mix_id or not mix_name:
return False
# 播放量必须大于0收藏合集的短剧不可能为0
if play_vv <= 0:
return False
# 排除占位名称
if mix_name.startswith('短剧_') or '未知' in mix_name:
return False
return True
def _add_item_with_validation(self, item_data: dict, source_type: str):
"""验证并添加数据项,包含实时去重"""
if not self._validate_item(item_data):
self.source_stats['filtered'] += 1
return
mix_id = item_data.get('mix_id')
# 实时去重:保留播放量最大的版本
if mix_id in self.collected_items:
existing = self.collected_items[mix_id]
current_play_vv = item_data.get('play_vv', 0)
existing_play_vv = existing.get('play_vv', 0)
if current_play_vv > existing_play_vv:
# 当前数据更好,替换
self.collected_items[mix_id] = item_data
logging.info(f'🔄 更新重复短剧: {item_data.get("mix_name")} (播放量: {existing_play_vv:,}{current_play_vv:,})')
else:
# 已有数据更好,跳过
logging.info(f'⏭️ 跳过重复短剧: {item_data.get("mix_name")} (当前: {current_play_vv:,}, 已有: {existing_play_vv:,})')
# 记录去重统计
logging.debug(f'去重统计: mix_id={mix_id}, 已有播放量={existing_play_vv:,}, 新播放量={current_play_vv:,}, 是否更新={current_play_vv > existing_play_vv}')
else:
# 新数据,直接添加
self.collected_items[mix_id] = item_data
self.source_stats[source_type] += 1
logging.info(f'✅ 添加新短剧: {item_data.get("mix_name")} - {item_data.get("play_vv", 0):,} 播放量')
def _format_count(self, n: int) -> str:
"""格式化数字显示"""
if n >= 100_000_000:
return f"{n/100_000_000:.1f}亿"
if n >= 10_000:
return f"{n/10_000:.1f}"
return str(n)
def _log_collection_stats(self):
"""输出收集统计信息"""
logging.info('=' * 60)
logging.info('统一数据收集统计:')
logging.info(f' - 网络API: {self.source_stats["network"]}')
logging.info(f' - SSR数据: {self.source_stats["ssr"]}')
logging.info(f' - 页面解析: {self.source_stats["page"]}')
logging.info(f' - 过滤无效: {self.source_stats["filtered"]}')
logging.info(f' - 最终结果: {len(self.collected_items)} 个唯一短剧')
logging.info('=' * 60)
class DouyinPlayVVScraper:
def __init__(self, start_url: str = None, auto_continue: bool = False, duration_s: int = 60):
self.start_url = start_url or "https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation"
self.auto_continue = auto_continue
self.duration_s = duration_s
self.driver = None
self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item}
self.captured_responses = []
self.db = None
self.collection = None
self.image_cache = {} # 图片ID到TOS链接的缓存映射 {image_id: tos_url}
# 实时存储相关属性
self.batch_id = str(uuid.uuid4()) # 每次运行的唯一标识
self.batch_time = datetime.now() # 批次时间
self.item_sequence = 0 # 数据项序号
self.saved_items = set() # 已保存的数据项(用于去重)
# 根据运行模式自动选择存储方式
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
self.realtime_save_enabled = not is_timer_mode # 定时器模式使用批量存储,普通模式使用实时存储
if self.realtime_save_enabled:
logging.info(f'[普通模式] 启用实时存储批次ID: {self.batch_id}')
else:
logging.info('[定时器模式] 使用批量存储')
self._cleanup_old_profiles()
# 智能清理Chrome缓存仅当超过50MB时
self._cleanup_chrome_cache_smart()
self._setup_mongodb()
self._load_image_cache()
def _setup_mongodb(self):
"""设置MongoDB连接"""
try:
# 使用 database.py 中的连接
self.db = db
# 根据运行模式选择集合
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_management'
self.collection = self.db[mongo_collection]
# 新增设置Rankings_management集合每天替换的数据库
self.management_collection = self.db['Rankings_management']
logging.info(f'MongoDB连接成功使用数据库: {self.db.name}')
logging.info(f'主集合: {mongo_collection}(只增不删)')
logging.info(f'管理集合: Rankings_management每天替换')
logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}')
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': 'MongoDB连接设置'
}
logging.error(f'MongoDB连接失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
self.db = None
self.collection = None
self.management_collection = None
def _load_image_cache(self):
"""从数据库加载已存在的图片ID到TOS链接的映射"""
target_collection = self.collection # 使用根据模式选择的集合
if target_collection is None:
return
try:
# 查询所有有封面图片的记录
cursor = target_collection.find(
{
'cover_image_url_original': {'$exists': True, '$ne': ''},
'cover_image_url': {'$exists': True, '$ne': ''}
},
{'cover_image_url_original': 1, 'cover_image_url': 1}
)
cache_count = 0
for doc in cursor:
original_url = doc.get('cover_image_url_original', '')
tos_url = doc.get('cover_image_url', '')
if original_url and tos_url and original_url != tos_url:
# 提取图片ID
image_id = self.extract_douyin_image_id(original_url)
if image_id:
self.image_cache[image_id] = tos_url
cache_count += 1
logging.info(f'从数据库加载图片缓存: {cache_count} 个图片映射')
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': '从数据库加载图片缓存'
}
logging.error(f'加载图片缓存失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
def _cleanup_old_profiles(self):
"""清理超过一天的旧临时Chrome配置文件"""
try:
script_dir = os.path.dirname(os.path.abspath(__file__))
# 清理两个配置目录的旧文件
profile_dirs = [
os.path.join(script_dir, 'config', 'chrome_profile_scraper'),
os.path.join(script_dir, 'config', 'chrome_profile_timer')
]
for profile_base_dir in profile_dirs:
if not os.path.exists(profile_base_dir):
continue
current_time = time.time()
one_day_ago = current_time - 24 * 60 * 60 # 24小时前
for item in os.listdir(profile_base_dir):
if item.startswith('run_'):
item_path = os.path.join(profile_base_dir, item)
if os.path.isdir(item_path):
try:
# 提取时间戳
timestamp = int(item.split('_')[1])
if timestamp < one_day_ago:
shutil.rmtree(item_path, ignore_errors=True)
logging.info(f'清理旧配置文件: {item}')
except (ValueError, IndexError):
# 如果无法解析时间戳,跳过
continue
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': '清理超过一天的旧临时Chrome配置文件'
}
logging.warning(f'清理旧配置文件时出错: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
def _cleanup_chrome_processes(self):
"""清理可能占用配置文件的Chrome进程"""
try:
# 获取当前配置文件路径
script_dir = os.path.dirname(os.path.abspath(__file__))
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent')
# 查找使用该配置文件的Chrome进程
killed_processes = []
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
if proc.info['name'] and 'chrome' in proc.info['name'].lower():
cmdline = proc.info['cmdline']
if cmdline and any(profile_dir in arg for arg in cmdline):
proc.terminate()
killed_processes.append(proc.info['pid'])
logging.info(f'终止占用配置文件的Chrome进程: PID {proc.info["pid"]}')
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
# 等待进程终止
if killed_processes:
time.sleep(2)
return len(killed_processes) > 0
except ImportError:
# 如果没有psutil使用系统命令
try:
result = subprocess.run(['taskkill', '/f', '/im', 'chrome.exe'],
capture_output=True, text=True, timeout=10)
if result.returncode == 0:
logging.info('使用taskkill清理Chrome进程')
time.sleep(2)
return True
except Exception as e:
logging.warning(f'清理Chrome进程失败: {e}')
return False
except Exception as e:
logging.warning(f'清理Chrome进程时出错: {e}')
return False
def _get_directory_size(self, directory_path):
"""计算目录大小MB"""
total_size = 0
try:
for dirpath, dirnames, filenames in os.walk(directory_path):
for filename in filenames:
filepath = os.path.join(dirpath, filename)
try:
total_size += os.path.getsize(filepath)
except (OSError, FileNotFoundError):
continue
except Exception as e:
logging.warning(f'计算目录大小时出错: {e}')
return total_size / (1024 * 1024) # 转换为MB
def _cleanup_chrome_cache_smart(self, size_threshold_mb=50):
"""智能清理Chrome配置文件缓存
Args:
size_threshold_mb (int): 触发清理的大小阈值MB默认50MB
"""
try:
script_dir = os.path.dirname(os.path.abspath(__file__))
# 根据运行模式选择对应的配置目录
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
if is_timer_mode:
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent')
else:
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent')
if not os.path.exists(profile_dir):
logging.info('Chrome配置文件目录不存在跳过缓存清理')
return False
# 计算当前配置文件大小
current_size_mb = self._get_directory_size(profile_dir)
logging.info(f'Chrome配置文件当前大小: {current_size_mb:.2f} MB')
# 检查是否超过阈值
if current_size_mb <= size_threshold_mb:
logging.info(f'配置文件大小 ({current_size_mb:.2f} MB) 未超过阈值 ({size_threshold_mb} MB),跳过清理')
return False
logging.info(f'配置文件大小 ({current_size_mb:.2f} MB) 超过阈值 ({size_threshold_mb} MB),开始清理缓存...')
# 定义需要清理的缓存目录和文件
cache_items = [
'Default/Cache',
'Default/Code Cache',
'Default/GPUCache',
'Default/Service Worker/CacheStorage',
'Default/Service Worker/ScriptCache',
'Default/IndexedDB',
'Default/Local Storage',
'Default/Session Storage',
'Default/Web Data-journal',
'Default/History-journal',
'Default/Favicons-journal',
'GrShaderCache',
'optimization_guide_model_store',
'BrowserMetrics'
]
cleaned_size = 0
cleaned_items = 0
for cache_item in cache_items:
cache_path = os.path.join(profile_dir, cache_item)
if os.path.exists(cache_path):
try:
# 计算要删除的大小
if os.path.isdir(cache_path):
item_size = self._get_directory_size(cache_path)
shutil.rmtree(cache_path)
else:
item_size = os.path.getsize(cache_path) / (1024 * 1024)
os.remove(cache_path)
cleaned_size += item_size
cleaned_items += 1
logging.debug(f'已清理: {cache_item} ({item_size:.2f} MB)')
except Exception as e:
logging.warning(f'清理 {cache_item} 时出错: {e}')
# 计算清理后的大小
final_size_mb = self._get_directory_size(profile_dir)
logging.info(f'缓存清理完成:')
logging.info(f' - 清理前大小: {current_size_mb:.2f} MB')
logging.info(f' - 清理后大小: {final_size_mb:.2f} MB')
logging.info(f' - 释放空间: {cleaned_size:.2f} MB')
logging.info(f' - 清理项目: {cleaned_items}')
return True
except Exception as e:
logging.error(f'智能缓存清理失败: {e}')
return False
def setup_driver(self):
logging.info('初始化Chrome WebDriver (启用CDP网络日志)')
# 清理可能占用配置文件的Chrome进程
self._cleanup_chrome_processes()
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--remote-allow-origins=*')
chrome_options.add_argument('--remote-debugging-port=0')
chrome_options.add_argument('--start-maximized')
chrome_options.add_argument('--lang=zh-CN')
# 根据运行模式选择不同的Chrome配置文件目录
script_dir = os.path.dirname(os.path.abspath(__file__))
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
if is_timer_mode:
# 定时器模式使用独立的配置目录
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent')
logging.info(f'[定时器模式] 使用独立Chrome配置文件: {profile_dir}')
else:
# 普通模式使用原有的配置目录
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_scraper', 'douyin_persistent')
logging.info(f'[普通模式] 使用独立Chrome配置文件: {profile_dir}')
os.makedirs(profile_dir, exist_ok=True)
chrome_options.add_argument(f'--user-data-dir={profile_dir}')
logging.info(f'使用持久化Chrome配置文件: {profile_dir}')
# 明确设置Chrome二进制路径32位Chrome常见安装位置
possible_chrome_bins = [
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
r"C:\Program Files\Google\Chrome\Application\chrome.exe"
]
for bin_path in possible_chrome_bins:
if os.path.exists(bin_path):
chrome_options.binary_location = bin_path
logging.info(f'使用Chrome二进制路径: {bin_path}')
break
# 性能日志Network事件
chrome_options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
# 仅使用本地或PATH中的chromedriver避免网络下载依赖
driver_ready = False
candidates = []
# 可通过环境变量强制覆盖驱动路径
env_override = os.environ.get('OVERRIDE_CHROMEDRIVER')
if env_override:
candidates.append(env_override)
logging.info(f'检测到环境变量 OVERRIDE_CHROMEDRIVER优先使用: {env_override}')
# 脚本所在目录的drivers路径优先
script_dir = os.path.dirname(os.path.abspath(__file__))
script_driver_path = os.path.join(script_dir, 'drivers', 'chromedriver.exe')
candidates.append(script_driver_path)
logging.info(f'优先尝试脚本目录路径: {script_driver_path}')
# 项目根目录的drivers路径
user_driver_path = os.path.join(os.getcwd(), 'drivers', 'chromedriver.exe')
candidates.append(user_driver_path)
logging.info(f'尝试项目根目录路径: {user_driver_path}')
# 项目根目录
candidates.append(os.path.join(os.getcwd(), 'chromedriver.exe'))
# 其他可能目录
candidates.append(os.path.join(os.getcwd(), 'drivers', 'chromedriver'))
# PATH 中的chromedriver
which_path = shutil.which('chromedriver')
if which_path:
candidates.append(which_path)
if not driver_ready:
for p in candidates:
try:
if p and os.path.exists(p):
logging.info(f'尝试使用chromedriver: {p}')
service = Service(p)
self.driver = webdriver.Chrome(service=service, options=chrome_options)
driver_ready = True
logging.info(f'使用chromedriver启动成功: {p}')
try:
caps = self.driver.capabilities
browser_ver = caps.get('browserVersion') or caps.get('version')
cdver = caps.get('chrome', {}).get('chromedriverVersion')
logging.info(f'Chrome版本: {browser_ver}, ChromeDriver版本: {cdver}')
except Exception:
pass
break
else:
logging.info(f'候选路径不存在: {p}')
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'尝试使用ChromeDriver路径: {p}'
}
logging.warning(f'尝试使用 {p} 启动失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
if not driver_ready:
# 最终回退使用webdriver-manager可能需要网络
try:
service = Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=service, options=chrome_options)
driver_ready = True
logging.info('使用webdriver-manager成功启动ChromeDriver')
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': '使用webdriver-manager启动ChromeDriver'
}
logging.error(f'webdriver-manager启动失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
raise RuntimeError(f'未能启动ChromeDriver。请手动下载匹配版本的chromedriver到项目根目录或PATH或检查网络以允许webdriver-manager下载。错误类型: {error_details["error_type"]}, 错误信息: {error_details["error_message"]}')
# 反检测
try:
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
except Exception:
pass
# 启用CDP Network
try:
self.driver.execute_cdp_cmd('Network.enable', {})
logging.info('已启用CDP Network')
except Exception as e:
logging.warning(f'启用CDP Network失败: {e}')
def navigate(self):
logging.info(f'导航到: {self.start_url}')
self.driver.get(self.start_url)
time.sleep(8) # 增加页面加载等待时间
def ensure_login(self):
"""确保用户已登录并导航到收藏合集页面"""
logging.info("检测登录状态和页面位置...")
# 首先检查是否已经登录并在正确页面
if self._check_login_and_page():
logging.info("检测到已登录且在收藏合集页面,跳过手动确认")
return
# 如果未登录或不在正确页面,进行手动登录流程
logging.info("请在弹出的浏览器中手动完成登录。")
if self.auto_continue:
logging.info('自动继续模式,假设已登录并跳过手动等待...')
# 在自动模式下尝试导航到起始URL
try:
logging.info(f"自动模式导航到起始URL: {self.start_url}")
self.driver.get(self.start_url)
time.sleep(3) # 等待页面加载
logging.info("自动模式:假设登录成功,继续执行...")
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'自动模式导航到起始URL: {self.start_url}'
}
logging.warning(f"自动模式导航失败: {error_details['error_type']} - {error_details['error_message']},继续执行...")
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
return
# 定时器模式下的登录检查
is_timer_mode = os.environ.get('TIMER_MODE') == '1'
if is_timer_mode:
logging.info("定时器模式:检查浏览器登录状态...")
# 在定时器模式下,浏览器已经启动并导航到页面,现在检查登录状态
if not self._check_login_and_page():
logging.warning("定时器模式:检测到未登录状态,需要手动登录")
print("⚠️ 定时器浏览器未登录")
print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面")
print(" 完成后按回车键继续...")
input()
# 重新检查登录状态
if not self._check_login_and_page():
logging.warning("定时器模式:登录确认后仍然未登录,继续执行...")
else:
logging.info("定时器模式:浏览器已登录,继续执行...")
return
logging.info("进入手动登录确认循环...")
while True:
# 要求用户输入特定文本确认
logging.info("等待用户输入确认...")
user_input = input("请在浏览器中完成登录,并导航到【我的】→【收藏】→【合集】页面。操作完成后,请在此处输入 'ok' 并按回车: ")
if user_input.strip().lower() != 'ok':
logging.warning("请输入 'ok' 确认您已完成登录并导航到【我的】→【收藏】→【合集】页面。")
continue
logging.info("用户已确认,检查当前页面...")
try:
current_url = self.driver.current_url
logging.info(f"当前页面URL: {current_url}")
if ("douyin.com/user/self" in current_url and
("favorite_collection" in current_url or "compilation" in current_url)):
logging.info(f"已确认您位于收藏合集列表页面: {current_url}")
logging.info("脚本将继续执行...")
break
else:
# 用户确认了,但页面不正确,继续循环等待
logging.warning(f"检测到当前页面 ({current_url}) 并非收藏合集列表页面。请确保已导航至【我的】→【收藏】→【合集】页面。")
except Exception as e:
if "browser has been closed" in str(e) or "no such window" in str(e) or "target window already closed" in str(e):
logging.error("浏览器窗口已关闭,脚本无法继续。")
raise RuntimeError("浏览器窗口已关闭")
logging.warning(f"检测URL时出错: {e}。请重试。")
time.sleep(1)
def _check_login_and_page(self, timeout: int = 600) -> bool:
"""检查是否已登录并在正确页面"""
try:
current_url = self.driver.current_url
logging.info(f"当前页面URL: {current_url}")
# 检查是否在收藏合集页面
if ("douyin.com/user/self" in current_url and
("favorite_collection" in current_url or "compilation" in current_url)):
# 进一步检查登录状态
return self._detect_login_status(timeout)
else:
# 如果不在正确页面,尝试导航到收藏合集页面
if self._detect_login_status(timeout):
logging.info("已登录但不在收藏合集页面,自动导航...")
self.driver.get(self.start_url)
time.sleep(3)
return True
return False
except Exception as e:
logging.warning(f"检查登录状态时出错: {e}")
return False
def _detect_login_status(self, timeout: int = 30) -> bool:
"""自动检测是否已登录"""
try:
start = time.time()
attempt = 0
while time.time() - start < timeout:
attempt += 1
logging.info(f"登录检测尝试 {attempt}...")
time.sleep(2)
# 检查登录状态的多个选择器
selectors = [
'[data-e2e="user-avatar"]',
'.user-avatar',
'[class*="avatar"]',
'[class*="Avatar"]',
'.avatar',
'img[alt*="头像"]',
'img[alt*="avatar"]'
]
for selector in selectors:
try:
elements = self.driver.find_elements("css selector", selector)
if elements:
logging.info(f"检测到用户头像 (选择器: {selector}),确认已登录")
return True
except Exception as e:
logging.debug(f"选择器 {selector} 检测失败: {e}")
continue
# 检查是否有登录按钮(表示未登录)
login_selectors = [
'[data-e2e="login-button"]',
'button[class*="login"]',
'a[href*="login"]',
'.login-button',
'button:contains("登录")'
]
for selector in login_selectors:
try:
elements = self.driver.find_elements("css selector", selector)
if elements:
logging.info(f"检测到登录按钮 (选择器: {selector}),用户未登录")
return False
except Exception as e:
logging.debug(f"登录按钮选择器 {selector} 检测失败: {e}")
continue
# 添加页面源码检查
try:
page_source = self.driver.page_source
if "登录" in page_source and "头像" not in page_source:
logging.info("页面源码显示需要登录")
return False
elif any(keyword in page_source for keyword in ["我的", "收藏", "合集"]):
logging.info("页面源码显示已登录")
return True
except Exception as e:
logging.debug(f"页面源码检查失败: {e}")
logging.warning(f"登录状态检测超时 ({timeout}秒),假设已登录并继续")
return True # 改为假设已登录,避免卡住
except Exception as e:
logging.warning(f"登录状态检测出错: {e},假设已登录并继续")
return True # 改为假设已登录,避免卡住
def trigger_loading(self):
logging.info('触发数据加载:强力滚动直到"暂时没有更多了"')
# 等待页面完全加载
logging.info('等待页面完全加载...')
time.sleep(10)
# 强力滚动策略 - 模拟真实用户行为,直到看到"暂时没有更多了"
max_scroll_attempts = 50 # 最大滚动尝试次数
scroll_count = 0
no_more_content_found = False
while scroll_count < max_scroll_attempts and not no_more_content_found:
try:
scroll_count += 1
logging.info(f'{scroll_count}次强力滚动...')
# 强力滚动:多次大幅度滚动确保触发懒加载
scroll_distance = 800 + (scroll_count * 200)
# 执行强力滚动JavaScript
self.driver.execute_script(f"""
// 1. 强制滚动页面
window.scrollBy(0, {scroll_distance});
document.documentElement.scrollTop += {scroll_distance};
document.body.scrollTop += {scroll_distance};
// 2. 滚动到页面底部(触发懒加载)
window.scrollTo(0, document.body.scrollHeight);
// 3. 查找并滚动所有可能的容器
const containers = document.querySelectorAll('[data-e2e="comment-list"], .comment-list, [class*="comment"], [class*="scroll"], [role="main"], [class*="collection"], [class*="favorite"]');
containers.forEach(container => {{
if (container.scrollTop !== undefined) {{
container.scrollTop = container.scrollHeight;
container.dispatchEvent(new Event('scroll', {{ bubbles: true }}));
}}
}});
// 4. 触发所有相关事件
['scroll', 'wheel', 'touchmove', 'resize'].forEach(eventType => {{
window.dispatchEvent(new Event(eventType, {{ bubbles: true }}));
document.dispatchEvent(new Event(eventType, {{ bubbles: true }}));
}});
// 5. 模拟用户交互
document.body.click();
console.log('执行强力滚动:', {scroll_distance}, 'px');
""")
# 等待数据加载
time.sleep(3)
# 检查是否有新数据加载
current_height = self.driver.execute_script("return document.body.scrollHeight;")
logging.info(f'当前页面高度: {current_height}px')
# 检查是否到达底部 - 看到"暂时没有更多了"
no_more_content_found = self._check_no_more_content()
if no_more_content_found:
logging.info('✅ 检测到页面底部:"暂时没有更多了",停止滚动')
break
# 检查页面高度是否不再增加(说明没有新内容加载)
if scroll_count > 5:
previous_height = current_height
time.sleep(2)
new_height = self.driver.execute_script("return document.body.scrollHeight;")
if new_height == previous_height:
logging.info('页面高度不再增加,可能已加载全部内容')
break
except Exception as e:
logging.error(f'滚动过程中出错: {e}')
time.sleep(2)
if no_more_content_found:
logging.info('🎉 成功滚动到页面底部,所有内容已加载完成')
else:
logging.info(f'达到最大滚动次数 {max_scroll_attempts},停止滚动')
# 最终检查一次是否还有更多内容
final_check = self._check_no_more_content()
if not final_check:
logging.info('⚠️ 最终检查:可能还有更多内容未加载')
def _check_no_more_content(self) -> bool:
"""检查是否已到达页面底部,没有更多内容"""
try:
# 检查多种可能的底部标识文本
bottom_indicators = [
"暂时没有更多了",
"没有更多内容",
"已加载全部",
"加载完毕",
"no more content",
"end of content"
]
for indicator in bottom_indicators:
try:
result = self.driver.execute_script(f"""
var elements = document.querySelectorAll('*');
for (var i = 0; i < elements.length; i++) {{
var text = elements[i].textContent || elements[i].innerText;
if (text.includes('{indicator}')) {{
return true;
}}
}}
return false;
""")
if result:
logging.info(f'✅ 检测到页面底部标识: "{indicator}"')
return True
except Exception:
continue
return False
except Exception as e:
logging.debug(f'检查页面底部失败: {e}')
return False
def format_count(self, n: int) -> str:
if n >= 100_000_000:
return f"{n/100_000_000:.1f}亿"
if n >= 10_000:
return f"{n/10_000:.1f}"
return str(n)
def format_interaction_count(self, n: int) -> str:
"""格式化互动数据数量,返回带单位的字符串
Args:
n: 数量
Returns:
str: 格式化后的字符串,如 27898 -> 2.8W, 1234 -> 1234
"""
if n >= 100_000_000:
result = n / 100_000_000
if result == int(result):
return f"{int(result)}亿"
else:
return f"{result:.1f}亿"
elif n >= 10_000:
result = n / 10_000
if result == int(result):
return f"{int(result)}W"
else:
return f"{result:.1f}W"
else:
return str(n)
def save_comments_to_file(self, comments: list, video_id: str = None):
"""简单保存评论数据到JSON文件"""
try:
if not comments:
return None
# 创建保存目录
script_dir = os.path.dirname(os.path.abspath(__file__))
save_dir = os.path.join(script_dir, 'saved_comments')
os.makedirs(save_dir, exist_ok=True)
# 生成文件名
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'comments_{video_id}_{timestamp}.json' if video_id else f'comments_{timestamp}.json'
file_path = os.path.join(save_dir, filename)
# 保存数据
save_data = {
'timestamp': datetime.now().isoformat(),
'video_id': video_id,
'total_comments': len(comments),
'comments': comments
}
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(save_data, f, ensure_ascii=False, indent=2)
logging.info(f'保存 {len(comments)} 条评论到: {file_path}')
return file_path
except Exception as e:
logging.error(f'保存评论失败: {e}')
return None
def dedupe(self):
# 🔧 修复按mix_id去重保留播放量最大的那个
# 原来的逻辑会导致播放量相同的不同短剧被误删
unique_dict = {} # 使用字典存储key是identifiervalue是item
for item in self.play_vv_items:
mix_id = item.get('mix_id', '')
# 如果没有mix_id使用mix_name作为备用标识
if not mix_id:
mix_name = item.get('mix_name', '')
identifier = f"name_{mix_name}"
else:
identifier = f"id_{mix_id}"
# 如果是第一次遇到这个identifier直接添加
if identifier not in unique_dict:
unique_dict[identifier] = item
else:
# 如果已经存在,比较播放量,保留播放量大的
existing_play_vv = unique_dict[identifier].get('play_vv', 0)
current_play_vv = item.get('play_vv', 0)
if current_play_vv > existing_play_vv:
# 当前数据的播放量更大,替换
logging.info(f'去重:发现重复短剧 {item.get("mix_name", "未知")},保留播放量更大的数据 ({existing_play_vv:,}{current_play_vv:,})')
unique_dict[identifier] = item
else:
# 已有数据的播放量更大或相等,跳过当前数据
logging.debug(f'去重:跳过重复的短剧 {item.get("mix_name", "未知")} (mix_id: {mix_id})')
# 转换回列表
unique = list(unique_dict.values())
removed_count = len(self.play_vv_items) - len(unique)
if removed_count > 0:
logging.info(f'去重完成:移除 {removed_count} 个重复项,保留 {len(unique)} 个唯一短剧')
else:
logging.info(f'去重完成:没有重复项,保留 {len(unique)} 个唯一短剧')
self.play_vv_items = unique
def save_results(self):
if self.realtime_save_enabled:
# 🔧 修复:在数据收集完成后,统一进行实时保存
logging.info(f'[实时保存] 开始保存 {len(self.play_vv_items)} 个合集的数据')
logging.info(f'[实时保存] 批次ID: {self.batch_id}')
# 先保存所有合集的基础信息(不获取详细内容)
for item_data in self.play_vv_items:
try:
logging.info(f'[实时保存] 保存合集基础信息: {item_data.get("mix_name", "未知")}')
self.save_collection_basic_info(item_data)
except Exception as e:
logging.error(f'[实时保存] 保存合集基础信息失败: {item_data.get("mix_name", "未知")} - {e}')
# 更新排名
try:
self.update_ranks_for_batch()
except Exception as e:
logging.error(f'[实时保存] 更新排名失败: {e}')
# 然后逐个获取详细内容(如果需要)
logging.info(f'[实时保存] 基础信息保存完成,开始获取详细内容')
for item_data in self.play_vv_items:
try:
mix_id = item_data.get('mix_id', '')
mix_name = item_data.get('mix_name', '')
current_episode_count = item_data.get('updated_to_episode', 0)
if mix_id and current_episode_count > 0:
# 查找已保存的文档ID
target_collection = self.collection
if target_collection is not None:
existing_doc = target_collection.find_one({'mix_id': mix_id}, {'_id': 1})
if existing_doc:
document_id = existing_doc['_id']
logging.info(f'[实时保存] 开始获取详细内容: {mix_name}')
# 获取视频ID列表
episode_video_ids = self.update_collection_video_ids(
document_id, mix_id, mix_name, current_episode_count
)
# 获取视频详细数据
if episode_video_ids:
self.update_video_details_incrementally(
document_id, episode_video_ids, mix_name, mix_id
)
except Exception as e:
logging.error(f'[实时保存] 获取详细内容失败: {item_data.get("mix_name", "未知")} - {e}')
logging.info(f'[实时保存] 所有数据处理完成,共 {len(self.saved_items)} 个合集')
else:
# 传统批量保存模式
self.save_to_mongodb()
logging.info('结果已保存到MongoDB')
def update_ranks_for_batch(self):
"""为当前批次的数据更新排名"""
target_collection = self.collection # 使用根据模式选择的集合
if target_collection is None:
logging.warning('[实时保存] 数据库集合未初始化,跳过排名更新')
return
if not self.saved_items:
logging.warning('[实时保存] 没有已保存的数据,跳过排名更新')
return
try:
# 获取当前批次的所有数据,按播放量排序
cursor = target_collection.find(
{'batch_id': self.batch_id},
{'_id': 1, 'play_vv': 1, 'mix_name': 1}
).sort('play_vv', -1)
batch_items = list(cursor)
if not batch_items:
logging.warning(f'[实时保存] 未找到批次 {self.batch_id} 的数据')
return
# 批量更新排名
from pymongo import UpdateOne
bulk_operations = []
for rank, item in enumerate(batch_items, 1):
bulk_operations.append(
UpdateOne(
{'_id': item['_id']},
{'$set': {'rank': rank}}
)
)
if bulk_operations:
result = target_collection.bulk_write(bulk_operations)
logging.info(f'[实时保存] 成功更新 {result.modified_count} 个合集的排名')
# 输出排名统计
total_play_vv = sum(item['play_vv'] for item in batch_items)
max_play_vv = batch_items[0]['play_vv'] if batch_items else 0
logging.info(f'[实时保存] 排名统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
except Exception as e:
logging.error(f'[实时保存] 更新排名失败: {e}')
def extract_douyin_image_id(self, cover_url):
"""
从抖音图片URL中提取唯一的图片ID
Args:
cover_url (str): 抖音图片URL
Returns:
str: 图片ID如果提取失败返回空字符串
"""
if not cover_url:
return ''
try:
# 抖音图片URL格式支持两种:
# 1. https://p{数字}-sign.douyinpic.com/obj/tos-cn-i-dy/{图片ID}?{参数}
# 2. https://p{数字}-sign.douyinpic.com/obj/douyin-user-image-file/{图片ID}?{参数}
# 使用正则表达式提取图片ID
patterns = [
r'/obj/tos-cn-i-dy/([a-f0-9]+)',
r'/obj/douyin-user-image-file/([a-f0-9]+)'
]
for pattern in patterns:
match = re.search(pattern, cover_url)
if match:
image_id = match.group(1)
logging.debug(f'提取图片ID成功: {image_id} from {cover_url}')
return image_id
logging.warning(f'无法从URL中提取图片ID: {cover_url}')
return ''
except Exception as e:
logging.error(f'提取图片ID异常: {cover_url}, 错误: {e}')
return ''
def upload_cover_image(self, cover_url, mix_name):
"""
上传封面图片到TOS并返回永久链接带去重功能和重试机制
Args:
cover_url: 临时封面图片链接
mix_name: 合集名称,用于生成文件名
Returns:
str: 永久链接URL如果上传失败则返回原链接
"""
if not cover_url:
return cover_url
# 提取图片ID
image_id = self.extract_douyin_image_id(cover_url)
# 如果能提取到图片ID检查缓存
if image_id:
if image_id in self.image_cache:
cached_url = self.image_cache[image_id]
logging.info(f'使用缓存图片: {image_id} -> {cached_url} (合集: {mix_name})')
return cached_url
# 生成随机文件名,保持原有的扩展名
file_extension = '.jpg' # 抖音封面图片通常是jpg格式
# 改进的扩展名检测逻辑
url_without_params = cover_url.split('?')[0]
url_path = url_without_params.split('/')[-1] # 获取URL路径的最后一部分
# 只有当最后一部分包含点且点后面的内容是常见图片扩展名时才使用
if '.' in url_path:
potential_ext = url_path.split('.')[-1].lower()
# 检查是否为常见的图片扩展名
if potential_ext in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']:
file_extension = '.' + potential_ext
# 生成唯一文件名
random_filename = f"{uuid.uuid4().hex}{file_extension}"
object_key = f"media/rank/{random_filename}"
# 重试机制最多尝试3次
max_retries = 3
last_error = None
for attempt in range(max_retries):
try:
logging.info(f'开始上传封面图片 (尝试 {attempt + 1}/{max_retries}): {mix_name}')
logging.info(f'封面图片URL: {cover_url}')
logging.info(f'目标对象键: {object_key}')
# 从URL上传到TOS并获取新的URL
oss_url = oss_client.upload_from_url(
url=cover_url,
object_key=object_key,
return_url=True,
timeout=30 # 30秒超时
)
# 验证上传是否成功检查返回的URL是否包含预期的域名
if not oss_url or not isinstance(oss_url, str):
raise Exception(f"上传返回了无效的URL: {oss_url}")
# 检查URL格式是否正确
expected_domain = oss_client.self_domain
if expected_domain not in oss_url:
raise Exception(f"上传返回的URL域名不正确: {oss_url}, 期望包含: {expected_domain}")
# 检查URL是否包含正确的对象键
if object_key not in oss_url:
raise Exception(f"上传返回的URL不包含对象键: {oss_url}, 期望包含: {object_key}")
logging.info(f'封面图片上传成功: {mix_name} -> {oss_url}')
# 如果有图片ID将结果缓存
if image_id:
self.image_cache[image_id] = oss_url
logging.debug(f'图片缓存已更新: {image_id} -> {oss_url}')
return oss_url
except Exception as e:
last_error = e
error_msg = str(e)
logging.warning(f'封面图片上传失败 (尝试 {attempt + 1}/{max_retries}): {mix_name} - {error_msg}')
# 如果不是最后一次尝试,等待一段时间后重试
if attempt < max_retries - 1:
import time
wait_time = (attempt + 1) * 2 # 递增等待时间2秒、4秒、6秒
logging.info(f'等待 {wait_time} 秒后重试...')
time.sleep(wait_time)
# 为重试生成新的文件名,避免可能的冲突
random_filename = f"{uuid.uuid4().hex}{file_extension}"
object_key = f"media/rank/{random_filename}"
# 所有重试都失败了
logging.error(f'封面图片上传彻底失败 (已尝试 {max_retries} 次): {mix_name} - 最后错误: {last_error}')
logging.error(f'将使用原始链接作为回退: {cover_url}')
return cover_url # 上传失败时返回原链接
def save_to_mongodb(self):
"""将数据保存到MongoDB"""
# 如果启用了实时保存,跳过批量保存
if self.realtime_save_enabled:
logging.info('[批量保存] 实时保存模式已启用,跳过批量保存')
return
if self.collection is None:
logging.warning('MongoDB未连接跳过数据库保存')
return
if not self.play_vv_items:
logging.info('没有数据需要保存到MongoDB')
return
try:
batch_time = datetime.now()
documents = []
for item in self.play_vv_items:
# 获取原始封面图片URL
original_cover_url = item.get('cover_image_url', '')
mix_name = item.get('mix_name', '')
# 处理封面图片
permanent_cover_url = ''
upload_success = False
if original_cover_url:
# 上传封面图片到TOS获取永久链接
permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name)
# 检查上传是否成功
if permanent_cover_url != original_cover_url:
# 上传成功URL已经改变
upload_success = True
logging.info(f'封面图片上传成功,已获得永久链接: {mix_name}')
else:
# 上传失败,使用原始链接作为回退
upload_success = False
logging.warning(f'封面图片上传失败,回退使用原始链接: {mix_name}')
logging.warning(f'原始链接: {original_cover_url}')
# 可以在这里添加额外的回退策略,比如:
# 1. 尝试使用备用的图片链接
# 2. 设置一个默认的占位图片
# 3. 记录失败的链接以便后续重试
# 当前策略:保持原始链接,但在数据库中标记上传状态
else:
# 没有封面图片,使用空字符串
permanent_cover_url = ''
upload_success = True # 没有图片不算失败
# 获取合集中的所有视频ID
mix_id = item.get('mix_id', '')
episode_video_ids = []
if mix_id:
logging.info(f'获取合集 {mix_name} 的所有视频ID')
current_episode_count = item.get('updated_to_episode', 0)
episode_video_ids = self.get_collection_videos(
mix_id=mix_id,
mix_name=mix_name,
current_episode_count=current_episode_count
)
logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID')
# 获取每个视频的详细互动数据
logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据')
video_details_list = self.get_collection_video_details(
episode_video_ids=episode_video_ids,
mix_name=mix_name,
mix_id=mix_id
)
# 构建每集的详细信息,使用获取到的真实数据
episode_details = []
total_episodes = item.get('updated_to_episode', 0)
for i in range(total_episodes):
episode_number = i + 1
video_id = episode_video_ids[i] if i < len(episode_video_ids) else ''
# 查找对应的视频详细数据
video_detail = None
if i < len(video_details_list):
video_detail = video_details_list[i]
if video_detail and video_detail.get('success', False):
# 使用获取到的真实数据
likes = video_detail.get('likes', 0)
shares = video_detail.get('shares', 0)
favorites = video_detail.get('favorites', 0)
episode_info = {
'episode_number': episode_number,
'video_id': video_id,
'likes': likes,
'shares': shares,
'favorites': favorites,
'likes_formatted': self.format_interaction_count(likes),
'shares_formatted': self.format_interaction_count(shares),
'favorites_formatted': self.format_interaction_count(favorites),
'comments': video_detail.get('comments', [])
}
else:
# 使用默认值
episode_info = {
'episode_number': episode_number,
'video_id': video_id,
'likes': 0,
'shares': 0,
'favorites': 0,
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': []
}
episode_details.append(episode_info)
# 统计获取到的数据
total_likes = sum(ep.get('likes', 0) for ep in episode_details)
total_comments = sum(len(ep.get('comments', [])) for ep in episode_details)
logging.info(f'合集 {mix_name} 详细数据统计: 总点赞数={total_likes:,}, 总评论数={total_comments}')
else:
# 如果没有获取到视频ID使用默认的episode_details
episode_details = [
{
'episode_number': i + 1,
'video_id': '',
'likes': 0,
'shares': 0,
'favorites': 0,
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': []
} for i in range(item.get('updated_to_episode', 0))
]
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段
doc = {
'batch_time': batch_time,
'mix_name': mix_name,
'mix_id': item.get('mix_id', ''), # 合集ID
'video_url': item.get('video_url', ''),
'playcount': item.get('formatted', ''),
'play_vv': item.get('play_vv', 0),
'request_id': item.get('request_id', ''),
'rank': 0, # 临时设置,后面会重新计算
'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试
'cover_image_url': permanent_cover_url, # 合集封面图片永久链接
'cover_upload_success': upload_success, # 封面图片上传是否成功
'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表
# 新增的字段
'series_author': item.get('series_author', ''), # 合集作者/影视工作室
'desc': item.get('desc', ''), # 合集描述
'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数
'episode_video_ids': episode_video_ids, # 每一集的视频ID列表
'episode_details': episode_details, # 每集的详细信息
'Manufacturing_Field': item.get('Manufacturing_Field', ''), # 承制信息
'Copyright_field': item.get('Copyright_field', ''), # 版权信息
}
documents.append(doc)
# 按播放量降序排序并添加排名
documents.sort(key=lambda x: x['play_vv'], reverse=True)
for i, doc in enumerate(documents, 1):
doc['rank'] = i
# 批量插入到目标集合(根据模式选择)
target_collection = self.collection # 使用根据模式选择的集合
result = target_collection.insert_many(documents)
logging.info(f'成功保存 {len(result.inserted_ids)} 条记录到MongoDB')
# 输出统计信息
total_play_vv = sum(doc['play_vv'] for doc in documents)
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
logging.info(f'保存的字段: batch_time, mix_name, mix_id, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, cover_upload_success, series_author, Manufacturing_Field, Copyright_field, desc, updated_to_episode')
# 统计封面图片处理情况
cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))
original_count = sum(1 for item in self.play_vv_items if item.get('cover_image_url'))
upload_success_count = sum(1 for doc in documents if doc.get('cover_upload_success', False))
upload_failed_count = sum(1 for doc in documents if doc.get('cover_image_url_original') and not doc.get('cover_upload_success', False))
logging.info(f'封面图片统计: {cover_count}/{len(documents)} 个合集有封面链接')
logging.info(f'封面上传统计: {upload_success_count}/{original_count} 个封面成功上传到TOS')
if upload_failed_count > 0:
logging.warning(f'封面上传失败: {upload_failed_count} 个封面上传失败,使用原始链接')
logging.info(f'图片缓存统计: 当前缓存 {len(self.image_cache)} 个图片映射')
except Exception as e:
logging.error(f'保存到MongoDB时出错: {e}')
def save_collection_basic_info(self, item_data: dict):
"""立即保存合集基础信息到数据库(第一阶段保存)"""
logging.info(f'[立即保存] 保存合集基础信息: {item_data.get("mix_name", "未知")}')
if not self.realtime_save_enabled or self.collection is None:
logging.warning(f'[立即保存] 跳过保存 - 实时保存未启用或数据库未连接')
return None
try:
# 生成唯一标识用于去重只使用mix_id不包含播放量
mix_id = item_data.get('mix_id', '')
item_key = mix_id
if item_key in self.saved_items:
logging.warning(f'[立即保存] 短剧已存在,跳过重复保存: {item_data.get("mix_name", "")} (mix_id: {mix_id})')
return None
# 增加序号
self.item_sequence += 1
# 获取基础信息
mix_name = item_data.get('mix_name', '')
mix_id = item_data.get('mix_id', '')
original_cover_url = item_data.get('cover_image_url', '')
current_episode_count = item_data.get('updated_to_episode', 0)
# 处理封面图片(如果有的话)
permanent_cover_url = ''
upload_success = False
if original_cover_url:
permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name)
upload_success = permanent_cover_url != original_cover_url
if upload_success:
logging.info(f'[立即保存] 封面图片上传成功: {mix_name}')
else:
upload_success = True # 没有图片不算失败
# 创建基础的episode_details结构
episode_details = [
{
'episode_number': i + 1,
'video_id': '', # 稍后更新
'likes': 0, # 稍后更新
'shares': 0, # 稍后更新
'favorites': 0, # 稍后更新
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': [], # 稍后更新
'data_status': 'pending' # 标记数据状态
} for i in range(current_episode_count)
]
# 构建基础文档数据
doc = {
'batch_id': self.batch_id,
'batch_time': self.batch_time,
'item_sequence': self.item_sequence,
'mix_name': mix_name,
'mix_id': mix_id,
'video_url': item_data.get('video_url', ''),
'playcount': item_data.get('formatted', ''),
'play_vv': item_data.get('play_vv', 0),
'request_id': item_data.get('request_id', ''),
'rank': 0,
'cover_image_url_original': original_cover_url,
'cover_image_url': permanent_cover_url,
'cover_upload_success': upload_success,
'cover_backup_urls': item_data.get('cover_backup_urls', []),
'series_author': item_data.get('series_author', ''),
'Manufacturing_Field': item_data.get('Manufacturing_Field', ''),
'Copyright_field': item_data.get('Copyright_field', ''),
'desc': item_data.get('desc', ''),
'updated_to_episode': current_episode_count,
'episode_video_ids': [], # 稍后更新
'episode_details': episode_details,
'data_status': 'basic_saved', # 标记为基础信息已保存
'realtime_saved': True,
'created_at': datetime.now(),
'last_updated': datetime.now()
}
# 根据运行模式选择数据库集合
target_collection = self.collection # 使用根据模式选择的集合
document_id = None
# 保存到目标数据库(根据模式:定时器模式->Ranking_storage_list普通模式->Rankings_management
if target_collection is not None:
try:
# 为目标数据库准备文档数据
target_doc = doc.copy()
target_doc['last_updated'] = datetime.now()
# 检查是否已存在该短剧的记录
existing_doc = target_collection.find_one({'mix_id': mix_id})
# 准备更新字段(不包含锁定字段,锁定字段将在后面单独处理)
set_fields = {
# 按照用户指定的字段顺序设置
'batch_id': target_doc.get('batch_id', ''),
'batch_time': target_doc.get('batch_time', datetime.now()),
'item_sequence': target_doc.get('item_sequence', 0),
'mix_name': target_doc.get('mix_name', ''),
'mix_id': mix_id,
'video_url': target_doc.get('video_url', ''),
'playcount': target_doc.get('playcount', ''),
'play_vv': target_doc.get('play_vv', 0),
'request_id': target_doc.get('request_id', ''),
'rank': target_doc.get('rank', 0),
'cover_image_url_original': target_doc.get('cover_image_url_original', ''),
'cover_image_url': target_doc.get('cover_image_url', ''),
'cover_upload_success': target_doc.get('cover_upload_success', True),
'cover_backup_urls': target_doc.get('cover_backup_urls', []),
'series_author': target_doc.get('series_author', ''),
'desc': target_doc.get('desc', ''),
'updated_to_episode': target_doc.get('updated_to_episode', 0),
'episode_video_ids': target_doc.get('episode_video_ids', []),
'episode_details': target_doc.get('episode_details', []),
'data_status': target_doc.get('data_status', ''),
'realtime_saved': target_doc.get('realtime_saved', True),
'created_at': target_doc.get('created_at', datetime.now()),
'last_updated': target_doc['last_updated']
# 注意:分类字段 Novel_IDs, Anime_IDs, Drama_IDs 不在此处设置
# 因为爬虫数据不包含这些用户手动设置的分类信息
# 这些字段只在保护逻辑中处理,避免覆盖现有数据
}
# 锁定字段保护逻辑检查field_lock_status来决定是否更新锁定字段
# 规则如果字段被用户锁定field_lock_status中包含该字段则跳过更新
# 如果字段未被锁定,且现有记录中这些字段有值,则跳过更新(保持原值)
# 如果字段未被锁定,且现有记录中这些字段为空,且新数据有值,则更新
# 如果是新记录,则使用新数据的值
if existing_doc:
# 记录已存在,检查锁定字段保护
existing_field_lock_status = existing_doc.get('field_lock_status', {})
existing_manufacturing = existing_doc.get('Manufacturing_Field', '')
existing_copyright = existing_doc.get('Copyright_field', '')
existing_novel_ids = existing_doc.get('Novel_IDs', [])
existing_anime_ids = existing_doc.get('Anime_IDs', [])
existing_drama_ids = existing_doc.get('Drama_IDs', [])
new_manufacturing = target_doc.get('Manufacturing_Field', '')
new_copyright = target_doc.get('Copyright_field', '')
# 注意不从target_doc获取分类字段因为爬虫数据不包含这些字段
# 分类字段只能由用户手动设置,爬虫不应该更新它们
new_novel_ids = [] # 爬虫数据不包含此字段
new_anime_ids = [] # 爬虫数据不包含此字段
new_drama_ids = [] # 爬虫数据不包含此字段
# Manufacturing_Field 保护逻辑
if existing_field_lock_status.get('Manufacturing_Field_locked', False):
# 字段被用户锁定,跳过更新
logging.info(f'[锁定字段] 跳过Manufacturing_Field更新: {mix_name} -> 字段已被用户锁定')
elif existing_manufacturing:
# 现有字段有值跳过更新不添加到set_fields中
logging.info(f'[锁定字段] 跳过Manufacturing_Field更新: {mix_name} -> 保持现有值 "{existing_manufacturing}"')
elif new_manufacturing:
# 现有字段为空,且新数据有值,则更新
set_fields['Manufacturing_Field'] = new_manufacturing
logging.info(f'[锁定字段] 更新Manufacturing_Field: {mix_name} -> "{new_manufacturing}"')
# 如果现有为空且新数据也为空,则不设置该字段(保持为空)
# Copyright_field 保护逻辑
if existing_field_lock_status.get('Copyright_field_locked', False):
# 字段被用户锁定,跳过更新
logging.info(f'[锁定字段] 跳过Copyright_field更新: {mix_name} -> 字段已被用户锁定')
elif existing_copyright:
# 现有字段有值跳过更新不添加到set_fields中
logging.info(f'[锁定字段] 跳过Copyright_field更新: {mix_name} -> 保持现有值 "{existing_copyright}"')
elif new_copyright:
# 现有字段为空,且新数据有值,则更新
set_fields['Copyright_field'] = new_copyright
logging.info(f'[锁定字段] 更新Copyright_field: {mix_name} -> "{new_copyright}"')
# 如果现有为空且新数据也为空,则不设置该字段(保持为空)
# Novel_IDs 保护逻辑
if existing_field_lock_status.get('Novel_IDs_locked', False):
# 字段被用户锁定,跳过更新
logging.info(f'[锁定字段] 跳过Novel_IDs更新: {mix_name} -> 字段已被用户锁定')
elif existing_novel_ids and len(existing_novel_ids) > 0:
# 现有字段有值跳过更新不添加到set_fields中
logging.info(f'[锁定字段] 跳过Novel_IDs更新: {mix_name} -> 保持现有值 {existing_novel_ids}')
elif new_novel_ids and len(new_novel_ids) > 0:
# 现有字段为空,且新数据有值,则更新
set_fields['Novel_IDs'] = new_novel_ids
logging.info(f'[锁定字段] 更新Novel_IDs: {mix_name} -> {new_novel_ids}')
# 如果现有为空且新数据也为空,则不设置该字段(保持为空)
# Anime_IDs 保护逻辑
if existing_field_lock_status.get('Anime_IDs_locked', False):
# 字段被用户锁定,跳过更新
logging.info(f'[锁定字段] 跳过Anime_IDs更新: {mix_name} -> 字段已被用户锁定')
elif existing_anime_ids and len(existing_anime_ids) > 0:
# 现有字段有值跳过更新不添加到set_fields中
logging.info(f'[锁定字段] 跳过Anime_IDs更新: {mix_name} -> 保持现有值 {existing_anime_ids}')
elif new_anime_ids and len(new_anime_ids) > 0:
# 现有字段为空,且新数据有值,则更新
set_fields['Anime_IDs'] = new_anime_ids
logging.info(f'[锁定字段] 更新Anime_IDs: {mix_name} -> {new_anime_ids}')
# 如果现有为空且新数据也为空,则不设置该字段(保持为空)
# Drama_IDs 保护逻辑
if existing_field_lock_status.get('Drama_IDs_locked', False):
# 字段被用户锁定,跳过更新
logging.info(f'[锁定字段] 跳过Drama_IDs更新: {mix_name} -> 字段已被用户锁定')
elif existing_drama_ids and len(existing_drama_ids) > 0:
# 现有字段有值跳过更新不添加到set_fields中
logging.info(f'[锁定字段] 跳过Drama_IDs更新: {mix_name} -> 保持现有值 {existing_drama_ids}')
elif new_drama_ids and len(new_drama_ids) > 0:
# 现有字段为空,且新数据有值,则更新
set_fields['Drama_IDs'] = new_drama_ids
logging.info(f'[锁定字段] 更新Drama_IDs: {mix_name} -> {new_drama_ids}')
# 如果现有为空且新数据也为空,则不设置该字段(保持为空)
else:
# 新记录,只设置非分类字段
set_fields['Manufacturing_Field'] = target_doc.get('Manufacturing_Field', '')
set_fields['Copyright_field'] = target_doc.get('Copyright_field', '')
# 注意:不设置分类字段 Novel_IDs, Anime_IDs, Drama_IDs
# 因为爬虫数据不包含这些用户手动设置的分类信息
# 新记录的分类字段将保持为空,等待用户手动设置
logging.info(f'[锁定字段] 新记录,设置初始非分类字段: {mix_name}')
# 使用upsert操作如果存在则更新不存在则插入
upsert_result = target_collection.update_one(
{'mix_id': mix_id}, # 查询条件
{
'$set': set_fields,
'$setOnInsert': {
# 只在插入时设置的字段(如果字段已在$set中则不需要在这里重复
}
},
upsert=True # 如果不存在则插入
)
if upsert_result.upserted_id:
# 新插入的文档
document_id = upsert_result.upserted_id
logging.info(f'[数据保存] ✅ 新短剧添加: {mix_name} - 文档ID: {document_id}')
else:
# 更新的现有文档
existing_doc = target_collection.find_one({'mix_id': mix_id}, {'_id': 1})
document_id = existing_doc['_id'] if existing_doc else None
logging.info(f'[数据保存] 🔄 已有短剧更新: {mix_name} - 文档ID: {document_id}')
except Exception as e:
logging.error(f'[数据保存] 目标数据库操作失败: {mix_name} - 错误: {e}')
# 记录已保存的项目
self.saved_items.add(item_key)
logging.info(f'[数据保存] 🎯 合集基础信息保存完成: {mix_name} (播放量: {item_data.get("play_vv", 0):,})')
return document_id
except Exception as e:
logging.error(f'[立即保存] 保存合集基础信息失败: {item_data.get("mix_name", "未知")} - 错误: {e}')
return None
def update_collection_video_ids(self, document_id, mix_id: str, mix_name: str, current_episode_count: int):
"""更新合集的视频ID列表第二阶段更新"""
target_collection = self.collection # 使用根据模式选择的集合
if not self.realtime_save_enabled or target_collection is None or not document_id:
return False
try:
logging.info(f'[增量更新] 开始获取合集 {mix_name} 的视频ID列表')
# 获取视频ID列表
episode_video_ids = self.get_collection_videos(
mix_id=mix_id,
mix_name=mix_name,
current_episode_count=current_episode_count
)
if episode_video_ids:
# 管理数据库更新逻辑
update_data = {
'$set': {
'episode_video_ids': episode_video_ids,
'data_status': 'video_ids_updated',
'last_updated': datetime.now()
}
}
# 更新目标数据库
try:
# 根据mix_id查找目标数据库中的文档
update_result = target_collection.update_one(
{'mix_id': mix_id},
update_data
)
if update_result.modified_count > 0:
logging.info(f'[数据更新] ✅ 视频ID列表更新完成: {mix_name} - 共 {len(episode_video_ids)} 个视频')
return episode_video_ids
else:
logging.warning(f'[数据更新] 视频ID列表更新失败: {mix_name}')
except Exception as e:
logging.error(f'[数据更新] 视频ID更新失败: {mix_name} - 错误: {e}')
else:
logging.warning(f'[增量更新] 未获取到视频ID: {mix_name}')
return []
except Exception as e:
logging.error(f'[增量更新] 更新视频ID列表失败: {mix_name} - 错误: {e}')
return []
def update_single_video_details(self, document_id, episode_number: int, video_id: str, video_details: dict, mix_name: str):
"""更新单个视频的详细数据(第三阶段增量更新)"""
target_collection = self.collection # 使用根据模式选择的集合
if not self.realtime_save_enabled or target_collection is None or not document_id:
return False
# 确保 episode_number 是整数类型
try:
episode_number = int(episode_number)
except (ValueError, TypeError):
logging.error(f'update_single_video_details: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}')
return False
try:
# 构建更新的视频详细信息
episode_info = {
'episode_number': episode_number,
'video_id': video_id,
'likes': video_details.get('likes', 0),
'shares': video_details.get('shares', 0),
'favorites': video_details.get('favorites', 0),
'likes_formatted': self.format_interaction_count(video_details.get('likes', 0)),
'shares_formatted': self.format_interaction_count(video_details.get('shares', 0)),
'favorites_formatted': self.format_interaction_count(video_details.get('favorites', 0)),
'comments': video_details.get('comments', []),
'data_status': 'completed'
}
# 双数据库更新逻辑
update_data = {
'$set': {
f'episode_details.{episode_number - 1}': episode_info,
'last_updated': datetime.now()
}
}
# 更新目标数据库
if target_collection is not None:
try:
# 直接使用document_id查找目标数据库中的文档
update_result = target_collection.update_one(
{'_id': document_id},
update_data
)
if update_result.modified_count > 0:
logging.info(f'[数据更新] ✅ 第 {episode_number} 集详细数据更新完成: {mix_name} - 点赞: {video_details.get("likes", 0):,}, 评论: {len(video_details.get("comments", []))}')
return True
else:
logging.warning(f'[数据更新] 第 {episode_number} 集详细数据更新失败: {mix_name}')
return False
except Exception as e:
logging.error(f'[数据更新] 第 {episode_number} 集详细数据更新失败: {mix_name} - 错误: {e}')
return False
else:
logging.warning(f'[数据更新] 目标数据库第 {episode_number} 集详细数据更新失败: {mix_name}')
return False
except Exception as e:
logging.error(f'[增量更新] 更新第 {episode_number} 集详细数据失败: {mix_name} - 错误: {e}')
return False
def update_video_comments_realtime(self, document_id, episode_number: int, new_comments: list = None, mix_name: str = '', mix_id: str = '', interaction_data: dict = None):
"""实时更新视频评论和互动数据(第四阶段实时更新)"""
target_collection = self.collection # 使用根据模式选择的集合
if not self.realtime_save_enabled or target_collection is None or not document_id:
return False
# 确保 episode_number 是整数类型
try:
episode_number = int(episode_number)
except (ValueError, TypeError):
logging.error(f'episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}')
return False
# 检查是否有数据需要更新
if not new_comments and not interaction_data:
return False
try:
# 构建更新操作
update_operations = {}
episode_prefix = f'episode_details.{episode_number - 1}'
# 处理评论更新
if new_comments:
update_operations['$push'] = {
f'{episode_prefix}.comments': {'$each': new_comments}
}
# 处理互动数据更新
set_fields = {'last_updated': datetime.now()}
if interaction_data:
# 更新点赞数据
if 'likes' in interaction_data:
set_fields[f'{episode_prefix}.likes'] = interaction_data['likes']
set_fields[f'{episode_prefix}.likes_formatted'] = interaction_data.get('likes_formatted', self.format_interaction_count(interaction_data['likes']))
# 更新分享数据
if 'shares' in interaction_data:
set_fields[f'{episode_prefix}.shares'] = interaction_data['shares']
set_fields[f'{episode_prefix}.shares_formatted'] = interaction_data.get('shares_formatted', self.format_interaction_count(interaction_data['shares']))
# 更新收藏数据
if 'favorites' in interaction_data:
set_fields[f'{episode_prefix}.favorites'] = interaction_data['favorites']
set_fields[f'{episode_prefix}.favorites_formatted'] = interaction_data.get('favorites_formatted', self.format_interaction_count(interaction_data['favorites']))
update_operations['$set'] = set_fields
# 目标数据库更新逻辑
if target_collection is not None:
try:
# 直接使用document_id查找目标数据库中的文档
update_result = target_collection.update_one(
{'_id': document_id},
update_operations
)
if update_result.modified_count > 0:
# 构建日志信息
log_parts = []
if new_comments:
log_parts.append(f"追加 {len(new_comments)} 条评论")
if interaction_data:
interaction_summary = []
if 'likes' in interaction_data:
interaction_summary.append(f"点赞={interaction_data.get('likes_formatted', interaction_data['likes'])}")
if 'shares' in interaction_data:
interaction_summary.append(f"分享={interaction_data.get('shares_formatted', interaction_data['shares'])}")
if 'favorites' in interaction_data:
interaction_summary.append(f"收藏={interaction_data.get('favorites_formatted', interaction_data['favorites'])}")
if interaction_summary:
log_parts.append(f"更新互动数据({', '.join(interaction_summary)})")
logging.info(f'[目标数据库] ✅ 第 {episode_number} 集评论/互动数据更新完成: {mix_name} - {", ".join(log_parts)}')
return True
else:
logging.warning(f'[目标数据库] 第 {episode_number} 集评论/互动数据更新失败: {mix_name}')
return False
except Exception as e:
logging.error(f'[目标数据库] 第 {episode_number} 集评论/互动数据更新失败: {mix_name} - 错误: {e}')
return False
else:
logging.error(f'[目标数据库] 目标数据库未初始化')
return False
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'实时更新视频评论,合集: {mix_name}, 第 {episode_number} 集, 文档ID: {document_id}, 新评论数: {len(new_comments) if new_comments else 0}'
}
logging.error(f'[实时更新] 更新失败: {mix_name}{episode_number} 集 - {error_details["error_type"]}: {error_details["error_message"]}')
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
return False
def save_single_item_realtime(self, item_data: dict):
"""分阶段实时保存合集数据(新版本)"""
logging.info(f'[分阶段保存] 开始处理合集: {item_data.get("mix_name", "未知")}')
# 第一阶段:立即保存基础信息
document_id = self.save_collection_basic_info(item_data)
if not document_id:
return False
# 第二阶段获取并更新视频ID列表
mix_id = item_data.get('mix_id', '')
mix_name = item_data.get('mix_name', '')
current_episode_count = item_data.get('updated_to_episode', 0)
if mix_id and current_episode_count > 0:
episode_video_ids = self.update_collection_video_ids(document_id, mix_id, mix_name, current_episode_count)
# 第三阶段:逐个获取并更新视频详细数据
if episode_video_ids:
self.update_video_details_incrementally(document_id, episode_video_ids, mix_name, mix_id)
# 🔄 第四阶段触发字段同步到Ranking_storage如果存在对应的榜单数据
try:
if mix_name: # 只有当mix_name存在时才尝试同步
logging.info(f'[字段同步] 检查是否需要同步字段到Ranking_storage: {mix_name}')
# 导入同步函数(延迟导入避免循环依赖)
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'routers'))
from rank_api_routes import sync_ranking_storage_fields
# 获取今天的日期
today_str = datetime.now().strftime('%Y-%m-%d')
# 检查Ranking_storage中是否存在该短剧的今日数据
ranking_storage_collection = db['Ranking_storage']
existing_ranking = ranking_storage_collection.find_one({
"date": today_str,
"mix_name": mix_name
})
if existing_ranking:
# 存在对应的榜单数据,触发同步
logging.info(f'[字段同步] 发现对应的榜单数据,开始同步: {mix_name}')
sync_result = sync_ranking_storage_fields(target_date=today_str, force_update=False)
if sync_result.get("success", False):
logging.info(f'[字段同步] ✅ 同步成功: {sync_result.get("message", "")}')
else:
logging.info(f'[字段同步] ⚠️ 同步完成: {sync_result.get("message", "")}')
else:
logging.info(f'[字段同步] 未找到对应的榜单数据,跳过同步: {mix_name}')
except Exception as sync_error:
logging.warning(f'[字段同步] 同步失败,但不影响数据保存: {mix_name} - {sync_error}')
# 同步失败不影响数据保存的成功状态
return True
def update_video_details_incrementally(self, document_id, episode_video_ids: list, mix_name: str, mix_id: str = ''):
"""增量更新视频详细数据"""
logging.info(f'[增量更新] 开始逐个获取视频详细数据: {mix_name}')
for i, video_id in enumerate(episode_video_ids, 1):
if not video_id:
logging.warning(f'[增量更新] 第 {i} 集视频ID为空跳过: {mix_name}')
continue
try:
# 获取单个视频的详细数据
logging.info(f'[增量更新] 获取第 {i}/{len(episode_video_ids)} 集视频详细数据: {mix_name}')
video_details = self.get_video_details(video_id, mix_name, mix_id, document_id, i)
if video_details and video_details.get('success', False):
# 立即更新到数据库
self.update_single_video_details(document_id, i, video_id, video_details, mix_name)
else:
logging.warning(f'[增量更新] 第 {i} 集视频详细数据获取失败: {mix_name}')
# 添加随机延迟避免请求过快,模拟人类行为
if i < len(episode_video_ids): # 不是最后一个视频时才延迟
random_delay = self.anti_detection.get_human_like_delay()
logging.info(f'🕐 [增量更新] 视频间隔等待时间: {random_delay:.1f}')
time.sleep(random_delay)
except Exception as e:
logging.error(f'[增量更新] 处理第 {i} 集视频时出错: {mix_name} - {e}')
continue
def get_video_info(self, video_id: str) -> dict:
"""获取视频详细信息
Args:
video_id: 视频ID
Returns:
dict: 包含视频详细信息的字典
"""
video_url = f'https://www.douyin.com/video/{video_id}'
logging.info(f'获取视频信息: {video_url}')
# 清除之前的网络日志
self.driver.execute_cdp_cmd('Network.clearBrowserCache', {})
self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {})
self.driver.get(video_url)
time.sleep(3)
# 等待页面加载完成
try:
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "video"))
)
except Exception as e:
logging.warning(f'等待视频元素超时: {e}')
# 获取网络请求日志
logs = self.driver.get_log('performance')
video_info = {}
for entry in logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and log['params']['response']
and log['params']['response']
and 'url' in log['params']['response']
and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url']
):
request_id = log['params']['requestId']
response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
if response and 'body' in response:
data = json.loads(response['body'])
if 'item_list' in data and len(data['item_list']) > 0:
item = data['item_list'][0]
video_info = {
'video_id': item.get('aweme_id'),
'create_time': item.get('create_time'),
'desc': item.get('desc'),
'duration': item.get('duration'),
'mix_info': {
'mix_id': item.get('mix_info', {}).get('mix_id'),
'mix_name': item.get('mix_info', {}).get('mix_name'),
'total': item.get('mix_info', {}).get('total')
}
}
break
except Exception as e:
logging.warning(f'解析日志条目时出错: {e}')
return video_info
def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list:
"""获取合集中的所有视频ID列表支持增量更新
Args:
mix_id: 合集ID
mix_name: 合集名称,用于日志
current_episode_count: 当前已知的集数
Returns:
list: 按集数排序的视频ID列表
"""
# 定时器模式下跳过此函数
if os.environ.get('TIMER_MODE') == '1':
logging.info(f'定时器模式:跳过 get_collection_videos 函数')
return []
try:
# 检查缓存文件
cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids')
# 确保缓存目录存在
os.makedirs(cache_dir, exist_ok=True)
cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json')
cached_videos = []
try:
if os.path.exists(cache_file):
with open(cache_file, 'r', encoding='utf-8') as f:
cache_data = json.load(f)
cached_videos = cache_data.get('episodes', [])
last_update = cache_data.get('last_update')
# 如果缓存的集数等于当前集数,直接返回缓存的结果
if len(cached_videos) == current_episode_count:
logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
except Exception as e:
logging.warning(f"读取缓存文件失败: {e}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Referer': 'https://www.douyin.com/',
}
params = {
'device_platform': 'webapp',
'aid': '6383',
'channel': 'channel_pc_web',
'pc_client_type': '1',
'version_code': '170400',
'version_name': '17.4.0',
'cookie_enabled': 'true',
'platform': 'PC',
'downlink': '10',
'mix_id': mix_id,
'cursor': '0',
'count': '30',
'screen_width': '1920',
'screen_height': '1080',
'browser_language': 'zh-CN',
'browser_platform': 'Win32',
'browser_name': 'Chrome',
'browser_version': '120.0.0.0',
'browser_online': 'true',
'engine_name': 'Blink',
'engine_version': '120.0.0.0',
'os_name': 'Windows',
'os_version': '10',
'cpu_core_num': '16',
'device_memory': '8',
'effective_type': '4g',
'round_trip_time': '50',
}
all_videos = []
# 使用服务端提供的游标进行分页,而不是使用 len(all_videos)
cursor = 0
seen_cursors = set()
while True:
# 将当前游标设置到请求参数(字符串以兼容部分接口)
params['cursor'] = str(cursor)
response = requests.get(
'https://www.douyin.com/aweme/v1/web/mix/aweme/',
params=params,
cookies=self.get_cookies_dict(),
headers=headers
)
if response.status_code != 200:
logging.error(f"请求失败: {response.status_code}")
logging.error(f"响应内容: {response.text}")
break
try:
data = response.json()
# 兼容可能的列表字段名
aweme_list = data.get('aweme_list') or data.get('mix_aweme_list') or []
if not aweme_list:
logging.info("当前页无视频,结束分页")
break
for aweme in aweme_list:
video_id = aweme.get('aweme_id')
if video_id:
all_videos.append({
'video_id': video_id,
'episode_num': int(aweme.get('episode_num', 0))
})
# 读取服务端分页标识
has_more = data.get('has_more') or data.get('hasMore') or False
next_cursor = (
data.get('cursor') or
data.get('next_cursor') or
data.get('max_cursor') or
data.get('min_cursor')
)
logging.info(f"分页: cursor={cursor}, next_cursor={next_cursor}, has_more={has_more}, 本页视频={len(aweme_list)}, 累计={len(all_videos)}")
# 退出条件:没有更多或没有有效下一游标
if not has_more or not next_cursor:
break
# 防止重复游标导致的死循环
if next_cursor in seen_cursors:
logging.warning(f"检测到重复游标 {next_cursor},停止分页以避免死循环")
break
seen_cursors.add(next_cursor)
cursor = next_cursor
time.sleep(1)
except json.JSONDecodeError as e:
logging.error(f"JSON解析错误: {e}")
logging.error(f"响应内容: {response.text}")
break
if not all_videos:
if cached_videos:
logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
return []
logging.info(f"获取到 {len(all_videos)} 个视频ID")
# 按集数排序
all_videos.sort(key=lambda x: x['episode_num'])
# 整理视频ID和集数信息
episode_info = []
for video in all_videos:
episode_info.append({
'video_id': video['video_id'],
'episode_num': video['episode_num']
})
# 检查是否有新增视频
if len(episode_info) > len(cached_videos):
logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)}")
# 保存到缓存文件
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump({
'episodes': episode_info,
'total_count': len(episode_info),
'last_update': datetime.now().isoformat(),
'mix_name': mix_name
}, f, ensure_ascii=False, indent=2)
# 返回视频ID列表
return [video['video_id'] for video in all_videos]
except Exception as e:
logging.error(f"获取合集视频时出错: {e}")
# 如果出错且有缓存,返回缓存的结果
if cached_videos:
logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})")
return [video['video_id'] for video in cached_videos]
return []
def _simulate_comment_scrolling(self, video_id: str, max_scroll_attempts: int = 10, scroll_delay: float = 2.0,
document_id=None, episode_number: int = 0, mix_name: str = '', mix_id: str = '',
max_comments: int = 100) -> list:
"""
模拟用户异步滑动机制,向上滑动加载更多评论
Args:
video_id: 视频ID
max_scroll_attempts: 最大滑动尝试次数默认10次
scroll_delay: 每次滑动后的延迟时间默认2秒
max_comments: 每集最大评论数量限制默认100条
Returns:
list: 收集到的所有评论数据
"""
# 确保 episode_number 是整数类型
try:
episode_number = int(episode_number)
except (ValueError, TypeError):
logging.error(f'_simulate_comment_scrolling: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}')
episode_number = 0
# 检查是否应该跳过评论滑动(仅在定时器模式下跳过)
if should_skip_function('scroll_comments'):
logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的评论滑动加载')
return []
# 首先检查视频是否真的没有评论(检测"抢首评"按钮)
if self._check_first_comment_button():
logging.info(f'检测到视频 {video_id} 没有评论(存在"抢首评"按钮),跳过评论抓取')
return []
all_comments = []
collected_comment_ids = set()
try:
logging.info(f'开始为视频 {video_id} 执行评论滑动加载机制')
# 等待页面加载完成
time.sleep(3)
# 定位评论区域
self._scroll_to_comment_section()
# 点击评论区域以触发网络请求
self._click_comment_area()
# 创建共享状态对象,用于任务间通信
shared_state = {
'scroll_completed': False,
'lock': threading.Lock()
}
with ThreadPoolExecutor(max_workers=2) as executor:
# 提交滑动任务
scroll_future = executor.submit(self._async_scroll_task_with_state, max_scroll_attempts, scroll_delay, shared_state)
# 同时提交监控任务 - 监控任务会检测滑动任务状态5小时超时
monitor_future = executor.submit(self._async_monitor_task_with_state, video_id, collected_comment_ids, shared_state, 18000,
document_id, episode_number, mix_name, mix_id, max_comments)
# 等待两个任务完成
scroll_result = scroll_future.result()
monitor_comments = monitor_future.result()
all_comments.extend(monitor_comments)
logging.info(f'评论滑动加载完成,共收集到 {len(all_comments)} 条评论')
# 保存评论到文件
if all_comments:
self.save_comments_to_file(all_comments, video_id)
# 添加随机停留时间,防止网页被爬取崩溃
rest_time = random.uniform(10, 20) # 10-20秒随机停留
logging.info(f'评论抓取完成,停留 {rest_time:.1f} 秒以保护网页稳定性...')
time.sleep(rest_time)
return all_comments
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'评论滑动加载机制视频ID: {video_id}, 最大滑动次数: {max_scroll_attempts}'
}
logging.error(f'评论滑动加载机制执行失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
return all_comments
def _async_scroll_task_with_state(self, max_attempts: int, scroll_delay: float, shared_state: dict):
"""带状态的异步滑动任务 - 无限滑动直到检测到"暂时没有更多评论"文本"""
try:
consecutive_no_progress = 0 # 连续无进展次数
attempt = 0
logging.info('开始无限滑动,直到检测到"暂时没有更多评论"')
while True: # 无限循环,直到检测到底部文本
attempt += 1
logging.info(f'{attempt} 次向上滑动')
# 检查监控任务是否通知停止
with shared_state['lock']:
if shared_state['scroll_completed']:
logging.info('收到监控任务停止信号,滑动任务结束')
break
# 记录滑动前的位置
current_position = self.driver.execute_script("return window.pageYOffset;")
# 执行向上滑动(加载更多评论)
self._execute_upward_scroll(attempt)
# 等待新内容加载
time.sleep(scroll_delay)
# 优先检查是否到达底部(检测到"暂时没有更多评论"文本)
if self._check_comment_section_bottom():
logging.info('检测到"暂时没有更多评论",停止滑动')
break
# 检查滑动是否有效果
new_position = self.driver.execute_script("return window.pageYOffset;")
if abs(new_position - current_position) < 50: # 滑动距离太小
consecutive_no_progress += 1
logging.debug(f'滑动进展较小,连续无进展次数: {consecutive_no_progress}')
# 如果连续多次无进展,增加滑动力度
if consecutive_no_progress >= 5:
logging.info('连续多次滑动无进展,增加滑动力度')
self._execute_force_scroll()
consecutive_no_progress = 0 # 重置计数器
time.sleep(scroll_delay * 2) # 增加等待时间
# 再次检查是否到达底部
if self._check_comment_section_bottom():
logging.info('强制滑动后检测到底部,停止滑动')
break
else:
consecutive_no_progress = 0
# 每50次滑动输出一次进度信息
if attempt % 50 == 0:
logging.info(f'已完成 {attempt} 次滑动,继续寻找"暂时没有更多评论"文本')
# 安全机制:如果滑动次数过多,暂停一下
if attempt % 200 == 0:
logging.info(f'已滑动 {attempt}暂停5秒以避免过度请求')
time.sleep(5)
# 滑动任务完成,通知监控任务
with shared_state['lock']:
shared_state['scroll_completed'] = True
logging.info('滑动任务已完成,通知监控任务结束')
except Exception as e:
logging.warning(f'滑动任务出错: {e}')
# 即使出错也要通知监控任务结束
with shared_state['lock']:
shared_state['scroll_completed'] = True
def _execute_force_scroll(self):
"""执行强制滑动,用于突破可能的滑动阻塞"""
try:
logging.info('执行强制滑动以突破阻塞')
# 执行多重强制滑动策略
self.driver.execute_script("""
// 1. 多次大幅度滑动
for (let i = 0; i < 5; i++) {
window.scrollBy(0, 1000);
document.documentElement.scrollTop += 1000;
document.body.scrollTop += 1000;
}
// 2. 滑动到页面最底部
window.scrollTo(0, document.body.scrollHeight);
// 3. 强制滚动所有容器
const containers = document.querySelectorAll('[data-e2e="comment-list"], .comment-list, [class*="comment"], [class*="scroll"]');
containers.forEach(container => {
if (container.scrollTop !== undefined) {
container.scrollTop = container.scrollHeight;
container.dispatchEvent(new Event('scroll', { bubbles: true }));
}
});
// 4. 触发所有滚动相关事件
['scroll', 'wheel', 'touchmove', 'resize', 'load'].forEach(eventType => {
window.dispatchEvent(new Event(eventType, { bubbles: true }));
document.dispatchEvent(new Event(eventType, { bubbles: true }));
});
// 5. 模拟用户交互
document.body.click();
console.log('执行强制滑动完成');
""")
time.sleep(3) # 增加等待时间
# 再次滑动到底部确保效果
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
logging.debug('强制滑动操作完成')
except Exception as e:
logging.warning(f'执行强制滑动失败: {e}')
def _execute_upward_scroll(self, attempt: int):
"""执行向上滑动操作 - 使用强力滑动策略确保有效触发懒加载"""
try:
# 记录滑动前状态
before_state = self.driver.execute_script("""
return {
scrollTop: window.pageYOffset,
commentCount: document.querySelectorAll('[data-e2e="comment-item"], [class*="comment"], .comment-item').length,
pageHeight: document.documentElement.scrollHeight
};
""")
logging.debug(f'滑动前状态: 位置={before_state["scrollTop"]}px, 评论数={before_state["commentCount"]}')
# 计算滑动距离,递增以确保效果
scroll_distance = 800 + (attempt * 300)
# 执行强力滚动 - 参考111.py的实现
self.driver.execute_script(f"""
// 1. 强制滚动页面
window.scrollBy(0, {scroll_distance});
document.documentElement.scrollTop += {scroll_distance};
document.body.scrollTop += {scroll_distance};
// 2. 滚动到页面底部(触发懒加载)
window.scrollTo(0, document.body.scrollHeight);
// 3. 查找并滚动所有可能的评论容器
const containers = document.querySelectorAll('[data-e2e="comment-list"], .comment-list, [class*="comment"], [class*="scroll"], [role="main"]');
containers.forEach(container => {{
if (container.scrollTop !== undefined) {{
container.scrollTop = container.scrollHeight;
container.dispatchEvent(new Event('scroll', {{ bubbles: true }}));
}}
}});
// 4. 触发所有相关事件
['scroll', 'wheel', 'touchmove', 'resize'].forEach(eventType => {{
window.dispatchEvent(new Event(eventType, {{ bubbles: true }}));
document.dispatchEvent(new Event(eventType, {{ bubbles: true }}));
}});
// 5. 模拟用户交互
document.body.click();
console.log('执行强力滚动:', {scroll_distance}, 'px');
""")
time.sleep(2) # 等待页面响应
# 尝试点击加载更多按钮(如果存在)
try:
button_clicked = self.driver.execute_script("""
const selectors = [
'[data-e2e="comment-load-more"]',
'[class*="load-more"]',
'[class*="more-comment"]',
'button[class*="load"]',
'div[class*="load"]'
];
for (let selector of selectors) {
const buttons = document.querySelectorAll(selector);
for (let button of buttons) {
if (button.offsetParent !== null && !button.disabled) {
button.click();
console.log('点击了加载更多按钮:', selector);
return true;
}
}
}
return false;
""")
if button_clicked:
logging.debug('成功点击了加载更多按钮')
time.sleep(1) # 等待按钮响应
except Exception as e:
logging.debug(f'点击加载更多按钮失败: {e}')
# 每隔几次使用真实手势滑动
if attempt % 3 == 0:
self._simulate_real_swipe()
logging.debug(f'执行强力滑动,距离: {scroll_distance}px')
except Exception as e:
logging.warning(f'执行滑动操作失败: {e}')
def _simulate_real_swipe(self):
"""模拟真实向上滑动手势 - 手指从下往上移动"""
try:
window_size = self.driver.get_window_size()
width = window_size['width']
height = window_size['height']
# 向上滑动手势:手指从屏幕下方往上方移动
start_x = width // 2 + random.randint(-20, 20) # 增加随机性
start_y = height * 4 // 5 # 从更靠下的位置开始4/5处
end_y = height // 5 # 到更靠上的位置结束1/5处
# 使用ActionChains模拟真实向上滑动手势
actions = ActionChains(self.driver)
actions.w3c_actions.pointer_action\
.move_to_location(start_x, start_y)\
.pointer_down()\
.pause(0.1)\
.move_to_location(start_x, end_y)\
.pause(0.1)\
.pointer_up()
actions.perform()
logging.debug(f'执行真实向上滑动手势: 从({start_x}, {start_y})到({start_x}, {end_y})')
except Exception as e:
logging.debug(f'真实手势滑动失败: {e}')
def _async_monitor_task(self, video_id: str, collected_comment_ids: set, timeout: float) -> list:
"""异步监控任务"""
all_comments = []
start_time = time.time()
while time.time() - start_time < timeout:
try:
# 从网络日志获取新评论
new_comments = self._extract_comments_from_network_logs(video_id)
# 去重并添加新评论
for comment in new_comments:
comment_id = f"{comment.get('text', '')}_{comment.get('user_name', '')}"
if comment_id not in collected_comment_ids:
collected_comment_ids.add(comment_id)
all_comments.append(comment)
if new_comments:
logging.info(f'监控到 {len(new_comments)} 条新评论,总计 {len(all_comments)}')
# 短暂等待后继续监控
time.sleep(1)
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'异步监控评论任务视频ID: {video_id}, 超时时间: {timeout}'
}
logging.warning(f'监控任务出错: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
time.sleep(2)
return all_comments
def _async_monitor_task_with_state(self, video_id: str, collected_comment_ids: set, shared_state: dict, timeout: float,
document_id=None, episode_number: int = 0, mix_name: str = '', mix_id: str = '',
max_comments: int = 100) -> list:
"""带状态的异步监控任务 - 监控评论并检测滑动任务状态"""
# 确保 episode_number 是整数类型
try:
episode_number = int(episode_number)
except (ValueError, TypeError):
logging.error(f'_async_monitor_task_with_state: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}')
episode_number = 0
all_comments = []
start_time = time.time()
last_comment_count = 0
no_new_comments_count = 0
logging.info('开始监控评论,将持续到滑动任务完成')
while time.time() - start_time < timeout:
try:
# 检查滑动任务是否完成
with shared_state['lock']:
scroll_completed = shared_state['scroll_completed']
if scroll_completed:
logging.info('检测到滑动任务已完成,监控任务即将结束')
# 滑动完成后再监控5秒确保收集到最后的评论
final_start = time.time()
while time.time() - final_start < 5:
try:
new_comments = self._extract_comments_from_network_logs(video_id)
for comment in new_comments:
comment_id = f"{comment.get('text', '')}_{comment.get('user_name', '')}"
if comment_id not in collected_comment_ids:
collected_comment_ids.add(comment_id)
all_comments.append(comment)
time.sleep(0.5)
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'最终监控阶段视频ID: {video_id}, 剩余监控时间: {5 - (time.time() - final_start):.1f}'
}
logging.warning(f'最终监控阶段出错: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
break
# 从网络日志获取新评论
new_comments = self._extract_comments_from_network_logs(video_id)
# 去重并添加新评论
new_comments_to_save = []
for comment in new_comments:
comment_id = f"{comment.get('text', '')}_{comment.get('user_name', '')}"
if comment_id not in collected_comment_ids:
collected_comment_ids.add(comment_id)
all_comments.append(comment)
new_comments_to_save.append(comment)
# 实时保存新评论到数据库
if new_comments_to_save and document_id and episode_number > 0:
self.update_video_comments_realtime(document_id, episode_number, new_comments_to_save, mix_name, mix_id)
# 检查是否有新评论
current_comment_count = len(all_comments)
if current_comment_count > last_comment_count:
logging.info(f'监控到 {current_comment_count - last_comment_count} 条新评论,总计 {current_comment_count}')
last_comment_count = current_comment_count
no_new_comments_count = 0
else:
no_new_comments_count += 1
# 每30秒输出一次状态
if no_new_comments_count % 30 == 0:
logging.info(f'监控中...当前总计 {current_comment_count} 条评论,等待滑动任务完成')
# 检查是否达到评论数量限制
if current_comment_count >= max_comments:
logging.info(f'已收集到 {current_comment_count} 条评论,达到限制数量 {max_comments},通知滑动任务停止')
with shared_state['lock']:
shared_state['scroll_completed'] = True
break
# 短暂等待后继续监控
time.sleep(1)
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'带状态的异步监控评论任务视频ID: {video_id}, 超时时间: {timeout}秒, 文档ID: {document_id}, 集数: {episode_number}'
}
logging.warning(f'监控任务出错: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
time.sleep(2)
logging.info(f'监控任务结束,共收集到 {len(all_comments)} 条评论')
# 确保只返回前max_comments条评论
return all_comments[:max_comments]
def _scroll_to_comment_section(self):
"""滚动到评论区域"""
try:
comment_section_selectors = [
'[data-e2e="comment-list"]',
'[class*="comment-list"]',
'[class*="comment-container"]',
]
for selector in comment_section_selectors:
try:
elements = self.driver.find_elements("css selector", selector)
if elements:
self.driver.execute_script(
"arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});",
elements[0]
)
time.sleep(2)
logging.info(f'成功定位到评论区域: {selector}')
return
except Exception:
continue
# 备用方案:滚动到页面底部
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
logging.info('使用备用方案:滚动到页面底部')
except Exception as e:
logging.warning(f'定位评论区域失败: {e}')
def _click_comment_area(self):
"""
点击评论区域以触发网络请求,确保能够获取到评论数据
"""
try:
# 多种评论区域选择器
comment_selectors = [
'[data-e2e="comment-list"]',
'[class*="comment"]',
'[class*="Comment"]',
'.comment-list',
'.comment-container',
'[data-e2e="comment-item"]',
'[class*="comment-item"]',
'div[class*="comment"]',
# 抖音特有的评论区域选择器
'div[data-e2e="comment-list"]',
'div[class*="CommentList"]',
'div[class*="comment-list"]'
]
clicked = False
for selector in comment_selectors:
try:
elements = self.driver.find_elements("css selector", selector)
if elements:
for element in elements:
try:
if element.is_displayed() and element.is_enabled():
# 滚动到元素可见
self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
time.sleep(0.5)
# 点击元素
element.click()
logging.info(f'成功点击评论区域: {selector}')
clicked = True
time.sleep(1) # 等待网络请求触发
break
except Exception as e:
logging.debug(f'点击元素失败: {e}')
continue
if clicked:
break
except Exception as e:
logging.debug(f'使用选择器 {selector} 查找评论区域失败: {e}')
continue
if not clicked:
# 如果没有找到特定的评论区域,尝试点击页面中部区域
try:
window_size = self.driver.get_window_size()
center_x = window_size['width'] // 2
center_y = window_size['height'] // 2
# 使用JavaScript点击页面中部
self.driver.execute_script(f"""
var element = document.elementFromPoint({center_x}, {center_y});
if (element) {{
element.click();
}}
""")
logging.info('点击页面中部区域以触发评论加载')
time.sleep(1)
except Exception as e:
logging.debug(f'点击页面中部失败: {e}')
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'点击评论区域,尝试激活评论加载'
}
logging.warning(f'点击评论区域失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
def _check_comment_section_bottom(self) -> bool:
"""
检测是否已经到达评论区底部
只有检测到"暂时没有更多评论"文本时才停止滑动,确保无限滑动直到真正到达底部
Returns:
bool: True表示已到达底部False表示还可以继续加载
"""
try:
# 目标文本:只有检测到这个文本才认为到达底部
target_text = "暂时没有更多评论"
logging.debug(f'正在检测评论区底部标识文本: "{target_text}"')
# 方法1: 使用XPath检测包含文本的元素
xpath_selectors = [
f"//*[contains(text(), '{target_text}')]",
f"//div[contains(text(), '{target_text}')]",
f"//span[contains(text(), '{target_text}')]",
f"//p[contains(text(), '{target_text}')]",
f"//*[text()='{target_text}']"
]
for xpath in xpath_selectors:
try:
elements = self.driver.find_elements("xpath", xpath)
if elements:
# 检查元素是否可见
for element in elements:
try:
if element.is_displayed():
logging.info(f'检测到评论区底部标识文本: "{target_text}" (通过XPath: {xpath})')
return True
except Exception:
continue
except Exception as e:
logging.debug(f'XPath检测失败 {xpath}: {e}')
continue
# 方法2: 使用JavaScript在页面中搜索文本
try:
js_result = self.driver.execute_script(f"""
// 搜索页面中所有包含目标文本的元素
var targetText = '{target_text}';
var walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null,
false
);
var node;
while (node = walker.nextNode()) {{
if (node.textContent.includes(targetText)) {{
var element = node.parentElement;
if (element && element.offsetParent !== null) {{
return {{
found: true,
text: node.textContent.trim(),
tagName: element.tagName,
className: element.className
}};
}}
}}
}}
return {{found: false}};
""")
if js_result and js_result.get('found'):
logging.info(f'通过JavaScript检测到评论区底部标识文本: "{target_text}"')
logging.debug(f'元素信息: 标签={js_result.get("tagName")}, 类名={js_result.get("className")}, 文本="{js_result.get("text")}"')
return True
except Exception as e:
logging.debug(f'JavaScript文本检测失败: {e}')
# 方法3: 检查页面源码中是否包含完整的目标文本
try:
page_source = self.driver.page_source
if target_text in page_source:
# 进一步验证:使用正则表达式确保是完整的文本匹配
pattern = re.escape(target_text)
if re.search(pattern, page_source):
logging.info(f'在页面源码中检测到完整的底部标识文本: "{target_text}"')
return True
except Exception as e:
logging.debug(f'页面源码检测失败: {e}')
# 检查页面滚动位置(仅用于调试信息)
try:
current_position = self.driver.execute_script("return window.pageYOffset;")
page_height = self.driver.execute_script("return document.body.scrollHeight;")
window_height = self.driver.execute_script("return window.innerHeight;")
distance_to_bottom = page_height - (current_position + window_height)
logging.debug(f'滚动状态: 当前位置={current_position}, 页面高度={page_height}, 窗口高度={window_height}, 距离底部={distance_to_bottom}px')
# 即使滚动到底部,也不停止滑动,除非检测到目标文本
if distance_to_bottom <= 10:
logging.debug(f'已滚动到页面底部,但未检测到"{target_text}"文本,继续滑动')
except Exception as e:
logging.debug(f'检查滚动位置失败: {e}')
# 只有检测到"暂时没有更多评论"文本才返回True否则继续滑动
logging.debug(f'未检测到"{target_text}"文本,继续滑动')
return False
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'检测评论区底部,目标文本: "暂时没有更多评论"'
}
logging.warning(f'检测评论区底部失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
return False
def _extract_comments_from_network_logs(self, video_id: str) -> list:
"""
从网络日志中提取评论数据
Args:
video_id: 视频ID
Returns:
list: 评论数据列表
"""
comments = []
try:
# 获取网络请求日志
logs = self.driver.get_log('performance')
for entry in logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and log['params']['response']
and log['params']['response'].get('url')
):
url = log['params']['response']['url']
# 检查是否是评论API
if '/aweme/v1/web/comment/list/' in url and video_id in url:
try:
# 获取响应体
response_body = self.driver.execute_cdp_cmd(
'Network.getResponseBody',
{'requestId': log['params']['requestId']}
)
if response_body and 'body' in response_body:
data = json.loads(response_body['body'])
api_comments = data.get('comments', [])
for comment in api_comments:
comment_info = {
'text': comment.get('text', ''),
'user_name': comment.get('user', {}).get('nickname', ''),
'digg_count': int(comment.get('digg_count', 0)),
'create_time': comment.get('create_time', 0)
}
comments.append(comment_info)
# 记录API URL信息用于调试
if api_comments:
logging.debug(f'从API获取到 {len(api_comments)} 条评论: {url}')
except Exception as e:
logging.debug(f'解析评论API响应失败: {e}')
continue
except Exception as e:
continue
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'提取网络日志评论数据视频ID: {video_id}, 已处理评论数: {len(comments)}'
}
logging.warning(f'提取网络日志评论数据失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
return comments
def get_video_details(self, video_id: str, mix_name: str = '', mix_id: str = '', document_id=None, episode_number: int = 0) -> dict:
"""获取单个视频的详细互动数据
Args:
video_id: 视频ID
Returns:
dict: 包含点赞数、收藏数、转发数、评论内容的字典
"""
# 确保 episode_number 是整数类型
try:
episode_number = int(episode_number)
except (ValueError, TypeError):
logging.error(f'get_video_details: episode_number 类型转换失败: {episode_number}, 类型: {type(episode_number)}')
episode_number = 0
video_details = {
'video_id': video_id,
'likes': 0,
'shares': 0,
'favorites': 0,
'likes_formatted': '0',
'shares_formatted': '0',
'favorites_formatted': '0',
'comments': [],
'success': False,
'error': None
}
# 添加互动数据保存标记,避免重复保存
interaction_data_saved = False
# 检查是否应该跳过详细数据获取(仅在定时器模式下跳过)
if os.environ.get('AUTO_CONTINUE') == '1':
logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的详细数据获取(点赞、收藏、分享、评论)')
video_details['success'] = True
video_details['error'] = '定时器模式:跳过详细数据获取'
return video_details
logging.info(f'🔍 get_video_details 被调用: video_id={video_id}')
try:
# 确保driver已初始化
if self.driver is None:
logging.info('Driver未初始化正在设置...')
self.setup_driver()
if self.driver is None:
raise Exception("无法初始化WebDriver")
video_url = f'https://www.douyin.com/video/{video_id}'
logging.info(f'获取视频详细数据: {video_url}')
# 导航到视频页面
self.driver.get(video_url)
time.sleep(3)
# 等待页面加载完成
try:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "video"))
)
except Exception as e:
logging.warning(f'等待视频元素超时: {e}')
# 首先获取页面加载时的网络请求日志(关键修复)
logging.info(f'获取页面加载时的网络日志以捕获视频详情API')
initial_logs = self.driver.get_log('performance')
# 解析初始网络日志获取视频详细数据cc
for entry in initial_logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and log['params']['response']
and log['params']['response'].get('url')
):
url = log['params']['response']['url']
# 检查是否是视频详情API
if '/aweme/v1/web/aweme/detail/' in url and video_id in url:
try:
# 获取响应体
response_body = self.driver.execute_cdp_cmd(
'Network.getResponseBody',
{'requestId': log['params']['requestId']}
)
if response_body and 'body' in response_body:
data = json.loads(response_body['body'])
aweme_detail = data.get('aweme_detail', {})
if aweme_detail:
# 获取统计数据
statistics = aweme_detail.get('statistics', {})
video_details['likes'] = int(statistics.get('digg_count', 0))
video_details['shares'] = int(statistics.get('share_count', 0))
video_details['favorites'] = int(statistics.get('collect_count', 0))
# 添加格式化字段
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
logging.info(f'从初始网络日志获取视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
# 实时保存互动数据(仅在首次获取时保存)
if document_id and episode_number and not interaction_data_saved:
interaction_data = {
'likes': video_details['likes'],
'likes_formatted': video_details['likes_formatted'],
'shares': video_details['shares'],
'shares_formatted': video_details['shares_formatted'],
'favorites': video_details['favorites'],
'favorites_formatted': video_details['favorites_formatted']
}
self.update_video_comments_realtime(document_id, episode_number, None, mix_name, mix_id, interaction_data)
interaction_data_saved = True
break
except Exception as e:
logging.warning(f'解析初始视频详情API响应失败: {e}')
continue
except Exception as e:
continue
# 启动滑动机制加载更多评论
logging.info(f'开始为视频 {video_id} 启动滑动机制加载评论')
scrolled_comments = self._simulate_comment_scrolling(video_id, max_scroll_attempts=15, scroll_delay=2.0,
document_id=document_id, episode_number=episode_number, mix_name=mix_name, mix_id=mix_id, max_comments=100)
# 如果滑动机制获取到评论,直接使用
if scrolled_comments:
video_details['comments'] = scrolled_comments
logging.info(f'滑动机制成功获取 {len(video_details["comments"])} 条评论')
# 获取滑动后的网络请求日志(用于评论数据)
logs = self.driver.get_log('performance')
# 解析滑动后的网络日志获取评论数据(作为滑动机制的补充)
for entry in logs:
try:
log = json.loads(entry['message'])['message']
if (
'Network.responseReceived' in log['method']
and 'response' in log['params']
and log['params']['response']
and log['params']['response'].get('url')
):
url = log['params']['response']['url']
# 只处理评论API视频详情API已在初始阶段处理
if '/aweme/v1/web/comment/list/' in url and video_id in url and not video_details['comments']:
try:
# 获取响应体
response_body = self.driver.execute_cdp_cmd(
'Network.getResponseBody',
{'requestId': log['params']['requestId']}
)
if response_body and 'body' in response_body:
data = json.loads(response_body['body'])
comments = data.get('comments', [])
# 只有在滑动机制没有获取到评论时才使用这个方法
if not video_details['comments']:
for comment in comments:
comment_info = {
'text': comment.get('text', ''),
'user_name': comment.get('user', {}).get('nickname', ''),
'digg_count': int(comment.get('digg_count', 0)),
'create_time': comment.get('create_time', 0)
}
video_details['comments'].append(comment_info)
logging.info(f'备用方案获取到 {len(comments)} 条评论')
logging.info(f'评论API URL: {url}')
except Exception as e:
logging.warning(f'解析评论API响应失败: {e}')
continue
except Exception as e:
continue
# 如果网络日志没有获取到数据,尝试页面解析
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
video_details = self._parse_video_details_from_page(video_id, video_details, document_id, episode_number, mix_name, interaction_data_saved)
video_details['success'] = True
return video_details
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'获取视频详细数据视频ID: {video_id}'
}
error_msg = f'获取视频 {video_id} 详细数据失败: {error_details["error_type"]} - {error_details["error_message"]}'
logging.error(error_msg)
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
video_details['error'] = error_msg
return video_details
def _parse_video_details_from_page(self, video_id: str, video_details: dict, document_id: str = None, episode_number: int = 0, mix_name: str = "", interaction_data_saved: bool = False) -> dict:
"""从页面元素解析视频详细数据(备用方案)
Args:
video_id: 视频ID
video_details: 现有的视频详细数据字典
document_id: 文档ID
episode_number: 集数
mix_name: 合集名称
interaction_data_saved: 互动数据是否已保存
Returns:
dict: 更新后的视频详细数据字典
"""
try:
logging.info(f'尝试从页面元素解析视频 {video_id} 的详细数据')
# 尝试解析页面中的SSR数据
try:
# 查找包含视频数据的script标签
scripts = self.driver.find_elements("tag name", "script")
for script in scripts:
script_content = script.get_attribute('innerHTML')
if script_content and ('window._SSR_HYDRATED_DATA' in script_content or 'RENDER_DATA' in script_content):
# 提取JSON数据
if 'window._SSR_HYDRATED_DATA' in script_content:
match = re.search(r'window\._SSR_HYDRATED_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
else:
match = re.search(r'window\.RENDER_DATA\s*=\s*({.*?});', script_content, re.DOTALL)
if match:
data = json.loads(match.group(1))
# 查找视频详情数据
def find_video_data(obj, target_id):
if isinstance(obj, dict):
for key, value in obj.items():
if key == 'aweme_id' and str(value) == str(target_id):
return obj
elif isinstance(value, (dict, list)):
result = find_video_data(value, target_id)
if result:
return result
elif isinstance(obj, list):
for item in obj:
result = find_video_data(item, target_id)
if result:
return result
return None
video_data = find_video_data(data, video_id)
if video_data:
statistics = video_data.get('statistics', {})
video_details['likes'] = int(statistics.get('digg_count', 0))
video_details['shares'] = int(statistics.get('share_count', 0))
video_details['favorites'] = int(statistics.get('collect_count', 0))
# 添加格式化字段
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
logging.info(f'从SSR数据解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
# 实时保存互动数据(仅在首次获取时保存)
if document_id and episode_number and not interaction_data_saved:
interaction_data = {
'likes': video_details['likes'],
'likes_formatted': video_details['likes_formatted'],
'shares': video_details['shares'],
'shares_formatted': video_details['shares_formatted'],
'favorites': video_details['favorites'],
'favorites_formatted': video_details['favorites_formatted']
}
self.update_video_comments_realtime(document_id, episode_number, None, mix_name, mix_id, interaction_data)
interaction_data_saved = True
break
except Exception as e:
logging.warning(f'解析SSR数据失败: {e}')
# 如果SSR数据解析失败尝试CSS选择器
if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0:
try:
# 尝试常见的点赞、分享、收藏按钮选择器
selectors = {
'likes': [
'[data-e2e="video-like-count"]',
'[class*="like"] [class*="count"]',
'[class*="digg"] [class*="count"]'
],
'shares': [
'[data-e2e="video-share-count"]',
'[class*="share"] [class*="count"]'
],
'favorites': [
'[data-e2e="video-collect-count"]',
'[class*="collect"] [class*="count"]',
'[class*="favorite"] [class*="count"]'
]
}
for data_type, selector_list in selectors.items():
for selector in selector_list:
try:
elements = self.driver.find_elements("css selector", selector)
if elements:
text = elements[0].text.strip()
if text and text.replace('.', '').replace('', '').replace('亿', '').isdigit():
# 转换数字格式
if '亿' in text:
video_details[data_type] = int(float(text.replace('亿', '')) * 100000000)
elif '' in text:
video_details[data_type] = int(float(text.replace('', '')) * 10000)
else:
video_details[data_type] = int(text)
break
except Exception:
continue
if video_details['likes'] > 0 or video_details['shares'] > 0 or video_details['favorites'] > 0:
# 添加格式化字段
video_details['likes_formatted'] = self.format_interaction_count(video_details['likes'])
video_details['shares_formatted'] = self.format_interaction_count(video_details['shares'])
video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites'])
logging.info(f'从页面元素解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}')
# 实时保存互动数据(仅在首次获取时保存)
if document_id and episode_number and not interaction_data_saved:
interaction_data = {
'likes': video_details['likes'],
'likes_formatted': video_details['likes_formatted'],
'shares': video_details['shares'],
'shares_formatted': video_details['shares_formatted'],
'favorites': video_details['favorites'],
'favorites_formatted': video_details['favorites_formatted']
}
self.update_video_comments_realtime(document_id, episode_number, None, mix_name, mix_id, interaction_data)
interaction_data_saved = True
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': f'CSS选择器解析视频互动数据视频ID: {video_id}'
}
logging.warning(f'CSS选择器解析失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.warning(f'详细错误信息: {error_details["traceback"]}')
logging.warning(f'错误上下文: {error_details["context"]}')
# 尝试获取评论(如果还没有获取到)
if not video_details['comments']:
try:
# 滚动到评论区域
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# 尝试常见的评论选择器
comment_selectors = [
'[data-e2e="comment-item"]',
'[class*="comment-item"]',
'[class*="comment"] [class*="content"]'
]
for selector in comment_selectors:
try:
comment_elements = self.driver.find_elements("css selector", selector)
if comment_elements:
for element in comment_elements:
try:
comment_text = element.text.strip()
if comment_text:
comment_info = {
'text': comment_text,
'user_name': '',
'digg_count': 0,
'create_time': 0
}
video_details['comments'].append(comment_info)
except Exception:
continue
if video_details['comments']:
logging.info(f'从页面元素获取到视频 {video_id}{len(video_details["comments"])} 条评论')
break
except Exception:
continue
except Exception as e:
logging.warning(f'获取评论失败: {e}')
except Exception as e:
logging.warning(f'页面解析视频详细数据失败: {e}')
return video_details
def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', mix_id: str = '') -> list:
"""获取合集中所有视频的详细互动数据
Args:
episode_video_ids: 视频ID列表
mix_name: 合集名称,用于日志
Returns:
list: 包含每个视频详细数据的列表
"""
# 检查是否应该跳过此函数(仅在定时器模式下跳过)
if should_skip_function('get_collection_video_details'):
logging.info(f'🚀 定时器模式:跳过 get_collection_video_details 函数(合集视频详细数据获取)')
return []
if not episode_video_ids:
logging.info(f'合集 {mix_name} 没有视频ID跳过详细数据获取')
return []
logging.info(f'开始获取合集 {mix_name}{len(episode_video_ids)} 个视频的详细数据')
video_details_list = []
for i, video_id in enumerate(episode_video_ids, 1):
if not video_id:
logging.warning(f'合集 {mix_name}{i} 集视频ID为空跳过')
video_details_list.append({
'episode_number': i,
'video_id': '',
'likes': 0,
'shares': 0,
'favorites': 0,
'comments': [],
'success': False,
'error': '视频ID为空'
})
continue
logging.info(f'获取合集 {mix_name}{i}/{len(episode_video_ids)} 集视频详细数据: {video_id}')
try:
# 获取单个视频的详细数据
video_details = self.get_video_details(video_id, mix_name, '', 0, mix_id)
video_details['episode_number'] = i
video_details_list.append(video_details)
# 添加随机延迟避免请求过快,模拟人类行为
random_delay = self.anti_detection.get_human_like_delay()
logging.info(f'🕐 视频间隔等待时间: {random_delay:.1f}')
time.sleep(random_delay)
# exit(0)
except Exception as e:
error_msg = f'获取视频 {video_id} 详细数据时出错: {e}'
logging.error(error_msg)
video_details_list.append({
'episode_number': i,
'video_id': video_id,
'likes': 0,
'shares': 0,
'favorites': 0,
'comments': [],
'success': False,
'error': error_msg
})
# 统计获取结果
success_count = sum(1 for detail in video_details_list if detail.get('success', False))
total_likes = sum(detail.get('likes', 0) for detail in video_details_list)
total_comments = sum(len(detail.get('comments', [])) for detail in video_details_list)
logging.info(f'合集 {mix_name} 视频详细数据获取完成: {success_count}/{len(episode_video_ids)} 成功, 总点赞数={total_likes:,}, 总评论数={total_comments}')
return video_details_list
def get_cookies_dict(self):
"""获取当前页面的cookies"""
if not hasattr(self, 'cookies') or not self.cookies:
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
return self.cookies
def _click_comment_area(self):
"""
点击评论区域,触发评论加载
"""
try:
# 尝试多种方式点击评论区域
comment_selectors = [
'[data-e2e="comment-list"]',
'[class*="comment-list"]',
'[class*="comment-container"]',
'[class*="comment-area"]',
'[class*="comment-section"]'
]
for selector in comment_selectors:
try:
elements = self.driver.find_elements("css selector", selector)
if elements and elements[0].is_displayed():
self.driver.execute_script("arguments[0].click();", elements[0])
logging.debug(f'成功点击评论区域: {selector}')
return
except:
continue
# 如果没有找到特定的评论区域,尝试点击页面中部
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
time.sleep(0.5)
except Exception as e:
logging.debug(f'点击评论区域失败: {e}')
def _check_first_comment_button(self) -> bool:
"""
检测是否存在"抢首评"按钮,如果存在说明视频确实没有评论
Returns:
bool: True表示检测到抢首评按钮视频没有评论False表示没有检测到
"""
try:
# 常见的"抢首评"相关文本
first_comment_indicators = [
"抢首评", "首评"
]
logging.debug('检测"抢首评"按钮...')
# 方法1: 通过文本内容检测
for indicator in first_comment_indicators:
try:
# 使用XPath查找包含指定文本的元素
xpath_selectors = [
f"//*[contains(text(), '{indicator}')]",
f"//button[contains(text(), '{indicator}')]",
f"//div[contains(text(), '{indicator}')]",
f"//span[contains(text(), '{indicator}')]"
]
for xpath in xpath_selectors:
elements = self.driver.find_elements("xpath", xpath)
if elements:
for element in elements:
try:
if element.is_displayed() and element.text.strip():
logging.debug(f'检测到抢首评相关文本: "{element.text.strip()}"')
return True
except Exception:
continue
except Exception:
continue
# 方法2: 通过CSS选择器检测评论输入框的占位符文本
try:
comment_input_selectors = [
'input[placeholder*="抢首评"]',
'textarea[placeholder*="抢首评"]',
'[data-e2e="comment-input"]',
'[class*="comment-input"]'
]
for selector in comment_input_selectors:
elements = self.driver.find_elements("css selector", selector)
if elements:
for element in elements:
try:
placeholder = element.get_attribute('placeholder') or ''
if any(indicator in placeholder for indicator in first_comment_indicators):
logging.debug(f'检测到抢首评输入框占位符: "{placeholder}"')
return True
except Exception:
continue
except Exception:
pass
# 方法3: 检测评论区域是否显示空状态
try:
empty_comment_selectors = [
'[class*="empty"]',
'[class*="no-comment"]',
'[class*="comment-empty"]',
'[data-e2e="comment-empty"]'
]
for selector in empty_comment_selectors:
elements = self.driver.find_elements("css selector", selector)
if elements:
for element in elements:
try:
if element.is_displayed():
text = element.text.strip()
if any(indicator in text for indicator in first_comment_indicators):
logging.debug(f'检测到空评论状态: "{text}"')
return True
except Exception:
continue
except Exception:
pass
logging.debug('未检测到抢首评按钮或相关标识')
return False
except Exception as e:
logging.debug(f'检测抢首评按钮时出错: {e}')
return False
def cleanup_old_management_data(self, days_to_keep: int = 7):
"""清理目标数据库Rankings_management中的旧数据基于last_updated字段保留指定天数的数据"""
target_collection = self.collection # 使用根据模式选择的集合
if target_collection is None:
logging.warning('[数据清理] 目标集合未初始化,跳过清理')
return False
try:
# 计算需要保留的最早时间
from datetime import timedelta
cutoff_datetime = datetime.now() - timedelta(days=days_to_keep)
# 查询需要删除的数据数量基于last_updated字段
old_data_count = target_collection.count_documents({
'last_updated': {'$lt': cutoff_datetime}
})
if old_data_count == 0:
logging.info(f'[数据清理] 无需清理,没有超过{days_to_keep}天未更新的旧数据')
return True
# 删除旧数据
delete_result = target_collection.delete_many({
'last_updated': {'$lt': cutoff_datetime}
})
if delete_result.deleted_count > 0:
logging.info(f'[数据清理] ✅ 成功清理Rankings_management中{delete_result.deleted_count}条旧数据(保留最近{days_to_keep}天更新的数据)')
return True
else:
logging.warning(f'[数据清理] 清理操作未删除任何数据')
return False
except Exception as e:
logging.error(f'[数据清理] 清理Rankings_management旧数据失败: {e}')
return False
def run(self):
try:
# 在开始抓取前清理旧数据保留最近7天
self.cleanup_old_management_data(days_to_keep=7)
self.setup_driver()
self.navigate()
self.ensure_login()
self.trigger_loading()
logging.info('=' * 60)
logging.info('开始统一数据收集')
logging.info('=' * 60)
# 使用统一数据收集器
collector = UnifiedDataCollector(self.driver, self.duration_s)
collected_data = collector.collect_all_data()
# 将收集到的数据转换为原有格式
self.play_vv_items = []
for item in collected_data:
self.play_vv_items.append({
'play_vv': item.get('play_vv', 0),
'formatted': item.get('formatted', ''),
'url': item.get('url', ''),
'request_id': item.get('request_id', ''),
'mix_name': item.get('mix_name', ''),
'video_url': item.get('video_url', ''),
'mix_id': item.get('mix_id', ''),
'cover_image_url': item.get('cover_image_url', ''),
'cover_backup_urls': item.get('cover_backup_urls', []),
'series_author': item.get('series_author', ''),
'desc': item.get('desc', ''),
'updated_to_episode': item.get('updated_to_episode', 0),
'timestamp': item.get('timestamp', '')
})
logging.info(f'✅ 统一数据收集完成:{len(self.play_vv_items)} 个合集')
# 统一数据收集器已实时去重,无需额外去重步骤
logging.info('=' * 60)
logging.info('数据去重已完成(统一收集器实时处理)')
logging.info('=' * 60)
logging.info('=' * 60)
logging.info('开始保存数据')
logging.info('=' * 60)
self.save_results()
logging.info('=' * 60)
logging.info(f'✅ 全部完成!共处理 {len(self.play_vv_items)} 个合集')
logging.info('=' * 60)
except Exception as e:
import traceback
error_details = {
'error_type': type(e).__name__,
'error_message': str(e),
'traceback': traceback.format_exc(),
'context': '执行抖音播放量抓取任务主流程'
}
logging.error(f'抓取任务执行失败: {error_details["error_type"]} - {error_details["error_message"]}')
logging.error(f'详细错误信息: {error_details["traceback"]}')
logging.error(f'错误上下文: {error_details["context"]}')
raise # 重新抛出异常,让上层调用者处理
finally:
if self.driver:
try:
self.driver.quit()
except Exception:
pass
def apply_timer_config():
"""应用定时器配置中的环境变量"""
try:
# 应用定时器环境变量配置
config.apply_timer_environment()
# 记录设置的环境变量
for key, value in config.TIMER_ENV_CONFIG.items():
logging.info(f'设置环境变量: {key}={value}')
except Exception as e:
logging.warning(f'应用定时器配置失败: {e}')
def should_skip_function(function_name):
"""检查是否应该跳过指定的函数 - 只在定时器模式下启用"""
try:
# 只有在定时器模式下才检查跳过逻辑
if os.environ.get('TIMER_MODE') == '1' and os.environ.get('AUTO_CONTINUE') == '1':
skip_functions = config.get_skip_functions()
return function_name in skip_functions
except Exception as e:
logging.warning(f'检查跳过函数配置失败: {e}')
return False
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Selenium+CDP 抖音play_vv抓取器')
parser.add_argument('--url', default='https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation', help='收藏合集列表页面URL')
parser.add_argument('--auto', action='store_true', help='自动继续,跳过回车等待')
parser.add_argument('--duration', type=int, default=60, help='网络响应收集时长(秒)')
parser.add_argument('--driver', help='覆盖chromedriver路径')
parser.add_argument('--timer', action='store_true', help='启用定时器模式应用config.py中的定时器配置')
args = parser.parse_args()
# 如果启用定时器模式,应用定时器配置
if args.timer:
apply_timer_config()
if args.driver:
os.environ['OVERRIDE_CHROMEDRIVER'] = args.driver
# 注意AUTO_CONTINUE 环境变量只在定时器模式下通过 apply_timer_config() 设置
# 普通模式下不设置任何环境变量,所有函数都正常运行
print('=== Selenium+CDP 抖音play_vv抓取器 ===')
scraper = DouyinPlayVVScraper(args.url, auto_continue=args.auto, duration_s=args.duration)
scraper.run()