Merge commit 'acad6baadeaae0d5c5aa6836a32e7129f62c3d5e'
This commit is contained in:
commit
d15eb6997c
@ -3,9 +3,9 @@ import importlib
|
|||||||
|
|
||||||
# 数据库配置
|
# 数据库配置
|
||||||
MONGO_URI = "mongodb://localhost:27017"
|
MONGO_URI = "mongodb://localhost:27017"
|
||||||
# MONGO_DB_NAME = "Rankings"
|
MONGO_DB_NAME = "Rankings"
|
||||||
# MONGO_URI = "mongodb://mongouser:Jdei2243afN@172.16.0.6:27017,172.16.0.4:27017/test?replicaSet=cmgo-r6qkaern_0&authSource=admin"
|
# MONGO_URI = "mongodb://mongouser:Jdei2243afN@172.16.0.6:27017,172.16.0.4:27017/test?replicaSet=cmgo-r6qkaern_0&authSource=admin"
|
||||||
MONGO_DB_NAME = "kemeng_media"
|
# MONGO_DB_NAME = "kemeng_media"
|
||||||
|
|
||||||
# 应用配置
|
# 应用配置
|
||||||
APP_ENV = os.getenv('APP_ENV', 'development')
|
APP_ENV = os.getenv('APP_ENV', 'development')
|
||||||
|
|||||||
@ -2017,6 +2017,11 @@ class DouyinPlayVVScraper:
|
|||||||
logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的评论滑动加载')
|
logging.info(f'🚀 定时器模式:跳过视频 {video_id} 的评论滑动加载')
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
# 首先检查视频是否真的没有评论(检测"抢首评"按钮)
|
||||||
|
if self._check_first_comment_button():
|
||||||
|
logging.info(f'检测到视频 {video_id} 没有评论(存在"抢首评"按钮),跳过评论抓取')
|
||||||
|
return []
|
||||||
|
|
||||||
all_comments = []
|
all_comments = []
|
||||||
collected_comment_ids = set()
|
collected_comment_ids = set()
|
||||||
|
|
||||||
@ -2059,42 +2064,43 @@ class DouyinPlayVVScraper:
|
|||||||
# 很可能存在页面上可见但未被网络日志捕获的评论
|
# 很可能存在页面上可见但未被网络日志捕获的评论
|
||||||
|
|
||||||
# 智能判断是否需要执行补丁机制
|
# 智能判断是否需要执行补丁机制
|
||||||
|
# 只在评论数量真正过少时才启用补丁机制
|
||||||
should_apply_patch = False
|
should_apply_patch = False
|
||||||
|
|
||||||
# 条件1: 评论数量较少(少于20条)
|
# 只有当评论数量少于10条时才启用补丁机制
|
||||||
if len(all_comments) < 20:
|
if len(all_comments) < 10:
|
||||||
should_apply_patch = True
|
should_apply_patch = True
|
||||||
logging.debug(f'评论数量较少({len(all_comments)}条),启用补丁机制')
|
logging.debug(f'评论数量过少({len(all_comments)}条),启用补丁机制')
|
||||||
|
|
||||||
# 条件2: 检查页面上是否有更多可见的评论元素
|
# 对于评论数量在10-50条之间的情况,检查是否可能遗漏了评论
|
||||||
try:
|
elif len(all_comments) <= 50:
|
||||||
visible_comment_count = self.driver.execute_script("""
|
try:
|
||||||
var selectors = [
|
visible_comment_count = self.driver.execute_script("""
|
||||||
'[data-e2e="comment-item"]',
|
var selectors = [
|
||||||
'[class*="comment-item"]',
|
'[data-e2e="comment-item"]',
|
||||||
'[class*="comment-content"]'
|
'[class*="comment-item"]',
|
||||||
];
|
'[class*="comment-content"]'
|
||||||
var totalCount = 0;
|
];
|
||||||
selectors.forEach(function(selector) {
|
var totalCount = 0;
|
||||||
var elements = document.querySelectorAll(selector);
|
selectors.forEach(function(selector) {
|
||||||
elements.forEach(function(element) {
|
var elements = document.querySelectorAll(selector);
|
||||||
if (element.offsetParent !== null && element.textContent.trim().length > 2) {
|
elements.forEach(function(element) {
|
||||||
totalCount++;
|
if (element.offsetParent !== null && element.textContent.trim().length > 2) {
|
||||||
}
|
totalCount++;
|
||||||
|
}
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
return totalCount;
|
||||||
return totalCount;
|
""")
|
||||||
""")
|
|
||||||
|
|
||||||
if visible_comment_count > len(all_comments):
|
|
||||||
should_apply_patch = True
|
|
||||||
logging.debug(f'页面可见评论({visible_comment_count}条) > 已获取评论({len(all_comments)}条),启用补丁机制')
|
|
||||||
|
|
||||||
except Exception as e:
|
# 只有当页面可见评论数量明显大于已获取数量时才启用补丁
|
||||||
logging.debug(f'检查页面可见评论数量失败: {e}')
|
if visible_comment_count > len(all_comments) * 2:
|
||||||
# 如果检查失败,对于少量评论的情况仍然执行补丁
|
should_apply_patch = True
|
||||||
if len(all_comments) < 10:
|
logging.debug(f'页面可见评论({visible_comment_count}条) >> 已获取评论({len(all_comments)}条),启用补丁机制')
|
||||||
should_apply_patch = True
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f'检查页面可见评论数量失败: {e}')
|
||||||
|
# 检查失败时不启用补丁机制
|
||||||
|
|
||||||
patch_comments = []
|
patch_comments = []
|
||||||
if should_apply_patch:
|
if should_apply_patch:
|
||||||
@ -3221,12 +3227,12 @@ class DouyinPlayVVScraper:
|
|||||||
|
|
||||||
def _extract_comments_patch(self, video_id: str) -> list:
|
def _extract_comments_patch(self, video_id: str) -> list:
|
||||||
"""
|
"""
|
||||||
评论抓取补丁机制 - 强制从页面元素中提取所有可见评论
|
评论补丁机制 - 更仔细地重新从网络日志获取评论
|
||||||
针对评论较少但获取不全的情况,确保不遗漏任何页面上可见的评论
|
不再抓取页面元素,而是重新触发评论加载并从API获取
|
||||||
"""
|
"""
|
||||||
comments = []
|
comments = []
|
||||||
try:
|
try:
|
||||||
logging.info(f'开始执行评论抓取补丁机制,视频ID: {video_id}')
|
logging.info(f'启动补丁机制,重新仔细获取视频 {video_id} 的评论...')
|
||||||
|
|
||||||
# 首先检查是否存在"抢首评"按钮,如果存在说明视频确实没有评论
|
# 首先检查是否存在"抢首评"按钮,如果存在说明视频确实没有评论
|
||||||
if self._check_first_comment_button():
|
if self._check_first_comment_button():
|
||||||
@ -3234,189 +3240,165 @@ class DouyinPlayVVScraper:
|
|||||||
return comments
|
return comments
|
||||||
|
|
||||||
# 等待页面稳定
|
# 等待页面稳定
|
||||||
time.sleep(1)
|
time.sleep(2)
|
||||||
|
|
||||||
# 滚动到评论区域确保评论完全加载
|
# 滚动到评论区域确保评论完全加载
|
||||||
self._scroll_to_comment_section()
|
self._scroll_to_comment_section()
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
# 针对少量评论优化的选择器列表
|
# 点击评论区域,触发评论加载
|
||||||
comment_selectors = [
|
try:
|
||||||
# 抖音常用的评论选择器(优先级最高)
|
self._click_comment_area()
|
||||||
'[data-e2e="comment-item"]',
|
time.sleep(1)
|
||||||
'[data-e2e="comment-list"] > div',
|
except:
|
||||||
'[class*="comment-item"]',
|
pass
|
||||||
'[class*="comment-content"]',
|
|
||||||
|
|
||||||
# 针对少量评论的特殊选择器
|
|
||||||
'[class*="comment"] [class*="content"]',
|
|
||||||
'[class*="Comment"] [class*="content"]',
|
|
||||||
'[class*="comment-text"]',
|
|
||||||
'[class*="user-comment"]',
|
|
||||||
|
|
||||||
# 更通用的评论选择器
|
|
||||||
'div[class*="comment"]:not([class*="input"]):not([class*="button"]):not([class*="header"])',
|
|
||||||
'li[class*="comment"]',
|
|
||||||
'[role="listitem"][class*="comment"]',
|
|
||||||
|
|
||||||
# 备用选择器(用于兜底)
|
|
||||||
'.comment-list > div',
|
|
||||||
'.comment-container > div',
|
|
||||||
'.comment-wrapper > div',
|
|
||||||
|
|
||||||
# 针对少量评论的深度选择器
|
|
||||||
'div[class*="comment"] p',
|
|
||||||
'div[class*="comment"] span:not([class*="button"]):not([class*="icon"])',
|
|
||||||
'[data-e2e*="comment"] div[class*="text"]'
|
|
||||||
]
|
|
||||||
|
|
||||||
collected_texts = set() # 用于去重
|
# 清理旧的网络日志
|
||||||
|
self.driver.get_log('performance')
|
||||||
|
|
||||||
for selector in comment_selectors:
|
# 轻微滚动,触发更多评论加载
|
||||||
|
for i in range(3):
|
||||||
|
self.driver.execute_script("window.scrollBy(0, 200);")
|
||||||
|
time.sleep(0.5)
|
||||||
|
self.driver.execute_script("window.scrollBy(0, -100);")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# 等待网络请求完成
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# 重新从网络日志中提取评论(更仔细的方式)
|
||||||
|
patch_comments = self._extract_comments_from_network_logs_detailed(video_id)
|
||||||
|
|
||||||
|
if patch_comments:
|
||||||
|
logging.info(f'补丁机制成功重新获取 {len(patch_comments)} 条评论')
|
||||||
|
comments.extend(patch_comments)
|
||||||
|
else:
|
||||||
|
logging.info('补丁机制未找到额外评论')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'评论补丁机制执行失败: {e}')
|
||||||
|
|
||||||
|
return comments
|
||||||
|
|
||||||
|
def _extract_comments_from_network_logs_detailed(self, video_id: str) -> list:
|
||||||
|
"""
|
||||||
|
更详细地从网络日志中提取评论数据(补丁机制专用)
|
||||||
|
Args:
|
||||||
|
video_id: 视频ID
|
||||||
|
Returns:
|
||||||
|
list: 评论数据列表
|
||||||
|
"""
|
||||||
|
comments = []
|
||||||
|
try:
|
||||||
|
# 获取网络请求日志
|
||||||
|
logs = self.driver.get_log('performance')
|
||||||
|
|
||||||
|
for entry in logs:
|
||||||
try:
|
try:
|
||||||
logging.debug(f'尝试选择器: {selector}')
|
log = json.loads(entry['message'])['message']
|
||||||
elements = self.driver.find_elements("css selector", selector)
|
if (
|
||||||
|
'Network.responseReceived' in log['method']
|
||||||
if elements:
|
and 'response' in log['params']
|
||||||
logging.debug(f'选择器 {selector} 找到 {len(elements)} 个元素')
|
and log['params']['response']
|
||||||
|
and log['params']['response'].get('url')
|
||||||
|
):
|
||||||
|
url = log['params']['response']['url']
|
||||||
|
|
||||||
for element in elements:
|
# 检查是否是评论相关的API(更宽泛的匹配)
|
||||||
|
comment_api_patterns = [
|
||||||
|
'/aweme/v1/web/comment/list/',
|
||||||
|
'/comment/list/',
|
||||||
|
'/comment/detail/',
|
||||||
|
'/reply/list/'
|
||||||
|
]
|
||||||
|
|
||||||
|
is_comment_api = any(pattern in url for pattern in comment_api_patterns)
|
||||||
|
|
||||||
|
if is_comment_api and video_id in url:
|
||||||
try:
|
try:
|
||||||
if not element.is_displayed():
|
# 获取响应体
|
||||||
continue
|
response_body = self.driver.execute_cdp_cmd(
|
||||||
|
'Network.getResponseBody',
|
||||||
|
{'requestId': log['params']['requestId']}
|
||||||
|
)
|
||||||
|
|
||||||
|
if response_body and 'body' in response_body:
|
||||||
|
data = json.loads(response_body['body'])
|
||||||
|
|
||||||
# 获取评论文本
|
# 尝试多种可能的评论数据结构
|
||||||
comment_text = element.text.strip()
|
api_comments = []
|
||||||
|
|
||||||
# 过滤无效文本
|
|
||||||
if not comment_text or len(comment_text) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 过滤系统文本和按钮文本
|
# 标准结构
|
||||||
skip_texts = [
|
if 'comments' in data:
|
||||||
'回复', '点赞', '举报', '删除', '编辑', '查看更多',
|
api_comments = data['comments']
|
||||||
'展开', '收起', '暂时没有更多评论', '加载中',
|
# 备用结构
|
||||||
'发布', '取消', '确定', '登录', '注册', '关注'
|
elif 'comment_list' in data:
|
||||||
]
|
api_comments = data['comment_list']
|
||||||
|
elif 'data' in data and isinstance(data['data'], list):
|
||||||
# 检查是否为纯按钮文本
|
api_comments = data['data']
|
||||||
if comment_text in skip_texts:
|
elif 'data' in data and 'comments' in data['data']:
|
||||||
continue
|
api_comments = data['data']['comments']
|
||||||
|
|
||||||
# 清理包含按钮文本的评论
|
|
||||||
original_text = comment_text
|
|
||||||
for skip_text in skip_texts:
|
|
||||||
comment_text = comment_text.replace(skip_text, '').strip()
|
|
||||||
|
|
||||||
# 如果清理后文本太短,跳过
|
|
||||||
if len(comment_text) < 2:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 去重检查
|
|
||||||
if comment_text in collected_texts:
|
|
||||||
continue
|
|
||||||
|
|
||||||
collected_texts.add(comment_text)
|
for comment in api_comments:
|
||||||
|
if isinstance(comment, dict):
|
||||||
# 创建评论对象
|
comment_text = comment.get('text', '') or comment.get('content', '')
|
||||||
comment_info = {
|
if comment_text and len(comment_text.strip()) > 0:
|
||||||
'text': comment_text,
|
comment_info = {
|
||||||
'user_name': '',
|
'text': comment_text.strip(),
|
||||||
'digg_count': 0,
|
'user_name': comment.get('user', {}).get('nickname', '') if comment.get('user') else '',
|
||||||
'create_time': 0,
|
'digg_count': int(comment.get('digg_count', 0) or comment.get('like_count', 0)),
|
||||||
'source': f'patch_{selector}' # 标记来源
|
'create_time': comment.get('create_time', 0) or comment.get('timestamp', 0),
|
||||||
}
|
'source': 'patch_api'
|
||||||
|
}
|
||||||
comments.append(comment_info)
|
comments.append(comment_info)
|
||||||
logging.debug(f'补丁提取评论: {comment_text[:30]}...')
|
|
||||||
|
# 记录API URL信息,用于调试
|
||||||
|
if api_comments:
|
||||||
|
logging.debug(f'补丁机制从API获取到 {len(api_comments)} 条评论: {url}')
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(f'处理评论元素失败: {e}')
|
logging.debug(f'补丁机制处理响应体失败: {e}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(f'选择器 {selector} 执行失败: {e}')
|
logging.debug(f'补丁机制处理日志条目失败: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'补丁机制从网络日志提取评论失败: {e}')
|
||||||
|
|
||||||
|
return comments
|
||||||
|
|
||||||
|
def _click_comment_area(self):
|
||||||
|
"""
|
||||||
|
点击评论区域,触发评论加载
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 尝试多种方式点击评论区域
|
||||||
|
comment_selectors = [
|
||||||
|
'[data-e2e="comment-list"]',
|
||||||
|
'[class*="comment-list"]',
|
||||||
|
'[class*="comment-container"]',
|
||||||
|
'[class*="comment-area"]',
|
||||||
|
'[class*="comment-section"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in comment_selectors:
|
||||||
|
try:
|
||||||
|
elements = self.driver.find_elements("css selector", selector)
|
||||||
|
if elements and elements[0].is_displayed():
|
||||||
|
self.driver.execute_script("arguments[0].click();", elements[0])
|
||||||
|
logging.debug(f'成功点击评论区域: {selector}')
|
||||||
|
return
|
||||||
|
except:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 使用JavaScript进一步搜索评论文本(针对少量评论优化)
|
# 如果没有找到特定的评论区域,尝试点击页面中部
|
||||||
try:
|
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.6);")
|
||||||
js_comments = self.driver.execute_script("""
|
time.sleep(0.5)
|
||||||
var comments = [];
|
|
||||||
var processedTexts = new Set();
|
|
||||||
|
|
||||||
// 针对少量评论的优化选择器
|
|
||||||
var selectors = [
|
|
||||||
'div[class*="comment"]',
|
|
||||||
'li[class*="comment"]',
|
|
||||||
'[data-e2e*="comment"]',
|
|
||||||
'[class*="comment-item"]',
|
|
||||||
'[class*="comment-content"]'
|
|
||||||
];
|
|
||||||
|
|
||||||
selectors.forEach(function(selector) {
|
|
||||||
try {
|
|
||||||
var elements = document.querySelectorAll(selector);
|
|
||||||
elements.forEach(function(element) {
|
|
||||||
if (element.offsetParent === null) return; // 跳过不可见元素
|
|
||||||
|
|
||||||
var text = element.textContent.trim();
|
|
||||||
if (text.length < 2) return;
|
|
||||||
|
|
||||||
// 过滤按钮文本
|
|
||||||
var skipTexts = ['回复', '点赞', '举报', '删除', '编辑', '查看更多', '展开', '收起', '暂时没有更多评论', '关注'];
|
|
||||||
var isButtonText = skipTexts.some(function(skipText) {
|
|
||||||
return text === skipText;
|
|
||||||
});
|
|
||||||
|
|
||||||
if (isButtonText || processedTexts.has(text)) return;
|
|
||||||
|
|
||||||
// 清理文本
|
|
||||||
var cleanText = text;
|
|
||||||
skipTexts.forEach(function(skipText) {
|
|
||||||
cleanText = cleanText.replace(new RegExp(skipText, 'g'), '').trim();
|
|
||||||
});
|
|
||||||
|
|
||||||
if (cleanText.length >= 2 && !processedTexts.has(cleanText)) {
|
|
||||||
processedTexts.add(cleanText);
|
|
||||||
comments.push({
|
|
||||||
text: cleanText,
|
|
||||||
user_name: '',
|
|
||||||
digg_count: 0,
|
|
||||||
create_time: 0,
|
|
||||||
source: 'patch_js'
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (e) {
|
|
||||||
console.log('JS选择器执行失败:', selector, e);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return comments;
|
|
||||||
""")
|
|
||||||
|
|
||||||
if js_comments:
|
|
||||||
# 去重合并
|
|
||||||
existing_texts = {comment['text'] for comment in comments}
|
|
||||||
for js_comment in js_comments:
|
|
||||||
if js_comment['text'] not in existing_texts:
|
|
||||||
comments.append(js_comment)
|
|
||||||
existing_texts.add(js_comment['text'])
|
|
||||||
|
|
||||||
logging.debug(f'JavaScript补丁额外提取 {len(js_comments)} 条评论')
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.debug(f'JavaScript评论提取失败: {e}')
|
|
||||||
|
|
||||||
if comments:
|
|
||||||
logging.info(f'评论抓取补丁机制成功提取 {len(comments)} 条评论')
|
|
||||||
else:
|
|
||||||
logging.warning('评论抓取补丁机制未找到任何评论')
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'评论抓取补丁机制执行失败: {e}')
|
logging.debug(f'点击评论区域失败: {e}')
|
||||||
|
|
||||||
return comments
|
|
||||||
|
|
||||||
def _check_first_comment_button(self) -> bool:
|
def _check_first_comment_button(self) -> bool:
|
||||||
"""
|
"""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user