优化增强了获取评论内容机制,
现在抓取评论内容的方法是网络日志拦截抖音API的方法, 这个获取的评论内容是随机的,但是里面添加了去重的机制, 针对于过少的评论和无评论做了优化, 添加监控超时限制,爬取评论内容一条视频监控超时时间为5小时(因为有的视频评论数量太大)
This commit is contained in:
parent
64455034bb
commit
b26852b95f
@ -79,7 +79,6 @@ class DouyinPlayVVScraper:
|
|||||||
self.db = None
|
self.db = None
|
||||||
self.collection = None
|
self.collection = None
|
||||||
self.image_cache = {} # 图片ID到TOS链接的缓存映射 {image_id: tos_url}
|
self.image_cache = {} # 图片ID到TOS链接的缓存映射 {image_id: tos_url}
|
||||||
self.all_collected_comments = [] # 存储所有收集到的评论数据
|
|
||||||
|
|
||||||
# 实时存储相关属性
|
# 实时存储相关属性
|
||||||
self.batch_id = str(uuid.uuid4()) # 每次运行的唯一标识
|
self.batch_id = str(uuid.uuid4()) # 每次运行的唯一标识
|
||||||
@ -1771,6 +1770,12 @@ class DouyinPlayVVScraper:
|
|||||||
else:
|
else:
|
||||||
logging.warning(f'[增量更新] 第 {i} 集视频详细数据获取失败: {mix_name}')
|
logging.warning(f'[增量更新] 第 {i} 集视频详细数据获取失败: {mix_name}')
|
||||||
|
|
||||||
|
# 添加随机延迟避免请求过快,模拟人类行为
|
||||||
|
if i < len(episode_video_ids): # 不是最后一个视频时才延迟
|
||||||
|
random_delay = self.anti_detection.get_human_like_delay()
|
||||||
|
logging.info(f'🕐 [增量更新] 视频间隔等待时间: {random_delay:.1f}秒')
|
||||||
|
time.sleep(random_delay)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'[增量更新] 处理第 {i} 集视频时出错: {mix_name} - {e}')
|
logging.error(f'[增量更新] 处理第 {i} 集视频时出错: {mix_name} - {e}')
|
||||||
continue
|
continue
|
||||||
@ -2037,8 +2042,8 @@ class DouyinPlayVVScraper:
|
|||||||
# 提交滑动任务
|
# 提交滑动任务
|
||||||
scroll_future = executor.submit(self._async_scroll_task_with_state, max_scroll_attempts, scroll_delay, shared_state)
|
scroll_future = executor.submit(self._async_scroll_task_with_state, max_scroll_attempts, scroll_delay, shared_state)
|
||||||
|
|
||||||
# 同时提交监控任务 - 监控任务会检测滑动任务状态
|
# 同时提交监控任务 - 监控任务会检测滑动任务状态(5小时超时)
|
||||||
monitor_future = executor.submit(self._async_monitor_task_with_state, video_id, collected_comment_ids, shared_state, 3600,
|
monitor_future = executor.submit(self._async_monitor_task_with_state, video_id, collected_comment_ids, shared_state, 18000,
|
||||||
document_id, episode_number, mix_name)
|
document_id, episode_number, mix_name)
|
||||||
|
|
||||||
# 等待两个任务完成
|
# 等待两个任务完成
|
||||||
@ -2049,10 +2054,90 @@ class DouyinPlayVVScraper:
|
|||||||
|
|
||||||
logging.info(f'评论滑动加载完成,共收集到 {len(all_comments)} 条评论')
|
logging.info(f'评论滑动加载完成,共收集到 {len(all_comments)} 条评论')
|
||||||
|
|
||||||
|
# 针对评论较少的情况,执行补丁机制确保不遗漏评论
|
||||||
|
# 当滑动次数较少(可能只滑动了2-3次就到底)但评论数量也较少时,
|
||||||
|
# 很可能存在页面上可见但未被网络日志捕获的评论
|
||||||
|
|
||||||
|
# 智能判断是否需要执行补丁机制
|
||||||
|
should_apply_patch = False
|
||||||
|
|
||||||
|
# 条件1: 评论数量较少(少于20条)
|
||||||
|
if len(all_comments) < 20:
|
||||||
|
should_apply_patch = True
|
||||||
|
logging.debug(f'评论数量较少({len(all_comments)}条),启用补丁机制')
|
||||||
|
|
||||||
|
# 条件2: 检查页面上是否有更多可见的评论元素
|
||||||
|
try:
|
||||||
|
visible_comment_count = self.driver.execute_script("""
|
||||||
|
var selectors = [
|
||||||
|
'[data-e2e="comment-item"]',
|
||||||
|
'[class*="comment-item"]',
|
||||||
|
'[class*="comment-content"]'
|
||||||
|
];
|
||||||
|
var totalCount = 0;
|
||||||
|
selectors.forEach(function(selector) {
|
||||||
|
var elements = document.querySelectorAll(selector);
|
||||||
|
elements.forEach(function(element) {
|
||||||
|
if (element.offsetParent !== null && element.textContent.trim().length > 2) {
|
||||||
|
totalCount++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return totalCount;
|
||||||
|
""")
|
||||||
|
|
||||||
|
if visible_comment_count > len(all_comments):
|
||||||
|
should_apply_patch = True
|
||||||
|
logging.debug(f'页面可见评论({visible_comment_count}条) > 已获取评论({len(all_comments)}条),启用补丁机制')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f'检查页面可见评论数量失败: {e}')
|
||||||
|
# 如果检查失败,对于少量评论的情况仍然执行补丁
|
||||||
|
if len(all_comments) < 10:
|
||||||
|
should_apply_patch = True
|
||||||
|
|
||||||
|
patch_comments = []
|
||||||
|
if should_apply_patch:
|
||||||
|
logging.info('执行评论补丁机制...')
|
||||||
|
patch_comments = self._extract_comments_patch(video_id)
|
||||||
|
else:
|
||||||
|
logging.debug('无需执行补丁机制')
|
||||||
|
|
||||||
|
if patch_comments:
|
||||||
|
# 去重合并补丁评论
|
||||||
|
existing_texts = {comment.get('text', '') for comment in all_comments}
|
||||||
|
new_patch_comments = []
|
||||||
|
|
||||||
|
for patch_comment in patch_comments:
|
||||||
|
if patch_comment.get('text', '') not in existing_texts:
|
||||||
|
new_patch_comments.append(patch_comment)
|
||||||
|
existing_texts.add(patch_comment.get('text', ''))
|
||||||
|
|
||||||
|
if new_patch_comments:
|
||||||
|
all_comments.extend(new_patch_comments)
|
||||||
|
logging.info(f'补丁机制额外获取到 {len(new_patch_comments)} 条评论,总计 {len(all_comments)} 条评论')
|
||||||
|
|
||||||
|
# 如果有新的评论且启用了实时保存,更新数据库
|
||||||
|
if document_id and episode_number and new_patch_comments:
|
||||||
|
try:
|
||||||
|
self.update_video_comments_realtime(document_id, episode_number, new_patch_comments, mix_name)
|
||||||
|
logging.info(f'实时保存补丁评论到数据库: {len(new_patch_comments)} 条')
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f'实时保存补丁评论失败: {e}')
|
||||||
|
else:
|
||||||
|
logging.debug('补丁机制未发现新的评论')
|
||||||
|
else:
|
||||||
|
logging.debug('补丁机制未获取到任何评论')
|
||||||
|
|
||||||
# 保存评论到文件
|
# 保存评论到文件
|
||||||
if all_comments:
|
if all_comments:
|
||||||
self.save_comments_to_file(all_comments, video_id)
|
self.save_comments_to_file(all_comments, video_id)
|
||||||
|
|
||||||
|
# 添加随机停留时间,防止网页被爬取崩溃
|
||||||
|
rest_time = random.uniform(10, 20) # 10-20秒随机停留
|
||||||
|
logging.info(f'评论抓取完成,停留 {rest_time:.1f} 秒以保护网页稳定性...')
|
||||||
|
time.sleep(rest_time)
|
||||||
|
|
||||||
return all_comments
|
return all_comments
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -3099,8 +3184,10 @@ class DouyinPlayVVScraper:
|
|||||||
video_details['episode_number'] = i
|
video_details['episode_number'] = i
|
||||||
video_details_list.append(video_details)
|
video_details_list.append(video_details)
|
||||||
|
|
||||||
# 添加延迟避免请求过快
|
# 添加随机延迟避免请求过快,模拟人类行为
|
||||||
time.sleep(2)
|
random_delay = self.anti_detection.get_human_like_delay()
|
||||||
|
logging.info(f'🕐 视频间隔等待时间: {random_delay:.1f}秒')
|
||||||
|
time.sleep(random_delay)
|
||||||
# exit(0)
|
# exit(0)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -3132,6 +3219,297 @@ class DouyinPlayVVScraper:
|
|||||||
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
|
self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
|
||||||
return self.cookies
|
return self.cookies
|
||||||
|
|
||||||
|
def _extract_comments_patch(self, video_id: str) -> list:
|
||||||
|
"""
|
||||||
|
评论抓取补丁机制 - 强制从页面元素中提取所有可见评论
|
||||||
|
针对评论较少但获取不全的情况,确保不遗漏任何页面上可见的评论
|
||||||
|
"""
|
||||||
|
comments = []
|
||||||
|
try:
|
||||||
|
logging.info(f'开始执行评论抓取补丁机制,视频ID: {video_id}')
|
||||||
|
|
||||||
|
# 首先检查是否存在"抢首评"按钮,如果存在说明视频确实没有评论
|
||||||
|
if self._check_first_comment_button():
|
||||||
|
logging.info('检测到"抢首评"按钮,确认视频没有评论,跳过补丁机制')
|
||||||
|
return comments
|
||||||
|
|
||||||
|
# 等待页面稳定
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# 滚动到评论区域确保评论完全加载
|
||||||
|
self._scroll_to_comment_section()
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# 针对少量评论优化的选择器列表
|
||||||
|
comment_selectors = [
|
||||||
|
# 抖音常用的评论选择器(优先级最高)
|
||||||
|
'[data-e2e="comment-item"]',
|
||||||
|
'[data-e2e="comment-list"] > div',
|
||||||
|
'[class*="comment-item"]',
|
||||||
|
'[class*="comment-content"]',
|
||||||
|
|
||||||
|
# 针对少量评论的特殊选择器
|
||||||
|
'[class*="comment"] [class*="content"]',
|
||||||
|
'[class*="Comment"] [class*="content"]',
|
||||||
|
'[class*="comment-text"]',
|
||||||
|
'[class*="user-comment"]',
|
||||||
|
|
||||||
|
# 更通用的评论选择器
|
||||||
|
'div[class*="comment"]:not([class*="input"]):not([class*="button"]):not([class*="header"])',
|
||||||
|
'li[class*="comment"]',
|
||||||
|
'[role="listitem"][class*="comment"]',
|
||||||
|
|
||||||
|
# 备用选择器(用于兜底)
|
||||||
|
'.comment-list > div',
|
||||||
|
'.comment-container > div',
|
||||||
|
'.comment-wrapper > div',
|
||||||
|
|
||||||
|
# 针对少量评论的深度选择器
|
||||||
|
'div[class*="comment"] p',
|
||||||
|
'div[class*="comment"] span:not([class*="button"]):not([class*="icon"])',
|
||||||
|
'[data-e2e*="comment"] div[class*="text"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
collected_texts = set() # 用于去重
|
||||||
|
|
||||||
|
for selector in comment_selectors:
|
||||||
|
try:
|
||||||
|
logging.debug(f'尝试选择器: {selector}')
|
||||||
|
elements = self.driver.find_elements("css selector", selector)
|
||||||
|
|
||||||
|
if elements:
|
||||||
|
logging.debug(f'选择器 {selector} 找到 {len(elements)} 个元素')
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
try:
|
||||||
|
if not element.is_displayed():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 获取评论文本
|
||||||
|
comment_text = element.text.strip()
|
||||||
|
|
||||||
|
# 过滤无效文本
|
||||||
|
if not comment_text or len(comment_text) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 过滤系统文本和按钮文本
|
||||||
|
skip_texts = [
|
||||||
|
'回复', '点赞', '举报', '删除', '编辑', '查看更多',
|
||||||
|
'展开', '收起', '暂时没有更多评论', '加载中',
|
||||||
|
'发布', '取消', '确定', '登录', '注册', '关注'
|
||||||
|
]
|
||||||
|
|
||||||
|
# 检查是否为纯按钮文本
|
||||||
|
if comment_text in skip_texts:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 清理包含按钮文本的评论
|
||||||
|
original_text = comment_text
|
||||||
|
for skip_text in skip_texts:
|
||||||
|
comment_text = comment_text.replace(skip_text, '').strip()
|
||||||
|
|
||||||
|
# 如果清理后文本太短,跳过
|
||||||
|
if len(comment_text) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 去重检查
|
||||||
|
if comment_text in collected_texts:
|
||||||
|
continue
|
||||||
|
|
||||||
|
collected_texts.add(comment_text)
|
||||||
|
|
||||||
|
# 创建评论对象
|
||||||
|
comment_info = {
|
||||||
|
'text': comment_text,
|
||||||
|
'user_name': '',
|
||||||
|
'digg_count': 0,
|
||||||
|
'create_time': 0,
|
||||||
|
'source': f'patch_{selector}' # 标记来源
|
||||||
|
}
|
||||||
|
|
||||||
|
comments.append(comment_info)
|
||||||
|
logging.debug(f'补丁提取评论: {comment_text[:30]}...')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f'处理评论元素失败: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f'选择器 {selector} 执行失败: {e}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 使用JavaScript进一步搜索评论文本(针对少量评论优化)
|
||||||
|
try:
|
||||||
|
js_comments = self.driver.execute_script("""
|
||||||
|
var comments = [];
|
||||||
|
var processedTexts = new Set();
|
||||||
|
|
||||||
|
// 针对少量评论的优化选择器
|
||||||
|
var selectors = [
|
||||||
|
'div[class*="comment"]',
|
||||||
|
'li[class*="comment"]',
|
||||||
|
'[data-e2e*="comment"]',
|
||||||
|
'[class*="comment-item"]',
|
||||||
|
'[class*="comment-content"]'
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(function(selector) {
|
||||||
|
try {
|
||||||
|
var elements = document.querySelectorAll(selector);
|
||||||
|
elements.forEach(function(element) {
|
||||||
|
if (element.offsetParent === null) return; // 跳过不可见元素
|
||||||
|
|
||||||
|
var text = element.textContent.trim();
|
||||||
|
if (text.length < 2) return;
|
||||||
|
|
||||||
|
// 过滤按钮文本
|
||||||
|
var skipTexts = ['回复', '点赞', '举报', '删除', '编辑', '查看更多', '展开', '收起', '暂时没有更多评论', '关注'];
|
||||||
|
var isButtonText = skipTexts.some(function(skipText) {
|
||||||
|
return text === skipText;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (isButtonText || processedTexts.has(text)) return;
|
||||||
|
|
||||||
|
// 清理文本
|
||||||
|
var cleanText = text;
|
||||||
|
skipTexts.forEach(function(skipText) {
|
||||||
|
cleanText = cleanText.replace(new RegExp(skipText, 'g'), '').trim();
|
||||||
|
});
|
||||||
|
|
||||||
|
if (cleanText.length >= 2 && !processedTexts.has(cleanText)) {
|
||||||
|
processedTexts.add(cleanText);
|
||||||
|
comments.push({
|
||||||
|
text: cleanText,
|
||||||
|
user_name: '',
|
||||||
|
digg_count: 0,
|
||||||
|
create_time: 0,
|
||||||
|
source: 'patch_js'
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
console.log('JS选择器执行失败:', selector, e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return comments;
|
||||||
|
""")
|
||||||
|
|
||||||
|
if js_comments:
|
||||||
|
# 去重合并
|
||||||
|
existing_texts = {comment['text'] for comment in comments}
|
||||||
|
for js_comment in js_comments:
|
||||||
|
if js_comment['text'] not in existing_texts:
|
||||||
|
comments.append(js_comment)
|
||||||
|
existing_texts.add(js_comment['text'])
|
||||||
|
|
||||||
|
logging.debug(f'JavaScript补丁额外提取 {len(js_comments)} 条评论')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f'JavaScript评论提取失败: {e}')
|
||||||
|
|
||||||
|
if comments:
|
||||||
|
logging.info(f'评论抓取补丁机制成功提取 {len(comments)} 条评论')
|
||||||
|
else:
|
||||||
|
logging.warning('评论抓取补丁机制未找到任何评论')
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'评论抓取补丁机制执行失败: {e}')
|
||||||
|
|
||||||
|
return comments
|
||||||
|
|
||||||
|
def _check_first_comment_button(self) -> bool:
|
||||||
|
"""
|
||||||
|
检测是否存在"抢首评"按钮,如果存在说明视频确实没有评论
|
||||||
|
Returns:
|
||||||
|
bool: True表示检测到抢首评按钮(视频没有评论),False表示没有检测到
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 常见的"抢首评"相关文本
|
||||||
|
first_comment_indicators = [
|
||||||
|
"抢首评", "首评"
|
||||||
|
]
|
||||||
|
|
||||||
|
logging.debug('检测"抢首评"按钮...')
|
||||||
|
|
||||||
|
# 方法1: 通过文本内容检测
|
||||||
|
for indicator in first_comment_indicators:
|
||||||
|
try:
|
||||||
|
# 使用XPath查找包含指定文本的元素
|
||||||
|
xpath_selectors = [
|
||||||
|
f"//*[contains(text(), '{indicator}')]",
|
||||||
|
f"//button[contains(text(), '{indicator}')]",
|
||||||
|
f"//div[contains(text(), '{indicator}')]",
|
||||||
|
f"//span[contains(text(), '{indicator}')]"
|
||||||
|
]
|
||||||
|
|
||||||
|
for xpath in xpath_selectors:
|
||||||
|
elements = self.driver.find_elements("xpath", xpath)
|
||||||
|
if elements:
|
||||||
|
for element in elements:
|
||||||
|
try:
|
||||||
|
if element.is_displayed() and element.text.strip():
|
||||||
|
logging.debug(f'检测到抢首评相关文本: "{element.text.strip()}"')
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 方法2: 通过CSS选择器检测评论输入框的占位符文本
|
||||||
|
try:
|
||||||
|
comment_input_selectors = [
|
||||||
|
'input[placeholder*="抢首评"]',
|
||||||
|
'textarea[placeholder*="抢首评"]',
|
||||||
|
'[data-e2e="comment-input"]',
|
||||||
|
'[class*="comment-input"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in comment_input_selectors:
|
||||||
|
elements = self.driver.find_elements("css selector", selector)
|
||||||
|
if elements:
|
||||||
|
for element in elements:
|
||||||
|
try:
|
||||||
|
placeholder = element.get_attribute('placeholder') or ''
|
||||||
|
if any(indicator in placeholder for indicator in first_comment_indicators):
|
||||||
|
logging.debug(f'检测到抢首评输入框占位符: "{placeholder}"')
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 方法3: 检测评论区域是否显示空状态
|
||||||
|
try:
|
||||||
|
empty_comment_selectors = [
|
||||||
|
'[class*="empty"]',
|
||||||
|
'[class*="no-comment"]',
|
||||||
|
'[class*="comment-empty"]',
|
||||||
|
'[data-e2e="comment-empty"]'
|
||||||
|
]
|
||||||
|
|
||||||
|
for selector in empty_comment_selectors:
|
||||||
|
elements = self.driver.find_elements("css selector", selector)
|
||||||
|
if elements:
|
||||||
|
for element in elements:
|
||||||
|
try:
|
||||||
|
if element.is_displayed():
|
||||||
|
text = element.text.strip()
|
||||||
|
if any(indicator in text for indicator in first_comment_indicators):
|
||||||
|
logging.debug(f'检测到空评论状态: "{text}"')
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
logging.debug('未检测到抢首评按钮或相关标识')
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f'检测抢首评按钮时出错: {e}')
|
||||||
|
return False
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
self.setup_driver()
|
self.setup_driver()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user