测试完成,现在的代码运行都是比较稳定的。
This commit is contained in:
parent
d4d555cdb1
commit
18efb25133
@ -87,13 +87,6 @@ class DouyinAutoScheduler:
|
|||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent')
|
profile_dir = os.path.join(script_dir, 'config', 'chrome_profile_timer', 'douyin_persistent')
|
||||||
|
|
||||||
# # 检查配置文件目录是否存在
|
|
||||||
# if not os.path.exists(profile_dir):
|
|
||||||
# print("⚠️ 检测到定时器浏览器配置目录不存在,需要首次登录")
|
|
||||||
# print(" 请在浏览器中完成抖音登录,并导航到【我的】→【收藏】→【合集】页面")
|
|
||||||
# print(" 完成后按回车键继续...")
|
|
||||||
# input()
|
|
||||||
# return
|
|
||||||
|
|
||||||
# 检查配置文件是否为空(可能未登录)
|
# 检查配置文件是否为空(可能未登录)
|
||||||
import glob
|
import glob
|
||||||
@ -178,7 +171,7 @@ class DouyinAutoScheduler:
|
|||||||
scraper = DouyinPlayVVScraper(
|
scraper = DouyinPlayVVScraper(
|
||||||
start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation",
|
start_url="https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation",
|
||||||
auto_continue=True,
|
auto_continue=True,
|
||||||
duration_s=180 # 增加到180秒,给更多时间收集数据
|
duration_s=60 # 增加到60秒,给更多时间收集数据
|
||||||
)
|
)
|
||||||
|
|
||||||
print("开始执行抓取任务...")
|
print("开始执行抓取任务...")
|
||||||
|
|||||||
@ -147,9 +147,17 @@
|
|||||||
{
|
{
|
||||||
"video_id": "7567050545257516331",
|
"video_id": "7567050545257516331",
|
||||||
"episode_num": 0
|
"episode_num": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"video_id": "7568152326477942022",
|
||||||
|
"episode_num": 0
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"video_id": "7569217928420183332",
|
||||||
|
"episode_num": 0
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"total_count": 37,
|
"total_count": 39,
|
||||||
"last_update": "2025-10-31T09:50:18.533027",
|
"last_update": "2025-11-06T11:06:44.598400",
|
||||||
"mix_name": "末世系列"
|
"mix_name": "末世系列"
|
||||||
}
|
}
|
||||||
@ -686,22 +686,32 @@ class DouyinPlayVVScraper:
|
|||||||
# 在auto_continue模式下增加页面加载等待时间
|
# 在auto_continue模式下增加页面加载等待时间
|
||||||
if self.auto_continue:
|
if self.auto_continue:
|
||||||
logging.info('自动继续模式:增加页面加载等待时间')
|
logging.info('自动继续模式:增加页面加载等待时间')
|
||||||
time.sleep(8) # 等待页面完全加载
|
time.sleep(10) # 增加到10秒,确保页面完全加载
|
||||||
else:
|
else:
|
||||||
# 普通模式也需要增加页面加载等待时间
|
# 普通模式也需要增加页面加载等待时间
|
||||||
logging.info('普通模式:增加页面加载等待时间')
|
logging.info('普通模式:增加页面加载等待时间')
|
||||||
time.sleep(8) # 等待页面完全加载
|
time.sleep(10) # 增加到10秒,确保页面完全加载
|
||||||
|
|
||||||
# 滚动触发懒加载
|
# 第一轮滚动:触发懒加载
|
||||||
for i in range(8):
|
logging.info('第一轮滚动:触发懒加载')
|
||||||
|
for i in range(10): # 增加滚动次数
|
||||||
self.driver.execute_script(f'window.scrollTo(0, {i * 900});')
|
self.driver.execute_script(f'window.scrollTo(0, {i * 900});')
|
||||||
time.sleep(1.2)
|
time.sleep(1.5) # 增加等待时间
|
||||||
|
|
||||||
|
# 等待数据加载
|
||||||
|
logging.info('等待数据加载...')
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
# 刷新触发新请求
|
# 刷新触发新请求
|
||||||
|
logging.info('刷新页面触发新请求')
|
||||||
self.driver.refresh()
|
self.driver.refresh()
|
||||||
time.sleep(4)
|
time.sleep(6) # 增加刷新后的等待时间
|
||||||
for i in range(6):
|
|
||||||
|
# 第二轮滚动:确保所有数据加载
|
||||||
|
logging.info('第二轮滚动:确保所有数据加载')
|
||||||
|
for i in range(8):
|
||||||
self.driver.execute_script(f'window.scrollTo(0, {i * 1200});')
|
self.driver.execute_script(f'window.scrollTo(0, {i * 1200});')
|
||||||
time.sleep(1.3)
|
time.sleep(1.5)
|
||||||
|
|
||||||
def format_count(self, n: int) -> str:
|
def format_count(self, n: int) -> str:
|
||||||
if n >= 100_000_000:
|
if n >= 100_000_000:
|
||||||
@ -794,41 +804,19 @@ class DouyinPlayVVScraper:
|
|||||||
mix_name = obj.get('mix_name', '')
|
mix_name = obj.get('mix_name', '')
|
||||||
statis = obj.get('statis', {})
|
statis = obj.get('statis', {})
|
||||||
|
|
||||||
# 调试:输出包含mix_id的完整对象结构(仅输出前3个)
|
|
||||||
if len(self.play_vv_items) < 3:
|
|
||||||
logging.info(f"=== 调试:合集对象结构 ===")
|
|
||||||
logging.info(f"完整对象键: {list(obj.keys())}")
|
|
||||||
# 查找可能的视频相关字段和新增字段
|
|
||||||
for key, value in obj.items():
|
|
||||||
if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower():
|
|
||||||
logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}")
|
|
||||||
# 检查新增字段相关的键
|
|
||||||
elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']):
|
|
||||||
logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}")
|
|
||||||
|
|
||||||
# 特别检查ids字段
|
|
||||||
if 'ids' in obj:
|
|
||||||
ids_value = obj['ids']
|
|
||||||
logging.info(f"ids字段详细信息: {type(ids_value)} - {ids_value}")
|
|
||||||
if isinstance(ids_value, list) and len(ids_value) > 0:
|
|
||||||
logging.info(f"ids列表长度: {len(ids_value)}")
|
|
||||||
logging.info(f"第一个ID: {ids_value[0]}")
|
|
||||||
if len(ids_value) > 1:
|
|
||||||
logging.info(f"第二个ID: {ids_value[1]}")
|
|
||||||
|
|
||||||
if isinstance(statis, dict) and 'play_vv' in statis:
|
if isinstance(statis, dict) and 'play_vv' in statis:
|
||||||
play_vv = statis.get('play_vv')
|
play_vv = statis.get('play_vv')
|
||||||
if isinstance(play_vv, (int, str)) and str(play_vv).isdigit():
|
if isinstance(play_vv, (int, str)) and str(play_vv).isdigit():
|
||||||
vv = int(play_vv)
|
vv = int(play_vv)
|
||||||
|
|
||||||
# 数据验证:确保播放量大于0且合集名称不为空
|
# 数据验证:确保合集名称不为空
|
||||||
if vv <= 0:
|
if not mix_name or mix_name.strip() == "":
|
||||||
logging.warning(f"跳过无效的播放量数据: mix_name={mix_name}, play_vv={vv}")
|
logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}, mix_id={mix_id}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if not mix_name or mix_name.strip() == "":
|
# 🔧 修复:不跳过播放量为0的数据,而是标记并保留
|
||||||
logging.warning(f"跳过缺少合集名称的数据: play_vv={vv}")
|
# 这些数据可能是因为页面加载不完整,但合集本身是存在的
|
||||||
return
|
# 警告信息移到去重检查之后,只有真正添加时才警告
|
||||||
|
|
||||||
# 构建合集链接
|
# 构建合集链接
|
||||||
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
||||||
@ -994,12 +982,40 @@ class DouyinPlayVVScraper:
|
|||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
# 添加到列表(保持原有逻辑)
|
# 🔧 修复:添加前检查是否已存在(避免重复)
|
||||||
self.play_vv_items.append(item_data)
|
# 检查是否已经有相同mix_id的数据
|
||||||
|
existing_item = None
|
||||||
|
for existing in self.play_vv_items:
|
||||||
|
if existing.get('mix_id') == mix_id:
|
||||||
|
existing_item = existing
|
||||||
|
break
|
||||||
|
|
||||||
# 实时保存到数据库
|
if existing_item:
|
||||||
if self.realtime_save_enabled:
|
# 如果已存在,比较播放量,保留更大的
|
||||||
self.save_single_item_realtime(item_data)
|
existing_vv = existing_item.get('play_vv', 0)
|
||||||
|
if vv > existing_vv:
|
||||||
|
# 当前数据更好,替换
|
||||||
|
logging.info(f'🔄 更新重复短剧: {mix_name} (播放量: {existing_vv:,} → {vv:,})')
|
||||||
|
self.play_vv_items.remove(existing_item)
|
||||||
|
self.play_vv_items.append(item_data)
|
||||||
|
else:
|
||||||
|
# 已有数据更好,跳过
|
||||||
|
logging.info(f'⏭️ 跳过重复短剧: {mix_name} (当前: {vv:,}, 已有: {existing_vv:,})')
|
||||||
|
return # 跳过当前数据
|
||||||
|
else:
|
||||||
|
# 不存在,直接添加
|
||||||
|
self.play_vv_items.append(item_data)
|
||||||
|
|
||||||
|
# 只有在真正添加时,才对播放量为0的数据发出警告
|
||||||
|
if vv <= 0:
|
||||||
|
logging.warning(f"⚠️ 添加了播放量为0的数据: {mix_name} (ID: {mix_id})")
|
||||||
|
logging.warning(f" 这可能需要后续重新获取播放量")
|
||||||
|
|
||||||
|
# 🔧 修复:不在数据收集阶段进行实时保存
|
||||||
|
# 实时保存会触发获取详细内容,导致数据收集中断
|
||||||
|
# 改为在数据收集完成后统一处理
|
||||||
|
# if self.realtime_save_enabled:
|
||||||
|
# self.save_single_item_realtime(item_data)
|
||||||
|
|
||||||
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
if series_author:
|
if series_author:
|
||||||
@ -1064,10 +1080,6 @@ class DouyinPlayVVScraper:
|
|||||||
# 添加到列表(保持原有逻辑)
|
# 添加到列表(保持原有逻辑)
|
||||||
self.play_vv_items.append(item_data)
|
self.play_vv_items.append(item_data)
|
||||||
|
|
||||||
# 实时保存到数据库
|
|
||||||
if self.realtime_save_enabled:
|
|
||||||
self.save_single_item_realtime(item_data)
|
|
||||||
|
|
||||||
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
@ -1165,6 +1177,13 @@ class DouyinPlayVVScraper:
|
|||||||
time.sleep(0.8)
|
time.sleep(0.8)
|
||||||
|
|
||||||
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个目标')
|
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个目标')
|
||||||
|
logging.info(f'=' * 60)
|
||||||
|
logging.info(f'网络收集阶段统计:')
|
||||||
|
logging.info(f' - 总数量: {len(self.play_vv_items)} 个合集')
|
||||||
|
logging.info(f' - 播放量为0: {sum(1 for item in self.play_vv_items if item.get("play_vv", 0) == 0)} 个')
|
||||||
|
logging.info(f' - 播放量正常: {sum(1 for item in self.play_vv_items if item.get("play_vv", 0) > 0)} 个')
|
||||||
|
logging.info(f'=' * 60)
|
||||||
|
logging.info(f'开始解析SSR数据...')
|
||||||
|
|
||||||
|
|
||||||
def parse_ssr_data(self):
|
def parse_ssr_data(self):
|
||||||
@ -1206,22 +1225,98 @@ class DouyinPlayVVScraper:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def dedupe(self):
|
def dedupe(self):
|
||||||
# 去重按play_vv数值
|
# 🔧 修复:按mix_id去重,保留播放量最大的那个
|
||||||
unique = []
|
# 原来的逻辑会导致播放量相同的不同短剧被误删
|
||||||
seen = set()
|
unique_dict = {} # 使用字典存储,key是identifier,value是item
|
||||||
|
|
||||||
for item in self.play_vv_items:
|
for item in self.play_vv_items:
|
||||||
vv = item['play_vv']
|
mix_id = item.get('mix_id', '')
|
||||||
if vv not in seen:
|
|
||||||
unique.append(item)
|
# 如果没有mix_id,使用mix_name作为备用标识
|
||||||
seen.add(vv)
|
if not mix_id:
|
||||||
|
mix_name = item.get('mix_name', '')
|
||||||
|
identifier = f"name_{mix_name}"
|
||||||
|
else:
|
||||||
|
identifier = f"id_{mix_id}"
|
||||||
|
|
||||||
|
# 如果是第一次遇到这个identifier,直接添加
|
||||||
|
if identifier not in unique_dict:
|
||||||
|
unique_dict[identifier] = item
|
||||||
|
else:
|
||||||
|
# 如果已经存在,比较播放量,保留播放量大的
|
||||||
|
existing_play_vv = unique_dict[identifier].get('play_vv', 0)
|
||||||
|
current_play_vv = item.get('play_vv', 0)
|
||||||
|
|
||||||
|
if current_play_vv > existing_play_vv:
|
||||||
|
# 当前数据的播放量更大,替换
|
||||||
|
logging.info(f'去重:发现重复短剧 {item.get("mix_name", "未知")},保留播放量更大的数据 ({existing_play_vv:,} → {current_play_vv:,})')
|
||||||
|
unique_dict[identifier] = item
|
||||||
|
else:
|
||||||
|
# 已有数据的播放量更大或相等,跳过当前数据
|
||||||
|
logging.debug(f'去重:跳过重复的短剧 {item.get("mix_name", "未知")} (mix_id: {mix_id})')
|
||||||
|
|
||||||
|
# 转换回列表
|
||||||
|
unique = list(unique_dict.values())
|
||||||
|
|
||||||
|
removed_count = len(self.play_vv_items) - len(unique)
|
||||||
|
if removed_count > 0:
|
||||||
|
logging.info(f'去重完成:移除 {removed_count} 个重复项,保留 {len(unique)} 个唯一短剧')
|
||||||
|
else:
|
||||||
|
logging.info(f'去重完成:没有重复项,保留 {len(unique)} 个唯一短剧')
|
||||||
|
|
||||||
self.play_vv_items = unique
|
self.play_vv_items = unique
|
||||||
|
|
||||||
def save_results(self):
|
def save_results(self):
|
||||||
if self.realtime_save_enabled and self.saved_items:
|
if self.realtime_save_enabled:
|
||||||
# 实时保存模式:只更新排名和统计信息
|
# 🔧 修复:在数据收集完成后,统一进行实时保存
|
||||||
self.update_ranks_for_batch()
|
logging.info(f'[实时保存] 开始保存 {len(self.play_vv_items)} 个合集的数据')
|
||||||
logging.info(f'[实时保存] 所有数据已通过实时保存功能保存到数据库,共 {len(self.saved_items)} 个合集')
|
|
||||||
logging.info(f'[实时保存] 批次ID: {self.batch_id}')
|
logging.info(f'[实时保存] 批次ID: {self.batch_id}')
|
||||||
|
|
||||||
|
# 先保存所有合集的基础信息(不获取详细内容)
|
||||||
|
for item_data in self.play_vv_items:
|
||||||
|
try:
|
||||||
|
logging.info(f'[实时保存] 保存合集基础信息: {item_data.get("mix_name", "未知")}')
|
||||||
|
self.save_collection_basic_info(item_data)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'[实时保存] 保存合集基础信息失败: {item_data.get("mix_name", "未知")} - {e}')
|
||||||
|
|
||||||
|
# 更新排名
|
||||||
|
try:
|
||||||
|
self.update_ranks_for_batch()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'[实时保存] 更新排名失败: {e}')
|
||||||
|
|
||||||
|
# 然后逐个获取详细内容(如果需要)
|
||||||
|
logging.info(f'[实时保存] 基础信息保存完成,开始获取详细内容')
|
||||||
|
for item_data in self.play_vv_items:
|
||||||
|
try:
|
||||||
|
mix_id = item_data.get('mix_id', '')
|
||||||
|
mix_name = item_data.get('mix_name', '')
|
||||||
|
current_episode_count = item_data.get('updated_to_episode', 0)
|
||||||
|
|
||||||
|
if mix_id and current_episode_count > 0:
|
||||||
|
# 查找已保存的文档ID
|
||||||
|
target_collection = self.collection
|
||||||
|
if target_collection is not None:
|
||||||
|
existing_doc = target_collection.find_one({'mix_id': mix_id}, {'_id': 1})
|
||||||
|
if existing_doc:
|
||||||
|
document_id = existing_doc['_id']
|
||||||
|
logging.info(f'[实时保存] 开始获取详细内容: {mix_name}')
|
||||||
|
|
||||||
|
# 获取视频ID列表
|
||||||
|
episode_video_ids = self.update_collection_video_ids(
|
||||||
|
document_id, mix_id, mix_name, current_episode_count
|
||||||
|
)
|
||||||
|
|
||||||
|
# 获取视频详细数据
|
||||||
|
if episode_video_ids:
|
||||||
|
self.update_video_details_incrementally(
|
||||||
|
document_id, episode_video_ids, mix_name, mix_id
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f'[实时保存] 获取详细内容失败: {item_data.get("mix_name", "未知")} - {e}')
|
||||||
|
|
||||||
|
logging.info(f'[实时保存] 所有数据处理完成,共 {len(self.saved_items)} 个合集')
|
||||||
else:
|
else:
|
||||||
# 传统批量保存模式
|
# 传统批量保存模式
|
||||||
self.save_to_mongodb()
|
self.save_to_mongodb()
|
||||||
@ -1230,7 +1325,12 @@ class DouyinPlayVVScraper:
|
|||||||
def update_ranks_for_batch(self):
|
def update_ranks_for_batch(self):
|
||||||
"""为当前批次的数据更新排名"""
|
"""为当前批次的数据更新排名"""
|
||||||
target_collection = self.collection # 使用根据模式选择的集合
|
target_collection = self.collection # 使用根据模式选择的集合
|
||||||
if target_collection is None or not self.saved_items:
|
if target_collection is None:
|
||||||
|
logging.warning('[实时保存] 数据库集合未初始化,跳过排名更新')
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.saved_items:
|
||||||
|
logging.warning('[实时保存] 没有已保存的数据,跳过排名更新')
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -3847,11 +3947,32 @@ class DouyinPlayVVScraper:
|
|||||||
self.navigate()
|
self.navigate()
|
||||||
self.ensure_login()
|
self.ensure_login()
|
||||||
self.trigger_loading()
|
self.trigger_loading()
|
||||||
|
|
||||||
|
logging.info('=' * 60)
|
||||||
|
logging.info('开始数据收集阶段')
|
||||||
|
logging.info('=' * 60)
|
||||||
self.collect_network_bodies()
|
self.collect_network_bodies()
|
||||||
|
logging.info(f'✅ 网络数据收集完成:{len(self.play_vv_items)} 个合集')
|
||||||
|
|
||||||
self.parse_ssr_data()
|
self.parse_ssr_data()
|
||||||
|
logging.info(f'✅ SSR数据解析完成:{len(self.play_vv_items)} 个合集')
|
||||||
|
|
||||||
|
logging.info('=' * 60)
|
||||||
|
logging.info('开始数据去重')
|
||||||
|
logging.info('=' * 60)
|
||||||
|
before_dedupe = len(self.play_vv_items)
|
||||||
self.dedupe()
|
self.dedupe()
|
||||||
|
after_dedupe = len(self.play_vv_items)
|
||||||
|
logging.info(f'✅ 去重完成:{before_dedupe} → {after_dedupe} (移除 {before_dedupe - after_dedupe} 个重复项)')
|
||||||
|
|
||||||
|
logging.info('=' * 60)
|
||||||
|
logging.info('开始保存数据')
|
||||||
|
logging.info('=' * 60)
|
||||||
self.save_results()
|
self.save_results()
|
||||||
logging.info('完成,play_vv数量: %d', len(self.play_vv_items))
|
|
||||||
|
logging.info('=' * 60)
|
||||||
|
logging.info(f'✅ 全部完成!共处理 {len(self.play_vv_items)} 个合集')
|
||||||
|
logging.info('=' * 60)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
error_details = {
|
error_details = {
|
||||||
@ -3901,7 +4022,7 @@ if __name__ == '__main__':
|
|||||||
parser = argparse.ArgumentParser(description='Selenium+CDP 抖音play_vv抓取器')
|
parser = argparse.ArgumentParser(description='Selenium+CDP 抖音play_vv抓取器')
|
||||||
parser.add_argument('--url', default='https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation', help='收藏合集列表页面URL')
|
parser.add_argument('--url', default='https://www.douyin.com/user/self?showTab=favorite_collection&showSubTab=compilation', help='收藏合集列表页面URL')
|
||||||
parser.add_argument('--auto', action='store_true', help='自动继续,跳过回车等待')
|
parser.add_argument('--auto', action='store_true', help='自动继续,跳过回车等待')
|
||||||
parser.add_argument('--duration', type=int, default=180, help='网络响应收集时长(秒)')
|
parser.add_argument('--duration', type=int, default=60, help='网络响应收集时长(秒)')
|
||||||
parser.add_argument('--driver', help='覆盖chromedriver路径')
|
parser.add_argument('--driver', help='覆盖chromedriver路径')
|
||||||
parser.add_argument('--timer', action='store_true', help='启用定时器模式,应用config.py中的定时器配置')
|
parser.add_argument('--timer', action='store_true', help='启用定时器模式,应用config.py中的定时器配置')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user