删除了不需要的代码,修改了定时器执行时间为每晚24:00
This commit is contained in:
parent
fdd79b6931
commit
c37c9106fa
@ -4,7 +4,7 @@
|
|||||||
抖音播放量自动抓取定时器 - 跨平台版本
|
抖音播放量自动抓取定时器 - 跨平台版本
|
||||||
|
|
||||||
功能:
|
功能:
|
||||||
- 每天上午9:35自动执行抖音播放量抓取任务
|
- 每晚24:00自动执行抖音播放量抓取任务
|
||||||
- 支持Windows、macOS、Linux
|
- 支持Windows、macOS、Linux
|
||||||
- 自动保存数据到MongoDB
|
- 自动保存数据到MongoDB
|
||||||
"""
|
"""
|
||||||
@ -81,10 +81,10 @@ class DouyinAutoScheduler:
|
|||||||
|
|
||||||
def setup_schedule(self):
|
def setup_schedule(self):
|
||||||
"""设置定时任务"""
|
"""设置定时任务"""
|
||||||
# 主执行时间:每天上午9:35
|
# 主执行时间:每晚24:00(午夜)
|
||||||
schedule.every().day.at("09:35").do(self.run_douyin_scraper)
|
schedule.every().day.at("00:00").do(self.run_douyin_scraper)
|
||||||
|
|
||||||
logging.info("⏰ 定时器已设置:每天上午9:35执行抖音播放量抓取")
|
logging.info("⏰ 定时器已设置:每晚24:00执行抖音播放量抓取")
|
||||||
|
|
||||||
def show_next_run(self):
|
def show_next_run(self):
|
||||||
"""显示下次执行时间"""
|
"""显示下次执行时间"""
|
||||||
@ -107,7 +107,7 @@ class DouyinAutoScheduler:
|
|||||||
"""启动定时器"""
|
"""启动定时器"""
|
||||||
self.is_running = True
|
self.is_running = True
|
||||||
logging.info("🚀 抖音播放量自动抓取定时器已启动")
|
logging.info("🚀 抖音播放量自动抓取定时器已启动")
|
||||||
logging.info("⏰ 执行时间:每天上午9:35")
|
logging.info("⏰ 执行时间:每晚24:00")
|
||||||
logging.info("📁 目标脚本:rank_data_scraper.py")
|
logging.info("📁 目标脚本:rank_data_scraper.py")
|
||||||
logging.info("💾 数据保存:MongoDB")
|
logging.info("💾 数据保存:MongoDB")
|
||||||
logging.info("⏹️ 按 Ctrl+C 停止定时器")
|
logging.info("⏹️ 按 Ctrl+C 停止定时器")
|
||||||
|
|||||||
@ -60,8 +60,6 @@ class DouyinPlayVVScraper:
|
|||||||
self.driver = None
|
self.driver = None
|
||||||
self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item}
|
self.play_vv_items = [] # list of dicts: {play_vv, formatted, url, request_id, mix_name, watched_item}
|
||||||
self.captured_responses = []
|
self.captured_responses = []
|
||||||
self.collected_aweme_ids = [] # 收集到的视频ID列表
|
|
||||||
self.mix_aweme_mapping = {} # 合集ID到视频ID列表的映射
|
|
||||||
self.mongo_client = None
|
self.mongo_client = None
|
||||||
self.db = None
|
self.db = None
|
||||||
self.collection = None
|
self.collection = None
|
||||||
@ -423,193 +421,7 @@ class DouyinPlayVVScraper:
|
|||||||
return f"{n/10_000:.1f}万"
|
return f"{n/10_000:.1f}万"
|
||||||
return str(n)
|
return str(n)
|
||||||
|
|
||||||
def _trigger_mix_aweme_api(self, mix_id: str):
|
|
||||||
"""主动触发/aweme/v1/web/mix/aweme/ API调用来获取合集中的视频列表"""
|
|
||||||
try:
|
|
||||||
if not self.driver:
|
|
||||||
logging.warning('WebDriver不可用,无法触发API调用')
|
|
||||||
return
|
|
||||||
|
|
||||||
logging.info(f'主动触发mix/aweme API调用,获取合集 {mix_id} 的视频列表')
|
|
||||||
|
|
||||||
# 构建API URL
|
|
||||||
api_url = f"https://www.douyin.com/aweme/v1/web/mix/aweme/?mix_id={mix_id}&count=20&cursor=0"
|
|
||||||
|
|
||||||
# 使用JavaScript发起fetch请求并直接处理响应
|
|
||||||
js_code = f"""
|
|
||||||
(async function() {{
|
|
||||||
try {{
|
|
||||||
const response = await fetch('{api_url}', {{
|
|
||||||
method: 'GET',
|
|
||||||
credentials: 'include',
|
|
||||||
headers: {{
|
|
||||||
'Accept': 'application/json',
|
|
||||||
'User-Agent': navigator.userAgent
|
|
||||||
}}
|
|
||||||
}});
|
|
||||||
|
|
||||||
if (response.ok) {{
|
|
||||||
const data = await response.json();
|
|
||||||
console.log('Mix aweme API response for {mix_id}:', data);
|
|
||||||
|
|
||||||
// 提取aweme_id列表
|
|
||||||
let awemeIds = [];
|
|
||||||
if (data && data.aweme_list && Array.isArray(data.aweme_list)) {{
|
|
||||||
awemeIds = data.aweme_list.map(aweme => aweme.aweme_id).filter(id => id);
|
|
||||||
}} else if (data && data.data && Array.isArray(data.data)) {{
|
|
||||||
awemeIds = data.data.map(aweme => aweme.aweme_id).filter(id => id);
|
|
||||||
}}
|
|
||||||
|
|
||||||
// 将结果存储到window对象中,供Python读取
|
|
||||||
if (!window.mixAwemeResults) {{
|
|
||||||
window.mixAwemeResults = {{}};
|
|
||||||
}}
|
|
||||||
window.mixAwemeResults['{mix_id}'] = {{
|
|
||||||
aweme_ids: awemeIds,
|
|
||||||
total_count: awemeIds.length,
|
|
||||||
raw_data: data
|
|
||||||
}};
|
|
||||||
|
|
||||||
console.log('Extracted aweme_ids for {mix_id}:', awemeIds);
|
|
||||||
return awemeIds;
|
|
||||||
}} else {{
|
|
||||||
console.error('Mix aweme API failed for {mix_id}:', response.status);
|
|
||||||
return [];
|
|
||||||
}}
|
|
||||||
}} catch (error) {{
|
|
||||||
console.error('Mix aweme API error for {mix_id}:', error);
|
|
||||||
return [];
|
|
||||||
}}
|
|
||||||
}})();
|
|
||||||
"""
|
|
||||||
|
|
||||||
# 执行JavaScript代码
|
|
||||||
result = self.driver.execute_script(js_code)
|
|
||||||
|
|
||||||
# 等待一下,然后读取结果
|
|
||||||
time.sleep(2)
|
|
||||||
|
|
||||||
# 从window对象中读取结果
|
|
||||||
try:
|
|
||||||
js_get_result = f"""
|
|
||||||
return window.mixAwemeResults && window.mixAwemeResults['{mix_id}']
|
|
||||||
? window.mixAwemeResults['{mix_id}']
|
|
||||||
: null;
|
|
||||||
"""
|
|
||||||
stored_result = self.driver.execute_script(js_get_result)
|
|
||||||
|
|
||||||
if stored_result and stored_result.get('aweme_ids'):
|
|
||||||
aweme_ids = stored_result['aweme_ids']
|
|
||||||
logging.info(f'成功获取合集 {mix_id} 的 {len(aweme_ids)} 个视频ID: {aweme_ids[:5]}...')
|
|
||||||
|
|
||||||
# 将aweme_ids添加到类属性中
|
|
||||||
if not hasattr(self, 'collected_aweme_ids'):
|
|
||||||
self.collected_aweme_ids = []
|
|
||||||
|
|
||||||
# 为这个特定的mix_id存储aweme_ids
|
|
||||||
if not hasattr(self, 'mix_aweme_mapping'):
|
|
||||||
self.mix_aweme_mapping = {}
|
|
||||||
self.mix_aweme_mapping[mix_id] = aweme_ids
|
|
||||||
|
|
||||||
# 也添加到总的collected_aweme_ids中
|
|
||||||
self.collected_aweme_ids.extend(aweme_ids)
|
|
||||||
|
|
||||||
logging.info(f'已将 {len(aweme_ids)} 个视频ID添加到合集 {mix_id}')
|
|
||||||
else:
|
|
||||||
logging.warning(f'未能获取合集 {mix_id} 的视频ID')
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f'读取JavaScript结果失败: {e}')
|
|
||||||
|
|
||||||
logging.info(f'已完成mix/aweme API调用,mix_id: {mix_id}')
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f'触发mix/aweme API调用失败: {e}')
|
|
||||||
|
|
||||||
def parse_mix_aweme_response(self, text: str, source_url: str, request_id: str = None):
|
|
||||||
"""解析合集中的视频列表API响应,提取单个视频的aweme_id和播放量"""
|
|
||||||
try:
|
|
||||||
if not text.strip():
|
|
||||||
return
|
|
||||||
|
|
||||||
# 尝试解析JSON响应
|
|
||||||
try:
|
|
||||||
data = json.loads(text)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
logging.warning(f'mix/aweme API响应不是有效JSON: {source_url}')
|
|
||||||
return
|
|
||||||
|
|
||||||
# 查找aweme_list或类似的视频列表
|
|
||||||
aweme_list = None
|
|
||||||
if isinstance(data, dict):
|
|
||||||
# 常见的响应结构
|
|
||||||
for key in ['aweme_list', 'data', 'awemes', 'items']:
|
|
||||||
if key in data and isinstance(data[key], list):
|
|
||||||
aweme_list = data[key]
|
|
||||||
break
|
|
||||||
|
|
||||||
# 如果没有直接找到,递归查找
|
|
||||||
if aweme_list is None:
|
|
||||||
aweme_list = self._find_aweme_list_recursive(data)
|
|
||||||
|
|
||||||
if aweme_list and isinstance(aweme_list, list):
|
|
||||||
logging.info(f'从mix/aweme API找到 {len(aweme_list)} 个视频')
|
|
||||||
|
|
||||||
# 收集所有aweme_id,用于后续与合集数据关联
|
|
||||||
aweme_ids = []
|
|
||||||
for aweme in aweme_list:
|
|
||||||
if isinstance(aweme, dict):
|
|
||||||
aweme_id = aweme.get('aweme_id', '')
|
|
||||||
if aweme_id:
|
|
||||||
aweme_ids.append(aweme_id)
|
|
||||||
|
|
||||||
# 获取视频标题
|
|
||||||
desc = aweme.get('desc', '')
|
|
||||||
if not desc:
|
|
||||||
# 尝试从其他字段获取标题
|
|
||||||
text_extra = aweme.get('text_extra', [])
|
|
||||||
if text_extra and isinstance(text_extra, list):
|
|
||||||
desc = ' '.join([item.get('hashtag_name', '') for item in text_extra if isinstance(item, dict)])
|
|
||||||
|
|
||||||
logging.info(f'找到视频ID: {aweme_id} - {desc[:50]}...')
|
|
||||||
|
|
||||||
# 将aweme_ids存储到类属性中,供其他函数使用
|
|
||||||
if not hasattr(self, 'collected_aweme_ids'):
|
|
||||||
self.collected_aweme_ids = []
|
|
||||||
self.collected_aweme_ids.extend(aweme_ids)
|
|
||||||
|
|
||||||
logging.info(f'累计收集到 {len(self.collected_aweme_ids)} 个视频ID')
|
|
||||||
else:
|
|
||||||
logging.warning(f'mix/aweme API响应中未找到视频列表: {source_url}')
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f'解析mix/aweme API响应时出错: {e}')
|
|
||||||
|
|
||||||
def _find_aweme_list_recursive(self, obj, max_depth=3, current_depth=0):
|
|
||||||
"""递归查找aweme_list"""
|
|
||||||
if current_depth >= max_depth:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if isinstance(obj, dict):
|
|
||||||
for key, value in obj.items():
|
|
||||||
if 'aweme' in key.lower() and isinstance(value, list):
|
|
||||||
# 检查列表中是否包含aweme对象
|
|
||||||
if value and isinstance(value[0], dict) and 'aweme_id' in value[0]:
|
|
||||||
return value
|
|
||||||
|
|
||||||
if isinstance(value, (dict, list)):
|
|
||||||
result = self._find_aweme_list_recursive(value, max_depth, current_depth + 1)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
|
|
||||||
elif isinstance(obj, list):
|
|
||||||
for item in obj:
|
|
||||||
if isinstance(item, (dict, list)):
|
|
||||||
result = self._find_aweme_list_recursive(item, max_depth, current_depth + 1)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None):
|
def parse_play_vv_from_text(self, text: str, source_url: str, request_id: str = None):
|
||||||
"""解析文本中的play_vv、mix_name和watched_item信息"""
|
"""解析文本中的play_vv、mix_name和watched_item信息"""
|
||||||
@ -665,10 +477,6 @@ class DouyinPlayVVScraper:
|
|||||||
# 构建合集链接
|
# 构建合集链接
|
||||||
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
||||||
|
|
||||||
# 获取该合集对应的aweme_id列表
|
|
||||||
mix_aweme_mapping = getattr(self, 'mix_aweme_mapping', {})
|
|
||||||
aweme_ids = mix_aweme_mapping.get(mix_id, [])
|
|
||||||
|
|
||||||
# 提取合集封面图片URL - 直接存储完整的图片链接
|
# 提取合集封面图片URL - 直接存储完整的图片链接
|
||||||
cover_image_url = ""
|
cover_image_url = ""
|
||||||
cover_image_backup_urls = [] # 备用链接列表
|
cover_image_backup_urls = [] # 备用链接列表
|
||||||
@ -713,16 +521,11 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_name': mix_name,
|
'mix_name': mix_name,
|
||||||
'video_url': video_url, # 合集链接
|
'video_url': video_url, # 合集链接
|
||||||
'mix_id': mix_id, # 合集ID
|
'mix_id': mix_id, # 合集ID
|
||||||
'aweme_ids': aweme_ids.copy() if aweme_ids else [], # 该合集包含的视频ID列表
|
|
||||||
'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL)
|
'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL)
|
||||||
'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表
|
'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}, 包含{len(aweme_ids)}个视频) - {vv:,} 播放量')
|
logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
|
|
||||||
# 如果aweme_ids为空,主动触发API调用获取合集中的视频列表
|
|
||||||
if not aweme_ids and mix_id:
|
|
||||||
self._trigger_mix_aweme_api(mix_id)
|
|
||||||
|
|
||||||
# 递归搜索子对象
|
# 递归搜索子对象
|
||||||
for key, value in obj.items():
|
for key, value in obj.items():
|
||||||
@ -750,10 +553,6 @@ class DouyinPlayVVScraper:
|
|||||||
# 构建合集链接
|
# 构建合集链接
|
||||||
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else ""
|
||||||
|
|
||||||
# 获取该合集对应的aweme_id列表
|
|
||||||
mix_aweme_mapping = getattr(self, 'mix_aweme_mapping', {})
|
|
||||||
aweme_ids = mix_aweme_mapping.get(mix_id, [])
|
|
||||||
|
|
||||||
self.play_vv_items.append({
|
self.play_vv_items.append({
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
'formatted': self.format_count(vv),
|
'formatted': self.format_count(vv),
|
||||||
@ -762,14 +561,9 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_name': mix_name,
|
'mix_name': mix_name,
|
||||||
'video_url': video_url, # 合集链接
|
'video_url': video_url, # 合集链接
|
||||||
'mix_id': mix_id, # 合集ID
|
'mix_id': mix_id, # 合集ID
|
||||||
'aweme_ids': aweme_ids.copy() if aweme_ids else [], # 该合集包含的视频ID列表
|
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}, 包含{len(aweme_ids)}个视频) - {vv:,} 播放量')
|
logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量')
|
||||||
|
|
||||||
# 如果aweme_ids为空,主动触发API调用获取合集中的视频列表
|
|
||||||
if not aweme_ids and mix_id:
|
|
||||||
self._trigger_mix_aweme_api(mix_id)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -779,9 +573,6 @@ class DouyinPlayVVScraper:
|
|||||||
vv = int(match)
|
vv = int(match)
|
||||||
# 检查是否已经存在相同的play_vv
|
# 检查是否已经存在相同的play_vv
|
||||||
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
if not any(item['play_vv'] == vv for item in self.play_vv_items):
|
||||||
# 获取收集到的aweme_id列表
|
|
||||||
aweme_ids = getattr(self, 'collected_aweme_ids', [])
|
|
||||||
|
|
||||||
self.play_vv_items.append({
|
self.play_vv_items.append({
|
||||||
'play_vv': vv,
|
'play_vv': vv,
|
||||||
'formatted': self.format_count(vv),
|
'formatted': self.format_count(vv),
|
||||||
@ -790,7 +581,6 @@ class DouyinPlayVVScraper:
|
|||||||
'mix_name': '', # 未知合集名称
|
'mix_name': '', # 未知合集名称
|
||||||
'video_url': '', # 未知链接
|
'video_url': '', # 未知链接
|
||||||
'mix_id': '', # 未知mix_id
|
'mix_id': '', # 未知mix_id
|
||||||
'aweme_ids': aweme_ids.copy() if aweme_ids else [], # 收集到的视频ID列表
|
|
||||||
'timestamp': datetime.now().isoformat()
|
'timestamp': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
except Exception:
|
except Exception:
|
||||||
@ -851,12 +641,8 @@ class DouyinPlayVVScraper:
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# 特殊处理mix/aweme API - 获取合集中的视频列表
|
# 解析play_vv
|
||||||
if '/aweme/v1/web/mix/aweme/' in url:
|
self.parse_play_vv_from_text(body_text, url, req_id)
|
||||||
self.parse_mix_aweme_response(body_text, url, req_id)
|
|
||||||
else:
|
|
||||||
# 解析play_vv
|
|
||||||
self.parse_play_vv_from_text(body_text, url, req_id)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
# 某些响应不可获取或过大
|
# 某些响应不可获取或过大
|
||||||
pass
|
pass
|
||||||
@ -868,26 +654,6 @@ class DouyinPlayVVScraper:
|
|||||||
|
|
||||||
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个play_vv候选')
|
logging.info(f'网络收集完成,共发现 {len(self.play_vv_items)} 个play_vv候选')
|
||||||
|
|
||||||
# 更新所有条目的aweme_ids字段
|
|
||||||
self._update_aweme_ids_for_existing_items()
|
|
||||||
|
|
||||||
def _update_aweme_ids_for_existing_items(self):
|
|
||||||
"""更新所有已存在条目的aweme_ids字段"""
|
|
||||||
if not hasattr(self, 'mix_aweme_mapping') or not self.mix_aweme_mapping:
|
|
||||||
logging.info('没有mix_aweme_mapping数据,跳过aweme_ids更新')
|
|
||||||
return
|
|
||||||
|
|
||||||
updated_count = 0
|
|
||||||
for item in self.play_vv_items:
|
|
||||||
mix_id = item.get('mix_id')
|
|
||||||
if mix_id and mix_id in self.mix_aweme_mapping:
|
|
||||||
aweme_ids = self.mix_aweme_mapping[mix_id]
|
|
||||||
if aweme_ids and len(aweme_ids) > 0:
|
|
||||||
item['aweme_ids'] = aweme_ids.copy()
|
|
||||||
updated_count += 1
|
|
||||||
logging.info(f'更新合集 {item.get("mix_name", "未知")} (ID: {mix_id}) 的aweme_ids,包含 {len(aweme_ids)} 个视频')
|
|
||||||
|
|
||||||
logging.info(f'已更新 {updated_count} 个条目的aweme_ids字段')
|
|
||||||
|
|
||||||
def parse_ssr_data(self):
|
def parse_ssr_data(self):
|
||||||
logging.info('尝试解析页面SSR数据')
|
logging.info('尝试解析页面SSR数据')
|
||||||
@ -939,51 +705,10 @@ class DouyinPlayVVScraper:
|
|||||||
self.play_vv_items = unique
|
self.play_vv_items = unique
|
||||||
|
|
||||||
def save_results(self):
|
def save_results(self):
|
||||||
ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
||||||
|
|
||||||
# 创建data文件夹
|
|
||||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
data_dir = os.path.join(script_dir, 'data')
|
|
||||||
os.makedirs(data_dir, exist_ok=True)
|
|
||||||
|
|
||||||
json_file = os.path.join(data_dir, f'douyin_cdp_play_vv_{ts}.json')
|
|
||||||
txt_file = os.path.join(data_dir, f'douyin_cdp_play_vv_{ts}.txt')
|
|
||||||
|
|
||||||
# 保存到JSON文件
|
|
||||||
with open(json_file, 'w', encoding='utf-8') as f:
|
|
||||||
json.dump({
|
|
||||||
'timestamp': ts,
|
|
||||||
'start_url': self.start_url,
|
|
||||||
'play_vv_items': self.play_vv_items,
|
|
||||||
'captured_count': len(self.play_vv_items)
|
|
||||||
}, f, ensure_ascii=False, indent=2)
|
|
||||||
|
|
||||||
# 保存到TXT文件
|
|
||||||
with open(txt_file, 'w', encoding='utf-8') as f:
|
|
||||||
f.write('抖音收藏合集真实播放量(play_vv) - Selenium+CDP\n')
|
|
||||||
f.write('=' * 60 + '\n\n')
|
|
||||||
if self.play_vv_items:
|
|
||||||
sorted_items = sorted(self.play_vv_items, key=lambda x: x['play_vv'], reverse=True)
|
|
||||||
f.write(f"✅ 提取到 {len(sorted_items)} 个唯一play_vv数值\n\n")
|
|
||||||
for i, item in enumerate(sorted_items, 1):
|
|
||||||
mix_info = f" - {item.get('mix_name', '未知合集')}" if item.get('mix_name') else ""
|
|
||||||
video_info = f" (链接: {item.get('video_url', '未知')})" if item.get('video_url') else ""
|
|
||||||
f.write(f"{i}. play_vv: {item['play_vv']:,} ({item['formatted']}){mix_info}{video_info}\n")
|
|
||||||
f.write(f" 来源: {item['url']}\n\n")
|
|
||||||
total = sum(x['play_vv'] for x in sorted_items)
|
|
||||||
f.write(f"📊 总播放量: {total:,}\n")
|
|
||||||
f.write(f"📈 最高播放量: {sorted_items[0]['play_vv']:,} ({sorted_items[0]['formatted']})\n")
|
|
||||||
else:
|
|
||||||
f.write('❌ 未能提取到play_vv数值\n')
|
|
||||||
f.write('可能原因:\n')
|
|
||||||
f.write('- 仍需登录或权限受限\n')
|
|
||||||
f.write('- API响应体不可读取或被加密\n')
|
|
||||||
f.write('- 页面结构或接口策略发生变更\n')
|
|
||||||
|
|
||||||
# 保存到MongoDB
|
# 保存到MongoDB
|
||||||
self.save_to_mongodb()
|
self.save_to_mongodb()
|
||||||
|
|
||||||
logging.info('结果已保存: %s, %s', json_file, txt_file)
|
logging.info('结果已保存到MongoDB')
|
||||||
|
|
||||||
def save_to_mongodb(self):
|
def save_to_mongodb(self):
|
||||||
"""将数据保存到MongoDB"""
|
"""将数据保存到MongoDB"""
|
||||||
@ -1000,7 +725,7 @@ class DouyinPlayVVScraper:
|
|||||||
documents = []
|
documents = []
|
||||||
|
|
||||||
for item in self.play_vv_items:
|
for item in self.play_vv_items:
|
||||||
# 保留用户要求的7个字段 + aweme_ids作为短剧集数ID列表 + cover_image_url作为合集封面图片完整链接
|
# 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接
|
||||||
doc = {
|
doc = {
|
||||||
'batch_time': batch_time,
|
'batch_time': batch_time,
|
||||||
'mix_name': item.get('mix_name', ''),
|
'mix_name': item.get('mix_name', ''),
|
||||||
@ -1009,7 +734,6 @@ class DouyinPlayVVScraper:
|
|||||||
'play_vv': item.get('play_vv', 0),
|
'play_vv': item.get('play_vv', 0),
|
||||||
'request_id': item.get('request_id', ''),
|
'request_id': item.get('request_id', ''),
|
||||||
'rank': 0, # 临时设置,后面会重新计算
|
'rank': 0, # 临时设置,后面会重新计算
|
||||||
'aweme_ids': item.get('aweme_ids', []), # 短剧集数ID列表
|
|
||||||
'cover_image_url': item.get('cover_image_url', ''), # 合集封面图片主链接(完整URL)
|
'cover_image_url': item.get('cover_image_url', ''), # 合集封面图片主链接(完整URL)
|
||||||
'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表
|
'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表
|
||||||
}
|
}
|
||||||
@ -1029,17 +753,13 @@ class DouyinPlayVVScraper:
|
|||||||
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
|
max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0
|
||||||
|
|
||||||
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
|
logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}')
|
||||||
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, aweme_ids, cover_image_url, cover_backup_urls')
|
logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url, cover_backup_urls')
|
||||||
|
|
||||||
# 统计封面图片提取情况
|
# 统计封面图片提取情况
|
||||||
cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))
|
cover_count = sum(1 for doc in documents if doc.get('cover_image_url'))
|
||||||
backup_count = sum(1 for doc in documents if doc.get('cover_backup_urls'))
|
backup_count = sum(1 for doc in documents if doc.get('cover_backup_urls'))
|
||||||
logging.info(f'封面图片统计: {cover_count}/{len(documents)} 个合集有主封面链接, {backup_count} 个合集有备用链接')
|
logging.info(f'封面图片统计: {cover_count}/{len(documents)} 个合集有主封面链接, {backup_count} 个合集有备用链接')
|
||||||
|
|
||||||
# 输出aweme_ids统计信息
|
|
||||||
total_episodes = sum(len(doc.get('aweme_ids', [])) for doc in documents)
|
|
||||||
logging.info(f'短剧集数统计: 总共收集到 {total_episodes} 集视频ID')
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f'保存到MongoDB时出错: {e}')
|
logging.error(f'保存到MongoDB时出错: {e}')
|
||||||
|
|
||||||
|
|||||||
@ -78,8 +78,8 @@ class MiniprogramAPI:
|
|||||||
else:
|
else:
|
||||||
return str(time_obj)
|
return str(time_obj)
|
||||||
|
|
||||||
def format_video_item(self, doc):
|
def format_mix_item(self, doc):
|
||||||
"""格式化单个视频数据项 - 完全按照数据库原始字段返回"""
|
"""格式化合集数据项 - 完全按照数据库原始字段返回"""
|
||||||
return {
|
return {
|
||||||
"_id": str(doc.get("_id", "")),
|
"_id": str(doc.get("_id", "")),
|
||||||
"batch_time": self.format_time(doc.get("batch_time")),
|
"batch_time": self.format_time(doc.get("batch_time")),
|
||||||
@ -89,13 +89,12 @@ class MiniprogramAPI:
|
|||||||
"play_vv": doc.get("play_vv", 0),
|
"play_vv": doc.get("play_vv", 0),
|
||||||
"request_id": doc.get("request_id", ""),
|
"request_id": doc.get("request_id", ""),
|
||||||
"rank": doc.get("rank", 0),
|
"rank": doc.get("rank", 0),
|
||||||
"aweme_ids": doc.get("aweme_ids", []),
|
|
||||||
"cover_image_url": doc.get("cover_image_url", ""),
|
"cover_image_url": doc.get("cover_image_url", ""),
|
||||||
"cover_backup_urls": doc.get("cover_backup_urls", [])
|
"cover_backup_urls": doc.get("cover_backup_urls", [])
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_video_list(self, page=1, limit=20, sort_by="playcount"):
|
def get_mix_list(self, page=1, limit=20, sort_by="playcount"):
|
||||||
"""获取视频列表(分页)"""
|
"""获取合集列表(分页)"""
|
||||||
try:
|
try:
|
||||||
# 计算跳过的数量
|
# 计算跳过的数量
|
||||||
skip = (page - 1) * limit
|
skip = (page - 1) * limit
|
||||||
@ -103,7 +102,7 @@ class MiniprogramAPI:
|
|||||||
# 设置排序字段
|
# 设置排序字段
|
||||||
if sort_by == "growth":
|
if sort_by == "growth":
|
||||||
# 按增长排序需要特殊处理
|
# 按增长排序需要特殊处理
|
||||||
return self.get_growth_videos(page, limit)
|
return self.get_growth_mixes(page, limit)
|
||||||
else:
|
else:
|
||||||
sort_field = "play_vv" if sort_by == "playcount" else "batch_time"
|
sort_field = "play_vv" if sort_by == "playcount" else "batch_time"
|
||||||
sort_order = -1 # 降序
|
sort_order = -1 # 降序
|
||||||
@ -146,14 +145,14 @@ class MiniprogramAPI:
|
|||||||
total = total_result[0]["total"] if total_result else 0
|
total = total_result[0]["total"] if total_result else 0
|
||||||
|
|
||||||
# 格式化数据
|
# 格式化数据
|
||||||
video_list = []
|
mix_list = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
item = self.format_video_item(doc)
|
item = self.format_mix_item(doc)
|
||||||
video_list.append(item)
|
mix_list.append(item)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"data": video_list,
|
"data": mix_list,
|
||||||
"pagination": {
|
"pagination": {
|
||||||
"page": page,
|
"page": page,
|
||||||
"limit": limit,
|
"limit": limit,
|
||||||
@ -167,11 +166,11 @@ class MiniprogramAPI:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"获取视频列表失败: {e}")
|
logging.error(f"获取合集列表失败: {e}")
|
||||||
return {"success": False, "message": f"获取数据失败: {str(e)}"}
|
return {"success": False, "message": f"获取数据失败: {str(e)}"}
|
||||||
|
|
||||||
def get_growth_videos(self, page=1, limit=20, start_date=None, end_date=None):
|
def get_growth_mixes(self, page=1, limit=20, start_date=None, end_date=None):
|
||||||
"""获取按播放量增长排序的视频列表"""
|
"""获取按播放量增长排序的合集列表"""
|
||||||
try:
|
try:
|
||||||
# 计算跳过的数量
|
# 计算跳过的数量
|
||||||
skip = (page - 1) * limit
|
skip = (page - 1) * limit
|
||||||
@ -218,14 +217,14 @@ class MiniprogramAPI:
|
|||||||
|
|
||||||
# 只保留增长为正的数据
|
# 只保留增长为正的数据
|
||||||
if growth > 0:
|
if growth > 0:
|
||||||
item = self.format_video_item(end_item)
|
item = self.format_mix_item(end_item)
|
||||||
item["growth"] = growth
|
item["growth"] = growth
|
||||||
item["start_date"] = start_date.strftime("%Y-%m-%d")
|
item["start_date"] = start_date.strftime("%Y-%m-%d")
|
||||||
item["end_date"] = end_date.strftime("%Y-%m-%d")
|
item["end_date"] = end_date.strftime("%Y-%m-%d")
|
||||||
growth_data.append(item)
|
growth_data.append(item)
|
||||||
else:
|
else:
|
||||||
# 如果开始日期没有数据,但结束日期有,也认为是新增长
|
# 如果开始日期没有数据,但结束日期有,也认为是新增长
|
||||||
item = self.format_video_item(end_item)
|
item = self.format_mix_item(end_item)
|
||||||
item["growth"] = end_item.get("play_vv", 0)
|
item["growth"] = end_item.get("play_vv", 0)
|
||||||
item["start_date"] = start_date.strftime("%Y-%m-%d")
|
item["start_date"] = start_date.strftime("%Y-%m-%d")
|
||||||
item["end_date"] = end_date.strftime("%Y-%m-%d")
|
item["end_date"] = end_date.strftime("%Y-%m-%d")
|
||||||
@ -262,14 +261,14 @@ class MiniprogramAPI:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"获取增长视频列表失败: {e}")
|
logging.error(f"获取增长合集列表失败: {e}")
|
||||||
# 如果增长计算失败,返回按播放量排序的数据作为备选
|
# 如果增长计算失败,返回按播放量排序的数据作为备选
|
||||||
return self.get_video_list(page, limit, "playcount")
|
return self.get_mix_list(page, limit, "playcount")
|
||||||
|
|
||||||
def get_top_videos(self, limit=10):
|
def get_top_mixes(self, limit=10):
|
||||||
"""获取热门视频(TOP榜单)"""
|
"""获取热门合集(TOP榜单)"""
|
||||||
try:
|
try:
|
||||||
# 按播放量排序获取热门视频
|
# 按播放量排序获取热门合集
|
||||||
cursor = self.collection.find().sort("play_vv", -1).limit(limit)
|
cursor = self.collection.find().sort("play_vv", -1).limit(limit)
|
||||||
docs = list(cursor)
|
docs = list(cursor)
|
||||||
|
|
||||||
@ -279,7 +278,7 @@ class MiniprogramAPI:
|
|||||||
# 格式化数据
|
# 格式化数据
|
||||||
top_list = []
|
top_list = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
item = self.format_video_item(doc)
|
item = self.format_mix_item(doc)
|
||||||
top_list.append(item)
|
top_list.append(item)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -290,11 +289,11 @@ class MiniprogramAPI:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"获取热门视频失败: {e}")
|
logging.error(f"获取热门合集失败: {e}")
|
||||||
return {"success": False, "message": f"获取数据失败: {str(e)}"}
|
return {"success": False, "message": f"获取数据失败: {str(e)}"}
|
||||||
|
|
||||||
def search_videos(self, keyword, page=1, limit=10):
|
def search_mixes(self, keyword, page=1, limit=10):
|
||||||
"""搜索视频"""
|
"""搜索合集"""
|
||||||
try:
|
try:
|
||||||
if not keyword:
|
if not keyword:
|
||||||
return {"success": False, "message": "请提供搜索关键词"}
|
return {"success": False, "message": "请提供搜索关键词"}
|
||||||
@ -317,7 +316,7 @@ class MiniprogramAPI:
|
|||||||
# 格式化数据
|
# 格式化数据
|
||||||
search_results = []
|
search_results = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
item = self.format_video_item(doc)
|
item = self.format_mix_item(doc)
|
||||||
search_results.append(item)
|
search_results.append(item)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -336,31 +335,31 @@ class MiniprogramAPI:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"搜索视频失败: {e}")
|
logging.error(f"搜索合集失败: {e}")
|
||||||
return {"success": False, "message": f"搜索失败: {str(e)}"}
|
return {"success": False, "message": f"搜索失败: {str(e)}"}
|
||||||
|
|
||||||
def get_video_detail(self, video_id):
|
def get_mix_detail(self, mix_id):
|
||||||
"""获取视频详情"""
|
"""获取合集详情"""
|
||||||
try:
|
try:
|
||||||
from bson import ObjectId
|
from bson import ObjectId
|
||||||
|
|
||||||
# 尝试通过ObjectId查找
|
# 尝试通过ObjectId查找
|
||||||
try:
|
try:
|
||||||
doc = self.collection.find_one({"_id": ObjectId(video_id)})
|
doc = self.collection.find_one({"_id": ObjectId(mix_id)})
|
||||||
except:
|
except:
|
||||||
# 如果ObjectId无效,尝试其他字段
|
# 如果ObjectId无效,尝试其他字段
|
||||||
doc = self.collection.find_one({
|
doc = self.collection.find_one({
|
||||||
"$or": [
|
"$or": [
|
||||||
{"mix_name": video_id},
|
{"mix_name": mix_id},
|
||||||
{"request_id": video_id}
|
{"request_id": mix_id}
|
||||||
]
|
]
|
||||||
})
|
})
|
||||||
|
|
||||||
if not doc:
|
if not doc:
|
||||||
return {"success": False, "message": "未找到视频信息"}
|
return {"success": False, "message": "未找到合集信息"}
|
||||||
|
|
||||||
# 格式化详细信息 - 只返回数据库原始字段
|
# 格式化详细信息 - 只返回数据库原始字段
|
||||||
detail = self.format_video_item(doc)
|
detail = self.format_mix_item(doc)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
@ -369,16 +368,16 @@ class MiniprogramAPI:
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"获取视频详情失败: {e}")
|
logging.error(f"获取合集详情失败: {e}")
|
||||||
return {"success": False, "message": f"获取详情失败: {str(e)}"}
|
return {"success": False, "message": f"获取详情失败: {str(e)}"}
|
||||||
|
|
||||||
def get_statistics(self):
|
def get_statistics(self):
|
||||||
"""获取统计信息"""
|
"""获取统计信息"""
|
||||||
try:
|
try:
|
||||||
# 基本统计
|
# 基本统计
|
||||||
total_videos = self.collection.count_documents({})
|
total_mixes = self.collection.count_documents({})
|
||||||
|
|
||||||
if total_videos == 0:
|
if total_mixes == 0:
|
||||||
return {"success": False, "message": "暂无数据"}
|
return {"success": False, "message": "暂无数据"}
|
||||||
|
|
||||||
# 播放量统计
|
# 播放量统计
|
||||||
@ -427,7 +426,7 @@ class MiniprogramAPI:
|
|||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"data": {
|
"data": {
|
||||||
"total_videos": total_videos,
|
"total_mixes": total_mixes,
|
||||||
"total_playcount": stats.get("total_playcount", 0),
|
"total_playcount": stats.get("total_playcount", 0),
|
||||||
"avg_playcount": int(stats.get("avg_playcount", 0)),
|
"avg_playcount": int(stats.get("avg_playcount", 0)),
|
||||||
"max_playcount": stats.get("max_playcount", 0),
|
"max_playcount": stats.get("max_playcount", 0),
|
||||||
@ -443,7 +442,7 @@ class MiniprogramAPI:
|
|||||||
return {"success": False, "message": f"获取统计失败: {str(e)}"}
|
return {"success": False, "message": f"获取统计失败: {str(e)}"}
|
||||||
|
|
||||||
def get_videos(self):
|
def get_videos(self):
|
||||||
"""获取视频列表 - 兼容app.py调用"""
|
"""获取合集列表 - 兼容app.py调用"""
|
||||||
from flask import request
|
from flask import request
|
||||||
|
|
||||||
page = int(request.args.get('page', 1))
|
page = int(request.args.get('page', 1))
|
||||||
@ -453,29 +452,29 @@ class MiniprogramAPI:
|
|||||||
if sort_by == 'growth':
|
if sort_by == 'growth':
|
||||||
start_date = request.args.get('start_date')
|
start_date = request.args.get('start_date')
|
||||||
end_date = request.args.get('end_date')
|
end_date = request.args.get('end_date')
|
||||||
return self.get_growth_videos(page, limit, start_date, end_date)
|
return self.get_growth_mixes(page, limit, start_date, end_date)
|
||||||
else:
|
else:
|
||||||
return self.get_video_list(page, limit, sort_by)
|
return self.get_mix_list(page, limit, sort_by)
|
||||||
|
|
||||||
def get_top(self):
|
def get_top(self):
|
||||||
"""获取热门榜单 - 兼容app.py调用"""
|
"""获取热门榜单 - 兼容app.py调用"""
|
||||||
from flask import request
|
from flask import request
|
||||||
limit = int(request.args.get('limit', 10))
|
limit = int(request.args.get('limit', 10))
|
||||||
return self.get_top_videos(limit)
|
return self.get_top_mixes(limit)
|
||||||
|
|
||||||
def search(self):
|
def search(self):
|
||||||
"""搜索视频 - 兼容app.py调用"""
|
"""搜索合集 - 兼容app.py调用"""
|
||||||
from flask import request
|
from flask import request
|
||||||
keyword = request.args.get('q', '')
|
keyword = request.args.get('q', '')
|
||||||
page = int(request.args.get('page', 1))
|
page = int(request.args.get('page', 1))
|
||||||
limit = int(request.args.get('limit', 10))
|
limit = int(request.args.get('limit', 10))
|
||||||
return self.search_videos(keyword, page, limit)
|
return self.search_mixes(keyword, page, limit)
|
||||||
|
|
||||||
def get_detail(self):
|
def get_detail(self):
|
||||||
"""获取视频详情 - 兼容app.py调用"""
|
"""获取合集详情 - 兼容app.py调用"""
|
||||||
from flask import request
|
from flask import request
|
||||||
video_id = request.args.get('id', '')
|
mix_id = request.args.get('id', '')
|
||||||
return self.get_video_detail(video_id)
|
return self.get_mix_detail(mix_id)
|
||||||
|
|
||||||
def get_stats(self):
|
def get_stats(self):
|
||||||
"""获取统计信息 - 兼容app.py调用"""
|
"""获取统计信息 - 兼容app.py调用"""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user