From be443349601385c1660f94c9618493c56bc77565 Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Mon, 20 Oct 2025 18:47:49 +0800 Subject: [PATCH 1/6] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=B0=81=E9=9D=A2?= =?UTF-8?q?=E5=9B=BE=E7=89=87=E8=BF=87=E6=9C=9F=E6=83=85=E5=86=B5=EF=BC=8C?= =?UTF-8?q?=E5=AD=98=E5=85=A5TOS=E4=B8=ADmedia/rank/=E7=9B=AE=E5=BD=95?= =?UTF-8?q?=E4=B8=AD=20=E7=83=AD=E6=92=AD=E6=80=BB=E6=A6=9C=E9=80=89?= =?UTF-8?q?=E6=8B=A9=E6=97=A5=E6=9C=9F=E4=B9=8B=E5=90=8E=E6=B0=B8=E4=B9=85?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E5=9B=BE=E7=89=87=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/Timer_worker.py | 2 +- backend/config.py | 21 + .../handlers/Rankings/rank_data_scraper.py | 179 +++++- backend/handlers/Rankings/tos_client.py | 562 ++++++++++++++++++ backend/routers/rank_api_routes.py | 3 +- docs/API接口文档.md | 16 +- 6 files changed, 762 insertions(+), 21 deletions(-) create mode 100644 backend/handlers/Rankings/tos_client.py diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index 109fb93..3aacde8 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -28,7 +28,7 @@ import config # 添加项目路径到 Python 路径 sys.path.append(os.path.join(os.path.dirname(__file__), 'handlers', 'Rankings')) -from rank_data_scraper import DouyinPlayVVScraper +from handlers.Rankings.rank_data_scraper import DouyinPlayVVScraper diff --git a/backend/config.py b/backend/config.py index f66da39..38d7cc5 100644 --- a/backend/config.py +++ b/backend/config.py @@ -18,4 +18,25 @@ LOG_DIR = 'logs' # 定时器配置 SCHEDULER_TIME = "24:00" # 定时器执行时间,格式为 HH:MM (24小时制) +# TOS/火山云对象存储配置 +TOS_CONFIG = { + 'access_key_id': os.getenv('TOS_ACCESS_KEY_ID', 'AKLTYjQyYmE1ZDAwZTY5NGZiOWI3ODZkZDhhOWE4MzVjODE'), + 'access_key_secret': os.getenv('TOS_ACCESS_KEY_SECRET', 'WlRKa05EbGhZVEUyTXpjNU5ESmpPRGt5T0RJNFl6QmhPR0pqTVRjMVpUWQ=='), + 'endpoint': 'https://tos-cn-beijing.volces.com', + 'region': 'cn-beijing', + 'bucket_name': os.getenv('TOS_BUCKET_NAME', 'km1'), + 'self_domain': os.getenv('TOS_SELF_DOMAIN', 'oss.xintiao85.com'), + 'disable_ssl_warnings': True +} + +# API配置(兼容现有代码) +API_CONFIG = { + 'huoshan': { + 'AccessKey': TOS_CONFIG['access_key_id'], + 'SecretKey': TOS_CONFIG['access_key_secret'] + }, + 'OSS_BUCKET_NAME': TOS_CONFIG['bucket_name'], + 'OSS_HOST': TOS_CONFIG['self_domain'] +} + print(f"Successfully loaded configuration for environment: {APP_ENV}") \ No newline at end of file diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index d3d8312..95c885b 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -35,7 +35,13 @@ import sys import os # 添加项目根目录到 Python 路径 sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..')) +# 确保能找到backend目录下的模块 +backend_dir = os.path.join(os.path.dirname(__file__), '..', '..') +sys.path.insert(0, backend_dir) from database import db +from tos_client import oss_client +import uuid +import re # 配置日志 @@ -65,8 +71,10 @@ class DouyinPlayVVScraper: self.captured_responses = [] self.db = None self.collection = None + self.image_cache = {} # 图片ID到TOS链接的缓存映射 {image_id: tos_url} self._cleanup_old_profiles() self._setup_mongodb() + self._load_image_cache() def _setup_mongodb(self): """设置MongoDB连接""" @@ -81,10 +89,42 @@ class DouyinPlayVVScraper: logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}') except Exception as e: - logging.warning(f'MongoDB设置出错: {e}') + logging.error(f'MongoDB连接失败: {e}') self.db = None self.collection = None + def _load_image_cache(self): + """从数据库加载已存在的图片ID到TOS链接的映射""" + if self.collection is None: + return + + try: + # 查询所有有封面图片的记录 + cursor = self.collection.find( + { + 'cover_image_url_original': {'$exists': True, '$ne': ''}, + 'cover_image_url': {'$exists': True, '$ne': ''} + }, + {'cover_image_url_original': 1, 'cover_image_url': 1} + ) + + cache_count = 0 + for doc in cursor: + original_url = doc.get('cover_image_url_original', '') + tos_url = doc.get('cover_image_url', '') + + if original_url and tos_url and original_url != tos_url: + # 提取图片ID + image_id = self.extract_douyin_image_id(original_url) + if image_id: + self.image_cache[image_id] = tos_url + cache_count += 1 + + logging.info(f'从数据库加载图片缓存: {cache_count} 个图片映射') + + except Exception as e: + logging.error(f'加载图片缓存失败: {e}') + def _cleanup_old_profiles(self): """清理超过一天的旧临时Chrome配置文件""" try: @@ -696,6 +736,109 @@ class DouyinPlayVVScraper: logging.info('结果已保存到MongoDB') + def extract_douyin_image_id(self, cover_url): + """ + 从抖音图片URL中提取唯一的图片ID + + Args: + cover_url (str): 抖音图片URL + + Returns: + str: 图片ID,如果提取失败返回空字符串 + """ + if not cover_url: + return '' + + try: + # 抖音图片URL格式支持两种: + # 1. https://p{数字}-sign.douyinpic.com/obj/tos-cn-i-dy/{图片ID}?{参数} + # 2. https://p{数字}-sign.douyinpic.com/obj/douyin-user-image-file/{图片ID}?{参数} + # 使用正则表达式提取图片ID + patterns = [ + r'/obj/tos-cn-i-dy/([a-f0-9]+)', + r'/obj/douyin-user-image-file/([a-f0-9]+)' + ] + + for pattern in patterns: + match = re.search(pattern, cover_url) + if match: + image_id = match.group(1) + logging.debug(f'提取图片ID成功: {image_id} from {cover_url}') + return image_id + + logging.warning(f'无法从URL中提取图片ID: {cover_url}') + return '' + + except Exception as e: + logging.error(f'提取图片ID异常: {cover_url}, 错误: {e}') + return '' + + def upload_cover_image(self, cover_url, mix_name): + """ + 上传封面图片到TOS并返回永久链接(带去重功能) + + Args: + cover_url: 临时封面图片链接 + mix_name: 合集名称,用于生成文件名 + + Returns: + str: 永久链接URL,如果上传失败则返回原链接 + """ + if not cover_url: + return cover_url + + try: + # 提取图片ID + image_id = self.extract_douyin_image_id(cover_url) + + # 如果能提取到图片ID,检查缓存 + if image_id: + if image_id in self.image_cache: + cached_url = self.image_cache[image_id] + logging.info(f'使用缓存图片: {image_id} -> {cached_url} (合集: {mix_name})') + return cached_url + + # 生成随机文件名,保持原有的扩展名 + file_extension = '.jpg' # 抖音封面图片通常是jpg格式 + + # 改进的扩展名检测逻辑 + url_without_params = cover_url.split('?')[0] + url_path = url_without_params.split('/')[-1] # 获取URL路径的最后一部分 + + # 只有当最后一部分包含点且点后面的内容是常见图片扩展名时才使用 + if '.' in url_path: + potential_ext = url_path.split('.')[-1].lower() + # 检查是否为常见的图片扩展名 + if potential_ext in ['jpg', 'jpeg', 'png', 'gif', 'webp', 'bmp']: + file_extension = '.' + potential_ext + + # 生成唯一文件名 + random_filename = f"{uuid.uuid4().hex}{file_extension}" + object_key = f"media/rank/{random_filename}" + + logging.info(f'开始上传封面图片: {mix_name}') + logging.info(f'封面图片URL: {cover_url}') + + # 从URL上传到TOS并获取新的URL + oss_url = oss_client.upload_from_url( + url=cover_url, + object_key=object_key, + return_url=True + ) + + logging.info(f'封面图片上传成功: {mix_name} -> {oss_url}') + + # 如果有图片ID,将结果缓存 + if image_id: + self.image_cache[image_id] = oss_url + logging.debug(f'图片缓存已更新: {image_id} -> {oss_url}') + + return oss_url + + except Exception as e: + logging.error(f'封面图片上传失败: {mix_name} - {str(e)}') + return cover_url # 上传失败时返回原链接 + def save_to_mongodb(self): """将数据保存到MongoDB""" if self.collection is None: @@ -711,16 +854,34 @@ class DouyinPlayVVScraper: documents = [] for item in self.play_vv_items: + # 获取原始封面图片URL + original_cover_url = item.get('cover_image_url', '') + mix_name = item.get('mix_name', '') + + # 处理封面图片 + permanent_cover_url = '' + if original_cover_url: + # 上传封面图片到TOS获取永久链接 + permanent_cover_url = self.upload_cover_image(original_cover_url, mix_name) + + # 如果上传失败且有原始链接,记录警告但继续保存 + if permanent_cover_url == original_cover_url: + logging.warning(f'封面图片上传失败,使用原始链接: {mix_name}') + else: + # 没有封面图片,使用空字符串 + permanent_cover_url = '' + # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 doc = { 'batch_time': batch_time, - 'mix_name': item.get('mix_name', ''), + 'mix_name': mix_name, 'video_url': item.get('video_url', ''), 'playcount': item.get('formatted', ''), 'play_vv': item.get('play_vv', 0), 'request_id': item.get('request_id', ''), 'rank': 0, # 临时设置,后面会重新计算 - 'cover_image_url': item.get('cover_image_url', ''), # 合集封面图片主链接(完整URL) + 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 + 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表 } documents.append(doc) @@ -739,12 +900,16 @@ class DouyinPlayVVScraper: max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0 logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}') - logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url, cover_backup_urls') + logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url') - # 统计封面图片提取情况 + # 统计封面图片处理情况 cover_count = sum(1 for doc in documents if doc.get('cover_image_url')) - backup_count = sum(1 for doc in documents if doc.get('cover_backup_urls')) - logging.info(f'封面图片统计: {cover_count}/{len(documents)} 个合集有主封面链接, {backup_count} 个合集有备用链接') + original_count = sum(1 for item in self.play_vv_items if item.get('cover_image_url')) + success_count = sum(1 for doc in documents if doc.get('cover_image_url') and doc.get('cover_image_url') != doc.get('cover_image_url_original', '')) + + logging.info(f'封面图片统计: {cover_count}/{len(documents)} 个合集有封面链接') + logging.info(f'封面上传统计: {success_count}/{original_count} 个封面成功上传到TOS') + logging.info(f'图片缓存统计: 当前缓存 {len(self.image_cache)} 个图片映射') except Exception as e: logging.error(f'保存到MongoDB时出错: {e}') diff --git a/backend/handlers/Rankings/tos_client.py b/backend/handlers/Rankings/tos_client.py new file mode 100644 index 0000000..caef32d --- /dev/null +++ b/backend/handlers/Rankings/tos_client.py @@ -0,0 +1,562 @@ +from typing import Any, Optional +import mimetypes +from io import StringIO +import os +import tos +import urllib3 +from urllib3.exceptions import InsecureRequestWarning +from config import API_CONFIG +# 火山对象存储 +class TOSClient: + def __init__( + self, + access_key_id: str, + access_key_secret: str, + endpoint: str, + region: str, + bucket_name: str, + self_domain: str, + disable_ssl_warnings: bool = True + ): + """ + 初始化OSS客户端 + + Args: + access_key_id: ak + access_key_secret: sk + endpoint: OSS访问端点 (如: https://oss-cn-hangzhou.aliyuncs.com) + bucket_name: 存储桶名称 + self_domain: 自定义域名 + disable_ssl_warnings: 是否禁用SSL警告 + """ + # 禁用SSL警告(如果需要) + if disable_ssl_warnings: + urllib3.disable_warnings(InsecureRequestWarning) + sts_token: str = "token_test" + self.bucket_name = bucket_name + self.self_domain = self_domain + self.endpoint = endpoint + self.client = tos.TosClientV2( + ak=access_key_id, + sk=access_key_secret, + endpoint=self_domain, + region=region, + is_custom_domain=True, + # bucket_name, + # security_token=sts_token, + connection_time=30, socket_timeout=60, max_retry_count=3 + ) + + def get_base_url(self, object_key: str) -> str: + """获取基础URL(不带签名参数)""" + # endpoint = self.endpoint.replace('https://', '').replace('http://', '') + return f"https://{self.self_domain}/{object_key}" + + def generate_url(self, object_key: str, expires: int = 3600) -> str: + """生成带签名的临时访问URL""" + # 生成签名URL + pre_signed_url_output = self.client.pre_signed_url( + tos.HttpMethodType.Http_Method_Get, + bucket=self.bucket_name, + key=object_key, + expires=expires) + return pre_signed_url_output.signed_url + + def upload_string( + self, + content_str: str, + object_key: str, + headers: Optional[dict] = None, + return_url: bool = True, + ) -> str: + """ + 上传本地文件到OSS + + Args: + local_file_path: 本地文件路径 + object_key: OSS对象键(路径),如果为None则使用本地文件名 + headers: 自定义HTTP头 + + Returns: + str: 文件在OSS的公开URL + + Raises: + Exception: 如果上传失败 + """ + + + try: + # if headers is None: + # headers = {} + # if content_type and 'Content-Type' not in headers: + # headers['Content-Type'] = content_type + content = StringIO(content_str) + result = self.client.put_object( + bucket=self.bucket_name, + key=object_key, + content_type='text/plain', + content=content, + ) + + # HTTP状态码 + print('upload_string http status code:{}'.format(result.status_code)) + # 请求ID。请求ID是本次请求的唯一标识,建议在日志中添加此参数 + # print('request_id: {}'.format(result.request_id)) + # hash_crc64_ecma 表示该对象的64位CRC值, 可用于验证上传对象的完整性 + # print('crc64: {}'.format(result.hash_crc64_ecma)) + if result.status_code != 200: + raise Exception(f"上传失败,HTTP状态码: {result.status_code}") + + return self.get_base_url(object_key) if return_url else object_key # 修改返回逻辑 + except Exception as e: + raise Exception(f"上传文件到OSS失败: {str(e)}") + + + def upload_file( + self, + local_file_path: str, + object_key: Optional[str] = None, + headers: Optional[dict] = None, + return_url: bool = True, + expires: int = 3600 # 新增参数,默认1小时 + ) -> str: + """ + 上传本地文件到OSS + + Args: + local_file_path: 本地文件路径 + object_key: OSS对象键(路径),如果为None则使用本地文件名 + headers: 自定义HTTP头 + + Returns: + str: 文件在OSS的公开URL + + Raises: + Exception: 如果上传失败 + """ + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"本地文件不存在: {local_file_path}") + + # 如果没有指定object_key,则使用文件名 + if object_key is None: + object_key = os.path.basename(local_file_path) + + # 自动设置Content-Type + content_type, _ = mimetypes.guess_type(local_file_path) + + try: + # file_name为本地文件的完整路径。 + result = self.client.put_object_from_file( + bucket=self.bucket_name, + key=object_key, + content_type=content_type or '', + file_path=local_file_path, + ) + + if result.status_code != 200: + raise Exception(f"上传失败,HTTP状态码: {result.status_code}") + + return self.get_base_url(object_key) if return_url else object_key # 修改返回逻辑 + except Exception as e: + raise Exception(f"上传文件到OSS失败: {str(e)}") + + + def upload_bytes( + self, + data: bytes, + object_key: str, + content_type: Optional[str] = None, + headers: Optional[dict] = None, + return_url: bool = True, + expires: int = 3600 # 新增参数 + ) -> str: + """ + 上传字节数据到OSS + Args: + data: 要上传的字节数据 + object_key: OSS对象键(路径) + content_type: 内容类型 (如: image/jpeg) + headers: 自定义HTTP头 + Returns: + str: 文件在OSS的公开URL + Raises: + Exception: 如果上传失败 + """ + + try: + result = self.client.put_object( + bucket=self.bucket_name, + key=object_key, + content_type=content_type or 'application/octet-stream', + content=data, + ) + + if result.status_code != 200: + raise Exception(f"上传失败,HTTP状态码: {result.status_code}") + + return self.get_base_url(object_key) if return_url else object_key # 修改返回逻辑 + except Exception as e: + raise Exception(f"上传字节数据到OSS失败: {str(e)}") + + def upload_from_url( + self, + url: str, + object_key: str, + headers: Optional[dict] = None, + timeout: int = 30, + return_url: bool = True, + expires: int = 3600 # 新增参数 + ) -> str: + """ + 从网络URL下载文件并上传到OSS + + Args: + url: 网络文件URL + object_key: OSS对象键(路径) + headers: 自定义HTTP头 + timeout: 下载超时时间(秒) + return_url: 是否返回完整URL + + Returns: + str: 文件在OSS的公开URL或object_key + + Raises: + Exception: 如果下载或上传失败 + """ + import requests + from io import BytesIO + + if not url.startswith(('http://', 'https://')): + raise ValueError("URL必须以http://或https://开头") + + try: + # 下载文件 + response = requests.get(url, stream=True, timeout=timeout) + response.raise_for_status() + + # 获取内容类型 + content_type = response.headers.get('Content-Type', '') + if not content_type: + content_type = mimetypes.guess_type(url)[0] or 'application/octet-stream' + + # 上传到OSS + return self.upload_bytes( + data=response.content, + object_key=object_key, + content_type=content_type, + headers=headers, + return_url=return_url, + expires=expires # 传递参数 + ) + except requests.exceptions.RequestException as e: + raise Exception(f"下载网络文件失败: {str(e)}") + except Exception as e: + raise Exception(f"上传网络文件到OSS失败: {str(e)}") + + def _format_object_key(self, object_key: str) -> str: + """ + 格式化OSS对象键(路径) + """ + # 如果object_key包含self_domain,截取self_domain后面的字符作为新的object_key + if self.self_domain and self.self_domain in object_key: + # 找到self_domain在object_key中的位置,截取后面的部分 + domain_index = object_key.find(self.self_domain) + if domain_index != -1: + # 截取self_domain后面的部分,去掉开头的斜杠 + object_key = object_key[domain_index + len(self.self_domain):].lstrip('/') + return object_key + + # 删除文件 + def delete_file(self, object_key: str) -> bool: + """ + 删除OSS上的文件 + + Args: + object_key: OSS对象键(路径) + + Returns: + bool: 删除是否成功 + """ + try: + self.client.delete_object( + bucket=self.bucket_name, + key=self._format_object_key(object_key), + ) + return True + except Exception as e: + print(f"删除文件失败: {str(e)}") + return False + + def download_file(self, object_key: str) -> bytes: + """ + 从TOS下载文件并返回文件数据 + + Args: + object_key: OSS对象键(路径) + + Returns: + bytes: 文件的字节数据 + + Raises: + Exception: 如果下载失败 + """ + try: + object_key = self._format_object_key(object_key) + + object_stream = self.client.get_object( + bucket=self.bucket_name, + key=object_key, + ) + content = object_stream.read() or b'' + if not content: + raise Exception(f"文件内容为空: {object_key}") + return content + except tos.exceptions.TosClientError as e: + # 操作失败,捕获客户端异常,一般情况为非法请求参数或网络异常 + print('TOS下载 fail with client error, message:{}, cause: {}'.format(e.message, e.cause)) + raise Exception(f"下载异常: {object_key} {e.message}") + except tos.exceptions.TosServerError as e: + # 操作失败,捕获服务端异常,可从返回信息中获取详细错误信息 + print('TOS下载 fail with server error, code: {}'.format(e.code)) + # request id 可定位具体问题,强烈建议日志中保存 + print('TOS下载 error with request id: {}'.format(e.request_id)) + print('TOS下载 error with message: {}'.format(e.message)) + print('TOS下载 error with http code: {}'.format(e.status_code)) + print('TOS下载 error with ec: {}'.format(e.ec)) + print('TOS下载 error with request url: {}'.format(e.request_url)) + raise Exception(f"下载异常: {object_key} {e.message}") + except Exception as e: + raise Exception(f"下载文件失败: {str(e)}") + + +class TOSChunkUploader: + """TOS分片上传类""" + + def __init__(self, tos_client: TOSClient): + """ + 初始化分片上传器 + + Args: + tos_client: TOS客户端实例 + """ + self.client = tos_client.client + self.bucket_name = tos_client.bucket_name + self.self_domain = tos_client.self_domain + + def init_multipart_upload(self, object_key: str, content_type: Optional[str] = None) -> str | None: + """ + 初始化分片上传 + + Args: + object_key: 对象键 + content_type: 内容类型 + + Returns: + str: 上传ID + + Raises: + Exception: 如果初始化失败 + """ + try: + # 设置默认内容类型 + if not content_type: + content_type = mimetypes.guess_type(object_key)[0] or 'application/octet-stream' + + # 初始化分片上传 + result = self.client.create_multipart_upload( + bucket=self.bucket_name, + key=object_key, + content_type=content_type + ) + + return result.upload_id + + except tos.exceptions.TosClientError as e: + raise Exception(f"初始化分片上传失败(客户端错误): {e.message}") + except tos.exceptions.TosServerError as e: + raise Exception(f"初始化分片上传失败(服务端错误): {e.message}") + except Exception as e: + raise Exception(f"初始化分片上传失败: {str(e)}") + + def upload_part(self, object_key: str, upload_id: str, part_number: int, data: bytes) -> dict: + """ + 上传分片 + + Args: + object_key: 对象键 + upload_id: 上传ID + part_number: 分片号(从1开始) + data: 分片数据 + + Returns: + dict: 包含完整分片信息的字典 + + Raises: + Exception: 如果上传失败 + """ + try: + from io import BytesIO + import hashlib + + # 计算分片大小 + part_size = len(data) + + # 计算CRC64(如果需要的话,这里先设为None) + hash_crc64_ecma = None + + # 上传分片 + result = self.client.upload_part( + bucket=self.bucket_name, + key=object_key, + upload_id=upload_id, + part_number=part_number, + content=BytesIO(data) + ) + + return { + 'part_number': part_number, + 'etag': result.etag, + 'part_size': part_size, + 'hash_crc64_ecma': hash_crc64_ecma, + 'is_completed': True + } + + except tos.exceptions.TosClientError as e: + raise Exception(f"上传分片失败(客户端错误): {e.message}") + except tos.exceptions.TosServerError as e: + raise Exception(f"上传分片失败(服务端错误): {e.message}") + except Exception as e: + raise Exception(f"上传分片失败: {str(e)}") + + def complete_multipart_upload(self, object_key: str, upload_id: str, parts: list) -> str: + """ + 完成分片上传 + + Args: + object_key: 对象键 + upload_id: 上传ID + parts: 分片信息列表,每个元素包含part_number和etag + + Returns: + str: 文件的完整URL + + Raises: + Exception: 如果完成上传失败 + """ + try: + # 按分片号排序 + sorted_parts = sorted(parts, key=lambda x: x['part_number']) + + # 构建分片列表并计算偏移量 + part_list = [] + current_offset = 0 + + for part in sorted_parts: + part_list.append(tos.models2.PartInfo( + part_number=part['part_number'], + etag=part['etag'], + part_size=part.get('part_size'), + offset=current_offset, + hash_crc64_ecma=part.get('hash_crc64_ecma'), + is_completed=part.get('is_completed', True) + )) + + # 更新偏移量 + if part.get('part_size'): + current_offset += part['part_size'] + + # 完成分片上传 + result = self.client.complete_multipart_upload( + bucket=self.bucket_name, + key=object_key, + upload_id=upload_id, + parts=part_list + ) + + # 返回完整URL + return f"https://{self.self_domain}/{object_key}" + + except tos.exceptions.TosClientError as e: + raise Exception(f"完成分片上传失败(客户端错误): {e.message}") + except tos.exceptions.TosServerError as e: + raise Exception(f"完成分片上传失败(服务端错误): {e.message}") + except Exception as e: + raise Exception(f"完成分片上传失败: {str(e)}") + + def abort_multipart_upload(self, object_key: str, upload_id: str) -> bool: + """ + 取消分片上传 + + Args: + object_key: 对象键 + upload_id: 上传ID + + Returns: + bool: 是否取消成功 + """ + try: + self.client.abort_multipart_upload( + bucket=self.bucket_name, + key=object_key, + upload_id=upload_id + ) + return True + + except tos.exceptions.TosClientError as e: + print(f"取消分片上传失败(客户端错误): {e.message}") + return False + except tos.exceptions.TosServerError as e: + print(f"取消分片上传失败(服务端错误): {e.message}") + return False + except Exception as e: + print(f"取消分片上传失败: {str(e)}") + return False + + def list_parts(self, object_key: str, upload_id: str) -> list: + """ + 列出已上传的分片 + + Args: + object_key: 对象键 + upload_id: 上传ID + + Returns: + list: 已上传的分片列表 + """ + try: + result = self.client.list_parts( + bucket=self.bucket_name, + key=object_key, + upload_id=upload_id + ) + + parts = [] + for part in result.parts: + parts.append({ + 'part_number': part.part_number, + 'etag': part.etag, + 'size': part.size, + 'last_modified': part.last_modified + }) + + return parts + + except Exception as e: + print(f"列出分片失败: {str(e)}") + return [] + + +# 创建OSS客户端 +from config import TOS_CONFIG +oss_client = TOSClient( + access_key_id=TOS_CONFIG['access_key_id'], + access_key_secret=TOS_CONFIG['access_key_secret'], + endpoint=TOS_CONFIG['endpoint'], + region=TOS_CONFIG['region'], + bucket_name=TOS_CONFIG['bucket_name'], + self_domain=TOS_CONFIG['self_domain'], + disable_ssl_warnings=TOS_CONFIG['disable_ssl_warnings'] +) + +# 创建分片上传器 +chunk_uploader = TOSChunkUploader(oss_client) diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 90846c3..54061b5 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -128,8 +128,7 @@ def format_mix_item(doc): "play_vv": doc.get("play_vv", 0), "request_id": doc.get("request_id", ""), "rank": doc.get("rank", 0), - "cover_image_url": doc.get("cover_image_url", ""), - "cover_backup_urls": doc.get("cover_backup_urls", []) + "cover_image_url": doc.get("cover_image_url", "") } def get_mix_list(page=1, limit=20, sort_by="playcount"): diff --git a/docs/API接口文档.md b/docs/API接口文档.md index 97e463f..325306c 100644 --- a/docs/API接口文档.md +++ b/docs/API接口文档.md @@ -42,8 +42,7 @@ "play_vv": 120000000, "request_id": "request_xxx", "rank": 1, - "cover_image_url": "https://p3.douyinpic.com/xxx", - "cover_backup_urls": ["url1", "url2"] + "cover_image_url": "https://p3.douyinpic.com/xxx" } ``` @@ -199,7 +198,6 @@ GET /api/rank/videos?page=1&limit=20&sort=growth&start_date=2025-10-16&end_date= "request_id": "request_xxx", "rank": 1, "cover_image_url": "https://p3.douyinpic.com/xxx", - "cover_backup_urls": ["url1", "url2"], "growth": 5000000, "growth_rate": 4.35 } @@ -252,8 +250,7 @@ GET /api/rank/top?limit=10 "play_vv": 120000000, "request_id": "request_xxx", "rank": 1, - "cover_image_url": "https://p3.douyinpic.com/xxx", - "cover_backup_urls": ["url1", "url2"] + "cover_image_url": "https://p3.douyinpic.com/xxx" } ], "total": 10, @@ -297,8 +294,7 @@ GET /api/rank/search?q=关键词&page=1&limit=10 "play_vv": 120000000, "request_id": "request_xxx", "rank": 1, - "cover_image_url": "https://p3.douyinpic.com/xxx", - "cover_backup_urls": ["url1", "url2"] + "cover_image_url": "https://p3.douyinpic.com/xxx" } ], "keyword": "关键词", @@ -347,8 +343,7 @@ GET /api/rank/detail?id=674f1234567890abcdef "play_vv": 120000000, "request_id": "request_xxx", "rank": 1, - "cover_image_url": "https://p3.douyinpic.com/xxx", - "cover_backup_urls": ["url1", "url2"] + "cover_image_url": "https://p3.douyinpic.com/xxx" }, "update_time": "2025-10-17 15:30:00" } @@ -887,8 +882,7 @@ wx.request({ - 提供搜索建议 ### 3. 图片加载 -- 优先使用 `cover_image_url` -- 备用 `cover_backup_urls` 作为备选 +- 使用 `cover_image_url` 作为封面图片 - 添加图片加载失败处理 ### 4. 数据更新 From 8b607f6e248d18aa11ef369389c4de4d22be6dfe Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Tue, 21 Oct 2025 15:12:18 +0800 Subject: [PATCH 2/6] =?UTF-8?q?=E5=9C=A8Rankings=5Flist=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E5=BA=93=E9=87=8C=E9=9D=A2=E6=B7=BB=E5=8A=A0=E4=B8=89=E4=B8=AA?= =?UTF-8?q?=E5=AD=97=E6=AE=B5=EF=BC=9A=E5=90=88=E9=9B=86=E4=BD=9C=E8=80=85?= =?UTF-8?q?=EF=BC=8C=E5=90=88=E9=9B=86=E6=8F=8F=E8=BF=B0=EF=BC=8C=E5=90=88?= =?UTF-8?q?=E9=9B=86=E6=80=BB=E9=9B=86=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../handlers/Rankings/rank_data_scraper.py | 140 +++++++++++++++++- backend/routers/rank_api_routes.py | 8 +- 2 files changed, 141 insertions(+), 7 deletions(-) diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 95c885b..b3d3dd7 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -481,10 +481,13 @@ class DouyinPlayVVScraper: if len(self.play_vv_items) < 3: logging.info(f"=== 调试:合集对象结构 ===") logging.info(f"完整对象键: {list(obj.keys())}") - # 查找可能的视频相关字段 + # 查找可能的视频相关字段和新增字段 for key, value in obj.items(): if 'aweme' in key.lower() or 'video' in key.lower() or 'item' in key.lower() or 'ids' in key.lower(): logging.info(f"可能的视频字段 {key}: {type(value)} - {str(value)[:200]}") + # 检查新增字段相关的键 + elif any(keyword in key.lower() for keyword in ['author', 'creator', 'user', 'desc', 'description', 'total', 'count', 'episode']): + logging.info(f"可能的新字段 {key}: {type(value)} - {str(value)[:200]}") # 特别检查ids字段 if 'ids' in obj: @@ -539,6 +542,112 @@ class DouyinPlayVVScraper: elif isinstance(pic, str): cover_image_url = pic + # 提取新增的三个字段 + series_author = "" + desc = "" + updated_to_episode = 0 + + # 提取合集作者/影视工作室 + if 'author' in obj: + author = obj['author'] + if isinstance(author, dict): + # 尝试多个可能的作者字段 + series_author = (author.get('nickname') or + author.get('unique_id') or + author.get('short_id') or + author.get('name') or '') + elif isinstance(author, str): + series_author = author + elif 'creator' in obj: + creator = obj['creator'] + if isinstance(creator, dict): + series_author = (creator.get('nickname') or + creator.get('unique_id') or + creator.get('name') or '') + elif isinstance(creator, str): + series_author = creator + elif 'user' in obj: + user = obj['user'] + if isinstance(user, dict): + series_author = (user.get('nickname') or + user.get('unique_id') or + user.get('name') or '') + elif isinstance(user, str): + series_author = user + + # 提取合集描述 - 扩展更多可能的字段 + description_fields = ['desc', 'share_info'] # 保持字段列表 + + # 先检查desc字段 + if 'desc' in obj and obj['desc']: + desc_value = str(obj['desc']).strip() + if desc_value: + desc = desc_value + logging.info(f"从desc提取到描述") + + # 如果desc中没有找到有效描述,检查share_info + if not desc and 'share_info' in obj and isinstance(obj['share_info'], dict): + share_desc = obj['share_info'].get('share_desc', '').strip() + if share_desc: + desc = share_desc + logging.info(f"从share_info.share_desc提取到描述") + + # 如果share_info中没有找到有效描述,继续检查desc字段 + if not desc: + for field in description_fields: + if field in obj and obj[field]: + desc_value = str(obj[field]).strip() + if desc_value: + desc = desc_value + logging.info(f"从{field}提取到描述") + break + + # 如果还没有找到描述,尝试从嵌套对象中查找desc字段 + if not desc: + def search_nested_desc(data, depth=0): + if depth > 3: # 限制递归深度 + return None + + if isinstance(data, dict): + # 检查当前层级的desc字段 + if 'desc' in data and data['desc']: + desc_value = str(data['desc']).strip() + if 5 <= len(desc_value) <= 1000: + return desc_value + + # 递归检查嵌套对象 + for value in data.values(): + if isinstance(value, dict): + nested_result = search_nested_desc(value, depth + 1) + if nested_result: + return nested_result + return None + + desc = search_nested_desc(obj) + + + # 提取合集总集数 - 从statis字段中获取 + updated_to_episode = 0 # 初始化默认值 + if 'statis' in obj and isinstance(obj['statis'], dict): + statis = obj['statis'] + if 'updated_to_episode' in statis: + try: + episodes = int(statis['updated_to_episode']) + if episodes > 0: + updated_to_episode = episodes + logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") + except ValueError: + logging.warning("updated_to_episode字段值无法转换为整数") + else: + logging.info("未找到statis字段或statis不是字典类型") + try: + episodes = int(obj['updated_to_episode']) + if episodes > 0: + updated_to_episode = episodes + logging.info(f"从updated_to_episode提取到集数: {episodes}") + except ValueError: + pass # 忽略无法转换为整数的情况 + self.play_vv_items.append({ 'play_vv': vv, 'formatted': self.format_count(vv), @@ -549,9 +658,18 @@ class DouyinPlayVVScraper: 'mix_id': mix_id, # 合集ID 'cover_image_url': cover_image_url, # 合集封面图片主链接(完整URL) 'cover_backup_urls': cover_image_backup_urls, # 封面图片备用链接列表 + 'series_author': series_author, # 合集作者/影视工作室 + 'desc': desc, # 合集描述 + 'updated_to_episode': updated_to_episode, # 合集总集数 'timestamp': datetime.now().isoformat() }) logging.info(f'提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') + if series_author: + logging.info(f' 作者: {series_author}') + if desc: + logging.info(f' 描述: {desc[:100]}{"..." if len(desc) > 100 else ""}') + if updated_to_episode > 0: + logging.info(f' 总集数: {updated_to_episode}') # 递归搜索子对象 for key, value in obj.items(): @@ -567,17 +685,21 @@ class DouyinPlayVVScraper: def _extract_from_text_regex(self, text: str, source_url: str, request_id: str = None): """使用正则表达式从文本中提取信息""" - # 查找包含完整合集信息的JSON片段 - mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' + # 查找包含完整合集信息的JSON片段,包括statis中的updated_to_episode + mix_pattern = r'\{[^{}]*"mix_id"\s*:\s*"([^"]*)"[^{}]*"mix_name"\s*:\s*"([^"]*)"[^{}]*"statis"\s*:\s*\{[^{}]*"play_vv"\s*:\s*(\d+)[^{}]*"updated_to_episode"\s*:\s*(\d+)[^{}]*\}[^{}]*\}' for match in re.finditer(mix_pattern, text): try: mix_id = match.group(1) mix_name = match.group(2) vv = int(match.group(3)) + episodes = int(match.group(4)) # 构建合集链接 video_url = f"https://www.douyin.com/collection/{mix_id}" if mix_id else "" + + if episodes > 0: + logging.info(f"从statis.updated_to_episode提取到集数: {episodes}") self.play_vv_items.append({ 'play_vv': vv, @@ -587,6 +709,7 @@ class DouyinPlayVVScraper: 'mix_name': mix_name, 'video_url': video_url, # 合集链接 'mix_id': mix_id, # 合集ID + 'updated_to_episode': episodes if episodes > 0 else None, # 从statis.updated_to_episode提取的集数 'timestamp': datetime.now().isoformat() }) logging.info(f'正则提取到合集: {mix_name} (ID: {mix_id}) - {vv:,} 播放量') @@ -607,6 +730,7 @@ class DouyinPlayVVScraper: 'mix_name': '', # 未知合集名称 'video_url': '', # 未知链接 'mix_id': '', # 未知mix_id + 'updated_to_episode': None, # 未知集数 'timestamp': datetime.now().isoformat() }) except Exception: @@ -871,7 +995,7 @@ class DouyinPlayVVScraper: # 没有封面图片,使用空字符串 permanent_cover_url = '' - # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段 doc = { 'batch_time': batch_time, 'mix_name': mix_name, @@ -882,7 +1006,11 @@ class DouyinPlayVVScraper: 'rank': 0, # 临时设置,后面会重新计算 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 - 'cover_backup_urls': item.get('cover_backup_urls', []) # 封面图片备用链接列表 + 'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表 + # 新增的三个字段 + 'series_author': item.get('series_author', ''), # 合集作者/影视工作室 + 'desc': item.get('desc', ''), # 合集描述 + 'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数 } documents.append(doc) @@ -900,7 +1028,7 @@ class DouyinPlayVVScraper: max_play_vv = max(doc['play_vv'] for doc in documents) if documents else 0 logging.info(f'MongoDB保存统计: 总播放量={total_play_vv:,}, 最高播放量={max_play_vv:,}') - logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url') + logging.info(f'保存的字段: batch_time, mix_name, video_url, playcount, play_vv, request_id, rank, cover_image_url_original, cover_image_url, series_author, desc, updated_to_episode') # 统计封面图片处理情况 cover_count = sum(1 for doc in documents if doc.get('cover_image_url')) diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 54061b5..660cb2b 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -128,7 +128,13 @@ def format_mix_item(doc): "play_vv": doc.get("play_vv", 0), "request_id": doc.get("request_id", ""), "rank": doc.get("rank", 0), - "cover_image_url": doc.get("cover_image_url", "") + "cover_image_url": doc.get("cover_image_url", ""), + # 新增字段 + "series_author": doc.get("series_author", ""), + "desc": doc.get("desc", ""), + "updated_to_episode": doc.get("updated_to_episode", 0), + "cover_backup_urls": doc.get("cover_backup_urls", []), + "mix_id": doc.get("mix_id", "") } def get_mix_list(page=1, limit=20, sort_by="playcount"): From 8b1149da5607f88d8b57aef4417fb70a37fa1874 Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Tue, 21 Oct 2025 17:56:00 +0800 Subject: [PATCH 3/6] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=BA=86episode=5Fvideo?= =?UTF-8?q?=5Fids=E5=AD=97=E6=AE=B5=EF=BC=8C=E5=9C=A8=E8=BF=90=E8=A1=8Cran?= =?UTF-8?q?k=5Fdata=5Fscraper.py=20=E6=88=96=E5=AE=9A=E6=97=B6=E5=99=A8?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=E7=9A=84=E6=97=B6=E5=80=99=E4=B8=8D=E4=BB=85?= =?UTF-8?q?=E4=BF=9D=E8=AF=81=E4=BA=86=E5=8E=9F=E6=9D=A5=E7=9A=84=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E5=AD=98=E5=85=A5=E6=95=B0=E6=8D=AE=E5=BA=93?= =?UTF-8?q?=E8=BF=98=E4=BF=9D=E5=AD=98=E4=BA=86=20=E6=AF=8F=E4=B8=80?= =?UTF-8?q?=E9=83=A8=E7=9F=AD=E5=89=A7=E7=9A=84=E6=AF=8F=E4=B8=80=E9=9B=86?= =?UTF-8?q?=E7=9A=84=E8=A7=86=E9=A2=91ID?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 提示: 拉取代码在您运行脚本之后会主动创建一个episode_video_ids文件夹 里面存入的是您第一次运行脚本的每一集的视频ID(作为缓存) 判断的方法是:在运行脚本之后检查每一集的缓存数量是否与本剧的集数相同,相同则使用缓存的视频ID 不相同则重新获取 获取视频ID的时间不长 --- .../handlers/Rankings/rank_data_scraper.py | 243 +++++++++++++++++- backend/routers/rank_api_routes.py | 3 +- 2 files changed, 242 insertions(+), 4 deletions(-) diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index b3d3dd7..1869847 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -23,6 +23,7 @@ import logging import os import shutil from datetime import datetime +import requests from selenium import webdriver import os @@ -995,7 +996,20 @@ class DouyinPlayVVScraper: # 没有封面图片,使用空字符串 permanent_cover_url = '' - # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增的3个字段 + # 获取合集中的所有视频ID + mix_id = item.get('mix_id', '') + episode_video_ids = [] + if mix_id: + logging.info(f'获取合集 {mix_name} 的所有视频ID') + current_episode_count = item.get('updated_to_episode', 0) + episode_video_ids = self.get_collection_videos( + mix_id=mix_id, + mix_name=mix_name, + current_episode_count=current_episode_count + ) + logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID') + + # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段 doc = { 'batch_time': batch_time, 'mix_name': mix_name, @@ -1007,10 +1021,11 @@ class DouyinPlayVVScraper: 'cover_image_url_original': original_cover_url, # 保存原始临时链接用于调试 'cover_image_url': permanent_cover_url, # 合集封面图片永久链接 'cover_backup_urls': item.get('cover_backup_urls', []), # 封面图片备用链接列表 - # 新增的三个字段 + # 新增的字段 'series_author': item.get('series_author', ''), # 合集作者/影视工作室 'desc': item.get('desc', ''), # 合集描述 - 'updated_to_episode': item.get('updated_to_episode', 0) # 合集总集数 + 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数 + 'episode_video_ids': episode_video_ids # 每一集的视频ID列表 } documents.append(doc) @@ -1042,6 +1057,228 @@ class DouyinPlayVVScraper: except Exception as e: logging.error(f'保存到MongoDB时出错: {e}') + def get_video_info(self, video_id: str) -> dict: + """获取视频详细信息 + Args: + video_id: 视频ID + Returns: + dict: 包含视频详细信息的字典 + """ + video_url = f'https://www.douyin.com/video/{video_id}' + logging.info(f'获取视频信息: {video_url}') + + # 清除之前的网络日志 + self.driver.execute_cdp_cmd('Network.clearBrowserCache', {}) + self.driver.execute_cdp_cmd('Network.clearBrowserCookies', {}) + self.driver.get(video_url) + time.sleep(3) + + # 等待页面加载完成 + try: + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.by import By + + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "video")) + ) + except Exception as e: + logging.warning(f'等待视频元素超时: {e}') + + # 获取网络请求日志 + logs = self.driver.get_log('performance') + video_info = {} + + for entry in logs: + try: + log = json.loads(entry['message'])['message'] + if ( + 'Network.responseReceived' in log['method'] + and 'response' in log['params'] + and 'url' in log['params']['response'] + and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url'] + ): + request_id = log['params']['requestId'] + response = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id}) + if response and 'body' in response: + data = json.loads(response['body']) + if 'item_list' in data and len(data['item_list']) > 0: + item = data['item_list'][0] + video_info = { + 'video_id': item.get('aweme_id'), + 'create_time': item.get('create_time'), + 'desc': item.get('desc'), + 'duration': item.get('duration'), + 'mix_info': { + 'mix_id': item.get('mix_info', {}).get('mix_id'), + 'mix_name': item.get('mix_info', {}).get('mix_name'), + 'total': item.get('mix_info', {}).get('total') + } + } + break + except Exception as e: + logging.warning(f'解析日志条目时出错: {e}') + + return video_info + + def get_collection_videos(self, mix_id: str, mix_name: str = '', current_episode_count: int = 0) -> list: + """获取合集中的所有视频ID列表,支持增量更新 + Args: + mix_id: 合集ID + mix_name: 合集名称,用于日志 + current_episode_count: 当前已知的集数 + Returns: + list: 按集数排序的视频ID列表 + """ + try: + # 检查缓存文件 + cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids') + # 确保缓存目录存在 + os.makedirs(cache_dir, exist_ok=True) + cache_file = os.path.join(cache_dir, f'video_ids_{mix_id}.json') + cached_videos = [] + + try: + if os.path.exists(cache_file): + with open(cache_file, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + cached_videos = cache_data.get('episodes', []) + last_update = cache_data.get('last_update') + + # 如果缓存的集数等于当前集数,直接返回缓存的结果 + if len(cached_videos) == current_episode_count: + logging.info(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})") + return [video['video_id'] for video in cached_videos] + except Exception as e: + logging.warning(f"读取缓存文件失败: {e}") + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'zh-CN,zh;q=0.9', + 'Referer': 'https://www.douyin.com/', + } + + params = { + 'device_platform': 'webapp', + 'aid': '6383', + 'channel': 'channel_pc_web', + 'pc_client_type': '1', + 'version_code': '170400', + 'version_name': '17.4.0', + 'cookie_enabled': 'true', + 'platform': 'PC', + 'downlink': '10', + 'mix_id': mix_id, + 'cursor': '0', + 'count': '30', + 'screen_width': '1920', + 'screen_height': '1080', + 'browser_language': 'zh-CN', + 'browser_platform': 'Win32', + 'browser_name': 'Chrome', + 'browser_version': '120.0.0.0', + 'browser_online': 'true', + 'engine_name': 'Blink', + 'engine_version': '120.0.0.0', + 'os_name': 'Windows', + 'os_version': '10', + 'cpu_core_num': '16', + 'device_memory': '8', + 'effective_type': '4g', + 'round_trip_time': '50', + } + + all_videos = [] + + while True: + response = requests.get( + 'https://www.douyin.com/aweme/v1/web/mix/aweme/', + params=params, + cookies=self.get_cookies_dict(), + headers=headers + ) + + if response.status_code != 200: + logging.error(f"请求失败: {response.status_code}") + logging.error(f"响应内容: {response.text}") + break + + try: + data = response.json() + aweme_list = data.get('aweme_list', []) + if not aweme_list: + break + + for aweme in aweme_list: + video_id = aweme.get('aweme_id') + if video_id: + all_videos.append({ + 'video_id': video_id, + 'episode_num': int(aweme.get('episode_num', 0)) + }) + + has_more = data.get('has_more', False) + if not has_more: + break + + params['cursor'] = str(len(all_videos)) + time.sleep(1) + + except json.JSONDecodeError as e: + logging.error(f"JSON解析错误: {e}") + logging.error(f"响应内容: {response.text}") + break + + if not all_videos: + if cached_videos: + logging.warning(f"获取视频列表失败,使用缓存数据: {mix_name} (ID: {mix_id})") + return [video['video_id'] for video in cached_videos] + return [] + + logging.info(f"获取到 {len(all_videos)} 个视频ID") + + # 按集数排序 + all_videos.sort(key=lambda x: x['episode_num']) + + # 整理视频ID和集数信息 + episode_info = [] + for video in all_videos: + episode_info.append({ + 'video_id': video['video_id'], + 'episode_num': video['episode_num'] + }) + + # 检查是否有新增视频 + if len(episode_info) > len(cached_videos): + logging.info(f"发现新增视频: {mix_name} (ID: {mix_id}), 新增 {len(episode_info) - len(cached_videos)} 集") + + # 保存到缓存文件 + with open(cache_file, 'w', encoding='utf-8') as f: + json.dump({ + 'episodes': episode_info, + 'total_count': len(episode_info), + 'last_update': datetime.now().isoformat(), + 'mix_name': mix_name + }, f, ensure_ascii=False, indent=2) + + # 返回视频ID列表 + return [video['video_id'] for video in all_videos] + + except Exception as e: + logging.error(f"获取合集视频时出错: {e}") + # 如果出错且有缓存,返回缓存的结果 + if cached_videos: + logging.warning(f"使用缓存的视频列表: {mix_name} (ID: {mix_id})") + return [video['video_id'] for video in cached_videos] + return [] + + def get_cookies_dict(self): + """获取当前页面的cookies""" + if not hasattr(self, 'cookies') or not self.cookies: + self.cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()} + return self.cookies + def run(self): try: self.setup_driver() diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index 660cb2b..aec3aae 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -134,7 +134,8 @@ def format_mix_item(doc): "desc": doc.get("desc", ""), "updated_to_episode": doc.get("updated_to_episode", 0), "cover_backup_urls": doc.get("cover_backup_urls", []), - "mix_id": doc.get("mix_id", "") + "mix_id": doc.get("mix_id", ""), + "episode_video_ids": doc.get("episode_video_ids", []) } def get_mix_list(page=1, limit=20, sort_by="playcount"): From 2a32b2a8c0454d18a4d668a0de7ca5c0a7bf7cf9 Mon Sep 17 00:00:00 2001 From: Qyir <13521889462@163.com> Date: Thu, 23 Oct 2025 10:04:44 +0800 Subject: [PATCH 4/6] =?UTF-8?q?1.=E6=B7=BB=E5=8A=A0=E5=88=A4=E6=96=AD?= =?UTF-8?q?=E4=BB=A3=E7=A0=81=EF=BC=8C=E5=90=AF=E5=8A=A8=E5=AE=9A=E6=97=B6?= =?UTF-8?q?=E5=99=A8=E6=97=B6=E4=B8=8D=E8=B0=83=E7=94=A8=E4=B8=BB=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E7=9A=84=E6=9F=90=E5=87=A0=E4=B8=AA=E5=87=BD=E6=95=B0?= =?UTF-8?q?=EF=BC=8C=E7=A1=AE=E4=BF=9D=E5=AE=9A=E6=97=B6=E5=99=A8=E6=AD=A3?= =?UTF-8?q?=E5=B8=B8=E8=AE=A1=E7=AE=97=E6=92=AD=E6=94=BE=E9=87=8F=E5=B7=AE?= =?UTF-8?q?=E5=80=BC=202.=E6=96=B0=E5=A2=9E=E5=8A=9F=E8=83=BD=EF=BC=9A?= =?UTF-8?q?=E8=8E=B7=E5=8F=96=E7=82=B9=E8=B5=9E=EF=BC=8C=E6=94=B6=E8=97=8F?= =?UTF-8?q?=EF=BC=8C=E8=BD=AC=E5=8F=91=E6=95=B0=E9=87=8F+=E8=AF=84?= =?UTF-8?q?=E8=AE=BA=E5=86=85=E5=AE=B9=E5=88=97=E8=A1=A8=EF=BC=88=E4=B8=8D?= =?UTF-8?q?=E5=AE=8C=E6=95=B4=EF=BC=8C=E6=AD=A3=E5=9C=A8=E7=BB=A7=E7=BB=AD?= =?UTF-8?q?=E4=BC=98=E5=8C=96=EF=BC=89=203.=E5=A2=9E=E5=8A=A0=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=BA=93=E6=96=87=E4=BB=B6=E5=A4=B9=EF=BC=8C=E5=BD=93?= =?UTF-8?q?=E5=90=AF=E5=8A=A8=E5=AE=9A=E6=97=B6=E5=99=A8=E6=97=B6=E5=AD=98?= =?UTF-8?q?=E5=82=A8=E5=88=B0Ranking=5Fstorage=5Flist=E4=B8=AD=EF=BC=8C=20?= =?UTF-8?q?=E6=8C=89=E7=85=A7Ranking=5Fstorage=5Flist=E4=B8=AD=E7=9A=84?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E8=BF=9B=E8=A1=8C=E8=AE=A1=E7=AE=97=E6=92=AD?= =?UTF-8?q?=E6=94=BE=E9=87=8F=E5=B7=AE=E5=80=BC=EF=BC=8C=E8=AE=A1=E7=AE=97?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E5=AD=98=E5=85=A5Ranking=5Fstorage=E4=B8=AD?= =?UTF-8?q?=20=E5=8D=95=E7=8B=AC=E8=BF=90=E8=A1=8Crank=5Fdata=5Fscraper.py?= =?UTF-8?q?=E7=9A=84=E6=97=B6=E5=80=99=E5=AD=98=E5=85=A5Rankings=5Flist?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 原因: Rankings_list里面存储的数据结构较多 Ranking_storage_list里面存储的主要是播放量 Rankings_list里面存入的是播放量差值 --- backend/Timer_worker.py | 66 ++- .../handlers/Rankings/rank_data_scraper.py | 479 +++++++++++++++++- backend/routers/rank_api_routes.py | 3 +- 3 files changed, 522 insertions(+), 26 deletions(-) diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index 3aacde8..99a0c82 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -61,6 +61,8 @@ class DouyinAutoScheduler: # 设置环境变量,确保自动模式 os.environ['AUTO_CONTINUE'] = '1' + # 设置定时器模式环境变量,跳过评论抓取等函数 + os.environ['TIMER_MODE'] = '1' # 直接创建并运行 DouyinPlayVVScraper 实例 scraper = DouyinPlayVVScraper( @@ -68,10 +70,10 @@ class DouyinAutoScheduler: auto_continue=True, duration_s=60 ) - + logging.info("📁 开始执行抓取任务...") scraper.run() - + logging.info("✅ 抖音播放量抓取任务执行成功") # 数据抓取完成后,自动生成当日榜单 @@ -89,7 +91,7 @@ class DouyinAutoScheduler: from datetime import timedelta # 获取集合 - douyin_collection = db['Rankings_list'] # 使用真实抓取的数据 + douyin_collection = db['Ranking_storage_list'] # 使用定时器抓取的数据 rankings_collection = db['Ranking_storage'] today = date.today() @@ -107,10 +109,20 @@ class DouyinAutoScheduler: try: logging.info("🔄 正在生成时间轴对比榜单...") - # 获取今天的数据,按短剧名称去重,只保留播放量最高的 - today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1)) + # 获取最新批次的数据 + latest_batch = douyin_collection.find_one(sort=[("batch_time", -1)]) + if not latest_batch: + logging.warning("⚠️ 未找到任何数据") + return False + + latest_batch_time = latest_batch.get("batch_time") + logging.info(f"📊 找到最新批次时间: {latest_batch_time}") - # 按短剧名称去重,每个短剧只保留播放量最高的一条 + # 只获取最新批次的数据 + today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1)) + logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}") + + # 按短剧名称去重(虽然同一批次应该不会有重复,但为了代码健壮性保留此逻辑) unique_videos = {} for video in today_videos_raw: mix_name = video.get("mix_name", "") @@ -121,26 +133,36 @@ class DouyinAutoScheduler: logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") - # 获取昨天的榜单数据(如果存在),取最新的计算结果 - yesterday_ranking = rankings_collection.find_one({ - "date": yesterday_str, - "type": "comprehensive" - }, sort=[("calculation_sequence", -1)]) + # 获取昨天最后一批次的数据 + yesterday_batch = douyin_collection.find_one({ + "batch_time": {"$regex": f"^{yesterday_str}"} + }, sort=[("batch_time", -1)]) yesterday_data = {} - if yesterday_ranking and "data" in yesterday_ranking: - # 将昨天的数据转换为字典,以短剧名称为键 - for item in yesterday_ranking["data"]: - title = item.get("title", "") - if title: - yesterday_data[title] = { - "rank": item.get("rank", 0), - "play_vv": item.get("play_vv", 0), - "video_id": item.get("video_id", "") + if yesterday_batch: + # 获取昨天最后一批次的所有数据 + yesterday_videos = list(douyin_collection.find({ + "batch_time": yesterday_batch["batch_time"] + }).sort("play_vv", -1)) + + # 按短剧名称去重,保留播放量最高的记录 + for video in yesterday_videos: + mix_name = video.get("mix_name", "") + if mix_name and (mix_name not in yesterday_data or video.get("play_vv", 0) > yesterday_data[mix_name].get("play_vv", 0)): + yesterday_data[mix_name] = { + "play_vv": video.get("play_vv", 0), + "video_id": str(video.get("_id", "")), + "rank": 0 # 稍后计算排名 } - logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧") + + # 计算排名 + sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True) + for rank, (mix_name, data) in enumerate(sorted_videos, 1): + yesterday_data[mix_name]["rank"] = rank + + logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧") else: - logging.info("📊 未找到昨天的榜单数据,将作为首次生成") + logging.info("📊 未找到昨天的原始数据,将作为首次生成") if today_videos: # 先计算所有视频的播放量差值 diff --git a/backend/handlers/Rankings/rank_data_scraper.py b/backend/handlers/Rankings/rank_data_scraper.py index 1869847..bd11206 100644 --- a/backend/handlers/Rankings/rank_data_scraper.py +++ b/backend/handlers/Rankings/rank_data_scraper.py @@ -83,11 +83,13 @@ class DouyinPlayVVScraper: # 使用 database.py 中的连接 self.db = db - # 设置集合 - mongo_collection = os.environ.get('MONGO_COLLECTION', 'Rankings_list') + # 根据运行模式选择集合 + is_timer_mode = os.environ.get('TIMER_MODE') == '1' + mongo_collection = 'Ranking_storage_list' if is_timer_mode else 'Rankings_list' self.collection = self.db[mongo_collection] logging.info(f'MongoDB连接成功,使用数据库: {self.db.name},集合: {mongo_collection}') + logging.info(f'当前运行模式: {"定时器模式" if is_timer_mode else "普通模式"}') except Exception as e: logging.error(f'MongoDB连接失败: {e}') @@ -447,6 +449,28 @@ class DouyinPlayVVScraper: if n >= 10_000: return f"{n/10_000:.1f}万" return str(n) + + def format_interaction_count(self, n: int) -> str: + """格式化互动数据数量,返回带单位的字符串 + Args: + n: 数量 + Returns: + str: 格式化后的字符串,如 27898 -> 2.8W, 1234 -> 1234 + """ + if n >= 100_000_000: + result = n / 100_000_000 + if result == int(result): + return f"{int(result)}亿" + else: + return f"{result:.1f}亿" + elif n >= 10_000: + result = n / 10_000 + if result == int(result): + return f"{int(result)}W" + else: + return f"{result:.1f}W" + else: + return str(n) @@ -1008,6 +1032,80 @@ class DouyinPlayVVScraper: current_episode_count=current_episode_count ) logging.info(f'合集 {mix_name} 共获取到 {len(episode_video_ids)} 个视频ID') + + # 获取每个视频的详细互动数据 + logging.info(f'开始获取合集 {mix_name} 的视频详细互动数据') + video_details_list = self.get_collection_video_details( + episode_video_ids=episode_video_ids, + mix_name=mix_name, + max_comments_per_video=10 # 每个视频最多获取10条评论 + ) + + # 构建每集的详细信息,使用获取到的真实数据 + episode_details = [] + total_episodes = item.get('updated_to_episode', 0) + + for i in range(total_episodes): + episode_number = i + 1 + video_id = episode_video_ids[i] if i < len(episode_video_ids) else '' + + # 查找对应的视频详细数据 + video_detail = None + if i < len(video_details_list): + video_detail = video_details_list[i] + + if video_detail and video_detail.get('success', False): + # 使用获取到的真实数据 + likes = video_detail.get('likes', 0) + shares = video_detail.get('shares', 0) + favorites = video_detail.get('favorites', 0) + + episode_info = { + 'episode_number': episode_number, + 'video_id': video_id, + 'likes': likes, + 'shares': shares, + 'favorites': favorites, + 'likes_formatted': self.format_interaction_count(likes), + 'shares_formatted': self.format_interaction_count(shares), + 'favorites_formatted': self.format_interaction_count(favorites), + 'comments': video_detail.get('comments', []) + } + else: + # 使用默认值 + episode_info = { + 'episode_number': episode_number, + 'video_id': video_id, + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [] + } + + episode_details.append(episode_info) + + # 统计获取到的数据 + total_likes = sum(ep.get('likes', 0) for ep in episode_details) + total_comments = sum(len(ep.get('comments', [])) for ep in episode_details) + logging.info(f'合集 {mix_name} 详细数据统计: 总点赞数={total_likes:,}, 总评论数={total_comments}') + else: + # 如果没有获取到视频ID,使用默认的episode_details + episode_details = [ + { + 'episode_number': i + 1, + 'video_id': '', + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [] + } for i in range(item.get('updated_to_episode', 0)) + ] # 保留用户要求的7个字段 + cover_image_url作为合集封面图片完整链接 + 新增字段 doc = { @@ -1025,7 +1123,8 @@ class DouyinPlayVVScraper: 'series_author': item.get('series_author', ''), # 合集作者/影视工作室 'desc': item.get('desc', ''), # 合集描述 'updated_to_episode': item.get('updated_to_episode', 0), # 合集总集数 - 'episode_video_ids': episode_video_ids # 每一集的视频ID列表 + 'episode_video_ids': episode_video_ids, # 每一集的视频ID列表 + 'episode_details': episode_details # 每集的详细信息 } documents.append(doc) @@ -1095,6 +1194,7 @@ class DouyinPlayVVScraper: if ( 'Network.responseReceived' in log['method'] and 'response' in log['params'] + and log['params']['response'] and 'url' in log['params']['response'] and '/web/api/v2/aweme/iteminfo' in log['params']['response']['url'] ): @@ -1130,6 +1230,11 @@ class DouyinPlayVVScraper: Returns: list: 按集数排序的视频ID列表 """ + # 定时器模式下跳过此函数 + if os.environ.get('TIMER_MODE') == '1': + logging.info(f'定时器模式:跳过 get_collection_videos 函数') + return [] + try: # 检查缓存文件 cache_dir = os.path.join(os.path.dirname(__file__), 'episode_video_ids') @@ -1273,6 +1378,374 @@ class DouyinPlayVVScraper: return [video['video_id'] for video in cached_videos] return [] + def get_video_details(self, video_id: str, max_comments: int = 20) -> dict: + """获取单个视频的详细互动数据 + Args: + video_id: 视频ID + max_comments: 最大评论数量,默认20条 + Returns: + dict: 包含点赞数、收藏数、转发数、评论内容的字典 + """ + video_details = { + 'video_id': video_id, + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'likes_formatted': '0', + 'shares_formatted': '0', + 'favorites_formatted': '0', + 'comments': [], + 'success': False, + 'error': None + } + + try: + # 确保driver已初始化 + if self.driver is None: + logging.info('Driver未初始化,正在设置...') + self.setup_driver() + if self.driver is None: + raise Exception("无法初始化WebDriver") + + video_url = f'https://www.douyin.com/video/{video_id}' + logging.info(f'获取视频详细数据: {video_url}') + + # 导航到视频页面 + self.driver.get(video_url) + time.sleep(3) + + # 等待页面加载完成 + try: + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.by import By + + WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.TAG_NAME, "video")) + ) + except Exception as e: + logging.warning(f'等待视频元素超时: {e}') + + # 获取网络请求日志 + logs = self.driver.get_log('performance') + + # 解析网络日志获取视频详细数据 + for entry in logs: + try: + log = json.loads(entry['message'])['message'] + if ( + 'Network.responseReceived' in log['method'] + and 'response' in log['params'] + and log['params']['response'] + and log['params']['response'].get('url') + ): + url = log['params']['response']['url'] + + # 检查是否是视频详情API + if '/aweme/v1/web/aweme/detail/' in url and video_id in url: + try: + # 获取响应体 + response_body = self.driver.execute_cdp_cmd( + 'Network.getResponseBody', + {'requestId': log['params']['requestId']} + ) + + if response_body and 'body' in response_body: + data = json.loads(response_body['body']) + aweme_detail = data.get('aweme_detail', {}) + + if aweme_detail: + # 获取统计数据 + statistics = aweme_detail.get('statistics', {}) + video_details['likes'] = int(statistics.get('digg_count', 0)) + video_details['shares'] = int(statistics.get('share_count', 0)) + video_details['favorites'] = int(statistics.get('collect_count', 0)) + + # 添加格式化字段 + video_details['likes_formatted'] = self.format_interaction_count(video_details['likes']) + video_details['shares_formatted'] = self.format_interaction_count(video_details['shares']) + video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites']) + + logging.info(f'视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}') + + except Exception as e: + logging.warning(f'解析视频详情API响应失败: {e}') + continue + + # 检查是否是评论API + elif '/aweme/v1/web/comment/list/' in url and video_id in url: + try: + # 获取响应体 + response_body = self.driver.execute_cdp_cmd( + 'Network.getResponseBody', + {'requestId': log['params']['requestId']} + ) + + if response_body and 'body' in response_body: + data = json.loads(response_body['body']) + comments = data.get('comments', []) + + for comment in comments[:max_comments]: + comment_info = { + 'text': comment.get('text', ''), + 'user_name': comment.get('user', {}).get('nickname', ''), + 'digg_count': int(comment.get('digg_count', 0)), + 'create_time': comment.get('create_time', 0) + } + video_details['comments'].append(comment_info) + + logging.info(f'视频 {video_id} 获取到 {len(video_details["comments"])} 条评论') + + except Exception as e: + logging.warning(f'解析评论API响应失败: {e}') + continue + + except Exception as e: + continue + + # 如果网络日志没有获取到数据,尝试页面解析 + if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0: + video_details = self._parse_video_details_from_page(video_id, video_details, max_comments) + + video_details['success'] = True + return video_details + + except Exception as e: + error_msg = f'获取视频 {video_id} 详细数据失败: {e}' + logging.error(error_msg) + video_details['error'] = error_msg + return video_details + + def _parse_video_details_from_page(self, video_id: str, video_details: dict, max_comments: int = 20) -> dict: + """从页面元素解析视频详细数据(备用方案) + Args: + video_id: 视频ID + video_details: 现有的视频详细数据字典 + max_comments: 最大评论数量 + Returns: + dict: 更新后的视频详细数据字典 + """ + try: + logging.info(f'尝试从页面元素解析视频 {video_id} 的详细数据') + + # 尝试解析页面中的SSR数据 + try: + # 查找包含视频数据的script标签 + scripts = self.driver.find_elements("tag name", "script") + for script in scripts: + script_content = script.get_attribute('innerHTML') + if script_content and ('window._SSR_HYDRATED_DATA' in script_content or 'RENDER_DATA' in script_content): + # 提取JSON数据 + if 'window._SSR_HYDRATED_DATA' in script_content: + match = re.search(r'window\._SSR_HYDRATED_DATA\s*=\s*({.*?});', script_content, re.DOTALL) + else: + match = re.search(r'window\.RENDER_DATA\s*=\s*({.*?});', script_content, re.DOTALL) + + if match: + data = json.loads(match.group(1)) + + # 查找视频详情数据 + def find_video_data(obj, target_id): + if isinstance(obj, dict): + for key, value in obj.items(): + if key == 'aweme_id' and str(value) == str(target_id): + return obj + elif isinstance(value, (dict, list)): + result = find_video_data(value, target_id) + if result: + return result + elif isinstance(obj, list): + for item in obj: + result = find_video_data(item, target_id) + if result: + return result + return None + + video_data = find_video_data(data, video_id) + if video_data: + statistics = video_data.get('statistics', {}) + video_details['likes'] = int(statistics.get('digg_count', 0)) + video_details['shares'] = int(statistics.get('share_count', 0)) + video_details['favorites'] = int(statistics.get('collect_count', 0)) + + # 添加格式化字段 + video_details['likes_formatted'] = self.format_interaction_count(video_details['likes']) + video_details['shares_formatted'] = self.format_interaction_count(video_details['shares']) + video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites']) + + logging.info(f'从SSR数据解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}') + break + + except Exception as e: + logging.warning(f'解析SSR数据失败: {e}') + + # 如果SSR数据解析失败,尝试CSS选择器 + if video_details['likes'] == 0 and video_details['shares'] == 0 and video_details['favorites'] == 0: + try: + # 尝试常见的点赞、分享、收藏按钮选择器 + selectors = { + 'likes': [ + '[data-e2e="video-like-count"]', + '[class*="like"] [class*="count"]', + '[class*="digg"] [class*="count"]' + ], + 'shares': [ + '[data-e2e="video-share-count"]', + '[class*="share"] [class*="count"]' + ], + 'favorites': [ + '[data-e2e="video-collect-count"]', + '[class*="collect"] [class*="count"]', + '[class*="favorite"] [class*="count"]' + ] + } + + for data_type, selector_list in selectors.items(): + for selector in selector_list: + try: + elements = self.driver.find_elements("css selector", selector) + if elements: + text = elements[0].text.strip() + if text and text.replace('.', '').replace('万', '').replace('亿', '').isdigit(): + # 转换数字格式 + if '亿' in text: + video_details[data_type] = int(float(text.replace('亿', '')) * 100000000) + elif '万' in text: + video_details[data_type] = int(float(text.replace('万', '')) * 10000) + else: + video_details[data_type] = int(text) + break + except Exception: + continue + + if video_details['likes'] > 0 or video_details['shares'] > 0 or video_details['favorites'] > 0: + # 添加格式化字段 + video_details['likes_formatted'] = self.format_interaction_count(video_details['likes']) + video_details['shares_formatted'] = self.format_interaction_count(video_details['shares']) + video_details['favorites_formatted'] = self.format_interaction_count(video_details['favorites']) + + logging.info(f'从页面元素解析到视频 {video_id} 互动数据: 点赞={video_details["likes_formatted"]}, 分享={video_details["shares_formatted"]}, 收藏={video_details["favorites_formatted"]}') + + except Exception as e: + logging.warning(f'CSS选择器解析失败: {e}') + + # 尝试获取评论(如果还没有获取到) + if not video_details['comments']: + try: + # 滚动到评论区域 + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(2) + + # 尝试常见的评论选择器 + comment_selectors = [ + '[data-e2e="comment-item"]', + '[class*="comment-item"]', + '[class*="comment"] [class*="content"]' + ] + + for selector in comment_selectors: + try: + comment_elements = self.driver.find_elements("css selector", selector)[:max_comments] + if comment_elements: + for element in comment_elements: + try: + comment_text = element.text.strip() + if comment_text: + comment_info = { + 'text': comment_text, + 'user_name': '', + 'digg_count': 0, + 'create_time': 0 + } + video_details['comments'].append(comment_info) + except Exception: + continue + + if video_details['comments']: + logging.info(f'从页面元素获取到视频 {video_id} 的 {len(video_details["comments"])} 条评论') + break + except Exception: + continue + + except Exception as e: + logging.warning(f'获取评论失败: {e}') + + except Exception as e: + logging.warning(f'页面解析视频详细数据失败: {e}') + + return video_details + + def get_collection_video_details(self, episode_video_ids: list, mix_name: str = '', max_comments_per_video: int = 10) -> list: + """获取合集中所有视频的详细互动数据 + Args: + episode_video_ids: 视频ID列表 + mix_name: 合集名称,用于日志 + max_comments_per_video: 每个视频最大评论数量,默认10条 + Returns: + list: 包含每个视频详细数据的列表 + """ + # 定时器模式下跳过此函数 + if os.environ.get('TIMER_MODE') == '1': + logging.info(f'定时器模式:跳过 get_collection_video_details 函数') + return [] + + if not episode_video_ids: + logging.info(f'合集 {mix_name} 没有视频ID,跳过详细数据获取') + return [] + + logging.info(f'开始获取合集 {mix_name} 中 {len(episode_video_ids)} 个视频的详细数据') + + video_details_list = [] + + for i, video_id in enumerate(episode_video_ids, 1): + if not video_id: + logging.warning(f'合集 {mix_name} 第 {i} 集视频ID为空,跳过') + video_details_list.append({ + 'episode_number': i, + 'video_id': '', + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'comments': [], + 'success': False, + 'error': '视频ID为空' + }) + continue + + logging.info(f'获取合集 {mix_name} 第 {i}/{len(episode_video_ids)} 集视频详细数据: {video_id}') + + try: + # 获取单个视频的详细数据 + video_details = self.get_video_details(video_id, max_comments_per_video) + video_details['episode_number'] = i + video_details_list.append(video_details) + + # 添加延迟避免请求过快 + time.sleep(2) + + except Exception as e: + error_msg = f'获取视频 {video_id} 详细数据时出错: {e}' + logging.error(error_msg) + video_details_list.append({ + 'episode_number': i, + 'video_id': video_id, + 'likes': 0, + 'shares': 0, + 'favorites': 0, + 'comments': [], + 'success': False, + 'error': error_msg + }) + + # 统计获取结果 + success_count = sum(1 for detail in video_details_list if detail.get('success', False)) + total_likes = sum(detail.get('likes', 0) for detail in video_details_list) + total_comments = sum(len(detail.get('comments', [])) for detail in video_details_list) + + logging.info(f'合集 {mix_name} 视频详细数据获取完成: {success_count}/{len(episode_video_ids)} 成功, 总点赞数={total_likes:,}, 总评论数={total_comments}') + + return video_details_list + def get_cookies_dict(self): """获取当前页面的cookies""" if not hasattr(self, 'cookies') or not self.cookies: diff --git a/backend/routers/rank_api_routes.py b/backend/routers/rank_api_routes.py index aec3aae..ae62d27 100644 --- a/backend/routers/rank_api_routes.py +++ b/backend/routers/rank_api_routes.py @@ -135,7 +135,8 @@ def format_mix_item(doc): "updated_to_episode": doc.get("updated_to_episode", 0), "cover_backup_urls": doc.get("cover_backup_urls", []), "mix_id": doc.get("mix_id", ""), - "episode_video_ids": doc.get("episode_video_ids", []) + "episode_video_ids": doc.get("episode_video_ids", []), + "episode_details": doc.get("episode_details", []) } def get_mix_list(page=1, limit=20, sort_by="playcount"): From 3591f5bdc2b77ec90d0fbef661d533b1a6ec38d5 Mon Sep 17 00:00:00 2001 From: qiaoyirui0819 <3160533978@qq.com> Date: Sat, 25 Oct 2025 19:27:07 +0800 Subject: [PATCH 5/6] =?UTF-8?q?=E8=8E=B7=E5=8F=96=E8=AF=84=E8=AE=BA?= =?UTF-8?q?=E5=86=85=E5=AE=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/Timer_worker.py | 172 +- .../Rankings/drivers/chromedriver.exe | Bin 16772096 -> 17123328 bytes .../video_ids_7393206265624676386.json | 43 + .../video_ids_7430120098712520743.json | 123 + .../video_ids_7472764729748883506.json | 51 + .../video_ids_7472792051890554918.json | 27 + .../video_ids_7475575474015209499.json | 123 + .../video_ids_7475967291851540516.json | 87 + .../video_ids_7481887799080126514.json | 47 + .../video_ids_7484936087232596005.json | 79 + .../video_ids_7486423445933951028.json | 151 + .../video_ids_7488149189877041178.json | 47 + .../video_ids_7488155970636351515.json | 143 + .../video_ids_7498567012075964457.json | 67 + .../video_ids_7504244771116812300.json | 47 + .../video_ids_7505982019956901899.json | 155 + .../video_ids_7506726690928724006.json | 235 + .../video_ids_7514985066221537331.json | 55 + .../video_ids_7518025970341185547.json | 83 + .../video_ids_7525674088921073715.json | 19 + .../video_ids_7525851121891182632.json | 19 + .../video_ids_7529584800245254183.json | 55 + .../video_ids_7533908989583558683.json | 195 + .../video_ids_7535344671749310527.json | 279 + .../video_ids_7538319294611621888.json | 55 + .../video_ids_7540121907586091043.json | 87 + .../video_ids_7540496302258276362.json | 59 + .../video_ids_7542848884909803539.json | 335 + .../video_ids_7546071666771396646.json | 59 + .../video_ids_7546852679645923366.json | 35 + .../video_ids_7553983671460448296.json | 283 + .../video_ids_7554004615847495706.json | 59 + .../video_ids_7554334029604997172.json | 87 + .../video_ids_7554913720183294004.json | 23 + .../video_ids_7555328975485012010.json | 215 + .../video_ids_7556193073273784355.json | 251 + .../video_ids_7561020459769153570.json | 115 + .../handlers/Rankings/rank_data_scraper.py | 1237 +++- ...s_7486805231804681530_20251025_191144.json | 5395 ++++++++++++++++ ...s_7487578743611985210_20251025_191509.json | 2521 ++++++++ ...s_7520060104645823780_20251025_184532.json | 5719 +++++++++++++++++ backend/handlers/Rankings/tos_client.py | 2 +- backend/routers/rank_api_routes.py | 3 +- 43 files changed, 18783 insertions(+), 59 deletions(-) create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7393206265624676386.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7430120098712520743.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7472764729748883506.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7472792051890554918.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7475575474015209499.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7475967291851540516.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7481887799080126514.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7484936087232596005.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7486423445933951028.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7488149189877041178.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7488155970636351515.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7498567012075964457.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7504244771116812300.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7505982019956901899.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7506726690928724006.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7514985066221537331.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7518025970341185547.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7525674088921073715.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7525851121891182632.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7529584800245254183.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7533908989583558683.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7535344671749310527.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7538319294611621888.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7540121907586091043.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7540496302258276362.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7542848884909803539.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7546071666771396646.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7546852679645923366.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7553983671460448296.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7554004615847495706.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7554334029604997172.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7554913720183294004.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7555328975485012010.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7556193073273784355.json create mode 100644 backend/handlers/Rankings/episode_video_ids/video_ids_7561020459769153570.json create mode 100644 backend/handlers/Rankings/saved_comments/comments_7486805231804681530_20251025_191144.json create mode 100644 backend/handlers/Rankings/saved_comments/comments_7487578743611985210_20251025_191509.json create mode 100644 backend/handlers/Rankings/saved_comments/comments_7520060104645823780_20251025_184532.json diff --git a/backend/Timer_worker.py b/backend/Timer_worker.py index 3aacde8..0a7c182 100644 --- a/backend/Timer_worker.py +++ b/backend/Timer_worker.py @@ -22,8 +22,7 @@ import sys import os import logging import argparse -from pathlib import Path -from datetime import datetime, date +from datetime import datetime, date, timedelta import config # 添加项目路径到 Python 路径 @@ -33,34 +32,83 @@ from handlers.Rankings.rank_data_scraper import DouyinPlayVVScraper # 配置日志的函数 -def setup_logging(): +def setup_logging(quiet_mode=False): """设置日志配置""" # 确保logs目录存在 - import os script_dir = os.path.dirname(os.path.abspath(__file__)) logs_dir = os.path.join(script_dir, 'handlers', 'Rankings', 'logs') os.makedirs(logs_dir, exist_ok=True) + # 在安静模式下,只记录WARNING及以上级别的日志到控制台 + console_level = logging.WARNING if quiet_mode else logging.INFO + logging.basicConfig( - level=logging.INFO, + level=logging.INFO, # 文件日志仍然记录所有INFO级别 format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(os.path.join(logs_dir, 'scheduler.log'), encoding='utf-8'), logging.StreamHandler() ] ) + + # 如果是安静模式,调整控制台处理器的级别 + if quiet_mode: + for handler in logging.getLogger().handlers: + if isinstance(handler, logging.StreamHandler) and not isinstance(handler, logging.FileHandler): + handler.setLevel(console_level) class DouyinAutoScheduler: def __init__(self): self.is_running = False + def _normalize_play_vv(self, play_vv): + """标准化播放量数据类型,将字符串转换为数字""" + if isinstance(play_vv, str): + try: + return int(play_vv.replace(',', '').replace('万', '0000').replace('亿', '00000000')) + except: + return 0 + elif not isinstance(play_vv, (int, float)): + return 0 + return play_vv + + def _deduplicate_videos_by_mix_name(self, videos, include_rank=False): + """按短剧名称去重,保留播放量最高的记录""" + unique_data = {} + for video in videos: + mix_name = video.get("mix_name", "") + if mix_name: + # 标准化播放量数据类型 + play_vv = self._normalize_play_vv(video.get("play_vv", 0)) + + if mix_name not in unique_data or play_vv > unique_data[mix_name].get("play_vv", 0): + if include_rank: + # 用于昨天数据的格式 + unique_data[mix_name] = { + "play_vv": play_vv, + "video_id": str(video.get("_id", "")), + "rank": 0 # 稍后计算排名 + } + else: + # 用于今天数据的格式,直接更新原视频对象 + video["play_vv"] = play_vv + unique_data[mix_name] = video + + return unique_data + def run_douyin_scraper(self): """执行抖音播放量抓取任务""" try: - logging.info("🚀 开始执行抖音播放量抓取任务...") + logging.warning("🚀 开始执行抖音播放量抓取任务...") # 设置环境变量,确保自动模式 os.environ['AUTO_CONTINUE'] = '1' + # 设置定时器模式环境变量,跳过评论抓取等函数 + os.environ['TIMER_MODE'] = '1' + + # 只在定时器模式下设置静默模式(非测试、非单次执行、非仅生成榜单) + if hasattr(self, '_is_timer_mode') and self._is_timer_mode: + os.environ['QUIET_MODE'] = '1' # 直接创建并运行 DouyinPlayVVScraper 实例 scraper = DouyinPlayVVScraper( @@ -68,11 +116,11 @@ class DouyinAutoScheduler: auto_continue=True, duration_s=60 ) - - logging.info("📁 开始执行抓取任务...") + + logging.warning("📁 开始执行抓取任务...") scraper.run() - - logging.info("✅ 抖音播放量抓取任务执行成功") + + logging.warning("✅ 抖音播放量抓取任务执行成功") # 数据抓取完成后,自动生成当日榜单 self.generate_daily_rankings() @@ -89,7 +137,7 @@ class DouyinAutoScheduler: from datetime import timedelta # 获取集合 - douyin_collection = db['Rankings_list'] # 使用真实抓取的数据 + douyin_collection = db['Ranking_storage_list'] # 使用定时器抓取的数据 rankings_collection = db['Ranking_storage'] today = date.today() @@ -107,40 +155,58 @@ class DouyinAutoScheduler: try: logging.info("🔄 正在生成时间轴对比榜单...") - # 获取今天的数据,按短剧名称去重,只保留播放量最高的 - today_videos_raw = list(douyin_collection.find({}).sort("play_vv", -1)) + # 获取最新批次的数据 + latest_batch = douyin_collection.find_one(sort=[("batch_time", -1)]) + if not latest_batch: + logging.warning("⚠️ 未找到任何数据") + return False + + latest_batch_time = latest_batch.get("batch_time") + logging.info(f"📊 找到最新批次时间: {latest_batch_time}") - # 按短剧名称去重,每个短剧只保留播放量最高的一条 - unique_videos = {} - for video in today_videos_raw: - mix_name = video.get("mix_name", "") - if mix_name and (mix_name not in unique_videos or video.get("play_vv", 0) > unique_videos[mix_name].get("play_vv", 0)): - unique_videos[mix_name] = video + # 只获取最新批次的数据 + today_videos_raw = list(douyin_collection.find({"batch_time": latest_batch_time}).sort("play_vv", -1)) + logging.info(f"📊 最新批次数据数量: {len(today_videos_raw)}") + # 调试:检查原始数据 + if today_videos_raw: + sample_video = today_videos_raw[0] + logging.info(f"🔍 样本数据检查:") + logging.info(f" mix_name: {sample_video.get('mix_name')}") + logging.info(f" play_vv: {sample_video.get('play_vv')} (类型: {type(sample_video.get('play_vv'))})") + logging.info(f" author: {sample_video.get('author')}") + + # 按短剧名称去重并确保数据类型正确 + unique_videos = self._deduplicate_videos_by_mix_name(today_videos_raw, include_rank=False) today_videos = list(unique_videos.values()) logging.info(f"📊 今日数据去重后:{len(today_videos)} 个独特短剧(原始数据:{len(today_videos_raw)} 条)") - # 获取昨天的榜单数据(如果存在),取最新的计算结果 - yesterday_ranking = rankings_collection.find_one({ - "date": yesterday_str, - "type": "comprehensive" - }, sort=[("calculation_sequence", -1)]) + # 获取昨天最后一批次的数据 + yesterday_start = datetime(yesterday.year, yesterday.month, yesterday.day) + yesterday_end = yesterday_start + timedelta(days=1) + yesterday_batch = douyin_collection.find_one({ + "batch_time": {"$gte": yesterday_start, "$lt": yesterday_end} + }, sort=[("batch_time", -1)]) yesterday_data = {} - if yesterday_ranking and "data" in yesterday_ranking: - # 将昨天的数据转换为字典,以短剧名称为键 - for item in yesterday_ranking["data"]: - title = item.get("title", "") - if title: - yesterday_data[title] = { - "rank": item.get("rank", 0), - "play_vv": item.get("play_vv", 0), - "video_id": item.get("video_id", "") - } - logging.info(f"📊 找到昨天的榜单数据,共 {len(yesterday_data)} 个短剧") + if yesterday_batch: + # 获取昨天最后一批次的所有数据 + yesterday_videos = list(douyin_collection.find({ + "batch_time": yesterday_batch["batch_time"] + }).sort("play_vv", -1)) + + # 按短剧名称去重,保留播放量最高的记录,并确保数据类型正确 + yesterday_data = self._deduplicate_videos_by_mix_name(yesterday_videos, include_rank=True) + + # 计算排名 + sorted_videos = sorted(yesterday_data.items(), key=lambda x: x[1]["play_vv"], reverse=True) + for rank, (mix_name, data) in enumerate(sorted_videos, 1): + yesterday_data[mix_name]["rank"] = rank + + logging.info(f"📊 找到昨天的原始数据,共 {len(yesterday_data)} 个短剧") else: - logging.info("📊 未找到昨天的榜单数据,将作为首次生成") + logging.info("📊 未找到昨天的原始数据,将作为首次生成") if today_videos: # 先计算所有视频的播放量差值 @@ -315,8 +381,6 @@ class DouyinAutoScheduler: def main(): """主函数""" - import argparse - try: parser = argparse.ArgumentParser(description='抖音播放量自动抓取定时器') parser.add_argument('--test', action='store_true', help='测试模式 - 立即执行一次') @@ -325,23 +389,51 @@ def main(): args = parser.parse_args() - # 设置日志配置 - setup_logging() + # 设置日志配置 - 只在定时器模式下启用静默模式 + quiet_mode = not (args.test or args.once or args.ranking_only) + setup_logging(quiet_mode=quiet_mode) print("正在初始化定时器...") scheduler = DouyinAutoScheduler() if args.test: + scheduler._is_timer_mode = False print("执行测试模式...") scheduler.run_test() elif args.once: + scheduler._is_timer_mode = False print("执行单次模式...") scheduler.run_once() elif args.ranking_only: + scheduler._is_timer_mode = False print("执行榜单生成模式...") scheduler.run_ranking_only() else: + scheduler._is_timer_mode = True print("启动定时器模式...") + + # 显示定时器信息(使用print确保能看到) + from datetime import datetime + current_time = datetime.now() + print(f"🕐 当前时间:{current_time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"⏰ 执行规则:每小时整点执行抖音播放量抓取") + + # 计算下次执行时间 + next_hour = current_time.replace(minute=0, second=0, microsecond=0) + if current_time.minute > 0 or current_time.second > 0: + next_hour = next_hour.replace(hour=next_hour.hour + 1) + if next_hour.hour >= 24: + from datetime import timedelta + next_hour = next_hour.replace(hour=0) + timedelta(days=1) + + wait_seconds = (next_hour - current_time).total_seconds() + wait_minutes = int(wait_seconds // 60) + + print(f"⏰ 下次执行时间:{next_hour.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"⏳ 距离下次执行:{wait_minutes} 分钟 ({int(wait_seconds)} 秒)") + print("💡 定时器正在等待中,将在整点自动执行任务...") + print("⏹️ 按 Ctrl+C 停止定时器") + scheduler.setup_schedule() scheduler.start_scheduler() diff --git a/backend/handlers/Rankings/drivers/chromedriver.exe b/backend/handlers/Rankings/drivers/chromedriver.exe index 57fd644947de3ace10664fe45c8bac7fb5dcfbd9..9225a08334004725a3edc78f86b341fb158587f2 100644 GIT binary patch literal 17123328 zcmeEv3w%`7wf9Uizz8E}(5SJdIH}V%*woIHv^Qd|H!2`1k0|m`K%xR-g-lHFQ56CN zw5MZQ+S=CM+Iwy5?ZbL+ZLfN@wKfAJ;USPf5
rWd5}+>0>vUNVgS+
zq@dKp!Klwab9$e&mE&(1f1e`bMK=4UHls1?pCY?`ML}=dv)6xY%d(~$=ku@o*p`X&
zC9f^FRa^2)`=7MAdRr2bPOq6W{qZ?SyXd>%kpz*~B=9{pTk$nz)9#&fug&)U->$XU
zUPPI9@OKIRTK*N dSQO$-Xg^>mwKF>
z4cuJl$5iLy|gIW
z3Ly<=o!XXqq#+k+4O;Y=79f*|ECcnGHBQEt0Wgf^p07bJU z9tq;=+amths&2A0S?84CN7H|af)*W@&U5b3C
zbvUwM*Br)E=R2=mN!-FCuQ|kd^PT=_0Kwy!@9g_1op(bC%&VpOLG%%mbvUwM4q`m;
zHt`vln|7p{?|g_?(MLSyJG<}>kN1f0;IzDn>`m@10=~horEJK&S{~`2S34JPe^bpA
zL{;#@^#{R-Z$66S1dii4c-GGPBgCJ-=yLX6;moYR2DFCwJUEQM#Z(7N(&o3OI9opo
zvTw?9!+{iep>;U8$*7EKQ4YS6+g5m=^PLjvu3D_x8gAdbT1>S?C8#ZjaA313Rb8&<
zTFCw0E3|CT{5ItHW?%084bw2gX-FYLtzN&Ez(w(WKa51|rr$r0eUD~B5tkrT0UmgO
zSRkMu)3NsCJ!BR0C&`q2xP$@|*pqv}Q|kxFP`pisf&>Kfryy~#DNQ8PS+W=1EYsk|
zbvQ^Ya|jJ6JjlL@@^)hBT`Xs0Qsz5PuvoGsQT)G$fi&tf-+7UN7#GEpr7q`g@Od}v
z25E`AkD*MwzYE8cIPhlJ4w6$Z^in}tEwVHqMz38ouP4!5&GpB-gEiN+_3Ek2psH;@
z;>G7-j5tL~TP21}TDU5`Q(RHJWa-~?9U|5_5UjY4D50}T=&VsX #`kksU5nJD1k$g2KPm+DDZoWETU700(>&p4avV3(smv{`q+%h_s_|18^
zDx5<0^oOPq;%0?%(Yg(A08{`-wgu6t3gapmohej_(<#^d5zaO1P#cL%*p!^kHItoS
zYg$%$syA9to==8n&H2w~@MtKGrD!}HzEZBZ^3gZHKOd+{e@0o@(WpVj(lfcBhKhH?
z`E@HjJob&SxS$K?e>LxbbgU(b%lUpnsx{^4ZZcJjbG)6^%JF)}Hg>&HFyA}2`QP_7
z&mF*~XJmJOtvpjm>)+^uN0cN$N(A1_5)3cXlv_@<)9}U5oEgZR$5TY*d8T$Zx13$n
zJhxz<9Xt>7(kyvb_Cfy~;*Xlp!IMqv+fYBq4Arj_wc@l5SMN#G^=njD7(K3Q1@-(>
zYlb4Go`G@YU`?wyU)*UxONr MNeB$A4wD~+;ol`mF{rK+1xpz)OJT)QCSDdXDz(W2Y}
z`2;up78*|}n?YT^&LooMbT$;3(0KL>XRK*IjQ1sV-EU;
%F6#m#N{BN(I-VBiX+8+x4(*vnb1L4YFxmQ#l
zaafF=LY?sbb@%T^>F$fotLB-!$bv6)2T60s;qsos(T@Y8hq>2ql*k3H!Re*xIK7Ni>&EA&
z;Pk>&oM`+6UkSdj&bFFs>}>gMSD9;f+o$*B^R{FAhWds2$JRI8E!;uWH
er!6Ikr
z8&P00KgY|`H;*Yx|GYBtq)B&TDyhu_?cK(UCcQ}BUH=_1e
lA-M)@x7VZPE(<+~Wr0N25Vmi6B}k5TM*`Md|2
zzorFA?6=r&vELTo1y >f)=5uP)z1
zb1kOi+(P+DHA)LB5gh;39)ygKt2o8*u(bQ7;uITHe0A~F#aCAvCTZ0$NR3=B`w7V$
z7?}ejb6|p=hG9vKG^vJ4US0C){YEB~Q(p$dXvbCNz{E(IVT%nZvq8m