diff --git a/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py b/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py index 2f1999b..54a2aed 100644 --- a/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py +++ b/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py @@ -517,129 +517,16 @@ def extract_product_title(soup): def extract_product_description(soup, base_url=''): - """从产品详情页提取完整的产品详细描述""" - description_parts = [] - - # 方法1: 从 detail-tab-item 容器中提取(made-in-china.com 的标准结构) - detail_tab_item = soup.find('div', class_='detail-tab-item') or soup.find('div', class_='J-tab-cnt') + """从产品详情页提取完整的产品详细描述(只提取 bsc-info 部分)""" + detail_tab_item = soup.find('div', class_='detail-tab') if detail_tab_item: - # 查找 detail-desc 容器 - detail_desc = detail_tab_item.find('div', class_='detail-desc') - if detail_desc: - # 查找 async-rich-info > rich-text - async_rich_info = detail_desc.find('div', class_='async-rich-info') - if async_rich_info: - rich_text = async_rich_info.find('div', class_='rich-text') - if rich_text: - # 提取完整HTML内容(保留所有格式、图片、表格等) - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - # 如果没有找到 async-rich-info,直接查找 rich-text - if not description_parts: - rich_text = detail_desc.find('div', class_='rich-text') - if rich_text: - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - # 如果还是没有找到,提取整个 detail-desc 的内容 - if not description_parts: - html_content = str(detail_desc) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - # 方法2: 直接查找 detail-desc 容器(如果没有 detail-tab-item) - if not description_parts: - detail_desc = soup.find('div', class_='detail-desc') - if detail_desc: - # 查找其中的 async-rich-info > rich-text - async_rich_info = detail_desc.find('div', class_='async-rich-info') - if async_rich_info: - rich_text = async_rich_info.find('div', class_='rich-text') - if rich_text: - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - # 如果没有找到,直接查找 rich-text - if not description_parts: - rich_text = detail_desc.find('div', class_='rich-text') - if rich_text: - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - # 方法3: 查找 async-rich-info 或 rich-text 容器(作为后备) - if not description_parts: - async_rich_info = soup.find('div', class_='async-rich-info') - if async_rich_info: - rich_text = async_rich_info.find('div', class_='rich-text') - if rich_text: - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - if not description_parts: - rich_text = soup.find('div', class_='rich-text') - if rich_text: - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - - # 方法4: 查找包含"Product Description"标题的容器 - if not description_parts: - # 查找包含"Product Description"的标题 - desc_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'Product Description|产品描述', re.IGNORECASE)) - for header in desc_headers: - # 向上查找父容器 - parent = header.find_parent() - if parent: - # 查找后续的 rich-text 或 async-rich-info - rich_text = parent.find('div', class_='rich-text') or parent.find('div', class_='async-rich-info') - if rich_text: - html_content = str(rich_text) - if html_content and len(html_content.strip()) > 50: - description_parts.append(html_content) - break - - # 如果没有找到,提取父容器中 header 之后的所有内容 - if not description_parts: - current = header.find_next_sibling() - content_parts = [] - for _ in range(20): # 最多查找20个兄弟元素 - if current: - if current.name in ['h2', 'h3', 'h4', 'h5']: - # 遇到下一个标题,停止 - break - # 提取内容 - content = str(current) - if content and len(content.strip()) > 20: - content_parts.append(content) - current = current.find_next_sibling() - else: - break - - if content_parts: - description_parts.extend(content_parts) - break - - # 合并所有描述部分 - if description_parts: - # 去重并合并 - unique_parts = [] - seen = set() - for part in description_parts: - # 使用前100个字符作为去重标识(避免完全相同的长内容) - part_hash = part[:100] if len(part) > 100 else part - if part_hash not in seen: - seen.add(part_hash) - unique_parts.append(part) - - # 合并为完整HTML - full_description = '\n'.join(unique_parts) - return full_description + # 查找 bsc-info 容器 + bsc_info = detail_tab_item.find('div', class_='bsc-info') + if bsc_info: + # 提取完整HTML内容(保留所有格式、表格等) + html_content = str(bsc_info) + if html_content: + return html_content return ''