extrac description from bsc-info

This commit is contained in:
jingrow 2025-11-14 05:51:26 +08:00
parent 0f9d2726ed
commit c57ce68d2b

View File

@ -517,129 +517,16 @@ def extract_product_title(soup):
def extract_product_description(soup, base_url=''):
"""从产品详情页提取完整的产品详细描述"""
description_parts = []
# 方法1: 从 detail-tab-item 容器中提取made-in-china.com 的标准结构)
detail_tab_item = soup.find('div', class_='detail-tab-item') or soup.find('div', class_='J-tab-cnt')
"""从产品详情页提取完整的产品详细描述(只提取 bsc-info 部分)"""
detail_tab_item = soup.find('div', class_='detail-tab')
if detail_tab_item:
# 查找 detail-desc 容器
detail_desc = detail_tab_item.find('div', class_='detail-desc')
if detail_desc:
# 查找 async-rich-info > rich-text
async_rich_info = detail_desc.find('div', class_='async-rich-info')
if async_rich_info:
rich_text = async_rich_info.find('div', class_='rich-text')
if rich_text:
# 提取完整HTML内容保留所有格式、图片、表格等
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 如果没有找到 async-rich-info直接查找 rich-text
if not description_parts:
rich_text = detail_desc.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 如果还是没有找到,提取整个 detail-desc 的内容
if not description_parts:
html_content = str(detail_desc)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 方法2: 直接查找 detail-desc 容器(如果没有 detail-tab-item
if not description_parts:
detail_desc = soup.find('div', class_='detail-desc')
if detail_desc:
# 查找其中的 async-rich-info > rich-text
async_rich_info = detail_desc.find('div', class_='async-rich-info')
if async_rich_info:
rich_text = async_rich_info.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 如果没有找到,直接查找 rich-text
if not description_parts:
rich_text = detail_desc.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 方法3: 查找 async-rich-info 或 rich-text 容器(作为后备)
if not description_parts:
async_rich_info = soup.find('div', class_='async-rich-info')
if async_rich_info:
rich_text = async_rich_info.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
if not description_parts:
rich_text = soup.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 方法4: 查找包含"Product Description"标题的容器
if not description_parts:
# 查找包含"Product Description"的标题
desc_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'Product Description|产品描述', re.IGNORECASE))
for header in desc_headers:
# 向上查找父容器
parent = header.find_parent()
if parent:
# 查找后续的 rich-text 或 async-rich-info
rich_text = parent.find('div', class_='rich-text') or parent.find('div', class_='async-rich-info')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
break
# 如果没有找到,提取父容器中 header 之后的所有内容
if not description_parts:
current = header.find_next_sibling()
content_parts = []
for _ in range(20): # 最多查找20个兄弟元素
if current:
if current.name in ['h2', 'h3', 'h4', 'h5']:
# 遇到下一个标题,停止
break
# 提取内容
content = str(current)
if content and len(content.strip()) > 20:
content_parts.append(content)
current = current.find_next_sibling()
else:
break
if content_parts:
description_parts.extend(content_parts)
break
# 合并所有描述部分
if description_parts:
# 去重并合并
unique_parts = []
seen = set()
for part in description_parts:
# 使用前100个字符作为去重标识避免完全相同的长内容
part_hash = part[:100] if len(part) > 100 else part
if part_hash not in seen:
seen.add(part_hash)
unique_parts.append(part)
# 合并为完整HTML
full_description = '\n'.join(unique_parts)
return full_description
# 查找 bsc-info 容器
bsc_info = detail_tab_item.find('div', class_='bsc-info')
if bsc_info:
# 提取完整HTML内容保留所有格式、表格等
html_content = str(bsc_info)
if html_content:
return html_content
return ''