extrac description from bsc-info
This commit is contained in:
parent
0f9d2726ed
commit
c57ce68d2b
@ -517,129 +517,16 @@ def extract_product_title(soup):
|
||||
|
||||
|
||||
def extract_product_description(soup, base_url=''):
|
||||
"""从产品详情页提取完整的产品详细描述"""
|
||||
description_parts = []
|
||||
|
||||
# 方法1: 从 detail-tab-item 容器中提取(made-in-china.com 的标准结构)
|
||||
detail_tab_item = soup.find('div', class_='detail-tab-item') or soup.find('div', class_='J-tab-cnt')
|
||||
"""从产品详情页提取完整的产品详细描述(只提取 bsc-info 部分)"""
|
||||
detail_tab_item = soup.find('div', class_='detail-tab')
|
||||
if detail_tab_item:
|
||||
# 查找 detail-desc 容器
|
||||
detail_desc = detail_tab_item.find('div', class_='detail-desc')
|
||||
if detail_desc:
|
||||
# 查找 async-rich-info > rich-text
|
||||
async_rich_info = detail_desc.find('div', class_='async-rich-info')
|
||||
if async_rich_info:
|
||||
rich_text = async_rich_info.find('div', class_='rich-text')
|
||||
if rich_text:
|
||||
# 提取完整HTML内容(保留所有格式、图片、表格等)
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
# 如果没有找到 async-rich-info,直接查找 rich-text
|
||||
if not description_parts:
|
||||
rich_text = detail_desc.find('div', class_='rich-text')
|
||||
if rich_text:
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
# 如果还是没有找到,提取整个 detail-desc 的内容
|
||||
if not description_parts:
|
||||
html_content = str(detail_desc)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
# 方法2: 直接查找 detail-desc 容器(如果没有 detail-tab-item)
|
||||
if not description_parts:
|
||||
detail_desc = soup.find('div', class_='detail-desc')
|
||||
if detail_desc:
|
||||
# 查找其中的 async-rich-info > rich-text
|
||||
async_rich_info = detail_desc.find('div', class_='async-rich-info')
|
||||
if async_rich_info:
|
||||
rich_text = async_rich_info.find('div', class_='rich-text')
|
||||
if rich_text:
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
# 如果没有找到,直接查找 rich-text
|
||||
if not description_parts:
|
||||
rich_text = detail_desc.find('div', class_='rich-text')
|
||||
if rich_text:
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
# 方法3: 查找 async-rich-info 或 rich-text 容器(作为后备)
|
||||
if not description_parts:
|
||||
async_rich_info = soup.find('div', class_='async-rich-info')
|
||||
if async_rich_info:
|
||||
rich_text = async_rich_info.find('div', class_='rich-text')
|
||||
if rich_text:
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
if not description_parts:
|
||||
rich_text = soup.find('div', class_='rich-text')
|
||||
if rich_text:
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
|
||||
# 方法4: 查找包含"Product Description"标题的容器
|
||||
if not description_parts:
|
||||
# 查找包含"Product Description"的标题
|
||||
desc_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'Product Description|产品描述', re.IGNORECASE))
|
||||
for header in desc_headers:
|
||||
# 向上查找父容器
|
||||
parent = header.find_parent()
|
||||
if parent:
|
||||
# 查找后续的 rich-text 或 async-rich-info
|
||||
rich_text = parent.find('div', class_='rich-text') or parent.find('div', class_='async-rich-info')
|
||||
if rich_text:
|
||||
html_content = str(rich_text)
|
||||
if html_content and len(html_content.strip()) > 50:
|
||||
description_parts.append(html_content)
|
||||
break
|
||||
|
||||
# 如果没有找到,提取父容器中 header 之后的所有内容
|
||||
if not description_parts:
|
||||
current = header.find_next_sibling()
|
||||
content_parts = []
|
||||
for _ in range(20): # 最多查找20个兄弟元素
|
||||
if current:
|
||||
if current.name in ['h2', 'h3', 'h4', 'h5']:
|
||||
# 遇到下一个标题,停止
|
||||
break
|
||||
# 提取内容
|
||||
content = str(current)
|
||||
if content and len(content.strip()) > 20:
|
||||
content_parts.append(content)
|
||||
current = current.find_next_sibling()
|
||||
else:
|
||||
break
|
||||
|
||||
if content_parts:
|
||||
description_parts.extend(content_parts)
|
||||
break
|
||||
|
||||
# 合并所有描述部分
|
||||
if description_parts:
|
||||
# 去重并合并
|
||||
unique_parts = []
|
||||
seen = set()
|
||||
for part in description_parts:
|
||||
# 使用前100个字符作为去重标识(避免完全相同的长内容)
|
||||
part_hash = part[:100] if len(part) > 100 else part
|
||||
if part_hash not in seen:
|
||||
seen.add(part_hash)
|
||||
unique_parts.append(part)
|
||||
|
||||
# 合并为完整HTML
|
||||
full_description = '\n'.join(unique_parts)
|
||||
return full_description
|
||||
# 查找 bsc-info 容器
|
||||
bsc_info = detail_tab_item.find('div', class_='bsc-info')
|
||||
if bsc_info:
|
||||
# 提取完整HTML内容(保留所有格式、表格等)
|
||||
html_content = str(bsc_info)
|
||||
if html_content:
|
||||
return html_content
|
||||
|
||||
return ''
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user