extract large images from bigImg-wrap

This commit is contained in:
jingrow 2025-11-14 04:52:49 +08:00
parent cb1b3887e8
commit 0f9d2726ed

View File

@ -264,33 +264,43 @@ def get_next_page_url(soup, current_url):
# ==================== 产品详情页提取函数 ====================
def extract_product_detail_images(soup, base_url=''):
"""从产品详情页提取所有图片"""
"""从产品详情页提取所有图片(从 bigImg-wrap 中提取大图)"""
images = []
# 查找图片容器
slide_page = soup.find('div', class_='sr-proMainInfo-slide-page')
if slide_page:
# 查找所有图片项
slide_ul = slide_page.find('ul', class_='sr-proMainInfo-slide-pageUl')
if slide_ul:
# 查找所有 li.J-pic-dot
pic_items = slide_ul.find_all('li', class_='J-pic-dot')
for item in pic_items:
img_tag = item.find('img')
if img_tag:
img_url = img_tag.get('data-original') or img_tag.get('src', '')
if img_url:
# 确保URL是完整的
if img_url.startswith('//'):
img_url = 'https:' + img_url
elif img_url.startswith('/'):
if base_url:
parsed = urlparse(base_url)
img_url = f"{parsed.scheme}://{parsed.netloc}" + img_url
else:
img_url = base_url + img_url
if img_url not in images:
images.append(img_url)
# 从 bigImg-wrap 中提取大图
big_img_wrap = soup.find('div', class_='bigImg-wrap') or soup.find('div', class_='J-bigImg-wrap')
if big_img_wrap:
# 查找所有 J-pic-large-item 元素
pic_large_items = big_img_wrap.find_all('div', class_='J-pic-large-item')
for item in pic_large_items:
img_url = None
# 优先从 fsrc 属性获取
pic_inside = item.find('div', class_='sr-proMainInfo-slide-picInside')
if pic_inside:
img_url = pic_inside.get('fsrc')
# 如果没有 fsrc从 enlargeHref 链接中的 img 获取 src
if not img_url:
enlarge_href = item.find('a', class_='enlargeHref')
if enlarge_href:
img_tag = enlarge_href.find('img')
if img_tag:
img_url = img_tag.get('src')
if img_url:
# 确保URL是完整的
if img_url.startswith('//'):
img_url = 'https:' + img_url
elif img_url.startswith('/'):
if base_url:
parsed = urlparse(base_url)
img_url = f"{parsed.scheme}://{parsed.netloc}" + img_url
else:
img_url = base_url + img_url
if img_url not in images:
images.append(img_url)
return images
@ -506,6 +516,134 @@ def extract_product_title(soup):
return ''
def extract_product_description(soup, base_url=''):
"""从产品详情页提取完整的产品详细描述"""
description_parts = []
# 方法1: 从 detail-tab-item 容器中提取made-in-china.com 的标准结构)
detail_tab_item = soup.find('div', class_='detail-tab-item') or soup.find('div', class_='J-tab-cnt')
if detail_tab_item:
# 查找 detail-desc 容器
detail_desc = detail_tab_item.find('div', class_='detail-desc')
if detail_desc:
# 查找 async-rich-info > rich-text
async_rich_info = detail_desc.find('div', class_='async-rich-info')
if async_rich_info:
rich_text = async_rich_info.find('div', class_='rich-text')
if rich_text:
# 提取完整HTML内容保留所有格式、图片、表格等
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 如果没有找到 async-rich-info直接查找 rich-text
if not description_parts:
rich_text = detail_desc.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 如果还是没有找到,提取整个 detail-desc 的内容
if not description_parts:
html_content = str(detail_desc)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 方法2: 直接查找 detail-desc 容器(如果没有 detail-tab-item
if not description_parts:
detail_desc = soup.find('div', class_='detail-desc')
if detail_desc:
# 查找其中的 async-rich-info > rich-text
async_rich_info = detail_desc.find('div', class_='async-rich-info')
if async_rich_info:
rich_text = async_rich_info.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 如果没有找到,直接查找 rich-text
if not description_parts:
rich_text = detail_desc.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 方法3: 查找 async-rich-info 或 rich-text 容器(作为后备)
if not description_parts:
async_rich_info = soup.find('div', class_='async-rich-info')
if async_rich_info:
rich_text = async_rich_info.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
if not description_parts:
rich_text = soup.find('div', class_='rich-text')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
# 方法4: 查找包含"Product Description"标题的容器
if not description_parts:
# 查找包含"Product Description"的标题
desc_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'Product Description|产品描述', re.IGNORECASE))
for header in desc_headers:
# 向上查找父容器
parent = header.find_parent()
if parent:
# 查找后续的 rich-text 或 async-rich-info
rich_text = parent.find('div', class_='rich-text') or parent.find('div', class_='async-rich-info')
if rich_text:
html_content = str(rich_text)
if html_content and len(html_content.strip()) > 50:
description_parts.append(html_content)
break
# 如果没有找到,提取父容器中 header 之后的所有内容
if not description_parts:
current = header.find_next_sibling()
content_parts = []
for _ in range(20): # 最多查找20个兄弟元素
if current:
if current.name in ['h2', 'h3', 'h4', 'h5']:
# 遇到下一个标题,停止
break
# 提取内容
content = str(current)
if content and len(content.strip()) > 20:
content_parts.append(content)
current = current.find_next_sibling()
else:
break
if content_parts:
description_parts.extend(content_parts)
break
# 合并所有描述部分
if description_parts:
# 去重并合并
unique_parts = []
seen = set()
for part in description_parts:
# 使用前100个字符作为去重标识避免完全相同的长内容
part_hash = part[:100] if len(part) > 100 else part
if part_hash not in seen:
seen.add(part_hash)
unique_parts.append(part)
# 合并为完整HTML
full_description = '\n'.join(unique_parts)
return full_description
return ''
# ==================== 爬取函数 ====================
async def crawl_product_detail(crawler, product_url, product_index=None, total_products=None, base_url=''):
@ -545,6 +683,11 @@ async def crawl_product_detail(crawler, product_url, product_index=None, total_p
if product_details:
detail_info['Product Details'] = product_details
# 提取完整的产品详细描述
description = extract_product_description(soup, base_url)
if description:
detail_info['description'] = description
return detail_info
except Exception as e:
@ -875,6 +1018,10 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any],
# 添加Product Details完整的产品详情
if 'Product Details' in detail_info:
full_product_data['Product Details'] = detail_info['Product Details']
# 添加完整的产品详细描述
if 'description' in detail_info:
full_product_data['description'] = detail_info['description']
# 异步创建记录
create_result = await create_record_async(full_product_data, config, label2field)