From 0f9d2726edce2bec7ebe534af001ce53b5d69c20 Mon Sep 17 00:00:00 2001 From: jingrow Date: Fri, 14 Nov 2025 04:52:49 +0800 Subject: [PATCH] extract large images from bigImg-wrap --- .../web_scrapers_create.py | 197 +++++++++++++++--- 1 file changed, 172 insertions(+), 25 deletions(-) diff --git a/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py b/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py index 9ba8232..2f1999b 100644 --- a/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py +++ b/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py @@ -264,33 +264,43 @@ def get_next_page_url(soup, current_url): # ==================== 产品详情页提取函数 ==================== def extract_product_detail_images(soup, base_url=''): - """从产品详情页提取所有图片""" + """从产品详情页提取所有图片(从 bigImg-wrap 中提取大图)""" images = [] - # 查找图片容器 - slide_page = soup.find('div', class_='sr-proMainInfo-slide-page') - if slide_page: - # 查找所有图片项 - slide_ul = slide_page.find('ul', class_='sr-proMainInfo-slide-pageUl') - if slide_ul: - # 查找所有 li.J-pic-dot - pic_items = slide_ul.find_all('li', class_='J-pic-dot') - for item in pic_items: - img_tag = item.find('img') - if img_tag: - img_url = img_tag.get('data-original') or img_tag.get('src', '') - if img_url: - # 确保URL是完整的 - if img_url.startswith('//'): - img_url = 'https:' + img_url - elif img_url.startswith('/'): - if base_url: - parsed = urlparse(base_url) - img_url = f"{parsed.scheme}://{parsed.netloc}" + img_url - else: - img_url = base_url + img_url - if img_url not in images: - images.append(img_url) + # 从 bigImg-wrap 中提取大图 + big_img_wrap = soup.find('div', class_='bigImg-wrap') or soup.find('div', class_='J-bigImg-wrap') + if big_img_wrap: + # 查找所有 J-pic-large-item 元素 + pic_large_items = big_img_wrap.find_all('div', class_='J-pic-large-item') + for item in pic_large_items: + img_url = None + + # 优先从 fsrc 属性获取 + pic_inside = item.find('div', class_='sr-proMainInfo-slide-picInside') + if pic_inside: + img_url = pic_inside.get('fsrc') + + # 如果没有 fsrc,从 enlargeHref 链接中的 img 获取 src + if not img_url: + enlarge_href = item.find('a', class_='enlargeHref') + if enlarge_href: + img_tag = enlarge_href.find('img') + if img_tag: + img_url = img_tag.get('src') + + if img_url: + # 确保URL是完整的 + if img_url.startswith('//'): + img_url = 'https:' + img_url + elif img_url.startswith('/'): + if base_url: + parsed = urlparse(base_url) + img_url = f"{parsed.scheme}://{parsed.netloc}" + img_url + else: + img_url = base_url + img_url + + if img_url not in images: + images.append(img_url) return images @@ -506,6 +516,134 @@ def extract_product_title(soup): return '' +def extract_product_description(soup, base_url=''): + """从产品详情页提取完整的产品详细描述""" + description_parts = [] + + # 方法1: 从 detail-tab-item 容器中提取(made-in-china.com 的标准结构) + detail_tab_item = soup.find('div', class_='detail-tab-item') or soup.find('div', class_='J-tab-cnt') + if detail_tab_item: + # 查找 detail-desc 容器 + detail_desc = detail_tab_item.find('div', class_='detail-desc') + if detail_desc: + # 查找 async-rich-info > rich-text + async_rich_info = detail_desc.find('div', class_='async-rich-info') + if async_rich_info: + rich_text = async_rich_info.find('div', class_='rich-text') + if rich_text: + # 提取完整HTML内容(保留所有格式、图片、表格等) + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + # 如果没有找到 async-rich-info,直接查找 rich-text + if not description_parts: + rich_text = detail_desc.find('div', class_='rich-text') + if rich_text: + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + # 如果还是没有找到,提取整个 detail-desc 的内容 + if not description_parts: + html_content = str(detail_desc) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + # 方法2: 直接查找 detail-desc 容器(如果没有 detail-tab-item) + if not description_parts: + detail_desc = soup.find('div', class_='detail-desc') + if detail_desc: + # 查找其中的 async-rich-info > rich-text + async_rich_info = detail_desc.find('div', class_='async-rich-info') + if async_rich_info: + rich_text = async_rich_info.find('div', class_='rich-text') + if rich_text: + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + # 如果没有找到,直接查找 rich-text + if not description_parts: + rich_text = detail_desc.find('div', class_='rich-text') + if rich_text: + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + # 方法3: 查找 async-rich-info 或 rich-text 容器(作为后备) + if not description_parts: + async_rich_info = soup.find('div', class_='async-rich-info') + if async_rich_info: + rich_text = async_rich_info.find('div', class_='rich-text') + if rich_text: + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + if not description_parts: + rich_text = soup.find('div', class_='rich-text') + if rich_text: + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + + # 方法4: 查找包含"Product Description"标题的容器 + if not description_parts: + # 查找包含"Product Description"的标题 + desc_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'Product Description|产品描述', re.IGNORECASE)) + for header in desc_headers: + # 向上查找父容器 + parent = header.find_parent() + if parent: + # 查找后续的 rich-text 或 async-rich-info + rich_text = parent.find('div', class_='rich-text') or parent.find('div', class_='async-rich-info') + if rich_text: + html_content = str(rich_text) + if html_content and len(html_content.strip()) > 50: + description_parts.append(html_content) + break + + # 如果没有找到,提取父容器中 header 之后的所有内容 + if not description_parts: + current = header.find_next_sibling() + content_parts = [] + for _ in range(20): # 最多查找20个兄弟元素 + if current: + if current.name in ['h2', 'h3', 'h4', 'h5']: + # 遇到下一个标题,停止 + break + # 提取内容 + content = str(current) + if content and len(content.strip()) > 20: + content_parts.append(content) + current = current.find_next_sibling() + else: + break + + if content_parts: + description_parts.extend(content_parts) + break + + # 合并所有描述部分 + if description_parts: + # 去重并合并 + unique_parts = [] + seen = set() + for part in description_parts: + # 使用前100个字符作为去重标识(避免完全相同的长内容) + part_hash = part[:100] if len(part) > 100 else part + if part_hash not in seen: + seen.add(part_hash) + unique_parts.append(part) + + # 合并为完整HTML + full_description = '\n'.join(unique_parts) + return full_description + + return '' + + # ==================== 爬取函数 ==================== async def crawl_product_detail(crawler, product_url, product_index=None, total_products=None, base_url=''): @@ -545,6 +683,11 @@ async def crawl_product_detail(crawler, product_url, product_index=None, total_p if product_details: detail_info['Product Details'] = product_details + # 提取完整的产品详细描述 + description = extract_product_description(soup, base_url) + if description: + detail_info['description'] = description + return detail_info except Exception as e: @@ -875,6 +1018,10 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], # 添加Product Details(完整的产品详情) if 'Product Details' in detail_info: full_product_data['Product Details'] = detail_info['Product Details'] + + # 添加完整的产品详细描述 + if 'description' in detail_info: + full_product_data['description'] = detail_info['description'] # 异步创建记录 create_result = await create_record_async(full_product_data, config, label2field)