extract large images from bigImg-wrap

2025-11-14 04:52:49 +08:00 · 2025-11-14 04:52:49 +08:00 · 0f9d2726ed
commit 0f9d2726ed
parent cb1b3887e8
1 changed files with 172 additions and 25 deletions
--- a/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py
+++ b/apps/jingrow/jingrow/ai/nodes/web_scrapers_create/web_scrapers_create.py
@ -264,33 +264,43 @@ def get_next_page_url(soup, current_url):
 # ==================== 产品详情页提取函数 ====================

 def extract_product_detail_images(soup, base_url=''):
-    """从产品详情页提取所有图片"""
+    """从产品详情页提取所有图片（从 bigImg-wrap 中提取大图）"""
    images = []
    
-    # 查找图片容器
-    slide_page = soup.find('div', class_='sr-proMainInfo-slide-page')
-    if slide_page:
-        # 查找所有图片项
-        slide_ul = slide_page.find('ul', class_='sr-proMainInfo-slide-pageUl')
-        if slide_ul:
-            # 查找所有 li.J-pic-dot
-            pic_items = slide_ul.find_all('li', class_='J-pic-dot')
-            for item in pic_items:
-                img_tag = item.find('img')
-                if img_tag:
-                    img_url = img_tag.get('data-original') or img_tag.get('src', '')
-                    if img_url:
-                        # 确保URL是完整的
-                        if img_url.startswith('//'):
-                            img_url = 'https:' + img_url
-                        elif img_url.startswith('/'):
-                            if base_url:
-                                parsed = urlparse(base_url)
-                                img_url = f"{parsed.scheme}://{parsed.netloc}" + img_url
-                            else:
-                                img_url = base_url + img_url
-                        if img_url not in images:
-                            images.append(img_url)
+    # 从 bigImg-wrap 中提取大图
+    big_img_wrap = soup.find('div', class_='bigImg-wrap') or soup.find('div', class_='J-bigImg-wrap')
+    if big_img_wrap:
+        # 查找所有 J-pic-large-item 元素
+        pic_large_items = big_img_wrap.find_all('div', class_='J-pic-large-item')
+        for item in pic_large_items:
+            img_url = None
+            
+            # 优先从 fsrc 属性获取
+            pic_inside = item.find('div', class_='sr-proMainInfo-slide-picInside')
+            if pic_inside:
+                img_url = pic_inside.get('fsrc')
+            
+            # 如果没有 fsrc，从 enlargeHref 链接中的 img 获取 src
+            if not img_url:
+                enlarge_href = item.find('a', class_='enlargeHref')
+                if enlarge_href:
+                    img_tag = enlarge_href.find('img')
+                    if img_tag:
+                        img_url = img_tag.get('src')
+            
+            if img_url:
+                # 确保URL是完整的
+                if img_url.startswith('//'):
+                    img_url = 'https:' + img_url
+                elif img_url.startswith('/'):
+                    if base_url:
+                        parsed = urlparse(base_url)
+                        img_url = f"{parsed.scheme}://{parsed.netloc}" + img_url
+                    else:
+                        img_url = base_url + img_url
+                
+                if img_url not in images:
+                    images.append(img_url)
    
    return images

@ -506,6 +516,134 @@ def extract_product_title(soup):
    return ''


+def extract_product_description(soup, base_url=''):
+    """从产品详情页提取完整的产品详细描述"""
+    description_parts = []
+    
+    # 方法1: 从 detail-tab-item 容器中提取（made-in-china.com 的标准结构）
+    detail_tab_item = soup.find('div', class_='detail-tab-item') or soup.find('div', class_='J-tab-cnt')
+    if detail_tab_item:
+        # 查找 detail-desc 容器
+        detail_desc = detail_tab_item.find('div', class_='detail-desc')
+        if detail_desc:
+            # 查找 async-rich-info > rich-text
+            async_rich_info = detail_desc.find('div', class_='async-rich-info')
+            if async_rich_info:
+                rich_text = async_rich_info.find('div', class_='rich-text')
+                if rich_text:
+                    # 提取完整HTML内容（保留所有格式、图片、表格等）
+                    html_content = str(rich_text)
+                    if html_content and len(html_content.strip()) > 50:
+                        description_parts.append(html_content)
+            
+            # 如果没有找到 async-rich-info，直接查找 rich-text
+            if not description_parts:
+                rich_text = detail_desc.find('div', class_='rich-text')
+                if rich_text:
+                    html_content = str(rich_text)
+                    if html_content and len(html_content.strip()) > 50:
+                        description_parts.append(html_content)
+            
+            # 如果还是没有找到，提取整个 detail-desc 的内容
+            if not description_parts:
+                html_content = str(detail_desc)
+                if html_content and len(html_content.strip()) > 50:
+                    description_parts.append(html_content)
+    
+    # 方法2: 直接查找 detail-desc 容器（如果没有 detail-tab-item）
+    if not description_parts:
+        detail_desc = soup.find('div', class_='detail-desc')
+        if detail_desc:
+            # 查找其中的 async-rich-info > rich-text
+            async_rich_info = detail_desc.find('div', class_='async-rich-info')
+            if async_rich_info:
+                rich_text = async_rich_info.find('div', class_='rich-text')
+                if rich_text:
+                    html_content = str(rich_text)
+                    if html_content and len(html_content.strip()) > 50:
+                        description_parts.append(html_content)
+            
+            # 如果没有找到，直接查找 rich-text
+            if not description_parts:
+                rich_text = detail_desc.find('div', class_='rich-text')
+                if rich_text:
+                    html_content = str(rich_text)
+                    if html_content and len(html_content.strip()) > 50:
+                        description_parts.append(html_content)
+    
+    # 方法3: 查找 async-rich-info 或 rich-text 容器（作为后备）
+    if not description_parts:
+        async_rich_info = soup.find('div', class_='async-rich-info')
+        if async_rich_info:
+            rich_text = async_rich_info.find('div', class_='rich-text')
+            if rich_text:
+                html_content = str(rich_text)
+                if html_content and len(html_content.strip()) > 50:
+                    description_parts.append(html_content)
+        
+        if not description_parts:
+            rich_text = soup.find('div', class_='rich-text')
+            if rich_text:
+                html_content = str(rich_text)
+                if html_content and len(html_content.strip()) > 50:
+                    description_parts.append(html_content)
+    
+    # 方法4: 查找包含"Product Description"标题的容器
+    if not description_parts:
+        # 查找包含"Product Description"的标题
+        desc_headers = soup.find_all(['h2', 'h3'], string=re.compile(r'Product Description|产品描述', re.IGNORECASE))
+        for header in desc_headers:
+            # 向上查找父容器
+            parent = header.find_parent()
+            if parent:
+                # 查找后续的 rich-text 或 async-rich-info
+                rich_text = parent.find('div', class_='rich-text') or parent.find('div', class_='async-rich-info')
+                if rich_text:
+                    html_content = str(rich_text)
+                    if html_content and len(html_content.strip()) > 50:
+                        description_parts.append(html_content)
+                        break
+                
+                # 如果没有找到，提取父容器中 header 之后的所有内容
+                if not description_parts:
+                    current = header.find_next_sibling()
+                    content_parts = []
+                    for _ in range(20):  # 最多查找20个兄弟元素
+                        if current:
+                            if current.name in ['h2', 'h3', 'h4', 'h5']:
+                                # 遇到下一个标题，停止
+                                break
+                            # 提取内容
+                            content = str(current)
+                            if content and len(content.strip()) > 20:
+                                content_parts.append(content)
+                            current = current.find_next_sibling()
+                        else:
+                            break
+                    
+                    if content_parts:
+                        description_parts.extend(content_parts)
+                        break
+    
+    # 合并所有描述部分
+    if description_parts:
+        # 去重并合并
+        unique_parts = []
+        seen = set()
+        for part in description_parts:
+            # 使用前100个字符作为去重标识（避免完全相同的长内容）
+            part_hash = part[:100] if len(part) > 100 else part
+            if part_hash not in seen:
+                seen.add(part_hash)
+                unique_parts.append(part)
+        
+        # 合并为完整HTML
+        full_description = '\n'.join(unique_parts)
+        return full_description
+    
+    return ''
+
+
 # ==================== 爬取函数 ====================

 async def crawl_product_detail(crawler, product_url, product_index=None, total_products=None, base_url=''):
@ -545,6 +683,11 @@ async def crawl_product_detail(crawler, product_url, product_index=None, total_p
        if product_details:
            detail_info['Product Details'] = product_details
        
+        # 提取完整的产品详细描述
+        description = extract_product_description(soup, base_url)
+        if description:
+            detail_info['description'] = description
+        
        return detail_info
        
    except Exception as e:
@ -875,6 +1018,10 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any],
                # 添加Product Details（完整的产品详情）
                if 'Product Details' in detail_info:
                    full_product_data['Product Details'] = detail_info['Product Details']
+                
+                # 添加完整的产品详细描述
+                if 'description' in detail_info:
+                    full_product_data['description'] = detail_info['description']
            
            # 异步创建记录
            create_result = await create_record_async(full_product_data, config, label2field)