From 0f3617e09e72a178abdd95b621db93ba1c666da2 Mon Sep 17 00:00:00 2001 From: jingrow Date: Fri, 14 Nov 2025 19:52:20 +0800 Subject: [PATCH] create category and menu before scraping --- .../made_in_china_scraper.py | 136 +++++++++++++++++- 1 file changed, 132 insertions(+), 4 deletions(-) diff --git a/apps/jingrow/jingrow/ai/nodes/made_in_china_scraper/made_in_china_scraper.py b/apps/jingrow/jingrow/ai/nodes/made_in_china_scraper/made_in_china_scraper.py index 43345a7..269e476 100644 --- a/apps/jingrow/jingrow/ai/nodes/made_in_china_scraper/made_in_china_scraper.py +++ b/apps/jingrow/jingrow/ai/nodes/made_in_china_scraper/made_in_china_scraper.py @@ -749,6 +749,72 @@ async def upload_images(images: List[str], record_name: str, record_type: str, f return uploaded_urls +def get_or_create_menu(menu_name: str, menu_pagetype: str, category_pagetype: str, category_record_name: str, site: str = None, parent_menu_name: str = None) -> Optional[str]: + """查找或创建菜单,返回菜单记录的name,失败返回None + 菜单关联到分类,按照分类的层级结构创建 + """ + if not menu_name or not menu_pagetype or not category_record_name: + return None + + if not site: + return None # site字段是必填的 + + try: + # 通过title和site字段查找菜单 + filters = [["title", "=", menu_name], ["site", "=", site]] + existing = jingrow.get_list(menu_pagetype, filters=filters, limit=1) + if existing: + return existing[0].get("name") + + # 根据category_pagetype确定menu_type和对应的category字段 + menu_type = None + category_field = None + if "Product Category" in category_pagetype: + menu_type = "Product Category" + category_field = "product_category" + elif "Article Category" in category_pagetype: + menu_type = "Article Category" + category_field = "article_category" + elif "Page Category" in category_pagetype: + menu_type = "Page Category" + category_field = "page_category" + elif "Project Category" in category_pagetype: + menu_type = "Project Category" + category_field = "project_category" + elif "Presentation Category" in category_pagetype: + menu_type = "Presentation Category" + category_field = "presentation_category" + + if not menu_type or not category_field: + return None # 不支持的分类类型 + + # 创建菜单数据 + menu_data = { + "title": menu_name, + "menu_type": menu_type, + "position": "Header", # 默认位置 + "status": "Published", # 默认状态 + "site": site, + category_field: category_record_name # 关联到分类 + } + + # 查找父菜单并设置 + if parent_menu_name: + parent_filters = [["title", "=", parent_menu_name], ["site", "=", site]] + parent_menus = jingrow.get_list(menu_pagetype, filters=parent_filters, limit=1) + if parent_menus: + parent_name = parent_menus[0].get("name") + menu_data["parent_jsite_menu"] = parent_name + + created = jingrow.create_pg(menu_pagetype, menu_data) + if created: + return created.get("name") + except Exception: + pass + + return None + + def get_or_create_category(category_name: str, category_pagetype: str, site: str = None, parent_category: str = "Products") -> Optional[str]: """查找或创建分类,返回分类记录的name,失败返回None 统一使用title字段查找和创建分类 @@ -1006,6 +1072,54 @@ async def create_record_async(product_data: Dict[str, Any], config: Dict[str, An } +def prepare_category_and_menu(config: Dict[str, Any], category_name: str = None, site: str = None) -> bool: + """在采集开始前创建分类和菜单(最佳实践) + 返回True表示成功,False表示失败 + """ + if not category_name: + return True # 如果没有分类名称,跳过 + + try: + category_pagetype = config.get("category_pagetype", "Jsite Product Category") + parent_category = config.get("parent_category", "Products") + menu_pagetype = "Jsite Menu" + + # 获取site值 + if not site and config.get("default_site"): + site = str(config.get("default_site")).strip() + if site.startswith(('http://', 'https://')): + site = None + + # 1. 确保父分类存在 + parent_category_name = get_or_create_category(parent_category, category_pagetype, site, None) + if not parent_category_name: + jingrow.log_error(f"警告:无法创建或找到父分类 {parent_category}") + + # 2. 创建分类 + category_record_name = get_or_create_category(category_name, category_pagetype, site, parent_category) + if not category_record_name: + jingrow.log_error(f"警告:无法创建或找到分类 {category_name}") + return False + + # 3. 创建对应的菜单(按照同样的层级结构) + if site: + # 先确保父菜单存在(如果已存在则直接返回,不会重复创建) + if parent_category_name: + get_or_create_menu(parent_category, menu_pagetype, category_pagetype, parent_category_name, site, None) + + # 创建分类对应的菜单(如果已存在则直接返回,不会重复创建) + if category_record_name: + menu_record_name = get_or_create_menu(category_name, menu_pagetype, category_pagetype, category_record_name, site, parent_category) + if not menu_record_name: + jingrow.log_error(f"警告:无法创建或找到菜单 {category_name}") + # 菜单创建失败不影响采集,只记录警告 + + return True + except Exception as e: + jingrow.log_error(f"准备分类和菜单时出错: {str(e)}") + return False + + async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], base_url: str = '', max_pages: int = 100): """爬取产品列表并逐条创建记录""" record_type = config.get("pagetype") @@ -1027,14 +1141,28 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], created_records = [] failed_records = [] + # 在采集开始前,提前创建分类和菜单(最佳实践) + # 先尝试从第一页提取分类名称(如果需要) + if not category_name: + products, _, cat_name = await crawl_single_list_page(crawler, start_url, 1, base_url) + if cat_name: + category_name = cat_name + + # 获取site值用于创建分类和菜单 + site_value = None + if config.get("default_site"): + site_value = str(config.get("default_site")).strip() + if site_value.startswith(('http://', 'https://')): + site_value = None + + # 提前创建分类和菜单(如果分类名称已确定) + if category_name: + prepare_category_and_menu(config, category_name, site_value) + while current_url and page_num <= max_pages: # 爬取当前页 products, next_url, cat_name = await crawl_single_list_page(crawler, current_url, page_num, base_url) - # 如果配置中没有分类名称,且是第一页,尝试从页面提取 - if not category_name and page_num == 1 and cat_name: - category_name = cat_name - if not products: break