create category and menu before scraping

This commit is contained in:
jingrow 2025-11-14 19:52:20 +08:00
parent fa895638a6
commit 0f3617e09e

View File

@ -749,6 +749,72 @@ async def upload_images(images: List[str], record_name: str, record_type: str, f
return uploaded_urls
def get_or_create_menu(menu_name: str, menu_pagetype: str, category_pagetype: str, category_record_name: str, site: str = None, parent_menu_name: str = None) -> Optional[str]:
"""查找或创建菜单返回菜单记录的name失败返回None
菜单关联到分类按照分类的层级结构创建
"""
if not menu_name or not menu_pagetype or not category_record_name:
return None
if not site:
return None # site字段是必填的
try:
# 通过title和site字段查找菜单
filters = [["title", "=", menu_name], ["site", "=", site]]
existing = jingrow.get_list(menu_pagetype, filters=filters, limit=1)
if existing:
return existing[0].get("name")
# 根据category_pagetype确定menu_type和对应的category字段
menu_type = None
category_field = None
if "Product Category" in category_pagetype:
menu_type = "Product Category"
category_field = "product_category"
elif "Article Category" in category_pagetype:
menu_type = "Article Category"
category_field = "article_category"
elif "Page Category" in category_pagetype:
menu_type = "Page Category"
category_field = "page_category"
elif "Project Category" in category_pagetype:
menu_type = "Project Category"
category_field = "project_category"
elif "Presentation Category" in category_pagetype:
menu_type = "Presentation Category"
category_field = "presentation_category"
if not menu_type or not category_field:
return None # 不支持的分类类型
# 创建菜单数据
menu_data = {
"title": menu_name,
"menu_type": menu_type,
"position": "Header", # 默认位置
"status": "Published", # 默认状态
"site": site,
category_field: category_record_name # 关联到分类
}
# 查找父菜单并设置
if parent_menu_name:
parent_filters = [["title", "=", parent_menu_name], ["site", "=", site]]
parent_menus = jingrow.get_list(menu_pagetype, filters=parent_filters, limit=1)
if parent_menus:
parent_name = parent_menus[0].get("name")
menu_data["parent_jsite_menu"] = parent_name
created = jingrow.create_pg(menu_pagetype, menu_data)
if created:
return created.get("name")
except Exception:
pass
return None
def get_or_create_category(category_name: str, category_pagetype: str, site: str = None, parent_category: str = "Products") -> Optional[str]:
"""查找或创建分类返回分类记录的name失败返回None
统一使用title字段查找和创建分类
@ -1006,6 +1072,54 @@ async def create_record_async(product_data: Dict[str, Any], config: Dict[str, An
}
def prepare_category_and_menu(config: Dict[str, Any], category_name: str = None, site: str = None) -> bool:
"""在采集开始前创建分类和菜单(最佳实践)
返回True表示成功False表示失败
"""
if not category_name:
return True # 如果没有分类名称,跳过
try:
category_pagetype = config.get("category_pagetype", "Jsite Product Category")
parent_category = config.get("parent_category", "Products")
menu_pagetype = "Jsite Menu"
# 获取site值
if not site and config.get("default_site"):
site = str(config.get("default_site")).strip()
if site.startswith(('http://', 'https://')):
site = None
# 1. 确保父分类存在
parent_category_name = get_or_create_category(parent_category, category_pagetype, site, None)
if not parent_category_name:
jingrow.log_error(f"警告:无法创建或找到父分类 {parent_category}")
# 2. 创建分类
category_record_name = get_or_create_category(category_name, category_pagetype, site, parent_category)
if not category_record_name:
jingrow.log_error(f"警告:无法创建或找到分类 {category_name}")
return False
# 3. 创建对应的菜单(按照同样的层级结构)
if site:
# 先确保父菜单存在(如果已存在则直接返回,不会重复创建)
if parent_category_name:
get_or_create_menu(parent_category, menu_pagetype, category_pagetype, parent_category_name, site, None)
# 创建分类对应的菜单(如果已存在则直接返回,不会重复创建)
if category_record_name:
menu_record_name = get_or_create_menu(category_name, menu_pagetype, category_pagetype, category_record_name, site, parent_category)
if not menu_record_name:
jingrow.log_error(f"警告:无法创建或找到菜单 {category_name}")
# 菜单创建失败不影响采集,只记录警告
return True
except Exception as e:
jingrow.log_error(f"准备分类和菜单时出错: {str(e)}")
return False
async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], base_url: str = '', max_pages: int = 100):
"""爬取产品列表并逐条创建记录"""
record_type = config.get("pagetype")
@ -1027,14 +1141,28 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any],
created_records = []
failed_records = []
# 在采集开始前,提前创建分类和菜单(最佳实践)
# 先尝试从第一页提取分类名称(如果需要)
if not category_name:
products, _, cat_name = await crawl_single_list_page(crawler, start_url, 1, base_url)
if cat_name:
category_name = cat_name
# 获取site值用于创建分类和菜单
site_value = None
if config.get("default_site"):
site_value = str(config.get("default_site")).strip()
if site_value.startswith(('http://', 'https://')):
site_value = None
# 提前创建分类和菜单(如果分类名称已确定)
if category_name:
prepare_category_and_menu(config, category_name, site_value)
while current_url and page_num <= max_pages:
# 爬取当前页
products, next_url, cat_name = await crawl_single_list_page(crawler, current_url, page_num, base_url)
# 如果配置中没有分类名称,且是第一页,尝试从页面提取
if not category_name and page_num == 1 and cat_name:
category_name = cat_name
if not products:
break