create category and menu before scraping
This commit is contained in:
parent
fa895638a6
commit
0f3617e09e
@ -749,6 +749,72 @@ async def upload_images(images: List[str], record_name: str, record_type: str, f
|
||||
return uploaded_urls
|
||||
|
||||
|
||||
def get_or_create_menu(menu_name: str, menu_pagetype: str, category_pagetype: str, category_record_name: str, site: str = None, parent_menu_name: str = None) -> Optional[str]:
|
||||
"""查找或创建菜单,返回菜单记录的name,失败返回None
|
||||
菜单关联到分类,按照分类的层级结构创建
|
||||
"""
|
||||
if not menu_name or not menu_pagetype or not category_record_name:
|
||||
return None
|
||||
|
||||
if not site:
|
||||
return None # site字段是必填的
|
||||
|
||||
try:
|
||||
# 通过title和site字段查找菜单
|
||||
filters = [["title", "=", menu_name], ["site", "=", site]]
|
||||
existing = jingrow.get_list(menu_pagetype, filters=filters, limit=1)
|
||||
if existing:
|
||||
return existing[0].get("name")
|
||||
|
||||
# 根据category_pagetype确定menu_type和对应的category字段
|
||||
menu_type = None
|
||||
category_field = None
|
||||
if "Product Category" in category_pagetype:
|
||||
menu_type = "Product Category"
|
||||
category_field = "product_category"
|
||||
elif "Article Category" in category_pagetype:
|
||||
menu_type = "Article Category"
|
||||
category_field = "article_category"
|
||||
elif "Page Category" in category_pagetype:
|
||||
menu_type = "Page Category"
|
||||
category_field = "page_category"
|
||||
elif "Project Category" in category_pagetype:
|
||||
menu_type = "Project Category"
|
||||
category_field = "project_category"
|
||||
elif "Presentation Category" in category_pagetype:
|
||||
menu_type = "Presentation Category"
|
||||
category_field = "presentation_category"
|
||||
|
||||
if not menu_type or not category_field:
|
||||
return None # 不支持的分类类型
|
||||
|
||||
# 创建菜单数据
|
||||
menu_data = {
|
||||
"title": menu_name,
|
||||
"menu_type": menu_type,
|
||||
"position": "Header", # 默认位置
|
||||
"status": "Published", # 默认状态
|
||||
"site": site,
|
||||
category_field: category_record_name # 关联到分类
|
||||
}
|
||||
|
||||
# 查找父菜单并设置
|
||||
if parent_menu_name:
|
||||
parent_filters = [["title", "=", parent_menu_name], ["site", "=", site]]
|
||||
parent_menus = jingrow.get_list(menu_pagetype, filters=parent_filters, limit=1)
|
||||
if parent_menus:
|
||||
parent_name = parent_menus[0].get("name")
|
||||
menu_data["parent_jsite_menu"] = parent_name
|
||||
|
||||
created = jingrow.create_pg(menu_pagetype, menu_data)
|
||||
if created:
|
||||
return created.get("name")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_or_create_category(category_name: str, category_pagetype: str, site: str = None, parent_category: str = "Products") -> Optional[str]:
|
||||
"""查找或创建分类,返回分类记录的name,失败返回None
|
||||
统一使用title字段查找和创建分类
|
||||
@ -1006,6 +1072,54 @@ async def create_record_async(product_data: Dict[str, Any], config: Dict[str, An
|
||||
}
|
||||
|
||||
|
||||
def prepare_category_and_menu(config: Dict[str, Any], category_name: str = None, site: str = None) -> bool:
|
||||
"""在采集开始前创建分类和菜单(最佳实践)
|
||||
返回True表示成功,False表示失败
|
||||
"""
|
||||
if not category_name:
|
||||
return True # 如果没有分类名称,跳过
|
||||
|
||||
try:
|
||||
category_pagetype = config.get("category_pagetype", "Jsite Product Category")
|
||||
parent_category = config.get("parent_category", "Products")
|
||||
menu_pagetype = "Jsite Menu"
|
||||
|
||||
# 获取site值
|
||||
if not site and config.get("default_site"):
|
||||
site = str(config.get("default_site")).strip()
|
||||
if site.startswith(('http://', 'https://')):
|
||||
site = None
|
||||
|
||||
# 1. 确保父分类存在
|
||||
parent_category_name = get_or_create_category(parent_category, category_pagetype, site, None)
|
||||
if not parent_category_name:
|
||||
jingrow.log_error(f"警告:无法创建或找到父分类 {parent_category}")
|
||||
|
||||
# 2. 创建分类
|
||||
category_record_name = get_or_create_category(category_name, category_pagetype, site, parent_category)
|
||||
if not category_record_name:
|
||||
jingrow.log_error(f"警告:无法创建或找到分类 {category_name}")
|
||||
return False
|
||||
|
||||
# 3. 创建对应的菜单(按照同样的层级结构)
|
||||
if site:
|
||||
# 先确保父菜单存在(如果已存在则直接返回,不会重复创建)
|
||||
if parent_category_name:
|
||||
get_or_create_menu(parent_category, menu_pagetype, category_pagetype, parent_category_name, site, None)
|
||||
|
||||
# 创建分类对应的菜单(如果已存在则直接返回,不会重复创建)
|
||||
if category_record_name:
|
||||
menu_record_name = get_or_create_menu(category_name, menu_pagetype, category_pagetype, category_record_name, site, parent_category)
|
||||
if not menu_record_name:
|
||||
jingrow.log_error(f"警告:无法创建或找到菜单 {category_name}")
|
||||
# 菜单创建失败不影响采集,只记录警告
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
jingrow.log_error(f"准备分类和菜单时出错: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], base_url: str = '', max_pages: int = 100):
|
||||
"""爬取产品列表并逐条创建记录"""
|
||||
record_type = config.get("pagetype")
|
||||
@ -1027,14 +1141,28 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any],
|
||||
created_records = []
|
||||
failed_records = []
|
||||
|
||||
# 在采集开始前,提前创建分类和菜单(最佳实践)
|
||||
# 先尝试从第一页提取分类名称(如果需要)
|
||||
if not category_name:
|
||||
products, _, cat_name = await crawl_single_list_page(crawler, start_url, 1, base_url)
|
||||
if cat_name:
|
||||
category_name = cat_name
|
||||
|
||||
# 获取site值用于创建分类和菜单
|
||||
site_value = None
|
||||
if config.get("default_site"):
|
||||
site_value = str(config.get("default_site")).strip()
|
||||
if site_value.startswith(('http://', 'https://')):
|
||||
site_value = None
|
||||
|
||||
# 提前创建分类和菜单(如果分类名称已确定)
|
||||
if category_name:
|
||||
prepare_category_and_menu(config, category_name, site_value)
|
||||
|
||||
while current_url and page_num <= max_pages:
|
||||
# 爬取当前页
|
||||
products, next_url, cat_name = await crawl_single_list_page(crawler, current_url, page_num, base_url)
|
||||
|
||||
# 如果配置中没有分类名称,且是第一页,尝试从页面提取
|
||||
if not category_name and page_num == 1 and cat_name:
|
||||
category_name = cat_name
|
||||
|
||||
if not products:
|
||||
break
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user