create category and menu before scraping
This commit is contained in:
parent
fa895638a6
commit
0f3617e09e
@ -749,6 +749,72 @@ async def upload_images(images: List[str], record_name: str, record_type: str, f
|
|||||||
return uploaded_urls
|
return uploaded_urls
|
||||||
|
|
||||||
|
|
||||||
|
def get_or_create_menu(menu_name: str, menu_pagetype: str, category_pagetype: str, category_record_name: str, site: str = None, parent_menu_name: str = None) -> Optional[str]:
|
||||||
|
"""查找或创建菜单,返回菜单记录的name,失败返回None
|
||||||
|
菜单关联到分类,按照分类的层级结构创建
|
||||||
|
"""
|
||||||
|
if not menu_name or not menu_pagetype or not category_record_name:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not site:
|
||||||
|
return None # site字段是必填的
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 通过title和site字段查找菜单
|
||||||
|
filters = [["title", "=", menu_name], ["site", "=", site]]
|
||||||
|
existing = jingrow.get_list(menu_pagetype, filters=filters, limit=1)
|
||||||
|
if existing:
|
||||||
|
return existing[0].get("name")
|
||||||
|
|
||||||
|
# 根据category_pagetype确定menu_type和对应的category字段
|
||||||
|
menu_type = None
|
||||||
|
category_field = None
|
||||||
|
if "Product Category" in category_pagetype:
|
||||||
|
menu_type = "Product Category"
|
||||||
|
category_field = "product_category"
|
||||||
|
elif "Article Category" in category_pagetype:
|
||||||
|
menu_type = "Article Category"
|
||||||
|
category_field = "article_category"
|
||||||
|
elif "Page Category" in category_pagetype:
|
||||||
|
menu_type = "Page Category"
|
||||||
|
category_field = "page_category"
|
||||||
|
elif "Project Category" in category_pagetype:
|
||||||
|
menu_type = "Project Category"
|
||||||
|
category_field = "project_category"
|
||||||
|
elif "Presentation Category" in category_pagetype:
|
||||||
|
menu_type = "Presentation Category"
|
||||||
|
category_field = "presentation_category"
|
||||||
|
|
||||||
|
if not menu_type or not category_field:
|
||||||
|
return None # 不支持的分类类型
|
||||||
|
|
||||||
|
# 创建菜单数据
|
||||||
|
menu_data = {
|
||||||
|
"title": menu_name,
|
||||||
|
"menu_type": menu_type,
|
||||||
|
"position": "Header", # 默认位置
|
||||||
|
"status": "Published", # 默认状态
|
||||||
|
"site": site,
|
||||||
|
category_field: category_record_name # 关联到分类
|
||||||
|
}
|
||||||
|
|
||||||
|
# 查找父菜单并设置
|
||||||
|
if parent_menu_name:
|
||||||
|
parent_filters = [["title", "=", parent_menu_name], ["site", "=", site]]
|
||||||
|
parent_menus = jingrow.get_list(menu_pagetype, filters=parent_filters, limit=1)
|
||||||
|
if parent_menus:
|
||||||
|
parent_name = parent_menus[0].get("name")
|
||||||
|
menu_data["parent_jsite_menu"] = parent_name
|
||||||
|
|
||||||
|
created = jingrow.create_pg(menu_pagetype, menu_data)
|
||||||
|
if created:
|
||||||
|
return created.get("name")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_category(category_name: str, category_pagetype: str, site: str = None, parent_category: str = "Products") -> Optional[str]:
|
def get_or_create_category(category_name: str, category_pagetype: str, site: str = None, parent_category: str = "Products") -> Optional[str]:
|
||||||
"""查找或创建分类,返回分类记录的name,失败返回None
|
"""查找或创建分类,返回分类记录的name,失败返回None
|
||||||
统一使用title字段查找和创建分类
|
统一使用title字段查找和创建分类
|
||||||
@ -1006,6 +1072,54 @@ async def create_record_async(product_data: Dict[str, Any], config: Dict[str, An
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_category_and_menu(config: Dict[str, Any], category_name: str = None, site: str = None) -> bool:
|
||||||
|
"""在采集开始前创建分类和菜单(最佳实践)
|
||||||
|
返回True表示成功,False表示失败
|
||||||
|
"""
|
||||||
|
if not category_name:
|
||||||
|
return True # 如果没有分类名称,跳过
|
||||||
|
|
||||||
|
try:
|
||||||
|
category_pagetype = config.get("category_pagetype", "Jsite Product Category")
|
||||||
|
parent_category = config.get("parent_category", "Products")
|
||||||
|
menu_pagetype = "Jsite Menu"
|
||||||
|
|
||||||
|
# 获取site值
|
||||||
|
if not site and config.get("default_site"):
|
||||||
|
site = str(config.get("default_site")).strip()
|
||||||
|
if site.startswith(('http://', 'https://')):
|
||||||
|
site = None
|
||||||
|
|
||||||
|
# 1. 确保父分类存在
|
||||||
|
parent_category_name = get_or_create_category(parent_category, category_pagetype, site, None)
|
||||||
|
if not parent_category_name:
|
||||||
|
jingrow.log_error(f"警告:无法创建或找到父分类 {parent_category}")
|
||||||
|
|
||||||
|
# 2. 创建分类
|
||||||
|
category_record_name = get_or_create_category(category_name, category_pagetype, site, parent_category)
|
||||||
|
if not category_record_name:
|
||||||
|
jingrow.log_error(f"警告:无法创建或找到分类 {category_name}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 3. 创建对应的菜单(按照同样的层级结构)
|
||||||
|
if site:
|
||||||
|
# 先确保父菜单存在(如果已存在则直接返回,不会重复创建)
|
||||||
|
if parent_category_name:
|
||||||
|
get_or_create_menu(parent_category, menu_pagetype, category_pagetype, parent_category_name, site, None)
|
||||||
|
|
||||||
|
# 创建分类对应的菜单(如果已存在则直接返回,不会重复创建)
|
||||||
|
if category_record_name:
|
||||||
|
menu_record_name = get_or_create_menu(category_name, menu_pagetype, category_pagetype, category_record_name, site, parent_category)
|
||||||
|
if not menu_record_name:
|
||||||
|
jingrow.log_error(f"警告:无法创建或找到菜单 {category_name}")
|
||||||
|
# 菜单创建失败不影响采集,只记录警告
|
||||||
|
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
jingrow.log_error(f"准备分类和菜单时出错: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], base_url: str = '', max_pages: int = 100):
|
async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any], base_url: str = '', max_pages: int = 100):
|
||||||
"""爬取产品列表并逐条创建记录"""
|
"""爬取产品列表并逐条创建记录"""
|
||||||
record_type = config.get("pagetype")
|
record_type = config.get("pagetype")
|
||||||
@ -1027,14 +1141,28 @@ async def crawl_and_create_list(crawler, start_url: str, config: Dict[str, Any],
|
|||||||
created_records = []
|
created_records = []
|
||||||
failed_records = []
|
failed_records = []
|
||||||
|
|
||||||
|
# 在采集开始前,提前创建分类和菜单(最佳实践)
|
||||||
|
# 先尝试从第一页提取分类名称(如果需要)
|
||||||
|
if not category_name:
|
||||||
|
products, _, cat_name = await crawl_single_list_page(crawler, start_url, 1, base_url)
|
||||||
|
if cat_name:
|
||||||
|
category_name = cat_name
|
||||||
|
|
||||||
|
# 获取site值用于创建分类和菜单
|
||||||
|
site_value = None
|
||||||
|
if config.get("default_site"):
|
||||||
|
site_value = str(config.get("default_site")).strip()
|
||||||
|
if site_value.startswith(('http://', 'https://')):
|
||||||
|
site_value = None
|
||||||
|
|
||||||
|
# 提前创建分类和菜单(如果分类名称已确定)
|
||||||
|
if category_name:
|
||||||
|
prepare_category_and_menu(config, category_name, site_value)
|
||||||
|
|
||||||
while current_url and page_num <= max_pages:
|
while current_url and page_num <= max_pages:
|
||||||
# 爬取当前页
|
# 爬取当前页
|
||||||
products, next_url, cat_name = await crawl_single_list_page(crawler, current_url, page_num, base_url)
|
products, next_url, cat_name = await crawl_single_list_page(crawler, current_url, page_num, base_url)
|
||||||
|
|
||||||
# 如果配置中没有分类名称,且是第一页,尝试从页面提取
|
|
||||||
if not category_name and page_num == 1 and cat_name:
|
|
||||||
category_name = cat_name
|
|
||||||
|
|
||||||
if not products:
|
if not products:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user