add made_in_china_scraper and add parent category config
This commit is contained in:
parent
e75e2d0f75
commit
fa895638a6
@ -1,10 +1,10 @@
|
|||||||
{
|
{
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"type": "web_scrapers_create",
|
"type": "made_in_china_scraper",
|
||||||
"label": "网站采集并创建",
|
"label": "中国制造网产品采集",
|
||||||
"icon": "fa-spider",
|
"icon": "fa-spider",
|
||||||
"color": "rgba(71, 180, 133, 1)",
|
"color": "rgba(71, 180, 133, 1)",
|
||||||
"description": "采集网站产品信息并立即创建记录(一体化节点),支持字段映射和图片上传",
|
"description": "采集中国制造网产品信息并立即创建记录(一体化节点),支持字段映射和图片上传",
|
||||||
"group": "数据",
|
"group": "数据",
|
||||||
"component_type": "GenericNode"
|
"component_type": "GenericNode"
|
||||||
},
|
},
|
||||||
@ -68,6 +68,11 @@
|
|||||||
"title": "分类名称",
|
"title": "分类名称",
|
||||||
"description": "要关联的分类名称(分类记录的title字段值)。如果配置了此字段,所有创建的记录都会关联到该分类;如果不配置,会尝试从页面提取分类名称"
|
"description": "要关联的分类名称(分类记录的title字段值)。如果配置了此字段,所有创建的记录都会关联到该分类;如果不配置,会尝试从页面提取分类名称"
|
||||||
},
|
},
|
||||||
|
"parent_category": {
|
||||||
|
"type": "string",
|
||||||
|
"title": "父分类",
|
||||||
|
"description": "创建分类时的父分类名称(分类记录的title字段值)。如果不配置,使用默认值 Products"
|
||||||
|
},
|
||||||
"max_pages": {
|
"max_pages": {
|
||||||
"type": "integer",
|
"type": "integer",
|
||||||
"title": "最大页数",
|
"title": "最大页数",
|
||||||
@ -105,6 +110,7 @@
|
|||||||
"category_pagetype",
|
"category_pagetype",
|
||||||
"category_field",
|
"category_field",
|
||||||
"category_name",
|
"category_name",
|
||||||
|
"parent_category",
|
||||||
"max_pages",
|
"max_pages",
|
||||||
"default_site"
|
"default_site"
|
||||||
]
|
]
|
||||||
@ -749,11 +749,11 @@ async def upload_images(images: List[str], record_name: str, record_type: str, f
|
|||||||
return uploaded_urls
|
return uploaded_urls
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_category(category_name: str, category_pagetype: str, site: str = None) -> Optional[str]:
|
def get_or_create_category(category_name: str, category_pagetype: str, site: str = None, parent_category: str = "Products") -> Optional[str]:
|
||||||
"""查找或创建分类,返回分类记录的name,失败返回None
|
"""查找或创建分类,返回分类记录的name,失败返回None
|
||||||
统一使用title字段查找和创建分类
|
统一使用title字段查找和创建分类
|
||||||
如果提供了site,会在查找和创建时使用site字段过滤
|
如果提供了site,会在查找和创建时使用site字段过滤
|
||||||
创建分类时会自动设置父分类为"Products"
|
创建分类时会自动设置父分类(默认值为"Products")
|
||||||
"""
|
"""
|
||||||
if not category_name or not category_pagetype:
|
if not category_name or not category_pagetype:
|
||||||
return None
|
return None
|
||||||
@ -772,16 +772,17 @@ def get_or_create_category(category_name: str, category_pagetype: str, site: str
|
|||||||
if site:
|
if site:
|
||||||
category_data["site"] = site
|
category_data["site"] = site
|
||||||
|
|
||||||
# 查找父分类"Products"并设置
|
# 查找父分类并设置
|
||||||
parent_filters = [["title", "=", "Products"]]
|
if parent_category:
|
||||||
if site:
|
parent_filters = [["title", "=", parent_category]]
|
||||||
parent_filters.append(["site", "=", site])
|
if site:
|
||||||
parent_categories = jingrow.get_list(category_pagetype, filters=parent_filters, limit=1)
|
parent_filters.append(["site", "=", site])
|
||||||
if parent_categories:
|
parent_categories = jingrow.get_list(category_pagetype, filters=parent_filters, limit=1)
|
||||||
parent_name = parent_categories[0].get("name")
|
if parent_categories:
|
||||||
# 父分类字段名格式:parent_分类pagetype名称(小写下划线格式)
|
parent_name = parent_categories[0].get("name")
|
||||||
parent_field = "parent_" + category_pagetype.lower().replace(" ", "_")
|
# 父分类字段名格式:parent_分类pagetype名称(小写下划线格式)
|
||||||
category_data[parent_field] = parent_name
|
parent_field = "parent_" + category_pagetype.lower().replace(" ", "_")
|
||||||
|
category_data[parent_field] = parent_name
|
||||||
|
|
||||||
created = jingrow.create_pg(category_pagetype, category_data)
|
created = jingrow.create_pg(category_pagetype, category_data)
|
||||||
if created:
|
if created:
|
||||||
@ -792,7 +793,7 @@ def get_or_create_category(category_name: str, category_pagetype: str, site: str
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def map_product_data_to_record(product_data: Dict[str, Any], field_map: List[Dict], label2field: Dict, record_type: str, default_site: str = "", category_name: str = None, category_field: str = None, category_pagetype: str = None) -> Dict[str, Any]:
|
def map_product_data_to_record(product_data: Dict[str, Any], field_map: List[Dict], label2field: Dict, record_type: str, default_site: str = "", category_name: str = None, category_field: str = None, category_pagetype: str = None, parent_category: str = "Products") -> Dict[str, Any]:
|
||||||
"""将产品数据映射为记录字段"""
|
"""将产品数据映射为记录字段"""
|
||||||
record_data = {}
|
record_data = {}
|
||||||
mapped_fields = set()
|
mapped_fields = set()
|
||||||
@ -871,7 +872,7 @@ def map_product_data_to_record(product_data: Dict[str, Any], field_map: List[Dic
|
|||||||
if site_value.startswith(('http://', 'https://')):
|
if site_value.startswith(('http://', 'https://')):
|
||||||
site_value = None
|
site_value = None
|
||||||
|
|
||||||
category_record_name = get_or_create_category(category_name, category_pagetype, site_value)
|
category_record_name = get_or_create_category(category_name, category_pagetype, site_value, parent_category)
|
||||||
if category_record_name:
|
if category_record_name:
|
||||||
record_data[category_field] = category_record_name
|
record_data[category_field] = category_record_name
|
||||||
|
|
||||||
@ -932,8 +933,11 @@ async def create_record_async(product_data: Dict[str, Any], config: Dict[str, An
|
|||||||
pass
|
pass
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# 获取父分类配置(默认值为"Products")
|
||||||
|
parent_category = config.get("parent_category", "Products")
|
||||||
|
|
||||||
# 映射字段
|
# 映射字段
|
||||||
record_data = map_product_data_to_record(product_data, field_map, label2field, record_type, default_site, category_name, category_field, category_pagetype)
|
record_data = map_product_data_to_record(product_data, field_map, label2field, record_type, default_site, category_name, category_field, category_pagetype, parent_category)
|
||||||
|
|
||||||
# 处理图片上传
|
# 处理图片上传
|
||||||
image_field = None
|
image_field = None
|
||||||
@ -1,5 +1,5 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "web_scrapers_create"
|
name = "made_in_china_scraper"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
requires-python = ">=3.10"
|
requires-python = ">=3.10"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
Loading…
x
Reference in New Issue
Block a user