jcloude/press/api/monitoring.py
2025-12-23 21:34:08 +08:00

199 lines
5.5 KiB
Python

# Copyright (c) 2021, JINGROW
# For license information, please see license.txt
from itertools import groupby
import jingrow
from jingrow.rate_limiter import rate_limit
from jcloude.exceptions import AlertRuleNotEnabled
from jcloude.jcloude.pagetype.monitor_server.monitor_server import get_monitor_server_ips
from jcloude.utils import log_error, servers_using_alternative_port_for_communication
def get_benches():
self_hosted_stand_alone_servers = jingrow.get_all(
"Server",
{"is_standalone": True, "is_self_hosted": True, "status": "Active"},
pluck="name",
)
monitoring_disabled_servers = jingrow.get_all(
"Server", {"is_monitoring_disabled": True, "status": ("!=", "Archived")}, pluck="name"
)
excluded_servers = set(self_hosted_stand_alone_servers + monitoring_disabled_servers)
sites = jingrow.get_all(
"Site",
["name", "bench"],
{"status": "Active", "server": ("not in", excluded_servers), "is_monitoring_disabled": False},
ignore_ifnull=True,
)
sites.sort(key=lambda x: (x.bench, x.name))
bench_map = {
bench.name: bench
for bench in jingrow.get_all(
"Bench",
{"name": ("in", set(site.bench for site in sites))},
["name", "cluster", "server", "group"],
ignore_ifnull=True,
)
}
benches = []
for bench_name, _sites in groupby(sites, lambda x: x.bench):
bench = bench_map[bench_name]
bench.update({"sites": [site.name for site in _sites]})
benches.append(bench)
return benches
def get_clusters():
servers = {}
servers["proxy"] = jingrow.get_all("Proxy Server", {"status": ("!=", "Archived")}, ["name", "cluster"])
servers["app"] = jingrow.get_all(
"Server", {"status": ("!=", "Archived"), "is_monitoring_disabled": False}, ["name", "cluster"]
)
servers["database"] = jingrow.get_all(
"Database Server",
{"status": ("!=", "Archived"), "is_monitoring_disabled": False},
["name", "cluster"],
)
servers["nfs"] = jingrow.get_all("NFS Server", {"status": ("!=", "Archived")}, ["name", "cluster"])
clusters = jingrow.get_all("Cluster")
job_map = get_job_map()
servers_using_alternative_port = servers_using_alternative_port_for_communication()
for cluster in clusters:
cluster["jobs"] = {}
for server_type, server_type_servers in servers.items():
for server in server_type_servers:
if server.cluster == cluster.name:
for job in job_map[server_type]:
if server.name in servers_using_alternative_port:
cluster["jobs"].setdefault(job, []).append(f"{server.name}:8443")
else:
cluster["jobs"].setdefault(job, []).append(server.name)
return clusters
def get_domains():
return jingrow.get_all(
"Site Domain", ["name", "site"], {"tls_certificate": ("is", "set")}, order_by="name"
)
def get_tls():
tls = []
server_types = [
"Server",
"Proxy Server",
"Database Server",
"Registry Server",
"Log Server",
"Monitor Server",
"Analytics Server",
"Trace Server",
"NFS Server",
]
for server_type in server_types:
filters = {"status": ("!=", "Archived")}
if server_type in ("Server", "Database Server"):
filters["is_monitoring_disabled"] = False
filters["is_for_recovery"] = False
tls += jingrow.get_all(server_type, filters, ["name"])
return tls
def get_targets_method_rate_limit() -> int:
if (
jingrow.local
and hasattr(jingrow.local, "request_ip")
and jingrow.local.request_ip in get_monitor_server_ips()
):
# Allow no limit for known monitor servers
return 1000
# For unknown IPs, allow only 2 request per minute
return 2
MONITORING_ENDPOINT_RATE_LIMIT_WINDOW_SECONDS = 60
@jingrow.whitelist(allow_guest=True)
@rate_limit(limit=get_targets_method_rate_limit, seconds=MONITORING_ENDPOINT_RATE_LIMIT_WINDOW_SECONDS)
def targets(token=None):
if not token:
jingrow.throw_permission_error()
monitor_token = jingrow.db.get_single_value("Jcloude Settings", "monitor_token", cache=True)
if token != monitor_token:
return None
return {"benches": get_benches(), "clusters": get_clusters(), "domains": get_domains(), "tls": get_tls()}
@jingrow.whitelist(allow_guest=True, xss_safe=True)
def alert(*args, **kwargs):
user = jingrow.session.user
try:
webhook_token = jingrow.db.get_value(
"Monitor Server",
jingrow.db.get_single_value("Jcloude Settings", "monitor_server", cache=True),
"webhook_token",
cache=True,
)
if jingrow.request.args.get("webhook_token") != webhook_token:
raise jingrow.AuthenticationError("Invalid credentials")
jingrow.set_user("Administrator")
pg = jingrow.get_pg(
{
"pagetype": "Alertmanager Webhook Log",
"payload": jingrow.request.get_data().decode(),
}
)
pg.insert()
except AlertRuleNotEnabled:
pass
except jingrow.AuthenticationError:
log_error("Alertmanager Webhook Authentication Error", args=args, kwargs=kwargs)
except Exception:
log_error("Alertmanager Webhook Error", args=args, kwargs=kwargs)
raise
finally:
jingrow.set_user(user)
def get_job_map() -> dict[str, list[str]]:
DEFAULT_JOB_MAP = {
"proxy": ["node", "nginx", "proxysql", "mariadb_proxy"],
"app": ["node", "nginx", "docker", "cadvisor", "gunicorn", "rq"],
"nfs": ["node", "nginx", "docker", "cadvisor", "gunicorn", "rq"],
"database": ["node", "mariadb"],
}
if jingrow.local and hasattr(jingrow.local, "request_ip"):
if jingrow.get_value(
"Monitor Server",
{"ip": jingrow.local.request_ip, "status": ("!=", "Archived")},
"only_monitor_uptime_metrics",
cache=True,
):
return {
"proxy": ["node"],
"app": ["node"],
"nfs": ["node"],
"database": ["node"],
}
return DEFAULT_JOB_MAP
return DEFAULT_JOB_MAP