jcloude/press/api/server.py

870 lines
27 KiB
Python

# Copyright (c) 2022, Frappe and contributors
# For license information, please see license.txt
from __future__ import annotations
from datetime import datetime, timedelta
from datetime import timezone as tz
from typing import TYPE_CHECKING
import jingrow
import requests
from frappe.utils import convert_utc_to_timezone, flt
from frappe.utils.caching import redis_cache
from frappe.utils.password import get_decrypted_password
from press.api.analytics import get_rounded_boundaries
from press.api.bench import all as all_benches
from press.api.site import protected
from press.exceptions import MonitorServerDown
from press.press.doctype.auto_scale_record.auto_scale_record import validate_scaling_schedule
from press.press.doctype.site_plan.plan import Plan, filter_by_roles
from press.press.doctype.team.team import get_child_team_members
from press.utils import get_current_team
if TYPE_CHECKING:
from press.press.doctype.auto_scale_record.auto_scale_record import AutoScaleRecord
from press.press.doctype.cluster.cluster import Cluster
from press.press.doctype.database_server.database_server import DatabaseServer
from press.press.doctype.server.server import Server
from press.press.doctype.server_plan.server_plan import ServerPlan
def poly_get_pg(doctypes, name):
for doctype in doctypes:
if frappe.db.exists(doctype, name):
return frappe.get_pg(doctype, name)
return frappe.get_pg(doctypes[-1], name)
def get_mount_point(server: str, server_type=None) -> str:
"""Guess mount point from server"""
if server_type is None:
server_type = "Database Server" if server[0] == "m" else "Server"
elif server_type == "Application Server":
server_type = "Server"
elif server_type == "Replication Server":
server_type = "Database Server"
server_pg: "Server" | "DatabaseServer" = frappe.get_pg(server_type, server)
if server_pg.provider != "AWS EC2":
return "/"
return server_pg.guess_data_disk_mountpoint()
@jingrow.whitelist()
def all(server_filter=None): # noqa: C901
if server_filter is None:
server_filter = {"server_type": "", "tag": ""}
team = get_current_team()
child_teams = [team.name for team in get_child_team_members(team)]
teams = [team, *child_teams]
db_server = frappe.qb.DocType("Database Server")
app_server = frappe.qb.DocType("Server")
res_tag = frappe.qb.DocType("Resource Tag")
if server_filter["server_type"] != "Database Servers":
app_server_query = (
frappe.qb.from_(app_server)
.select(
app_server.name,
app_server.title,
app_server.status,
app_server.creation,
app_server.cluster,
)
.where(((app_server.team).isin(teams)) & (app_server.status != "Archived"))
)
if server_filter["tag"]:
app_server_query = app_server_query.inner_join(res_tag).on(
(res_tag.parent == app_server.name) & (res_tag.tag_name == server_filter["tag"])
)
if server_filter["server_type"] != "App Servers":
database_server_query = (
frappe.qb.from_(db_server)
.select(
db_server.name,
db_server.title,
db_server.status,
db_server.creation,
db_server.cluster,
)
.where(((db_server.team).isin(teams)) & (db_server.status != "Archived"))
)
if server_filter["tag"]:
database_server_query = database_server_query.inner_join(res_tag).on(
(res_tag.parent == db_server.name) & (res_tag.tag_name == server_filter["tag"])
)
if server_filter["server_type"] == "App Servers":
query = app_server_query
elif server_filter["server_type"] == "Database Servers":
query = database_server_query
else:
query = app_server_query + database_server_query
# union isn't supported in qb for run method
# http://git.jingrow.com/frappe/frappe/issues/15609
servers = frappe.db.sql(query.get_sql(), as_dict=True)
for server in servers:
server_plan_name = frappe.get_value("Server", server.name, "plan")
server["plan"] = frappe.get_pg("Server Plan", server_plan_name) if server_plan_name else None
server["app_server"] = f"f{server.name[1:]}"
server["tags"] = frappe.get_all("Resource Tag", {"parent": server.name}, pluck="tag_name")
server["region_info"] = frappe.db.get_value(
"Cluster", server.cluster, ["title", "image"], as_dict=True
)
return servers
@jingrow.whitelist()
def server_tags():
team = get_current_team()
return frappe.get_all("Press Tag", {"team": team, "doctype_name": "Server"}, pluck="tag")
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def get(name):
server = poly_get_pg(["Server", "Database Server"], name)
return {
"name": server.name,
"title": server.title,
"status": server.status,
"team": server.team,
"app_server": server.name
if server.is_self_hosted
else f"f{server.name[1:]}", # Don't use `f` series if self hosted
"region_info": frappe.db.get_value(
"Cluster", server.cluster, ["name", "title", "image"], as_dict=True
),
"server_tags": [{"name": x.tag, "tag": x.tag_name} for x in server.tags],
"tags": frappe.get_all("Press Tag", {"team": server.team, "doctype_name": "Server"}, ["name", "tag"]),
"type": "database-server" if server.meta.name == "Database Server" else "server",
}
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def overview(name):
server = poly_get_pg(["Server", "Database Server"], name)
plan = frappe.get_pg("Server Plan", server.plan) if server.plan else None
if plan:
# override plan disk size with the actual disk size
# TODO: Remove this once we remove old dashboard
plan.disk = frappe.db.get_value("Virtual Machine", name, "disk_size")
return {
"plan": plan if plan else None,
"info": {
"owner": frappe.db.get_value(
"User",
frappe.get_value("Team", server.team, "user"),
["first_name", "last_name", "user_image"],
as_dict=True,
),
"created_on": server.creation,
},
}
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def archive(name):
server = poly_get_pg(["Server", "Database Server"], name)
server.drop_server()
@jingrow.whitelist()
@protected(["Server"])
def get_reclaimable_size(name):
server: Server = frappe.get_pg("Server", name)
return server.agent.get("server/reclaimable-size")
@jingrow.whitelist()
def new(server):
server_plan_platform = frappe.get_value("Server Plan", server["app_plan"], "platform")
cluster_has_arm_support = frappe.get_value("Cluster", server["cluster"], "has_arm_support")
auto_increase_storage = server.get("auto_increase_storage", False)
if server_plan_platform == "arm64" and not cluster_has_arm_support:
frappe.throw(f"ARM Instances are currently unavailable in the {server['cluster']} region")
team = get_current_team(get_pg=True)
if not team.enabled:
frappe.throw("You cannot create a new server because your account is disabled")
cluster: Cluster = frappe.get_pg("Cluster", server["cluster"])
db_plan: ServerPlan = frappe.get_pg("Server Plan", server["db_plan"])
if not cluster.check_machine_availability(db_plan.instance_type):
frappe.throw(
f"No machines of {db_plan.instance_type} are currently available in the {cluster.name} region"
)
db_server, job = cluster.create_server(
"Database Server",
server["title"],
db_plan,
team=team.name,
auto_increase_storage=auto_increase_storage,
)
proxy_server = frappe.get_all(
"Proxy Server",
{"status": "Active", "cluster": cluster.name, "is_primary": True},
limit=1,
)[0]
# to be used by app server
cluster.database_server = db_server.name
cluster.proxy_server = proxy_server.name
app_plan: ServerPlan = frappe.get_pg("Server Plan", server["app_plan"])
if not cluster.check_machine_availability(app_plan.instance_type):
frappe.throw(
f"No machines of {app_plan.instance_type} are currently available in the {cluster.name} region"
)
app_server, job = cluster.create_server(
"Server", server["title"], app_plan, team=team.name, auto_increase_storage=auto_increase_storage
)
return {"server": app_server.name, "job": job.name}
def get_cpu_and_memory_usage(name: str, time_range: str = "4m") -> dict[str, float]:
"""Returns simplified CPU and memory usage [0..1] for autoscale triggers"""
monitor_server = frappe.db.get_single_value("Press Settings", "monitor_server")
if not monitor_server:
return {"vcpu": 0.0, "memory": 0.0}
query = f"""
1 - avg(rate(node_cpu_seconds_total{{instance="{name}",job="node",mode="idle"}}[{time_range}]))
OR
1 -
(
avg_over_time(node_memory_MemAvailable_bytes{{instance="{name}", job="node"}}[{time_range}])
/
avg_over_time(node_memory_MemTotal_bytes{{instance="{name}", job="node"}}[{time_range}])
)
"""
url = f"https://{monitor_server}/prometheus/api/v1/query"
password = get_decrypted_password("Monitor Server", monitor_server, "grafana_password")
params = {"query": query}
response = requests.get(url, params=params, auth=("frappe", str(password))).json()
if (
response.get("status") == "success"
and response.get("data")
and response["data"].get("resultType") == "vector"
and response["data"].get("result")
):
results = response["data"]["result"]
if results and len(results) == 2:
return {
"vcpu": float(results[0]["value"][1]),
"memory": float(results[1]["value"][1]),
}
return {"vcpu": 0.0, "memory": 0.0}
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def usage(name):
mount_point = get_mount_point(name)
# (
# (count(count by (cpu) (node_cpu_seconds_total{instance="fs22-mumbai.frappe.cloud",job="node"})))
# -
# avg(
# sum by (mode) (
# rate(node_cpu_seconds_total{instance="fs22-mumbai.frappe.cloud",job="node",mode="idle"}[120s])
# )
# )
# )
# /
# Changing CPU usage to a vector result averaging over a 120s window
query_map = {
"disk": (
f"""sum(node_filesystem_size_bytes{{instance="{name}", job="node", mountpoint=~"{mount_point}"}} - node_filesystem_avail_bytes{{instance="{name}", job="node", mountpoint=~"{mount_point}"}}) by ()/ (1024 * 1024 * 1024)""",
lambda x: x,
),
"memory": (
f"""(node_memory_MemTotal_bytes{{instance="{name}",job="node"}} - node_memory_MemFree_bytes{{instance="{name}",job="node"}} - (node_memory_Cached_bytes{{instance="{name}",job="node"}} + node_memory_Buffers_bytes{{instance="{name}",job="node"}})) / (1024 * 1024)""",
lambda x: x,
),
"free_memory": (
f"""avg_over_time(node_memory_MemAvailable_bytes{{instance="{name}", job="node"}}[10m])""",
lambda x: x,
),
}
result = {}
for usage_type, query in query_map.items():
response = prometheus_query(query[0], query[1], "Asia/Kolkata", 120, 120)["datasets"]
if response:
result[usage_type] = response[0]["values"][-1]
result["vcpu"] = get_cpu_and_memory_usage(name)["vcpu"]
return result
@protected(["Server", "Database Server"])
def total_resource(name):
mount_point = get_mount_point(name)
query_map = {
"vcpu": (
f"""(count(count(node_cpu_seconds_total{{instance="{name}",job="node"}}) by (cpu)))""",
lambda x: x,
),
"disk": (
f"""sum(node_filesystem_size_bytes{{instance="{name}", job="node", mountpoint=~"{mount_point}"}}) by () / (1024 * 1024 * 1024)""",
lambda x: x,
),
"memory": (
f"""(node_memory_MemTotal_bytes{{instance="{name}",job="node"}}) / (1024 * 1024)""",
lambda x: x,
),
}
result = {}
for usage_type, query in query_map.items():
response = prometheus_query(query[0], query[1], "Asia/Kolkata", 120, 120)["datasets"]
if response:
result[usage_type] = response[0]["values"][-1]
return result
def calculate_swap(name):
query_map = {
"swap_used": (
f"""((node_memory_SwapTotal_bytes{{instance="{name}",job="node"}} - node_memory_SwapFree_bytes{{instance="{name}",job="node"}}) / node_memory_SwapTotal_bytes{{instance="{name}",job="node"}}) * 100""",
lambda x: x,
),
"swap": (
f"""node_memory_SwapTotal_bytes{{instance="{name}",job="node"}} / (1024 * 1024 * 1024)""",
lambda x: x,
),
"required": (
f"""(
(node_memory_MemTotal_bytes{{instance="{name}",job="node"}} +
node_memory_SwapTotal_bytes{{instance="{name}",job="node"}}
) -
(node_memory_MemFree_bytes{{instance="{name}",job="node"}} +
node_memory_SwapFree_bytes{{instance="{name}",job="node"}} +
node_memory_Cached_bytes{{instance="{name}",job="node"}} +
node_memory_Buffers_bytes{{instance="{name}",job="node"}} +
node_memory_SwapCached_bytes{{instance="{name}",job="node"}}
)
) /
(1024 * 1024 * 1024)""",
lambda x: x,
),
}
result = {}
for usage_type, query in query_map.items():
response = prometheus_query(query[0], query[1], "Asia/Kolkata", 120, 120)["datasets"]
if response:
result[usage_type] = response[0]["values"][-1]
return result
@jingrow.whitelist()
@protected(["Server", "Database Server"])
@redis_cache(ttl=10 * 60)
def analytics(name, query, timezone, duration, server_type=None):
mount_point = get_mount_point(name, server_type)
timespan, timegrain = get_timespan_timegrain(duration)
query_map = {
"cpu": (
f"""sum by (mode)(rate(node_cpu_seconds_total{{instance="{name}", job="node"}}[{timegrain}s])) * 100""",
lambda x: x["mode"],
),
"network": (
f"""rate(node_network_receive_bytes_total{{instance="{name}", job="node", device=~"ens.*"}}[{timegrain}s]) * 8""",
lambda x: x["device"],
),
"iops": (
f"""rate(node_disk_reads_completed_total{{instance="{name}", job="node"}}[{timegrain}s])""",
lambda x: x["device"],
),
"space": (
f"""100 - ((node_filesystem_avail_bytes{{instance="{name}", job="node", mountpoint=~"{mount_point}"}} * 100) / node_filesystem_size_bytes{{instance="{name}", job="node", mountpoint=~"{mount_point}"}})""",
lambda x: x["mountpoint"],
),
"loadavg": (
f"""{{__name__=~"node_load1|node_load5|node_load15", instance="{name}", job="node"}}""",
lambda x: f"Load Average {x['__name__'][9:]}", # strip "node_load" prefix
),
"memory": (
f"""node_memory_MemTotal_bytes{{instance="{name}",job="node"}} - node_memory_MemFree_bytes{{instance="{name}",job="node"}} - (node_memory_Cached_bytes{{instance="{name}",job="node"}} + node_memory_Buffers_bytes{{instance="{name}",job="node"}})""",
lambda x: "Used",
),
"database_uptime": (
f"""mysql_up{{instance="{name}",job="mariadb"}}""",
lambda x: "Uptime",
),
"database_commands_count": (
f"""sum(round(increase(mysql_global_status_commands_total{{instance='{name}', command=~"select|update|insert|delete|begin|commit|rollback"}}[{timegrain}s]))) by (command)""",
lambda x: x["command"],
),
"database_connections": (
f"""{{__name__=~"mysql_global_status_threads_connected|mysql_global_variables_max_connections", instance="{name}"}}""",
lambda x: "Max Connections"
if x["__name__"] == "mysql_global_variables_max_connections"
else "Connected Clients",
),
"innodb_bp_size": (
f"""mysql_global_variables_innodb_buffer_pool_size{{instance='{name}'}}""",
lambda x: "Buffer Pool Size",
),
"innodb_bp_size_of_total_ram": (
f"""avg by (instance) ((mysql_global_variables_innodb_buffer_pool_size{{instance=~"{name}"}} * 100)) / on (instance) (avg by (instance) (node_memory_MemTotal_bytes{{instance=~"{name}"}}))""",
lambda x: "Buffer Pool Size of Total Ram",
),
"innodb_bp_miss_percent": (
f"""
avg by (instance) (
rate(mysql_global_status_innodb_buffer_pool_reads{{instance=~"{name}"}}[{timegrain}s])
/
rate(mysql_global_status_innodb_buffer_pool_read_requests{{instance=~"{name}"}}[{timegrain}s])
)
""",
lambda x: "Buffer Pool Miss Percentage",
),
"innodb_avg_row_lock_time": (
f"""(rate(mysql_global_status_innodb_row_lock_time{{instance="{name}"}}[{timegrain}s]) / 1000)/rate(mysql_global_status_innodb_row_lock_waits{{instance="{name}"}}[{timegrain}s])""",
lambda x: "Avg Row Lock Time",
),
}
return prometheus_query(query_map[query][0], query_map[query][1], timezone, timespan, timegrain)
@jingrow.whitelist()
@protected(["Server", "Database Server"])
@redis_cache(ttl=10 * 60)
def get_request_by_site(name, query, timezone, duration):
from press.api.analytics import ResourceType, get_request_by_
timespan, timegrain = get_timespan_timegrain(duration)
return get_request_by_(name, query, timezone, timespan, timegrain, ResourceType.SERVER)
@jingrow.whitelist()
@protected(["Server", "Database Server"])
@redis_cache(ttl=10 * 60)
def get_background_job_by_site(name, query, timezone, duration):
from press.api.analytics import ResourceType, get_background_job_by_
timespan, timegrain = get_timespan_timegrain(duration)
return get_background_job_by_(name, query, timezone, timespan, timegrain, ResourceType.SERVER)
@jingrow.whitelist()
@protected(["Server", "Database Server"])
@redis_cache(ttl=10 * 60)
def get_slow_logs_by_site(name, query, timezone, duration, normalize=False):
from press.api.analytics import ResourceType, get_slow_logs
timespan, timegrain = get_timespan_timegrain(duration)
return get_slow_logs(name, query, timezone, timespan, timegrain, ResourceType.SERVER, normalize)
def prometheus_query(query, function, timezone, timespan, timegrain):
monitor_server = frappe.db.get_single_value("Press Settings", "monitor_server")
if not monitor_server:
return {"datasets": [], "labels": []}
url = f"https://{monitor_server}/prometheus/api/v1/query_range"
password = get_decrypted_password("Monitor Server", monitor_server, "grafana_password")
start, end = get_rounded_boundaries(
timespan,
timegrain,
) # timezone not passed as only utc time allowed in promql
query = {
"query": query,
"start": start.timestamp(),
"end": end.timestamp(),
"step": f"{timegrain}s",
}
try:
response = requests.get(url, params=query, auth=("frappe", str(password))).json()
except requests.exceptions.RequestException:
frappe.throw("Unable to connect to monitor server", MonitorServerDown)
datasets = []
labels = []
if not response["data"]["result"]:
return {"datasets": datasets, "labels": labels}
timegrain_delta = timedelta(seconds=timegrain)
labels = [(start + i * timegrain_delta).timestamp() for i in range((end - start) // timegrain_delta + 1)]
for index in range(len(response["data"]["result"])):
dataset = {
"name": function(response["data"]["result"][index]["metric"]),
"values": [None] * len(labels), # Initialize with None
}
for label, value in response["data"]["result"][index]["values"]:
dataset["values"][labels.index(label)] = flt(value, 2)
datasets.append(dataset)
labels = [
convert_utc_to_timezone(datetime.fromtimestamp(label, tz=tz.utc).replace(tzinfo=None), timezone)
for label in labels
]
return {"datasets": datasets, "labels": labels}
@jingrow.whitelist()
def options():
if not get_current_team(get_pg=True).servers_enabled:
frappe.throw("Servers feature is not yet enabled on your account")
regions = frappe.get_all(
"Cluster",
{"cloud_provider": ("!=", "Generic"), "public": True},
["name", "title", "image", "beta"],
)
storage_plan = frappe.db.get_value(
"Server Storage Plan",
{"enabled": 1},
["price_inr", "price_usd"],
as_dict=True,
)
snapshot_plan = frappe.db.get_value(
"Server Snapshot Plan",
{"enabled": 1},
["price_inr", "price_usd"],
as_dict=True,
)
return {
"regions": regions,
"app_plans": plans("Server"),
"db_plans": plans("Database Server"),
"storage_plan": storage_plan,
"snapshot_plan": snapshot_plan,
}
@jingrow.whitelist()
def get_autoscale_discount():
return frappe.db.get_single_value("Press Settings", "autoscale_discount", cache=True)
@jingrow.whitelist()
def secondary_server_plans(
name,
cluster=None,
platform=None,
current_plan: str | None = None,
):
current_price = frappe.db.get_value("Server Plan", current_plan, "price_inr")
ServerPlan = frappe.qb.DocType("Server Plan")
HasRole = frappe.qb.DocType("Has Role")
autoscale_discount = frappe.db.get_single_value("Press Settings", "autoscale_discount")
query = (
frappe.qb.from_(ServerPlan)
.select(
ServerPlan.name,
ServerPlan.title,
(ServerPlan.price_usd * autoscale_discount).as_("price_usd"),
(ServerPlan.price_inr * autoscale_discount).as_("price_inr"),
ServerPlan.vcpu,
ServerPlan.memory,
ServerPlan.disk,
ServerPlan.cluster,
ServerPlan.instance_type,
ServerPlan.premium,
ServerPlan.platform,
HasRole.role,
)
.join(HasRole)
.on((HasRole.parenttype == "Server Plan") & (HasRole.parent == ServerPlan.name))
.where(ServerPlan.server_type == name)
.where(ServerPlan.platform == platform)
.where(ServerPlan.price_inr == current_price)
.where(ServerPlan.enabled == 1)
)
if cluster:
query = query.where(ServerPlan.cluster == cluster)
if platform:
query = query.where(ServerPlan.platform == platform)
plans = query.run(as_dict=1)
return filter_by_roles(plans)
@jingrow.whitelist()
def plans(name, cluster=None, platform=None):
# Removed default platform of x86_64;
# Still use x86_64 for new database servers
filters = {"server_type": name}
if cluster:
filters.update({"cluster": cluster})
if platform:
filters.update({"platform": platform})
return Plan.get_plans(
doctype="Server Plan",
fields=[
"name",
"title",
"price_usd",
"price_inr",
"vcpu",
"memory",
"disk",
"cluster",
"instance_type",
"premium",
"platform",
],
filters=filters,
)
@jingrow.whitelist()
def play(play):
play = frappe.get_pg("Ansible Play", play)
play = play.as_dict()
whitelisted_fields = [
"name",
"play",
"creation",
"status",
"start",
"end",
"duration",
]
for key in list(play.keys()):
if key not in whitelisted_fields:
play.pop(key, None)
play.steps = frappe.get_all(
"Ansible Task",
filters={"play": play.name},
fields=["task", "status", "start", "end", "duration", "output"],
order_by="creation",
)
return play
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def change_plan(name, plan):
poly_get_pg(["Server", "Database Server"], name).change_plan(plan)
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def press_jobs(name):
jobs = []
for job in frappe.get_all("Press Job", {"server": name}, pluck="name"):
jobs.append(frappe.get_pg("Press Job", job).detail())
return jobs
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def jobs(filters=None, order_by=None, limit_start=None, limit_page_length=None):
jobs = frappe.get_all(
"Agent Job",
fields=["name", "job_type", "creation", "status", "start", "end", "duration"],
filters=filters,
start=limit_start,
limit=limit_page_length,
order_by=order_by or "creation desc",
)
for job in jobs:
job["status"] = "Pending" if job["status"] == "Undelivered" else job["status"]
return jobs
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def plays(filters=None, order_by=None, limit_start=None, limit_page_length=None):
return frappe.get_all(
"Ansible Play",
fields=["name", "play", "creation", "status", "start", "end", "duration"],
filters=filters,
start=limit_start,
limit=limit_page_length,
order_by=order_by or "creation desc",
)
@jingrow.whitelist()
@protected("Server")
def get_title_and_cluster(name):
return frappe.db.get_value("Server", name, ["title", "cluster"], as_dict=True)
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def groups(name):
server = poly_get_pg(["Server", "Database Server"], name)
if server.doctype == "Database Server":
app_server = frappe.db.get_value("Server", {"database_server": server.name}, "name")
server = frappe.get_pg("Server", app_server)
return all_benches(server=server.name)
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def reboot(name):
return poly_get_pg(["Server", "Database Server"], name).reboot()
@jingrow.whitelist()
@protected(["Server", "Database Server"])
def rename(name, title):
pg = poly_get_pg(["Server", "Database Server"], name)
pg.title = title
pg.save()
def get_timespan_timegrain(duration: str) -> tuple[int, int]:
timespan, timegrain = {
"1 Hour": (60 * 60, 2 * 60),
"6 Hour": (6 * 60 * 60, 5 * 60),
"24 Hour": (24 * 60 * 60, 30 * 60),
"7 Days": (7 * 24 * 60 * 60, 2 * 30 * 60),
"15 Days": (15 * 24 * 60 * 60, 3 * 30 * 60),
}[duration]
return timespan, timegrain
@jingrow.whitelist(allow_guest=True)
def benches_are_idle(server: str, access_token: str) -> None:
"""Shut down the secondary server if all benches are idle.
This function is only triggered by secondary servers:
http://git.jingrow.com/jingrow/agent/pull/346/files#diff-7355d9c50cadfa3f4c74fc77a4ad8ab08e4da8f6c3326bbf9b0de0f00a0aa0daR87-R93
"""
from passlib.hash import pbkdf2_sha256 as pbkdf2
server_pg = frappe.get_cached_pg("Server", server)
agent_password = server_pg.get_password("agent_password")
current_user = frappe.session.user
if not pbkdf2.verify(agent_password, access_token):
return
primary_server, is_server_scaled_up = frappe.db.get_value(
"Server", {"secondary_server": server}, ["name", "scaled_up"]
)
running_scale_down = frappe.db.get_value(
"Auto Scale Record", {"secondary_server": server, "status": ("IN", ("Running", "Pending"))}
)
scaled_up_at = frappe.db.get_value(
"Auto Scale Record", {"secondary_server": server, "scale_up": True}, "modified"
)
cool_off_period = frappe.db.get_single_value("Press Settings", "cool_off_period")
should_scale_down = (
not running_scale_down
and is_server_scaled_up
and scaled_up_at
and (frappe.utils.now_datetime() - scaled_up_at) > timedelta(seconds=cool_off_period or 300)
)
if should_scale_down:
# Scale down here
frappe.set_user("Administrator")
auto_scale_record: "AutoScaleRecord" = frappe.get_pg(
{
"doctype": "Auto Scale Record",
"scale_up": False,
"scale_down": True,
"primary_server": primary_server,
}
)
auto_scale_record.insert()
frappe.set_user(current_user)
@jingrow.whitelist()
@protected(["Server"])
def schedule_auto_scale(
name, scheduled_scale_up_time: str | datetime, scheduled_scale_down_time: str | datetime
) -> None:
"""Schedule two auto scale record with scale up and down actions at given times"""
secondary_server = frappe.db.get_value("Server", name, "secondary_server")
formatted_scheduled_scale_up_time = (
datetime.strptime(scheduled_scale_up_time, "%Y-%m-%d %H:%M:%S")
if isinstance(scheduled_scale_up_time, str)
else scheduled_scale_up_time
)
formatted_scheduled_scale_down_time = (
datetime.strptime(scheduled_scale_down_time, "%Y-%m-%d %H:%M:%S")
if isinstance(scheduled_scale_down_time, str)
else scheduled_scale_down_time
)
if (formatted_scheduled_scale_down_time - formatted_scheduled_scale_up_time).total_seconds() / 60 < 60:
frappe.throw("Scheduled scales must be an hour apart", frappe.ValidationError)
validate_scaling_schedule(
name,
formatted_scheduled_scale_up_time,
formatted_scheduled_scale_down_time,
)
def create_record(action: str, scheduled_time: datetime) -> None:
pg = frappe.get_pg(
{
"doctype": "Auto Scale Record",
"action": action,
"status": "Scheduled",
"scheduled_time": scheduled_time,
"primary_server": name,
"secondary_server": secondary_server,
}
)
pg.insert()
create_record("Scale Up", formatted_scheduled_scale_up_time)
create_record("Scale Down", formatted_scheduled_scale_down_time)
@jingrow.whitelist()
@protected(["Server"])
def get_configured_autoscale_triggers(name) -> list[dict[str, float]] | None:
return frappe.db.get_all(
"Auto Scale Trigger",
{"parent": name},
["name", "metric", "threshold", "action"],
)