249 lines
6.7 KiB
Nix
249 lines
6.7 KiB
Nix
# Portions of this file are sourced from
|
|
# https://github.com/NickCao/flakes/blob/3b03efb676ea602575c916b2b8bc9d9cd13b0d85/nixos/hcloud/iad1/prometheus.nix
|
|
{
|
|
config,
|
|
lib,
|
|
pkgs,
|
|
data,
|
|
...
|
|
}:
|
|
let
|
|
cfg = config.services.prometheus;
|
|
common = import ../../../../zones/common.nix;
|
|
publicHosts = lib.filterAttrs (_name: value: value.endpoints != [ ]) common.hosts;
|
|
targets = lib.mapAttrsToList (name: _value: "${name}.rebmit.link") publicHosts;
|
|
primaryNameserver = "${common.primary}.rebmit.link";
|
|
nameservers = map (ns: "${ns}.rebmit.link") common.nameservers;
|
|
relabel_configs = [
|
|
{
|
|
source_labels = [ "__address__" ];
|
|
target_label = "__param_target";
|
|
}
|
|
{
|
|
source_labels = [ "__param_target" ];
|
|
target_label = "instance";
|
|
}
|
|
{
|
|
target_label = "__address__";
|
|
replacement =
|
|
with config.services.prometheus.exporters.blackbox;
|
|
"${listenAddress}:${toString port}";
|
|
}
|
|
];
|
|
in
|
|
{
|
|
sops.secrets."prom/password" = {
|
|
sopsFile = config.sops.secretFiles.host;
|
|
owner = config.systemd.services.prometheus.serviceConfig.User;
|
|
restartUnits = [ "prometheus.service" ];
|
|
};
|
|
|
|
sops.secrets."prom/alertmanager-ntfy" = {
|
|
sopsFile = config.sops.secretFiles.host;
|
|
restartUnits = [ "alertmanager.service" ];
|
|
};
|
|
|
|
services.prometheus = {
|
|
enable = true;
|
|
webExternalUrl = "https://prom.rebmit.moe";
|
|
listenAddress = "127.0.0.1";
|
|
port = config.networking.ports.prometheus;
|
|
retentionTime = "7d";
|
|
globalConfig = {
|
|
scrape_interval = "1m";
|
|
evaluation_interval = "1m";
|
|
};
|
|
scrapeConfigs = [
|
|
{
|
|
job_name = "metrics";
|
|
scheme = "https";
|
|
metrics_path = "/metrics";
|
|
basic_auth = {
|
|
username = "prometheus";
|
|
password_file = config.sops.secrets."prom/password".path;
|
|
};
|
|
static_configs = [ { inherit targets; } ];
|
|
}
|
|
{
|
|
job_name = "caddy";
|
|
scheme = "https";
|
|
metrics_path = "/caddy";
|
|
basic_auth = {
|
|
username = "prometheus";
|
|
password_file = config.sops.secrets."prom/password".path;
|
|
};
|
|
static_configs = [ { inherit targets; } ];
|
|
}
|
|
{
|
|
job_name = "ping";
|
|
scheme = "https";
|
|
metrics_path = "/ping";
|
|
basic_auth = {
|
|
username = "prometheus";
|
|
password_file = config.sops.secrets."prom/password".path;
|
|
};
|
|
static_configs = [ { inherit targets; } ];
|
|
}
|
|
{
|
|
job_name = "dns";
|
|
scheme = "http";
|
|
metrics_path = "/probe";
|
|
params = {
|
|
module = [ "dns_soa" ];
|
|
};
|
|
static_configs = [ { targets = nameservers; } ];
|
|
inherit relabel_configs;
|
|
}
|
|
{
|
|
job_name = "http";
|
|
scheme = "http";
|
|
metrics_path = "/probe";
|
|
params = {
|
|
module = [ "http_2xx" ];
|
|
};
|
|
static_configs = [
|
|
{
|
|
targets = [
|
|
"https://rebmit.moe"
|
|
"https://chat.rebmit.moe"
|
|
"https://idp.rebmit.moe"
|
|
"https://rss.rebmit.moe"
|
|
];
|
|
}
|
|
];
|
|
inherit relabel_configs;
|
|
}
|
|
];
|
|
rules = lib.singleton (
|
|
builtins.toJSON {
|
|
groups = [
|
|
{
|
|
name = "metrics";
|
|
rules = [
|
|
{
|
|
alert = "NodeDown";
|
|
expr = ''up == 0'';
|
|
for = "5m";
|
|
}
|
|
{
|
|
alert = "OOM";
|
|
expr = ''node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1'';
|
|
}
|
|
{
|
|
alert = "DiskFull";
|
|
expr = ''node_filesystem_avail_bytes{mountpoint=~"/persist"} / node_filesystem_size_bytes < 0.1'';
|
|
}
|
|
{
|
|
alert = "UnitFailed";
|
|
expr = ''node_systemd_unit_state{state="failed"} == 1'';
|
|
}
|
|
{
|
|
alert = "ZoneStale";
|
|
expr = ''probe_dns_serial{instance="${primaryNameserver}"} != ignoring(instance) group_right() probe_dns_serial'';
|
|
for = "5m";
|
|
}
|
|
];
|
|
}
|
|
];
|
|
}
|
|
);
|
|
alertmanagers = [
|
|
{
|
|
path_prefix = "/alert";
|
|
static_configs = [
|
|
{
|
|
targets = [ "${cfg.alertmanager.listenAddress}:${builtins.toString cfg.alertmanager.port}" ];
|
|
}
|
|
];
|
|
}
|
|
];
|
|
alertmanager = {
|
|
enable = true;
|
|
webExternalUrl = "https://${config.networking.fqdn}/alert";
|
|
listenAddress = "127.0.0.1";
|
|
port = config.networking.ports.prometheus-alertmanager;
|
|
extraFlags = [ ''--cluster.listen-address=""'' ];
|
|
configuration = {
|
|
receivers = [
|
|
{
|
|
name = "ntfy";
|
|
webhook_configs = [
|
|
{
|
|
url = "https://push.rebmit.workers.moe/alert?tpl=yes&m=${lib.escapeURL ''
|
|
Alert {{.status}}
|
|
{{range .alerts}}-----{{range $k,$v := .labels}}
|
|
{{$k}}={{$v}}{{end}}
|
|
{{end}}
|
|
''}";
|
|
http_config = {
|
|
basic_auth = {
|
|
username = "alertmanager";
|
|
password_file = "/run/credentials/alertmanager.service/alertmanager-ntfy";
|
|
};
|
|
};
|
|
}
|
|
];
|
|
}
|
|
];
|
|
route = {
|
|
receiver = "ntfy";
|
|
};
|
|
};
|
|
};
|
|
};
|
|
|
|
systemd.services.alertmanager.serviceConfig = {
|
|
LoadCredential = [
|
|
"alertmanager-ntfy:${config.sops.secrets."prom/alertmanager-ntfy".path}"
|
|
];
|
|
};
|
|
|
|
services.prometheus.exporters.blackbox = {
|
|
enable = true;
|
|
listenAddress = "127.0.0.1";
|
|
port = config.networking.ports.prometheus-blackbox-exporter;
|
|
configFile = (pkgs.formats.yaml { }).generate "config.yml" {
|
|
modules = {
|
|
http_2xx = {
|
|
prober = "http";
|
|
};
|
|
dns_soa = {
|
|
prober = "dns";
|
|
dns = {
|
|
query_name = "rebmit.moe";
|
|
query_type = "SOA";
|
|
};
|
|
};
|
|
};
|
|
};
|
|
};
|
|
|
|
services.caddy.virtualHosts."prom.rebmit.moe" = {
|
|
serverAliases = [ "prom.rebmit.workers.moe" ];
|
|
extraConfig = with config.services.prometheus; ''
|
|
tls internal {
|
|
client_auth {
|
|
mode require_and_verify
|
|
trust_pool file ${builtins.toFile "cloudflare_aop_ca_certificate" data.cloudflare_aop_ca_certificate}
|
|
}
|
|
}
|
|
reverse_proxy ${listenAddress}:${toString port}
|
|
'';
|
|
};
|
|
|
|
preservation.preserveAt."/persist".directories = [
|
|
{
|
|
directory = "/var/lib/prometheus2";
|
|
mode = "-";
|
|
user = "-";
|
|
group = "-";
|
|
}
|
|
{
|
|
directory = "/var/lib/private/alertmanager";
|
|
mode = "-";
|
|
user = "-";
|
|
group = "-";
|
|
}
|
|
];
|
|
}
|