homelab/nomad-job/prometheus.nomad

297 lines
8.7 KiB
Plaintext
Raw Normal View History

2022-05-10 17:41:18 +00:00
job "prometheus" {
2022-10-08 06:52:09 +00:00
datacenters = ["homelab"]
2022-12-10 16:10:32 +00:00
priority = 50
2022-05-10 17:41:18 +00:00
type = "service"
2022-11-21 19:33:18 +00:00
constraint {
attribute = "${attr.cpu.arch}"
value = "amd64"
}
2023-10-23 17:16:47 +00:00
meta{
force_deploy= 1
}
2024-02-21 18:03:31 +00:00
constraint {
attribute = "${node.class}"
operator = "set_contains"
value = "cluster"
}
2022-05-10 17:41:18 +00:00
group "prometheus" {
count = 1
network {
port "prometheus_ui" {
static = 9090
}
}
restart {
attempts = 2
interval = "30m"
delay = "15s"
mode = "fail"
}
2022-10-29 08:40:01 +00:00
vault {
2022-10-30 08:33:39 +00:00
policies = ["prometheus"]
2022-10-08 06:52:09 +00:00
}
2022-05-10 17:41:18 +00:00
ephemeral_disk {
size = 300
}
task "prometheus" {
template {
change_mode = "noop"
destination = "local/prometheus.yml"
data = <<EOH
---
global:
scrape_interval: 10s
evaluation_interval: 10s
2022-05-13 18:17:20 +00:00
alerting:
alertmanagers:
- consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['alertmanager']
rule_files:
- "nomad-alert-rules.yml"
2022-05-10 17:41:18 +00:00
scrape_configs:
- job_name: 'nomad_metrics'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['nomad-client', 'nomad']
relabel_configs:
- source_labels: ['__meta_consul_tags']
regex: '(.*)http(.*)'
action: keep
2023-04-26 18:35:09 +00:00
- source_labels: ['__meta_consul_dc']
target_label: 'dc'
2022-05-10 17:41:18 +00:00
scrape_interval: 5s
metrics_path: /v1/metrics
params:
format: ['prometheus']
2023-04-26 18:35:09 +00:00
- job_name: 'traefik'
2022-05-13 09:43:07 +00:00
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['traefik-local-admin','traefik-admin']
2023-04-26 18:35:09 +00:00
relabel_configs:
- source_labels: ['__meta_consul_service']
target_label: instance
2022-05-13 18:17:20 +00:00
- job_name: 'alertmanager'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['alertmanager']
2023-04-26 18:35:09 +00:00
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: instance
2022-05-13 09:43:07 +00:00
2022-05-26 12:19:32 +00:00
- job_name: 'crowdsec'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['crowdsec-metrics']
2022-05-26 16:25:39 +00:00
relabel_configs:
- source_labels: [__meta_consul_node]
target_label: machine
2022-06-22 14:38:56 +00:00
- job_name: 'nodeexp'
static_configs:
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['node-exporter']
2022-06-25 07:01:01 +00:00
relabel_configs:
- source_labels: [__meta_consul_node]
target_label: instance
2023-11-28 18:02:19 +00:00
- job_name: 'loki'
consul_sd_configs:
- server: consul.service.consul:8500
services: ['loki']
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: instance
2022-10-08 06:52:09 +00:00
- job_name: 'HASS'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['hass']
2022-11-21 19:33:35 +00:00
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: instance
2022-10-08 06:52:09 +00:00
scrape_interval: 60s
metrics_path: /api/prometheus
authorization:
2022-10-30 08:33:39 +00:00
credentials: {{ with secret "secrets/data/nomad/prometheus"}}'{{ .Data.data.hass_token }}'{{end}}
2023-10-08 14:50:19 +00:00
- job_name: 'nut'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['nutexporter']
metrics_path: /ups_metrics
relabel_configs:
- source_labels: ['__meta_consul_dc']
target_label: instance
2022-10-08 06:52:09 +00:00
2022-06-22 14:38:56 +00:00
2022-05-26 16:25:39 +00:00
2022-05-26 12:19:32 +00:00
2022-05-13 09:43:07 +00:00
2022-05-13 18:17:20 +00:00
EOH
}
template {
2022-10-29 08:40:01 +00:00
destination = "local/nomad-alert-rules.yml"
2022-05-13 18:17:20 +00:00
right_delimiter = "]]"
2022-10-29 08:40:01 +00:00
left_delimiter = "[["
data = <<EOH
2022-05-13 18:17:20 +00:00
---
groups:
- name: nomad_alerts
rules:
- alert: NomadBlockedEvaluation
expr: nomad_nomad_blocked_evals_total_blocked > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2022-09-18 12:48:28 +00:00
- alert: NomadJobQueued
expr: nomad_nomad_job_summary_queued > 0
for: 2m
labels:
severity: warning
annotations:
summary: Nomad job queued (instance {{ $labels.instance }})
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2023-09-14 18:26:16 +00:00
- alert: NomadBatchError
expr: nomad_nomad_job_summary_failed{parent_id=~".+"}>0
labels:
severity: warning
annotations:
summary: Nomad batch {{ $labels.parent_id}} error
2023-10-07 15:34:17 +00:00
- alert: test gitea
expr: nomad_nomad_job_summary_running{exported_job="git"}==0
labels:
severity: warning
2023-10-08 14:50:19 +00:00
- name: nut_alerts
rules:
- alert: UPSonBattery
expr: network_ups_tools_ups_status{flag="OB"}==1
labels:
severity: warning
annotations:
summary: UPS switched on battery
- alert: UPSLowBattery
expr: network_ups_tools_ups_status{flag="LB"}==1
labels:
severity: critical
annotations:
summary: UPS is now on low battery please shutdown all device
- alert: "UPS Battery needed to be replaced"
expr: network_ups_tools_ups_status{flag="RB"}==1
labels:
severity: warning
annotations:
summary: UPS battery is detected to replace
- name: Node_alerts
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10)
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m]) > 600) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: critical
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8)
for: 10m
labels:
severity: warning
annotations:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write latency (instance {{ $labels.instance }})
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk read latency (instance {{ $labels.instance }})
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2022-05-10 17:41:18 +00:00
EOH
}
driver = "docker"
config {
2024-03-17 17:58:24 +00:00
image = "docker.service.consul:5000/prom/prometheus:latest"
2022-05-10 17:41:18 +00:00
args = [
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus",
2023-10-23 17:16:27 +00:00
"--storage.tsdb.retention.time=40d",
2022-05-10 17:41:18 +00:00
]
volumes = [
2022-05-13 18:17:20 +00:00
"local/nomad-alert-rules.yml:/etc/prometheus/nomad-alert-rules.yml",
2022-05-10 17:41:18 +00:00
"local/prometheus.yml:/etc/prometheus/prometheus.yml",
"/mnt/diskstation/nomad/prometheus:/prometheus"
]
ports = ["prometheus_ui"]
}
service {
name = "prometheus"
2022-05-23 19:44:34 +00:00
tags = ["urlprefix-/",
2022-10-29 08:40:01 +00:00
"homer.enable=true",
"homer.name=Prometheus",
"homer.service=Monitoring",
"homer.type=Prometheus",
"homer.logo=https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/173px-Prometheus_software_logo.svg.png",
"homer.target=_blank",
"homer.url=http://${NOMAD_ADDR_prometheus_ui}",
2022-05-23 19:44:34 +00:00
]
2022-05-10 17:41:18 +00:00
port = "prometheus_ui"
check {
name = "prometheus_ui port alive"
type = "http"
path = "/-/healthy"
interval = "10s"
timeout = "2s"
}
}
2022-05-12 09:36:04 +00:00
resources {
2023-10-23 17:16:47 +00:00
memory = 350
2024-04-09 06:41:06 +00:00
memory_max = 500
2022-05-12 09:36:04 +00:00
}
2022-05-10 17:41:18 +00:00
}
}
}