homelab/prometheus.nomad

165 lines
3.9 KiB
Plaintext
Raw Normal View History

2022-05-10 17:41:18 +00:00
job "prometheus" {
datacenters = ["homelab"]
type = "service"
group "prometheus" {
count = 1
network {
port "prometheus_ui" {
static = 9090
}
}
restart {
attempts = 2
interval = "30m"
delay = "15s"
mode = "fail"
}
ephemeral_disk {
size = 300
}
task "prometheus" {
template {
change_mode = "noop"
destination = "local/prometheus.yml"
data = <<EOH
---
global:
scrape_interval: 10s
evaluation_interval: 10s
2022-05-13 18:17:20 +00:00
alerting:
alertmanagers:
- consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['alertmanager']
rule_files:
- "nomad-alert-rules.yml"
2022-05-10 17:41:18 +00:00
scrape_configs:
- job_name: 'nomad_metrics'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['nomad-client', 'nomad']
relabel_configs:
- source_labels: ['__meta_consul_tags']
regex: '(.*)http(.*)'
action: keep
scrape_interval: 5s
metrics_path: /v1/metrics
params:
format: ['prometheus']
2022-05-13 09:43:07 +00:00
- job_name: 'traefik-local'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['traefik-local-admin','traefik-admin']
2022-05-13 18:17:20 +00:00
- job_name: 'alertmanager'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['alertmanager']
2022-05-13 09:43:07 +00:00
2022-05-26 12:19:32 +00:00
- job_name: 'crowdsec'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['crowdsec-metrics']
2022-05-26 16:25:39 +00:00
relabel_configs:
- source_labels: [__meta_consul_node]
target_label: machine
2022-05-26 12:19:32 +00:00
2022-05-13 09:43:07 +00:00
2022-05-13 18:17:20 +00:00
EOH
}
template {
destination = "local/nomad-alert-rules.yml"
right_delimiter = "]]"
left_delimiter = "[["
data = <<EOH
---
groups:
- name: nomad_alerts
rules:
- alert: NomadJobFailed
expr: nomad_nomad_job_summary_failed > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job failed (instance {{ $labels.instance }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: nomad_nomad_blocked_evals_total_blocked > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: nomad_nomad_job_summary_lost > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job lost (instance {{ $labels.instance }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2022-05-10 17:41:18 +00:00
EOH
}
driver = "docker"
config {
image = "prom/prometheus:latest"
args = [
"--config.file=/etc/prometheus/prometheus.yml",
"--storage.tsdb.path=/prometheus",
"--storage.tsdb.retention.time=15d",
]
volumes = [
2022-05-13 18:17:20 +00:00
"local/nomad-alert-rules.yml:/etc/prometheus/nomad-alert-rules.yml",
2022-05-10 17:41:18 +00:00
"local/prometheus.yml:/etc/prometheus/prometheus.yml",
"/mnt/diskstation/nomad/prometheus:/prometheus"
]
ports = ["prometheus_ui"]
}
service {
name = "prometheus"
2022-05-23 19:44:34 +00:00
tags = ["urlprefix-/",
"homer.enable=true",
"homer.name=Prometheus",
"homer.service=Monitoring",
"homer.type=Prometheus",
"homer.logo=https://upload.wikimedia.org/wikipedia/commons/thumb/3/38/Prometheus_software_logo.svg/173px-Prometheus_software_logo.svg.png",
"homer.target=_blank",
"homer.url=http://${NOMAD_ADDR_prometheus_ui}",
]
2022-05-10 17:41:18 +00:00
port = "prometheus_ui"
check {
name = "prometheus_ui port alive"
type = "http"
path = "/-/healthy"
interval = "10s"
timeout = "2s"
}
}
2022-05-12 09:36:04 +00:00
resources {
memory = 200
}
2022-05-10 17:41:18 +00:00
}
}
}