add alert manager ans loki

This commit is contained in:
vincent 2022-05-13 20:17:20 +02:00
parent 95e196d042
commit 34cca8db64
4 changed files with 284 additions and 0 deletions

41
alertmanager.nomad Normal file
View File

@ -0,0 +1,41 @@
job "alertmanager" {
datacenters = ["homelab"]
type = "service"
meta {
forcedeploy = "0"
}
group "alertmanager"{
network {
mode = "host"
port "http" {
static = 9093
}
}
task "alertmanager" {
driver = "docker"
service {
name = "alertmanager"
port = "http"
tags = ["urlprefix-/alertmanager strip=/alertmanager"]
check {
name = "alertmanager_ui port alive"
type = "http"
path = "/-/healthy"
interval = "10s"
timeout = "2s"
}
}
config {
image = "prom/alertmanager"
ports = ["http"]
}
resources {
memory = 75
}
}
}
}

102
loki.nomad Normal file
View File

@ -0,0 +1,102 @@
job "loki" {
datacenters = ["homelab"]
type = "service"
meta {
forcedeploy = "0"
}
group "loki"{
network {
mode = "host"
port "http" {
static = 3100
}
}
task "loki" {
driver = "docker"
service {
name = "loki"
port = "http"
check {
name = "Loki HTTP"
type = "http"
path = "/ready"
interval = "5s"
timeout = "2s"
check_restart {
limit = 2
grace = "60s"
ignore_warnings = false
}
}
}
config {
image = "grafana/loki"
ports = ["http"]
args = [
"-config.file",
"/etc/loki/local-config.yaml",
]
}
template {
data = <<EOH
auth_enabled: false
server:
http_listen_port: 3100
ingester:
lifecycler:
address: 127.0.0.1
ring:
kvstore:
store: inmemory
replication_factor: 1
final_sleep: 0s
# Any chunk not receiving new logs in this time will be flushed
chunk_idle_period: 1h
# All chunks will be flushed when they hit this age, default is 1h
max_chunk_age: 1h
# Loki will attempt to build chunks up to 1.5MB, flushing if chunk_idle_period or max_chunk_age is reached first
chunk_target_size: 1048576
# Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
chunk_retain_period: 30s
max_transfer_retries: 0 # Chunk transfers disabled
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
shared_store: filesystem
filesystem:
directory: /loki/chunks
compactor:
working_directory: /tmp/loki/boltdb-shipper-compactor
shared_store: filesystem
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: false
retention_period: 0s
EOH
destination = "local/loki/local-config.yaml"
}
resources {
memory = 150
}
}
}
}

View File

@ -32,6 +32,13 @@ job "prometheus" {
global:
scrape_interval: 10s
evaluation_interval: 10s
alerting:
alertmanagers:
- consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['alertmanager']
rule_files:
- "nomad-alert-rules.yml"
scrape_configs:
@ -54,8 +61,47 @@ scrape_configs:
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['traefik-local-admin','traefik-admin']
- job_name: 'alertmanager'
consul_sd_configs:
- server: 'consul.service.consul:8500'
services: ['alertmanager']
EOH
}
template {
destination = "local/nomad-alert-rules.yml"
right_delimiter = "]]"
left_delimiter = "[["
data = <<EOH
---
groups:
- name: nomad_alerts
rules:
- alert: NomadJobFailed
expr: nomad_nomad_job_summary_failed > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job failed (instance {{ $labels.instance }})
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: nomad_nomad_blocked_evals_total_blocked > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: nomad_nomad_job_summary_lost > 0
for: 0m
labels:
severity: warning
annotations:
summary: Nomad job lost (instance {{ $labels.instance }})
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
EOH
}
@ -69,6 +115,7 @@ EOH
"--storage.tsdb.retention.time=15d",
]
volumes = [
"local/nomad-alert-rules.yml:/etc/prometheus/nomad-alert-rules.yml",
"local/prometheus.yml:/etc/prometheus/prometheus.yml",
"/mnt/diskstation/nomad/prometheus:/prometheus"
]

94
promtail.nomad Normal file
View File

@ -0,0 +1,94 @@
job "promtail" {
datacenters = ["homelab","hetzner"]
type = "system"
meta {
forcedeploy = "0"
}
group "promtail"{
network {
mode = "host"
port "http" {
static = 3200
}
}
task "promtail" {
driver = "docker"
service {
name = "promtail"
port = "http"
check {
name = "Promtail HTTP"
type = "http"
path = "/targets"
interval = "5s"
timeout = "2s"
check_restart {
limit = 2
grace = "60s"
ignore_warnings = false
}
}
}
config {
image = "grafana/promtail"
ports = ["http"]
args = [
"-config.file=/local/promtail.yml",
"-server.http-listen-port=${NOMAD_PORT_http}",
]
volumes = [
"/mnt/diskstation/nomad/promtail:/data",
"/var/lib/nomad/:/nomad/"
]
}
env {
HOSTNAME = "${attr.unique.hostname}"
}
template {
data = <<EOTC
positions:
filename: /data/positions.yaml
clients:
- url: http://loki.service.consul:3100/loki/api/v1/push
scrape_configs:
- job_name: 'nomad-logs'
consul_sd_configs:
- server: 'consul.service.consul:8500'
relabel_configs:
- source_labels: [__meta_consul_node]
target_label: __host__
- source_labels: [__meta_consul_service_metadata_external_source]
target_label: source
regex: (.*)
replacement: '$1'
- source_labels: [__meta_consul_service_id]
regex: '_nomad-task-([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})-.*'
target_label: 'task_id'
replacement: '$1'
- source_labels: [__meta_consul_tags]
regex: ',(app|monitoring),'
target_label: 'group'
replacement: '$1'
- source_labels: [__meta_consul_service]
target_label: job
- source_labels: ['__meta_consul_node']
regex: '(.*)'
target_label: 'instance'
replacement: '$1'
- source_labels: [__meta_consul_service_id]
regex: '_nomad-task-([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})-.*'
target_label: '__path__'
replacement: '/nomad/alloc/$1/alloc/logs/*std*.{?,??}'
EOTC
destination = "/local/promtail.yml"
}
resources {
memory = 50
}
}
}
}