add alert manager ans loki
This commit is contained in:
parent
95e196d042
commit
34cca8db64
41
alertmanager.nomad
Normal file
41
alertmanager.nomad
Normal file
@ -0,0 +1,41 @@
|
||||
|
||||
job "alertmanager" {
|
||||
datacenters = ["homelab"]
|
||||
type = "service"
|
||||
meta {
|
||||
forcedeploy = "0"
|
||||
}
|
||||
|
||||
group "alertmanager"{
|
||||
network {
|
||||
mode = "host"
|
||||
port "http" {
|
||||
static = 9093
|
||||
}
|
||||
}
|
||||
task "alertmanager" {
|
||||
driver = "docker"
|
||||
service {
|
||||
name = "alertmanager"
|
||||
port = "http"
|
||||
tags = ["urlprefix-/alertmanager strip=/alertmanager"]
|
||||
check {
|
||||
name = "alertmanager_ui port alive"
|
||||
type = "http"
|
||||
path = "/-/healthy"
|
||||
interval = "10s"
|
||||
timeout = "2s"
|
||||
}
|
||||
}
|
||||
|
||||
config {
|
||||
image = "prom/alertmanager"
|
||||
ports = ["http"]
|
||||
|
||||
}
|
||||
resources {
|
||||
memory = 75
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
102
loki.nomad
Normal file
102
loki.nomad
Normal file
@ -0,0 +1,102 @@
|
||||
|
||||
job "loki" {
|
||||
datacenters = ["homelab"]
|
||||
type = "service"
|
||||
meta {
|
||||
forcedeploy = "0"
|
||||
}
|
||||
|
||||
group "loki"{
|
||||
network {
|
||||
mode = "host"
|
||||
port "http" {
|
||||
static = 3100
|
||||
}
|
||||
}
|
||||
task "loki" {
|
||||
driver = "docker"
|
||||
service {
|
||||
name = "loki"
|
||||
port = "http"
|
||||
check {
|
||||
name = "Loki HTTP"
|
||||
type = "http"
|
||||
path = "/ready"
|
||||
interval = "5s"
|
||||
timeout = "2s"
|
||||
|
||||
check_restart {
|
||||
limit = 2
|
||||
grace = "60s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
config {
|
||||
image = "grafana/loki"
|
||||
ports = ["http"]
|
||||
args = [
|
||||
"-config.file",
|
||||
"/etc/loki/local-config.yaml",
|
||||
]
|
||||
}
|
||||
template {
|
||||
data = <<EOH
|
||||
auth_enabled: false
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
ingester:
|
||||
lifecycler:
|
||||
address: 127.0.0.1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
replication_factor: 1
|
||||
final_sleep: 0s
|
||||
# Any chunk not receiving new logs in this time will be flushed
|
||||
chunk_idle_period: 1h
|
||||
# All chunks will be flushed when they hit this age, default is 1h
|
||||
max_chunk_age: 1h
|
||||
# Loki will attempt to build chunks up to 1.5MB, flushing if chunk_idle_period or max_chunk_age is reached first
|
||||
chunk_target_size: 1048576
|
||||
# Must be greater than index read cache TTL if using an index cache (Default index read cache TTL is 5m)
|
||||
chunk_retain_period: 30s
|
||||
max_transfer_retries: 0 # Chunk transfers disabled
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
storage_config:
|
||||
boltdb_shipper:
|
||||
active_index_directory: /loki/boltdb-shipper-active
|
||||
cache_location: /loki/boltdb-shipper-cache
|
||||
cache_ttl: 24h # Can be increased for faster performance over longer query periods, uses more disk space
|
||||
shared_store: filesystem
|
||||
filesystem:
|
||||
directory: /loki/chunks
|
||||
compactor:
|
||||
working_directory: /tmp/loki/boltdb-shipper-compactor
|
||||
shared_store: filesystem
|
||||
limits_config:
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
chunk_store_config:
|
||||
max_look_back_period: 0s
|
||||
table_manager:
|
||||
retention_deletes_enabled: false
|
||||
retention_period: 0s
|
||||
EOH
|
||||
destination = "local/loki/local-config.yaml"
|
||||
}
|
||||
resources {
|
||||
memory = 150
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -32,6 +32,13 @@ job "prometheus" {
|
||||
global:
|
||||
scrape_interval: 10s
|
||||
evaluation_interval: 10s
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- consul_sd_configs:
|
||||
- server: 'consul.service.consul:8500'
|
||||
services: ['alertmanager']
|
||||
rule_files:
|
||||
- "nomad-alert-rules.yml"
|
||||
|
||||
scrape_configs:
|
||||
|
||||
@ -54,8 +61,47 @@ scrape_configs:
|
||||
consul_sd_configs:
|
||||
- server: 'consul.service.consul:8500'
|
||||
services: ['traefik-local-admin','traefik-admin']
|
||||
- job_name: 'alertmanager'
|
||||
consul_sd_configs:
|
||||
- server: 'consul.service.consul:8500'
|
||||
services: ['alertmanager']
|
||||
|
||||
|
||||
EOH
|
||||
}
|
||||
template {
|
||||
destination = "local/nomad-alert-rules.yml"
|
||||
right_delimiter = "]]"
|
||||
left_delimiter = "[["
|
||||
data = <<EOH
|
||||
---
|
||||
groups:
|
||||
- name: nomad_alerts
|
||||
rules:
|
||||
- alert: NomadJobFailed
|
||||
expr: nomad_nomad_job_summary_failed > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job failed (instance {{ $labels.instance }})
|
||||
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: NomadBlockedEvaluation
|
||||
expr: nomad_nomad_blocked_evals_total_blocked > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
|
||||
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: NomadJobLost
|
||||
expr: nomad_nomad_job_summary_lost > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job lost (instance {{ $labels.instance }})
|
||||
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
EOH
|
||||
}
|
||||
|
||||
@ -69,6 +115,7 @@ EOH
|
||||
"--storage.tsdb.retention.time=15d",
|
||||
]
|
||||
volumes = [
|
||||
"local/nomad-alert-rules.yml:/etc/prometheus/nomad-alert-rules.yml",
|
||||
"local/prometheus.yml:/etc/prometheus/prometheus.yml",
|
||||
"/mnt/diskstation/nomad/prometheus:/prometheus"
|
||||
]
|
||||
|
94
promtail.nomad
Normal file
94
promtail.nomad
Normal file
@ -0,0 +1,94 @@
|
||||
|
||||
job "promtail" {
|
||||
datacenters = ["homelab","hetzner"]
|
||||
type = "system"
|
||||
meta {
|
||||
forcedeploy = "0"
|
||||
}
|
||||
|
||||
group "promtail"{
|
||||
network {
|
||||
mode = "host"
|
||||
port "http" {
|
||||
static = 3200
|
||||
}
|
||||
}
|
||||
task "promtail" {
|
||||
driver = "docker"
|
||||
service {
|
||||
name = "promtail"
|
||||
port = "http"
|
||||
check {
|
||||
name = "Promtail HTTP"
|
||||
type = "http"
|
||||
path = "/targets"
|
||||
interval = "5s"
|
||||
timeout = "2s"
|
||||
|
||||
check_restart {
|
||||
limit = 2
|
||||
grace = "60s"
|
||||
ignore_warnings = false
|
||||
}
|
||||
}
|
||||
}
|
||||
config {
|
||||
image = "grafana/promtail"
|
||||
ports = ["http"]
|
||||
args = [
|
||||
"-config.file=/local/promtail.yml",
|
||||
"-server.http-listen-port=${NOMAD_PORT_http}",
|
||||
]
|
||||
volumes = [
|
||||
"/mnt/diskstation/nomad/promtail:/data",
|
||||
"/var/lib/nomad/:/nomad/"
|
||||
]
|
||||
|
||||
}
|
||||
env {
|
||||
HOSTNAME = "${attr.unique.hostname}"
|
||||
}
|
||||
template {
|
||||
data = <<EOTC
|
||||
positions:
|
||||
filename: /data/positions.yaml
|
||||
clients:
|
||||
- url: http://loki.service.consul:3100/loki/api/v1/push
|
||||
scrape_configs:
|
||||
- job_name: 'nomad-logs'
|
||||
consul_sd_configs:
|
||||
- server: 'consul.service.consul:8500'
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_consul_node]
|
||||
target_label: __host__
|
||||
- source_labels: [__meta_consul_service_metadata_external_source]
|
||||
target_label: source
|
||||
regex: (.*)
|
||||
replacement: '$1'
|
||||
- source_labels: [__meta_consul_service_id]
|
||||
regex: '_nomad-task-([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})-.*'
|
||||
target_label: 'task_id'
|
||||
replacement: '$1'
|
||||
- source_labels: [__meta_consul_tags]
|
||||
regex: ',(app|monitoring),'
|
||||
target_label: 'group'
|
||||
replacement: '$1'
|
||||
- source_labels: [__meta_consul_service]
|
||||
target_label: job
|
||||
- source_labels: ['__meta_consul_node']
|
||||
regex: '(.*)'
|
||||
target_label: 'instance'
|
||||
replacement: '$1'
|
||||
- source_labels: [__meta_consul_service_id]
|
||||
regex: '_nomad-task-([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})-.*'
|
||||
target_label: '__path__'
|
||||
replacement: '/nomad/alloc/$1/alloc/logs/*std*.{?,??}'
|
||||
EOTC
|
||||
destination = "/local/promtail.yml"
|
||||
}
|
||||
resources {
|
||||
memory = 50
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user