[Alerts]Migrate OpsGenie alerts to OnCall

This commit is contained in:
Mario Tischlinger 2023-10-11 15:22:41 +00:00
parent 24d21fdc08
commit b0ca82ab32
7 changed files with 60 additions and 42 deletions

View File

@ -174,7 +174,6 @@ plan_staging:
-var "host=$HOST_STAGING"
-var "timeout=$TIMEOUT"
-var "prom_service_acct=$PROM_SA"
-var "opsgenie_key=$OPSGENIE_KEY"
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
-var "private_bucket=$PRIVATE_BUCKET"
-var "letsencrypt_email=$LE_EMAIL"
@ -207,7 +206,6 @@ deploy_staging:
-var "host=$HOST_STAGING"
-var "timeout=$TIMEOUT"
-var "prom_service_acct=$PROM_SA"
-var "opsgenie_key=$OPSGENIE_KEY"
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
-var "private_bucket=$PRIVATE_BUCKET"
-var "letsencrypt_email=$LE_EMAIL"
@ -239,7 +237,6 @@ plan_production:
-var "host=$HOST"
-var "timeout=$TIMEOUT"
-var "prom_service_acct=$PROM_SA"
-var "opsgenie_key=$OPSGENIE_KEY"
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
-var "private_bucket=$PRIVATE_BUCKET"
-var "letsencrypt_email=$LE_EMAIL"
@ -272,7 +269,6 @@ deploy_production:
-var "host=$HOST"
-var "timeout=$TIMEOUT"
-var "prom_service_acct=$PROM_SA"
-var "opsgenie_key=$OPSGENIE_KEY"
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
-var "private_bucket=$PRIVATE_BUCKET"
-var "letsencrypt_email=$LE_EMAIL"
@ -303,7 +299,6 @@ plan_production_testnet:
-var "instance_type=$INSTANCE_TYPE"
-var "timeout=$TIMEOUT"
-var "prom_service_acct=$PROM_SA"
-var "opsgenie_key=$OPSGENIE_KEY"
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
-var "private_bucket=$PRIVATE_BUCKET"
-var "letsencrypt_email=$LE_EMAIL"
@ -333,7 +328,6 @@ deploy_production_testnet:
-var "instance_type=$INSTANCE_TYPE"
-var "timeout=$TIMEOUT"
-var "prom_service_acct=$PROM_SA"
-var "opsgenie_key=$OPSGENIE_KEY"
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
-var "private_bucket=$PRIVATE_BUCKET"
-var "letsencrypt_email=$LE_EMAIL"

View File

@ -50,7 +50,6 @@ module "blc-mainnet" {
instance_type = var.instance_type[1]
timeout = var.timeout
prom_service_acct = var.prom_service_acct
opsgenie_key = var.opsgenie_key
rpcpass = var.rpcpass
charge_token = var.charge_token
k8s_autossh_lb = var.k8s_autossh_lb
@ -84,7 +83,6 @@ module "blc-testnet" {
instance_type = var.instance_type[1]
timeout = var.timeout
prom_service_acct = var.prom_service_acct
opsgenie_key = var.opsgenie_key
rpcpass = var.rpcpass
charge_token = var.charge_token
k8s_autossh_lb = var.k8s_autossh_lb

View File

@ -15,30 +15,41 @@ write_files:
content: |
#!/bin/bash
# Save # and names of running containers
NUM_RUNNING_CONT=$(docker ps -q | wc -l)
NEEDED_CONT=8
RUNNING_CONT="$(docker ps --format '{{.Names}}' | tr '\n' ', ' | sed -e 's/,$//g')"
# Set the file path
output_file="/var/tmp/tempmetrics"
scrape_file="/var/tmp/nodeexporter/container_metrics.prom"
# If less than 8 are running, send alert to opsgenie
if [ $${NUM_RUNNING_CONT} != $${NEEDED_CONT} ]
then
curl -s -X POST https://api.opsgenie.com/v2/alerts \
-H "Content-Type: application/json" \
-H "Authorization: GenieKey ${opsgenie_key}" \
-d \
'{
"message": "Satellite API instance does not have all '$${NEEDED_CONT}' containers running",
"alias": "satapi-missing-containers",
"description":"Currently running '$${NUM_RUNNING_CONT}'/'$${NEEDED_CONT}': '$${RUNNING_CONT}'",
"tags": ["SatAPI","Critical"],
"entity":"${announce_addr}",
"priority":"P2"
}'
else
echo "$${NUM_RUNNING_CONT}/$${NEEDED_CONT} containers are running"
# Check if the file exists, if not, create it
if [[ ! -f "$output_file" ]]; then
touch "$output_file"
fi
while true; do
# Empty the file to start fresh
> "$output_file"
# Fetch the list of all container names and their statuses
docker ps -a --format "{{.Names}} {{.Status}}" | while read -r line; do
# Split the line into container name and status
container_name=$(echo "$line" | awk '{print $1}')
status=$(echo "$line" | awk '{print $2}')
# Check if the container status is Running
if [[ "$status" == "Up" ]]; then
echo "running_container{cont=\"$container_name\"} 1" >> "$output_file"
else
echo "running_container{cont=\"$container_name\"} 0" >> "$output_file"
fi
done
# move file for nodeexporter to scrape when finished writing it
mv $output_file $scrape_file
# Sleep for 30 seconds before the next iteration
sleep 30
done
- path: /etc/systemd/system/check-containers.service
permissions: 0644
owner: root
@ -50,17 +61,19 @@ write_files:
[Service]
ExecStart=/bin/bash /home/bs/check_containers.sh
Restart=always
RestartSec=1
User=root
- path: /etc/systemd/system/check-containers.timer
permissions: 0644
owner: root
content: |
[Unit]
Description=Run check-containers service every 10 minutes (7 min delay)
Description=Run check-containers service after initial 7min delay
[Timer]
OnBootSec=420s
OnUnitActiveSec=10m
Persistent=true
[Install]
@ -88,7 +101,7 @@ write_files:
-v /proc:/host/proc:ro \
-v /sys:/host/sys:ro \
-v /:/rootfs:ro \
-v metrics:/metrics:ro \
-v /var/tmp/nodeexporter:/metrics:ro \
-v /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro \
"${node_exporter_docker}" --path.procfs /host/proc --path.sysfs /host/sys --collector.textfile.directory /metrics --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc($|/))" --collector.systemd
ExecStop=/usr/bin/docker stop node-exporter
@ -335,4 +348,4 @@ runcmd:
- systemctl enable --now api-workers.service
- systemctl enable --now sse-server.service
- systemctl enable --now node-exporter.service
- systemctl enable --now check-containers.timer
- systemctl enable --now check-containers.timer

View File

@ -23,7 +23,6 @@ data "template_file" "blc" {
node_exporter_docker = var.node_exporter_docker
autossh_docker = var.autossh_docker
certbot_docker = var.certbot_docker
opsgenie_key = var.opsgenie_key
k8s_autossh_lb = var.k8s_autossh_lb
rpcpass = var.rpcpass
k8s_autossh_ssh_port = var.net == "testnet" ? "2222" : "2223"

View File

@ -48,10 +48,6 @@ variable "timeout" {
type = string
}
variable "opsgenie_key" {
type = string
}
variable "prom_service_acct" {
type = string
}

View File

@ -67,6 +67,28 @@ write_files:
zone: us-west1-c
port: 9100
- job_name: iridium-analyzer-node-exporter
relabel_configs:
- source_labels:
- '__meta_gce_label_network'
target_label: 'network'
- source_labels:
- '__meta_gce_label_name'
target_label: 'name'
- source_labels:
- '__meta_gce_instance_name'
target_label: 'instance_name'
gce_sd_configs:
- project: satellite-api
zone: us-east1-b
port: 9100
- project: satellite-api
zone: us-east1-c
port: 9100
- project: satellite-api
zone: us-east1-d
port: 9100
- job_name: lightningd
relabel_configs:
- source_labels:

View File

@ -102,10 +102,6 @@ variable "prom_allowed_source_ip" {
default = []
}
variable "opsgenie_key" {
type = string
default = ""
}
variable "satellite_lb" {
type = string