mirror of
https://github.com/Blockstream/satellite-api.git
synced 2024-11-18 20:40:02 +01:00
[Alerts]Migrate OpsGenie alerts to OnCall
This commit is contained in:
parent
24d21fdc08
commit
b0ca82ab32
@ -174,7 +174,6 @@ plan_staging:
|
||||
-var "host=$HOST_STAGING"
|
||||
-var "timeout=$TIMEOUT"
|
||||
-var "prom_service_acct=$PROM_SA"
|
||||
-var "opsgenie_key=$OPSGENIE_KEY"
|
||||
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
|
||||
-var "private_bucket=$PRIVATE_BUCKET"
|
||||
-var "letsencrypt_email=$LE_EMAIL"
|
||||
@ -207,7 +206,6 @@ deploy_staging:
|
||||
-var "host=$HOST_STAGING"
|
||||
-var "timeout=$TIMEOUT"
|
||||
-var "prom_service_acct=$PROM_SA"
|
||||
-var "opsgenie_key=$OPSGENIE_KEY"
|
||||
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
|
||||
-var "private_bucket=$PRIVATE_BUCKET"
|
||||
-var "letsencrypt_email=$LE_EMAIL"
|
||||
@ -239,7 +237,6 @@ plan_production:
|
||||
-var "host=$HOST"
|
||||
-var "timeout=$TIMEOUT"
|
||||
-var "prom_service_acct=$PROM_SA"
|
||||
-var "opsgenie_key=$OPSGENIE_KEY"
|
||||
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
|
||||
-var "private_bucket=$PRIVATE_BUCKET"
|
||||
-var "letsencrypt_email=$LE_EMAIL"
|
||||
@ -272,7 +269,6 @@ deploy_production:
|
||||
-var "host=$HOST"
|
||||
-var "timeout=$TIMEOUT"
|
||||
-var "prom_service_acct=$PROM_SA"
|
||||
-var "opsgenie_key=$OPSGENIE_KEY"
|
||||
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
|
||||
-var "private_bucket=$PRIVATE_BUCKET"
|
||||
-var "letsencrypt_email=$LE_EMAIL"
|
||||
@ -303,7 +299,6 @@ plan_production_testnet:
|
||||
-var "instance_type=$INSTANCE_TYPE"
|
||||
-var "timeout=$TIMEOUT"
|
||||
-var "prom_service_acct=$PROM_SA"
|
||||
-var "opsgenie_key=$OPSGENIE_KEY"
|
||||
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
|
||||
-var "private_bucket=$PRIVATE_BUCKET"
|
||||
-var "letsencrypt_email=$LE_EMAIL"
|
||||
@ -333,7 +328,6 @@ deploy_production_testnet:
|
||||
-var "instance_type=$INSTANCE_TYPE"
|
||||
-var "timeout=$TIMEOUT"
|
||||
-var "prom_service_acct=$PROM_SA"
|
||||
-var "opsgenie_key=$OPSGENIE_KEY"
|
||||
-var "public_bucket_url=$PUBLIC_BUCKET_URL"
|
||||
-var "private_bucket=$PRIVATE_BUCKET"
|
||||
-var "letsencrypt_email=$LE_EMAIL"
|
||||
|
@ -50,7 +50,6 @@ module "blc-mainnet" {
|
||||
instance_type = var.instance_type[1]
|
||||
timeout = var.timeout
|
||||
prom_service_acct = var.prom_service_acct
|
||||
opsgenie_key = var.opsgenie_key
|
||||
rpcpass = var.rpcpass
|
||||
charge_token = var.charge_token
|
||||
k8s_autossh_lb = var.k8s_autossh_lb
|
||||
@ -84,7 +83,6 @@ module "blc-testnet" {
|
||||
instance_type = var.instance_type[1]
|
||||
timeout = var.timeout
|
||||
prom_service_acct = var.prom_service_acct
|
||||
opsgenie_key = var.opsgenie_key
|
||||
rpcpass = var.rpcpass
|
||||
charge_token = var.charge_token
|
||||
k8s_autossh_lb = var.k8s_autossh_lb
|
||||
|
@ -15,30 +15,41 @@ write_files:
|
||||
content: |
|
||||
#!/bin/bash
|
||||
|
||||
# Save # and names of running containers
|
||||
NUM_RUNNING_CONT=$(docker ps -q | wc -l)
|
||||
NEEDED_CONT=8
|
||||
RUNNING_CONT="$(docker ps --format '{{.Names}}' | tr '\n' ', ' | sed -e 's/,$//g')"
|
||||
# Set the file path
|
||||
output_file="/var/tmp/tempmetrics"
|
||||
scrape_file="/var/tmp/nodeexporter/container_metrics.prom"
|
||||
|
||||
# If less than 8 are running, send alert to opsgenie
|
||||
if [ $${NUM_RUNNING_CONT} != $${NEEDED_CONT} ]
|
||||
then
|
||||
curl -s -X POST https://api.opsgenie.com/v2/alerts \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Authorization: GenieKey ${opsgenie_key}" \
|
||||
-d \
|
||||
'{
|
||||
"message": "Satellite API instance does not have all '$${NEEDED_CONT}' containers running",
|
||||
"alias": "satapi-missing-containers",
|
||||
"description":"Currently running '$${NUM_RUNNING_CONT}'/'$${NEEDED_CONT}': '$${RUNNING_CONT}'",
|
||||
"tags": ["SatAPI","Critical"],
|
||||
"entity":"${announce_addr}",
|
||||
"priority":"P2"
|
||||
}'
|
||||
else
|
||||
echo "$${NUM_RUNNING_CONT}/$${NEEDED_CONT} containers are running"
|
||||
# Check if the file exists, if not, create it
|
||||
if [[ ! -f "$output_file" ]]; then
|
||||
touch "$output_file"
|
||||
fi
|
||||
|
||||
while true; do
|
||||
# Empty the file to start fresh
|
||||
> "$output_file"
|
||||
|
||||
# Fetch the list of all container names and their statuses
|
||||
docker ps -a --format "{{.Names}} {{.Status}}" | while read -r line; do
|
||||
# Split the line into container name and status
|
||||
container_name=$(echo "$line" | awk '{print $1}')
|
||||
status=$(echo "$line" | awk '{print $2}')
|
||||
|
||||
# Check if the container status is Running
|
||||
if [[ "$status" == "Up" ]]; then
|
||||
echo "running_container{cont=\"$container_name\"} 1" >> "$output_file"
|
||||
else
|
||||
echo "running_container{cont=\"$container_name\"} 0" >> "$output_file"
|
||||
fi
|
||||
done
|
||||
|
||||
# move file for nodeexporter to scrape when finished writing it
|
||||
mv $output_file $scrape_file
|
||||
|
||||
# Sleep for 30 seconds before the next iteration
|
||||
sleep 30
|
||||
done
|
||||
|
||||
|
||||
- path: /etc/systemd/system/check-containers.service
|
||||
permissions: 0644
|
||||
owner: root
|
||||
@ -50,17 +61,19 @@ write_files:
|
||||
|
||||
[Service]
|
||||
ExecStart=/bin/bash /home/bs/check_containers.sh
|
||||
Restart=always
|
||||
RestartSec=1
|
||||
User=root
|
||||
|
||||
- path: /etc/systemd/system/check-containers.timer
|
||||
permissions: 0644
|
||||
owner: root
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run check-containers service every 10 minutes (7 min delay)
|
||||
Description=Run check-containers service after initial 7min delay
|
||||
|
||||
[Timer]
|
||||
OnBootSec=420s
|
||||
OnUnitActiveSec=10m
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
@ -88,7 +101,7 @@ write_files:
|
||||
-v /proc:/host/proc:ro \
|
||||
-v /sys:/host/sys:ro \
|
||||
-v /:/rootfs:ro \
|
||||
-v metrics:/metrics:ro \
|
||||
-v /var/tmp/nodeexporter:/metrics:ro \
|
||||
-v /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro \
|
||||
"${node_exporter_docker}" --path.procfs /host/proc --path.sysfs /host/sys --collector.textfile.directory /metrics --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc($|/))" --collector.systemd
|
||||
ExecStop=/usr/bin/docker stop node-exporter
|
||||
@ -335,4 +348,4 @@ runcmd:
|
||||
- systemctl enable --now api-workers.service
|
||||
- systemctl enable --now sse-server.service
|
||||
- systemctl enable --now node-exporter.service
|
||||
- systemctl enable --now check-containers.timer
|
||||
- systemctl enable --now check-containers.timer
|
@ -23,7 +23,6 @@ data "template_file" "blc" {
|
||||
node_exporter_docker = var.node_exporter_docker
|
||||
autossh_docker = var.autossh_docker
|
||||
certbot_docker = var.certbot_docker
|
||||
opsgenie_key = var.opsgenie_key
|
||||
k8s_autossh_lb = var.k8s_autossh_lb
|
||||
rpcpass = var.rpcpass
|
||||
k8s_autossh_ssh_port = var.net == "testnet" ? "2222" : "2223"
|
||||
|
@ -48,10 +48,6 @@ variable "timeout" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "opsgenie_key" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "prom_service_acct" {
|
||||
type = string
|
||||
}
|
||||
|
@ -67,6 +67,28 @@ write_files:
|
||||
zone: us-west1-c
|
||||
port: 9100
|
||||
|
||||
- job_name: iridium-analyzer-node-exporter
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- '__meta_gce_label_network'
|
||||
target_label: 'network'
|
||||
- source_labels:
|
||||
- '__meta_gce_label_name'
|
||||
target_label: 'name'
|
||||
- source_labels:
|
||||
- '__meta_gce_instance_name'
|
||||
target_label: 'instance_name'
|
||||
gce_sd_configs:
|
||||
- project: satellite-api
|
||||
zone: us-east1-b
|
||||
port: 9100
|
||||
- project: satellite-api
|
||||
zone: us-east1-c
|
||||
port: 9100
|
||||
- project: satellite-api
|
||||
zone: us-east1-d
|
||||
port: 9100
|
||||
|
||||
- job_name: lightningd
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
|
@ -102,10 +102,6 @@ variable "prom_allowed_source_ip" {
|
||||
default = []
|
||||
}
|
||||
|
||||
variable "opsgenie_key" {
|
||||
type = string
|
||||
default = ""
|
||||
}
|
||||
|
||||
variable "satellite_lb" {
|
||||
type = string
|
||||
|
Loading…
Reference in New Issue
Block a user