From b0ca82ab3226da28a31e8c6cae205209bc0b73a3 Mon Sep 17 00:00:00 2001 From: Mario Tischlinger Date: Wed, 11 Oct 2023 15:22:41 +0000 Subject: [PATCH] [Alerts]Migrate OpsGenie alerts to OnCall --- .gitlab-ci.yml | 6 -- terraform/main.tf | 2 - terraform/modules/blc/cloud-init/blc.yaml | 63 +++++++++++-------- terraform/modules/blc/data.tf | 1 - terraform/modules/blc/variables.tf | 4 -- .../prometheus/cloud-init/prometheus.yml | 22 +++++++ terraform/variables.tf | 4 -- 7 files changed, 60 insertions(+), 42 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1653fe6..d8873d0 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -174,7 +174,6 @@ plan_staging: -var "host=$HOST_STAGING" -var "timeout=$TIMEOUT" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "public_bucket_url=$PUBLIC_BUCKET_URL" -var "private_bucket=$PRIVATE_BUCKET" -var "letsencrypt_email=$LE_EMAIL" @@ -207,7 +206,6 @@ deploy_staging: -var "host=$HOST_STAGING" -var "timeout=$TIMEOUT" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "public_bucket_url=$PUBLIC_BUCKET_URL" -var "private_bucket=$PRIVATE_BUCKET" -var "letsencrypt_email=$LE_EMAIL" @@ -239,7 +237,6 @@ plan_production: -var "host=$HOST" -var "timeout=$TIMEOUT" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "public_bucket_url=$PUBLIC_BUCKET_URL" -var "private_bucket=$PRIVATE_BUCKET" -var "letsencrypt_email=$LE_EMAIL" @@ -272,7 +269,6 @@ deploy_production: -var "host=$HOST" -var "timeout=$TIMEOUT" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "public_bucket_url=$PUBLIC_BUCKET_URL" -var "private_bucket=$PRIVATE_BUCKET" -var "letsencrypt_email=$LE_EMAIL" @@ -303,7 +299,6 @@ plan_production_testnet: -var "instance_type=$INSTANCE_TYPE" -var "timeout=$TIMEOUT" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "public_bucket_url=$PUBLIC_BUCKET_URL" -var "private_bucket=$PRIVATE_BUCKET" -var "letsencrypt_email=$LE_EMAIL" @@ -333,7 +328,6 @@ deploy_production_testnet: -var "instance_type=$INSTANCE_TYPE" -var "timeout=$TIMEOUT" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "public_bucket_url=$PUBLIC_BUCKET_URL" -var "private_bucket=$PRIVATE_BUCKET" -var "letsencrypt_email=$LE_EMAIL" diff --git a/terraform/main.tf b/terraform/main.tf index c8bc2b3..7b0e64e 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -50,7 +50,6 @@ module "blc-mainnet" { instance_type = var.instance_type[1] timeout = var.timeout prom_service_acct = var.prom_service_acct - opsgenie_key = var.opsgenie_key rpcpass = var.rpcpass charge_token = var.charge_token k8s_autossh_lb = var.k8s_autossh_lb @@ -84,7 +83,6 @@ module "blc-testnet" { instance_type = var.instance_type[1] timeout = var.timeout prom_service_acct = var.prom_service_acct - opsgenie_key = var.opsgenie_key rpcpass = var.rpcpass charge_token = var.charge_token k8s_autossh_lb = var.k8s_autossh_lb diff --git a/terraform/modules/blc/cloud-init/blc.yaml b/terraform/modules/blc/cloud-init/blc.yaml index 1222833..3222473 100644 --- a/terraform/modules/blc/cloud-init/blc.yaml +++ b/terraform/modules/blc/cloud-init/blc.yaml @@ -15,30 +15,41 @@ write_files: content: | #!/bin/bash - # Save # and names of running containers - NUM_RUNNING_CONT=$(docker ps -q | wc -l) - NEEDED_CONT=8 - RUNNING_CONT="$(docker ps --format '{{.Names}}' | tr '\n' ', ' | sed -e 's/,$//g')" + # Set the file path + output_file="/var/tmp/tempmetrics" + scrape_file="/var/tmp/nodeexporter/container_metrics.prom" - # If less than 8 are running, send alert to opsgenie - if [ $${NUM_RUNNING_CONT} != $${NEEDED_CONT} ] - then - curl -s -X POST https://api.opsgenie.com/v2/alerts \ - -H "Content-Type: application/json" \ - -H "Authorization: GenieKey ${opsgenie_key}" \ - -d \ - '{ - "message": "Satellite API instance does not have all '$${NEEDED_CONT}' containers running", - "alias": "satapi-missing-containers", - "description":"Currently running '$${NUM_RUNNING_CONT}'/'$${NEEDED_CONT}': '$${RUNNING_CONT}'", - "tags": ["SatAPI","Critical"], - "entity":"${announce_addr}", - "priority":"P2" - }' - else - echo "$${NUM_RUNNING_CONT}/$${NEEDED_CONT} containers are running" + # Check if the file exists, if not, create it + if [[ ! -f "$output_file" ]]; then + touch "$output_file" fi + while true; do + # Empty the file to start fresh + > "$output_file" + + # Fetch the list of all container names and their statuses + docker ps -a --format "{{.Names}} {{.Status}}" | while read -r line; do + # Split the line into container name and status + container_name=$(echo "$line" | awk '{print $1}') + status=$(echo "$line" | awk '{print $2}') + + # Check if the container status is Running + if [[ "$status" == "Up" ]]; then + echo "running_container{cont=\"$container_name\"} 1" >> "$output_file" + else + echo "running_container{cont=\"$container_name\"} 0" >> "$output_file" + fi + done + + # move file for nodeexporter to scrape when finished writing it + mv $output_file $scrape_file + + # Sleep for 30 seconds before the next iteration + sleep 30 + done + + - path: /etc/systemd/system/check-containers.service permissions: 0644 owner: root @@ -50,17 +61,19 @@ write_files: [Service] ExecStart=/bin/bash /home/bs/check_containers.sh + Restart=always + RestartSec=1 + User=root - path: /etc/systemd/system/check-containers.timer permissions: 0644 owner: root content: | [Unit] - Description=Run check-containers service every 10 minutes (7 min delay) + Description=Run check-containers service after initial 7min delay [Timer] OnBootSec=420s - OnUnitActiveSec=10m Persistent=true [Install] @@ -88,7 +101,7 @@ write_files: -v /proc:/host/proc:ro \ -v /sys:/host/sys:ro \ -v /:/rootfs:ro \ - -v metrics:/metrics:ro \ + -v /var/tmp/nodeexporter:/metrics:ro \ -v /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro \ "${node_exporter_docker}" --path.procfs /host/proc --path.sysfs /host/sys --collector.textfile.directory /metrics --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc($|/))" --collector.systemd ExecStop=/usr/bin/docker stop node-exporter @@ -335,4 +348,4 @@ runcmd: - systemctl enable --now api-workers.service - systemctl enable --now sse-server.service - systemctl enable --now node-exporter.service - - systemctl enable --now check-containers.timer + - systemctl enable --now check-containers.timer \ No newline at end of file diff --git a/terraform/modules/blc/data.tf b/terraform/modules/blc/data.tf index 255a4e5..a1dda3a 100644 --- a/terraform/modules/blc/data.tf +++ b/terraform/modules/blc/data.tf @@ -23,7 +23,6 @@ data "template_file" "blc" { node_exporter_docker = var.node_exporter_docker autossh_docker = var.autossh_docker certbot_docker = var.certbot_docker - opsgenie_key = var.opsgenie_key k8s_autossh_lb = var.k8s_autossh_lb rpcpass = var.rpcpass k8s_autossh_ssh_port = var.net == "testnet" ? "2222" : "2223" diff --git a/terraform/modules/blc/variables.tf b/terraform/modules/blc/variables.tf index c5398fd..ab3d22c 100644 --- a/terraform/modules/blc/variables.tf +++ b/terraform/modules/blc/variables.tf @@ -48,10 +48,6 @@ variable "timeout" { type = string } -variable "opsgenie_key" { - type = string -} - variable "prom_service_acct" { type = string } diff --git a/terraform/modules/prometheus/cloud-init/prometheus.yml b/terraform/modules/prometheus/cloud-init/prometheus.yml index 41814e8..872398a 100644 --- a/terraform/modules/prometheus/cloud-init/prometheus.yml +++ b/terraform/modules/prometheus/cloud-init/prometheus.yml @@ -67,6 +67,28 @@ write_files: zone: us-west1-c port: 9100 + - job_name: iridium-analyzer-node-exporter + relabel_configs: + - source_labels: + - '__meta_gce_label_network' + target_label: 'network' + - source_labels: + - '__meta_gce_label_name' + target_label: 'name' + - source_labels: + - '__meta_gce_instance_name' + target_label: 'instance_name' + gce_sd_configs: + - project: satellite-api + zone: us-east1-b + port: 9100 + - project: satellite-api + zone: us-east1-c + port: 9100 + - project: satellite-api + zone: us-east1-d + port: 9100 + - job_name: lightningd relabel_configs: - source_labels: diff --git a/terraform/variables.tf b/terraform/variables.tf index 110e80e..ab51bb0 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -102,10 +102,6 @@ variable "prom_allowed_source_ip" { default = [] } -variable "opsgenie_key" { - type = string - default = "" -} variable "satellite_lb" { type = string