From 6b6448437e17006b1b5d2a802c4e637f18009734 Mon Sep 17 00:00:00 2001 From: nitramiz Date: Mon, 30 Aug 2021 11:53:40 -0700 Subject: [PATCH] tf: update prometheus instance --- .gitlab-ci.yml | 4 +- terraform/main.tf | 3 +- .../prometheus/cloud-init/prometheus.yml | 131 ++---------------- terraform/modules/prometheus/data.tf | 1 - terraform/modules/prometheus/firewall.tf | 2 +- terraform/modules/prometheus/main.tf | 2 +- terraform/modules/prometheus/variables.tf | 7 +- terraform/variables.tf | 10 +- 8 files changed, 20 insertions(+), 140 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8f53baf..64c93c1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -7,7 +7,7 @@ variables: API_IMAGE: us.gcr.io/satellite-api/satellite-api SSE_IMAGE: us.gcr.io/satellite-api/satellite-api-sse -image: blockstream/gcloud-docker:v0.14.5 +image: blockstream/gcloud-docker:1.0.5 stages: - test - build @@ -363,7 +363,6 @@ plan_misc: -var "onion_host=$ONION_HOST" -var "prom_allowed_source_ip=$PROMETHEUS_ALLOWED_SOURCE_IP" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "satellite_lb=$SATELLITE_LB" -var "satellite_api_lb=$SATELLITE_API_LB" -var "satellite_api_lb_staging=$SATELLITE_API_LB_STAGING" @@ -392,7 +391,6 @@ deploy_misc: -var "onion_host=$ONION_HOST" -var "prom_allowed_source_ip=$PROMETHEUS_ALLOWED_SOURCE_IP" -var "prom_service_acct=$PROM_SA" - -var "opsgenie_key=$OPSGENIE_KEY" -var "satellite_lb=$SATELLITE_LB" -var "satellite_api_lb=$SATELLITE_API_LB" -var "satellite_api_lb_staging=$SATELLITE_API_LB_STAGING" diff --git a/terraform/main.tf b/terraform/main.tf index f5243b5..cfde802 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -7,7 +7,7 @@ terraform { source = "hashicorp/google-beta" } } - required_version = ">= 0.14" + required_version = ">= 0.15" backend "gcs" { bucket = "terraform-bs-source" @@ -169,7 +169,6 @@ module "prometheus" { zone = var.zone instance_type = var.instance_type[1] prom_allowed_source_ip = var.prom_allowed_source_ip - opsgenie_key = var.opsgenie_key prom_service_acct = var.prom_service_acct } diff --git a/terraform/modules/prometheus/cloud-init/prometheus.yml b/terraform/modules/prometheus/cloud-init/prometheus.yml index 4f15747..41814e8 100644 --- a/terraform/modules/prometheus/cloud-init/prometheus.yml +++ b/terraform/modules/prometheus/cloud-init/prometheus.yml @@ -9,42 +9,6 @@ users: uid: 2000 write_files: - - path: /home/bs/prometheus/alertmanager.yml - permissions: 0644 - owner: root - content: | - route: - group_by: [cluster, alertname] - # If an alert isn't caught by a route, send it to the pager. - receiver: noc-pager - routes: - - match: - severity: page - receiver: noc-pager - - receivers: - - name: noc-pager - opsgenie_configs: - - api_key: ${opsgenie_key} - teams: SecOps - tags: satellite-api - - - path: /home/bs/prometheus/rules/alerts.yml - permissions: 0644 - owner: root - content: | - groups: - - name: node - rules: - #- alert: NoHostsInNetwork - # expr: sum by (name) (up{name=~".+"}) == 0 - # for: 1m - # labels: - # severity: page - # annotations: - # summary: No hosts in network {{ $labels.name }}, production traffic impacted! - # description: There are currently no hosts up in the network {{ $labels.name }}, verify the instance groups. https://wiki.blockstream.io/OpsPlaybooks/Esplora-Runbooks#NoHostsInNetwork - - path: /home/bs/prometheus/prometheus.yml permissions: 0644 owner: root @@ -55,16 +19,6 @@ write_files: external_labels: project: satellite-api - rule_files: - - /config/rules/alerts.yml - - alerting: - alertmanagers: - - scheme: http - static_configs: - - targets: - - "127.0.0.1:9093" - scrape_configs: - job_name: prometheus relabel_configs: @@ -91,7 +45,7 @@ write_files: zone: us-west1-c port: 9100 - - job_name: satellite-api + - job_name: satellite-api-node-exporter relabel_configs: - source_labels: - '__meta_gce_label_network' @@ -104,40 +58,12 @@ write_files: target_label: 'instance_name' gce_sd_configs: - project: satellite-api - filter: (labels.type = "lightning-app") zone: us-west1-a port: 9100 - project: satellite-api - filter: (labels.type = "lightning-app") zone: us-west1-b port: 9100 - project: satellite-api - filter: (labels.type = "lightning-app") - zone: us-west1-c - port: 9100 - - - job_name: satellite-api-tor - relabel_configs: - - source_labels: - - '__meta_gce_label_network' - target_label: 'network' - - source_labels: - - '__meta_gce_label_name' - target_label: 'name' - - source_labels: - - '__meta_gce_instance_name' - target_label: 'instance_name' - gce_sd_configs: - - project: satellite-api - filter: (labels.type = "tor") - zone: us-west1-a - port: 9100 - - project: satellite-api - filter: (labels.type = "tor") - zone: us-west1-b - port: 9100 - - project: satellite-api - filter: (labels.type = "tor") zone: us-west1-c port: 9100 @@ -156,16 +82,15 @@ write_files: - project: satellite-api filter: (labels.type = "lightning-app-blc") zone: us-west1-a - port: 9900 + port: 9750 - project: satellite-api filter: (labels.type = "lightning-app-blc") zone: us-west1-b - port: 9900 + port: 9750 - project: satellite-api filter: (labels.type = "lightning-app-blc") zone: us-west1-c - port: 9900 - + port: 9750 - path: /etc/systemd/system/prometheus.service permissions: 0644 @@ -180,7 +105,6 @@ write_files: Restart=always RestartSec=1 Environment=HOME=/home/bs - ExecStartPre=/usr/bin/docker-credential-gcr configure-docker ExecStartPre=/usr/bin/docker pull ${prom_docker} ExecStartPre=/sbin/iptables -A INPUT -m tcp -p tcp --dport 9090 -j ACCEPT ExecStartPre=/sbin/iptables -A INPUT -m tcp -p tcp --dport 80 -j ACCEPT @@ -193,7 +117,7 @@ write_files: --name prometheus \ "${prom_docker}" --config.file=/config/prometheus.yml --web.enable-lifecycle --web.enable-admin-api --storage.tsdb.path=/data/metrics --storage.tsdb.retention=${retention} ExecStop=/usr/bin/docker stop prometheus - ExecStopPost=/usr/bin/docker rm prometheus + ExecStopPost=-/usr/bin/docker rm prometheus ExecStopPost=/sbin/iptables -D INPUT -m tcp -p tcp --dport 9090 -j ACCEPT ExecStopPost=/sbin/iptables -D INPUT -m tcp -p tcp --dport 80 -j ACCEPT ExecStopPost=/sbin/iptables -D PREROUTING -t nat -p tcp --dport 80 -j REDIRECT --to-port 9090 @@ -201,37 +125,6 @@ write_files: [Install] WantedBy=multi-user.target - - path: /etc/systemd/system/alertmanager.service - permissions: 0644 - owner: root - content: | - [Unit] - Description=alertmanager-server instance - Wants=gcr-online.target docker.service - After=gcr-online.service docker.service - - [Service] - Restart=always - RestartSec=1 - Environment=HOME=/home/bs - ExecStartPre=/usr/bin/docker-credential-gcr configure-docker - ExecStartPre=/usr/bin/docker pull ${prom_docker} - ExecStartPre=/sbin/iptables -A INPUT -m tcp -p tcp --dport 9093 -j ACCEPT - ExecStart=/usr/bin/docker run \ - --network=host \ - -v /mnt/disks/data:/data:rw \ - -v /home/bs/prometheus:/config:ro \ - --read-only \ - --name alertmanager \ - --entrypoint=/bin/alertmanager \ - "${prom_docker}" --config.file=/config/alertmanager.yml - ExecStop=/usr/bin/docker stop alertmanager - ExecStopPost=/usr/bin/docker rm alertmanager - ExecStopPost=/sbin/iptables -D INPUT -m tcp -p tcp --dport 9093 -j ACCEPT - - [Install] - WantedBy=multi-user.target - - path: /etc/systemd/system/node-exporter.service permissions: 0644 owner: root @@ -245,7 +138,6 @@ write_files: Restart=always RestartSec=1 Environment=HOME=/home/bs - ExecStartPre=/usr/bin/docker-credential-gcr configure-docker ExecStartPre=/usr/bin/docker pull ${node_exporter_docker} ExecStartPre=/sbin/iptables -A INPUT -m tcp -p tcp --dport 9100 -j ACCEPT ExecStart=/usr/bin/docker run \ @@ -256,9 +148,10 @@ write_files: -v /sys:/host/sys:ro \ -v /:/rootfs:ro \ -v metrics:/metrics:ro \ - "${node_exporter_docker}" --path.procfs /host/proc --path.sysfs /host/sys --collector.textfile.directory /metrics --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc($|/))" + -v /var/run/dbus/system_bus_socket:/var/run/dbus/system_bus_socket:ro \ + "${node_exporter_docker}" --path.procfs /host/proc --path.sysfs /host/sys --collector.textfile.directory /metrics --collector.filesystem.ignored-mount-points "^/(sys|proc|dev|host|etc($|/))" --collector.systemd ExecStop=/usr/bin/docker stop node-exporter - ExecStopPost=/usr/bin/docker rm node-exporter + ExecStopPost=-/usr/bin/docker rm node-exporter ExecStopPost=/sbin/iptables -D INPUT -m tcp -p tcp --dport 9100 -j ACCEPT [Install] @@ -268,9 +161,5 @@ runcmd: - /bin/mkdir -p /mnt/disks/data/metrics - /bin/chown nobody:nobody /mnt/disks/data/metrics - systemctl daemon-reload - - systemctl start prometheus.service - - systemctl enable prometheus.service - - systemctl start alertmanager.service - - systemctl enable alertmanager.service - - systemctl start node-exporter.service - - systemctl enable node-exporter.service + - systemctl enable --now prometheus.service + - systemctl enable --now node-exporter.service diff --git a/terraform/modules/prometheus/data.tf b/terraform/modules/prometheus/data.tf index 3ca1326..4913ae9 100644 --- a/terraform/modules/prometheus/data.tf +++ b/terraform/modules/prometheus/data.tf @@ -12,7 +12,6 @@ data "template_file" "prometheus" { prom_docker = var.prom_docker node_exporter_docker = var.node_exporter_docker retention = var.retention - opsgenie_key = var.opsgenie_key } } diff --git a/terraform/modules/prometheus/firewall.tf b/terraform/modules/prometheus/firewall.tf index d9de90b..1e159d0 100644 --- a/terraform/modules/prometheus/firewall.tf +++ b/terraform/modules/prometheus/firewall.tf @@ -9,7 +9,7 @@ resource "google_compute_firewall" "all-traffic" { ports = ["80"] } - source_ranges = ["${var.prom_allowed_source_ip}/32"] + source_ranges = var.prom_allowed_source_ip target_service_accounts = [ google_service_account.prometheus[0].email, diff --git a/terraform/modules/prometheus/main.tf b/terraform/modules/prometheus/main.tf index ac72e37..9cf11fa 100644 --- a/terraform/modules/prometheus/main.tf +++ b/terraform/modules/prometheus/main.tf @@ -52,7 +52,7 @@ resource "google_compute_instance" "prometheus-server" { boot_disk { initialize_params { - size = "10" + size = "20" image = var.boot_image } } diff --git a/terraform/modules/prometheus/variables.tf b/terraform/modules/prometheus/variables.tf index 67c357f..2128ac1 100644 --- a/terraform/modules/prometheus/variables.tf +++ b/terraform/modules/prometheus/variables.tf @@ -42,11 +42,7 @@ variable "prom_service_acct" { } variable "prom_allowed_source_ip" { - type = string -} - -variable "opsgenie_key" { - type = string + type = list(any) } variable "prom_docker" { @@ -56,4 +52,3 @@ variable "prom_docker" { variable "node_exporter_docker" { type = string } - diff --git a/terraform/variables.tf b/terraform/variables.tf index ff2b2b3..1f151ba 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -98,8 +98,8 @@ variable "lb_svc_acct" { } variable "prom_allowed_source_ip" { - type = string - default = "" + type = list(any) + default = [] } variable "opsgenie_key" { @@ -227,17 +227,17 @@ variable "charge_docker" { variable "tor_docker" { type = string - default = "blockstream/tor@sha256:46594b0a84f7503de70078652e7bd94f6152b7976d11779ad9f143f02508284c" + default = "blockstream/tor:0.4.3.7" } variable "node_exporter_docker" { type = string - default = "prom/node-exporter@sha256:55302581333c43d540db0e144cf9e7735423117a733cdec27716d87254221086" + default = "prom/node-exporter:v1.1.2" } variable "prom_docker" { type = string - default = "blockstream/prometheus@sha256:cab8c2359ab187aa6c9e9c7fcfcc3060b62742417030a77862c747e091d3c6d6" + default = "prom/prometheus:v2.29.1" } variable "gcloud_docker" {