From 9b143e76380c5f25477bf6cd8dc105c667c6e7d4 Mon Sep 17 00:00:00 2001 From: ObNitram Date: Sun, 18 Jan 2026 15:36:23 +0100 Subject: [PATCH] Add observability stack with Grafana, Prometheus, Loki, and cAdvisor - Introduced a new Grafana dashboard for monitoring Docker containers using cAdvisor. - Created provisioning files for Grafana dashboards and data sources. - Added Dockerfiles and configuration files for Loki and Prometheus. - Implemented a Docker Compose stack for the observability services. - Configured Traefik as a reverse proxy for the services with appropriate routing. - Added scripts for SSH tunneling to access the telemetry dashboard. - Included secrets management for Grafana admin credentials. --- .github/workflows/action-build-and-push.yaml | 71 + .../workflows/b&p-observability-alloy.yaml | 17 + .../workflows/b&p-observability-grafana.yaml | 17 + .github/workflows/b&p-observability-loki.yaml | 17 + .../b&p-observability-prometheus.yaml | 17 + observability/alloy/Dockerfile | 3 + observability/alloy/config.alloy | 29 + observability/dev-stack.yaml | 159 + observability/grafana/Dockerfile | 11 + observability/grafana/grafana.ini | 41 + .../provisioning/dashboards/17346_rev9.json | 1570 ++ .../provisioning/dashboards/1860_rev42.json | 15766 ++++++++++++++++ .../provisioning/dashboards/19908_rev1.json | 991 + .../provisioning/dashboards/default.yml | 27 + .../provisioning/datasources/datasource.yml | 18 + observability/loki/Dockerfile | 7 + observability/loki/loki-config.yaml | 51 + observability/prometheus/Dockerfile | 13 + observability/prometheus/prometheus.yaml | 41 + .../secrets/grafana-admin-password.txt | 1 + observability/secrets/grafana-admin-user.txt | 1 + observability/stack.yaml | 149 + scripts/bind-telemetry.sh | 18 + traefik/compose.yaml | 104 + 24 files changed, 19139 insertions(+) create mode 100644 .github/workflows/action-build-and-push.yaml create mode 100644 .github/workflows/b&p-observability-alloy.yaml create mode 100644 .github/workflows/b&p-observability-grafana.yaml create mode 100644 .github/workflows/b&p-observability-loki.yaml create mode 100644 .github/workflows/b&p-observability-prometheus.yaml create mode 100644 observability/alloy/Dockerfile create mode 100644 observability/alloy/config.alloy create mode 100644 observability/dev-stack.yaml create mode 100644 observability/grafana/Dockerfile create mode 100644 observability/grafana/grafana.ini create mode 100644 observability/grafana/provisioning/dashboards/17346_rev9.json create mode 100644 observability/grafana/provisioning/dashboards/1860_rev42.json create mode 100644 observability/grafana/provisioning/dashboards/19908_rev1.json create mode 100644 observability/grafana/provisioning/dashboards/default.yml create mode 100644 observability/grafana/provisioning/datasources/datasource.yml create mode 100644 observability/loki/Dockerfile create mode 100644 observability/loki/loki-config.yaml create mode 100644 observability/prometheus/Dockerfile create mode 100644 observability/prometheus/prometheus.yaml create mode 100644 observability/secrets/grafana-admin-password.txt create mode 100644 observability/secrets/grafana-admin-user.txt create mode 100644 observability/stack.yaml create mode 100755 scripts/bind-telemetry.sh create mode 100644 traefik/compose.yaml diff --git a/.github/workflows/action-build-and-push.yaml b/.github/workflows/action-build-and-push.yaml new file mode 100644 index 0000000..2012287 --- /dev/null +++ b/.github/workflows/action-build-and-push.yaml @@ -0,0 +1,71 @@ +name: Docker Build & Push to GitHub Container Registry (Use git tags with format vX.Y.Z as release version) + +on: + workflow_call: + inputs: + image_name: + type: string + required: true + description: > + Docker image name without registry and owner. + Example: observability/alloy + + build_context: + type: string + required: false + default: . + description: > + Path to the Docker build context. + Example: . or ./services/api + + file_path: + type: string + required: false + default: ./Dockerfile + description: > + Path to the Dockerfile relative to the repository root. + Example: ./Dockerfile or ./services/api/Dockerfile + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Ensure lowercase username because some registries do not allow uppercase letters + id: lowercase + run: echo "username=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]')" >> $GITHUB_OUTPUT + + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ steps.lowercase.outputs.username }}/${{ inputs.image_name }} + tags: | + type=match,pattern=.*(v\d+\.\d+\.\d+),group=1 + type=match,pattern=.*(v\d+\.\d+)\.\d+,group=1 + type=match,pattern=.*(v\d+)\.\d+\.\d+,group=1 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: ${{ inputs.build_context }} + file: ${{ inputs.file_path }} + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.github/workflows/b&p-observability-alloy.yaml b/.github/workflows/b&p-observability-alloy.yaml new file mode 100644 index 0000000..3b067e0 --- /dev/null +++ b/.github/workflows/b&p-observability-alloy.yaml @@ -0,0 +1,17 @@ +name: Build and Push Alloy + +on: + push: + tags: + - "alloy-v*.*.*" + +jobs: + build-and-push: + uses: ./.github/workflows/action-build-and-push.yaml + permissions: + contents: read + packages: write + with: + image_name: observability/alloy + build_context: ./observability/alloy + file_path: ./observability/alloy/Dockerfile \ No newline at end of file diff --git a/.github/workflows/b&p-observability-grafana.yaml b/.github/workflows/b&p-observability-grafana.yaml new file mode 100644 index 0000000..033d8ce --- /dev/null +++ b/.github/workflows/b&p-observability-grafana.yaml @@ -0,0 +1,17 @@ +name: Build and Push Grafana Observability + +on: + push: + tags: + - "grafana-v*.*.*" + +jobs: + build-and-push: + uses: ./.github/workflows/action-build-and-push.yaml + permissions: + contents: read + packages: write + with: + image_name: observability/grafana + build_context: ./observability/grafana + file_path: ./observability/grafana/Dockerfile \ No newline at end of file diff --git a/.github/workflows/b&p-observability-loki.yaml b/.github/workflows/b&p-observability-loki.yaml new file mode 100644 index 0000000..1fddf96 --- /dev/null +++ b/.github/workflows/b&p-observability-loki.yaml @@ -0,0 +1,17 @@ +name: Build and Push Loki Database + +on: + push: + tags: + - "loki-v*.*.*" + +jobs: + build-and-push: + uses: ./.github/workflows/action-build-and-push.yaml + permissions: + contents: read + packages: write + with: + image_name: observability/loki + build_context: ./observability/loki + file_path: ./observability/loki/Dockerfile \ No newline at end of file diff --git a/.github/workflows/b&p-observability-prometheus.yaml b/.github/workflows/b&p-observability-prometheus.yaml new file mode 100644 index 0000000..b067c48 --- /dev/null +++ b/.github/workflows/b&p-observability-prometheus.yaml @@ -0,0 +1,17 @@ +name: Build and Push Prometheus Observability + +on: + push: + tags: + - "prometheus-v*.*.*" + +jobs: + build-and-push: + uses: ./.github/workflows/action-build-and-push.yaml + permissions: + contents: read + packages: write + with: + image_name: observability/prometheus + build_context: ./observability/prometheus + file_path: ./observability/prometheus/Dockerfile \ No newline at end of file diff --git a/observability/alloy/Dockerfile b/observability/alloy/Dockerfile new file mode 100644 index 0000000..f232807 --- /dev/null +++ b/observability/alloy/Dockerfile @@ -0,0 +1,3 @@ +FROM grafana/alloy:latest + +COPY config.alloy /etc/alloy/config.alloy diff --git a/observability/alloy/config.alloy b/observability/alloy/config.alloy new file mode 100644 index 0000000..9eab798 --- /dev/null +++ b/observability/alloy/config.alloy @@ -0,0 +1,29 @@ + +discovery.docker "containers" { + host = "unix:///var/run/docker.sock" +} + +discovery.relabel "swarm" { + targets = [] + + rule { + source_labels = ["__meta_docker_container_name"] + regex = "^/?(.*)$" + target_label = "container_name" + } +} + +loki.source.docker "docker" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.containers.targets + relabel_rules = discovery.relabel.swarm.rules + forward_to = [loki.write.default.receiver] + + labels = { job = "docker" } // label stable pour requêter +} + +loki.write "default" { + endpoint { + url = "http://loki:3100/loki/api/v1/push" + } +} \ No newline at end of file diff --git a/observability/dev-stack.yaml b/observability/dev-stack.yaml new file mode 100644 index 0000000..43da785 --- /dev/null +++ b/observability/dev-stack.yaml @@ -0,0 +1,159 @@ +networks: + observability-network: + driver: overlay + attachable: false + internal: true + driver_opts: + encrypted: "true" + + grafana-frontend-network: + driver: overlay + attachable: false + internal: true + driver_opts: + encrypted: "true" + + swarm-proxy-network: + external: true + +secrets: + grafana-admin-user: + file: ./secrets/grafana-admin-user.txt + grafana-admin-password: + file: ./secrets/grafana-admin-password.txt + +volumes: + grafana-storage: {} + prometheus-storage: {} + loki-storage: {} + +services: + grafana: + image: grafana/grafana:latest + environment: + GF_SECURITY_ADMIN_USER__FILE: /run/secrets/grafana-admin-user + GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana-admin-password + secrets: + - grafana-admin-user + - grafana-admin-password + volumes: + - grafana-storage:/var/lib/grafana + - ./grafana/provisioning/:/etc/grafana/provisioning/:ro + - ./grafana/grafana.ini:/etc/grafana/grafana.ini:ro + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: any + labels: + - "traefik.enable=true" + - "traefik.http.routers.grafana.entrypoints=admin" + - "traefik.http.routers.grafana.rule=Host(`admin.localhost`) && PathPrefix(`/grafana`)" + + - "traefik.http.routers.grafana.middlewares=compress-all" + - "traefik.http.middlewares.compress-all.compress=true" + - "traefik.http.middlewares.compress-all.compress.encodings=zstd, br, gzip" + + - "traefik.http.routers.grafana.service=grafana" + - "traefik.http.services.grafana.loadbalancer.server.port=3000" + ports: + - target: 3000 + published: 3002 + protocol: tcp + mode: ingress + networks: + - swarm-proxy-network + - grafana-frontend-network + + prometheus: + image: prom/prometheus:latest + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-storage:/prometheus + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: any + placement: + constraints: + - node.role == manager + ports: + - target: 9090 + published: 3001 + protocol: tcp + mode: ingress + networks: + - observability-network + - grafana-frontend-network + + loki: + image: grafana/loki:latest + command: + - -config.file=/etc/loki/local-config.yaml + volumes: + - loki-storage:/loki + - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: any + placement: + constraints: + - node.role == manager + ports: + - target: 3100 + published: 3100 + protocol: tcp + mode: ingress + networks: + - observability-network + - grafana-frontend-network + + alloy: + image: grafana/alloy:latest + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - ./alloy/config.alloy:/etc/alloy/config.alloy:ro + deploy: + mode: global + endpoint_mode: dnsrr + restart_policy: + condition: any + networks: + - observability-network + + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + volumes: + - /:/rootfs:ro + - /run:/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + deploy: + mode: global + endpoint_mode: dnsrr + restart_policy: + condition: any + networks: + - observability-network + + node_exporter: + image: quay.io/prometheus/node-exporter:latest + hostname: "{{.Node.Hostname}}" + command: + - "--path.rootfs=/host" + volumes: + - "/:/host:ro,rslave" + deploy: + mode: global + endpoint_mode: dnsrr + restart_policy: + condition: any + networks: + - observability-network diff --git a/observability/grafana/Dockerfile b/observability/grafana/Dockerfile new file mode 100644 index 0000000..24e972a --- /dev/null +++ b/observability/grafana/Dockerfile @@ -0,0 +1,11 @@ +FROM grafana/grafana:latest + +# Disable Grafana sign up option +ENV GF_USERS_ALLOW_SIGN_UP=false + +# Copy local Grafana config and provisioning into the image +COPY grafana.ini /etc/grafana/grafana.ini +COPY provisioning/ /etc/grafana/provisioning/ + +# Switch back to Grafana user and keep default entrypoint/CMD +EXPOSE 3000 diff --git a/observability/grafana/grafana.ini b/observability/grafana/grafana.ini new file mode 100644 index 0000000..df920a9 --- /dev/null +++ b/observability/grafana/grafana.ini @@ -0,0 +1,41 @@ +; https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#configuration-options +[server] +http_port = 3000 + +; https://grafana.com/docs/grafana/latest/setup-grafana/configure-grafana/#serve_from_sub_path +; To enable serving Grafana from a subpath +serve_from_sub_path = false +; root_url = http://grafana.localhost:3030/ + +enable_gzip = true + +[analytics] +reporting_enabled = true +check_for_updates = true +check_for_plugin_updates = true +feedback_links_enabled = false + +[security] +disable_gravatar = false + +[users] +allow_sign_up = false +allow_org_create = false + +[log] +level = info ; debug, info, warn, error + +[explore] +enabled = true +[help] +enabled = true +[profile] +enabled = true +[news] +news_feed_enabled = false + +[public_dashboards] +enabled = false + + + diff --git a/observability/grafana/provisioning/dashboards/17346_rev9.json b/observability/grafana/provisioning/dashboards/17346_rev9.json new file mode 100644 index 0000000..43c0a3b --- /dev/null +++ b/observability/grafana/provisioning/dashboards/17346_rev9.json @@ -0,0 +1,1570 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.3.1" + }, + { + "type": "panel", + "id": "piechart", + "name": "Pie chart", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Official dashboard for Standalone Traefik", + "editable": false, + "fiscalYearStartMonth": 0, + "gnetId": 17346, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 9, + "panels": [], + "title": "General", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 5, + "x": 0, + "y": 1 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "count(traefik_config_reloads_total)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Traefik Instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 5, + "y": 1 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(traefik_entrypoint_requests_total{entrypoint=~\"$entrypoint\"}[$interval])) by (entrypoint)", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests per Entrypoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "https://medium.com/@tristan_96324/prometheus-apdex-alerting-d17a065e39d0", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "(sum(rate(traefik_entrypoint_request_duration_seconds_bucket{le=\"0.3\",code=\"200\",entrypoint=~\"$entrypoint\"}[$interval])) by (method) + \n sum(rate(traefik_entrypoint_request_duration_seconds_bucket{le=\"1.2\",code=\"200\",entrypoint=~\"$entrypoint\"}[$interval])) by (method)) / 2 / \n sum(rate(traefik_entrypoint_request_duration_seconds_count{code=\"200\",entrypoint=~\"$entrypoint\"}[$interval])) by (method)\n", + "legendFormat": "{{method}}", + "range": true, + "refId": "A" + } + ], + "title": "Apdex score", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "Mean Distribution", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [], + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) by (method, code)", + "legendFormat": "{{method}}[{{code}}]", + "range": true, + "refId": "A" + } + ], + "title": "Http Code ", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n traefik_service_request_duration_seconds_sum{service=~\"$service.*\",protocol=\"http\"} / \n traefik_service_request_duration_seconds_count{service=~\"$service.*\",protocol=\"http\"},\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)\n\n", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Top slow services", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Most requested services", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 11, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "label_replace(\n 1 - (sum by (service)\n (rate(traefik_service_request_duration_seconds_bucket{le=\"1.2\",service=~\"$service.*\"}[$interval])) / sum by (service) \n (rate(traefik_service_request_duration_seconds_count{service=~\"$service.*\"}[$interval]))\n ) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\"\n)", + "legendFormat": "{{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Services failing SLO of 1200ms", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "label_replace(\n 1 - (sum by (service)\n (rate(traefik_service_request_duration_seconds_bucket{le=\"0.3\",service=~\"$service.*\"}[$interval])) / sum by (service) \n (rate(traefik_service_request_duration_seconds_count{service=~\"$service.*\"}[$interval]))\n ) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\"\n)", + "legendFormat": "{{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Services failing SLO of 300ms", + "type": "timeseries" + } + ], + "title": "SLO", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 16, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 0, + "y": 19 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"2..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "2xx over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 8, + "y": 19 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code=~\"5..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "5xx over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 19 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method,code) \n (rate(traefik_service_requests_total{service=~\"$service.*\",code!~\"2..|5..\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}}[{{code}}] on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Other codes over $interval", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method) \n (rate(traefik_service_requests_bytes_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}} on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "topk(15,\n label_replace(\n sum by (service,method) \n (rate(traefik_service_responses_bytes_total{service=~\"$service.*\",protocol=\"http\"}[$interval])) > 0,\n \"service\", \"$1\", \"service\", \"([^-]+-[^-]+).*\")\n)", + "legendFormat": "{{method}} on {{service}}", + "range": true, + "refId": "A" + } + ], + "title": "Responses Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(traefik_open_connections{entrypoint=~\"$entrypoint\"}) by (entrypoint)\n", + "legendFormat": "{{entrypoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Connections per Entrypoint", + "type": "timeseries" + } + ], + "title": "HTTP Details", + "type": "row" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "DS_PROMETHEUS", + "label": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "auto": true, + "auto_count": 30, + "auto_min": "1m", + "current": { + "selected": false, + "text": "auto", + "value": "$__auto_interval_interval" + }, + "hide": 0, + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "2h", + "value": "2h" + }, + { + "selected": false, + "text": "4h", + "value": "4h" + }, + { + "selected": false, + "text": "8h", + "value": "8h" + } + ], + "query": "1m,5m,10m,30m,1h,2h,4h,8h", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "definition": "label_values(traefik_open_connections, entrypoint)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "entrypoint", + "options": [], + "query": { + "query": "label_values(traefik_open_connections, entrypoint)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "definition": "label_values(traefik_service_requests_total, service)", + "hide": 0, + "includeAll": true, + "multi": false, + "name": "service", + "options": [], + "query": { + "query": "label_values(traefik_service_requests_total, service)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Traefik Official Standalone Dashboard", + "uid": "n5bu_kv45", + "version": 7, + "weekStart": "" +} \ No newline at end of file diff --git a/observability/grafana/provisioning/dashboards/1860_rev42.json b/observability/grafana/provisioning/dashboards/1860_rev42.json new file mode 100644 index 0000000..fdc3a00 --- /dev/null +++ b/observability/grafana/provisioning/dashboards/1860_rev42.json @@ -0,0 +1,15766 @@ +{ + "__requires": [ + { + "type": "panel", + "id": "bargauge", + "name": "Bar gauge", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.6.1" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "GitHub", + "type": "link", + "url": "https://github.com/rfmoz/grafana-dashboards" + }, + { + "icon": "external link", + "tags": [], + "targetBlank": true, + "title": "Grafana", + "type": "link", + "url": "https://grafana.com/grafana/dashboards/1860" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 261, + "panels": [], + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Resource pressure via PSI", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green" + }, + { + "color": "dark-yellow", + "value": 70 + }, + { + "color": "dark-red", + "value": 90 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 323, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "CPU", + "range": false, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "Mem", + "range": false, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "I/O", + "range": false, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": true, + "legendFormat": "Irq", + "range": false, + "refId": "D", + "step": 240 + } + ], + "title": "Pressure", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Overall CPU busy percentage (averaged across all cores)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 20, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "100 * (1 - avg(rate(node_cpu_seconds_total{mode=\"idle\", instance=\"$node\"}[$__rate_interval])))", + "instant": true, + "legendFormat": "", + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "System load over all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 155, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "scalar(node_load1{instance=\"$node\",job=\"$job\"}) * 100 / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Sys Load", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Real RAM usage excluding cache and reclaimable memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 16, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "clamp_min((1 - (node_memory_MemAvailable_bytes{instance=\"$node\", job=\"$job\"} / node_memory_MemTotal_bytes{instance=\"$node\", job=\"$job\"})) * 100, 0)", + "format": "time_series", + "instant": true, + "range": false, + "refId": "B", + "step": 240 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of swap space currently used by the system", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 21, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"})) * 100", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 1 + }, + "id": 154, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "(\n (node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"})\n / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", mountpoint=\"/\", fstype!=\"rootfs\"}\n) * 100\n", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 1 + }, + "id": 14, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 1 + }, + "id": 75, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RAM Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 1 + }, + "id": 18, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "SWAP Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 3 + }, + "id": 23, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "RootFS Total", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 3 + }, + "id": 15, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "instant": true, + "range": false, + "refId": "A", + "step": 240 + } + ], + "title": "Uptime", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 263, + "panels": [], + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "CPU time spent busy vs idle, split by activity type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Busy Iowait" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Idle" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy System" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy User" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Busy Other" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 77, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "exemplar": false, + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"system\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "instant": false, + "legendFormat": "Busy System", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"user\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy User", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"iowait\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy Iowait", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=~\".*irq\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy IRQs", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq'}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Busy Other", + "range": true, + "refId": "E", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\", mode=\"idle\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Idle", + "range": true, + "refId": "F", + "step": 240 + } + ], + "title": "CPU Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "RAM and swap usage overview, including caches", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Swap used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache + Buffer" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} + node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Cache + Buffer", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Free", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Swap used", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "Memory Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Per-interface network traffic (receive and transmit) in bits per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Tx.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 74, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "Rx {{device}}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "Tx {{device}} ", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Basic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of filesystem space used for each mounted device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 152, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "((node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"} - node_filesystem_avail_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) / node_filesystem_size_bytes{instance=\"$node\", job=\"$job\", device!~\"rootfs\"}) * 100", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk Space Used Basic", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 265, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "CPU time usage split by state, normalized across all CPU cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 70, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "percent" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Idle - Waiting for something to happen" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Iowait - Waiting for I/O to complete" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Irq - Servicing interrupts" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Nice - Niced processes executing in user mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Softirq - Servicing softirqs" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Steal - Time spent in other operating systems when running in a virtualized environment" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#FCE2DE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "System - Processes executing in kernel mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "User - Normal processes executing in user mode" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#5195CE", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Guest CPU usage" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "custom.stacking", + "value": { + "group": "A", + "mode": "none" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 250 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "interval": "", + "legendFormat": "System - Processes executing in kernel mode", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"user\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "User - Normal processes executing in user mode", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"nice\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Nice - Niced processes executing in user mode", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"iowait\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Iowait - Waiting for I/O to complete", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"irq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Irq - Servicing interrupts", + "range": true, + "refId": "E", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"softirq\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Softirq - Servicing softirqs", + "range": true, + "refId": "F", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"steal\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", + "range": true, + "refId": "G", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum(irate(node_cpu_seconds_total{mode=\"idle\",instance=\"$node\",job=\"$job\"}[$__rate_interval])) / scalar(count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)))", + "format": "time_series", + "legendFormat": "Idle - Waiting for something to happen", + "range": true, + "refId": "H", + "step": 240 + }, + { + "editorMode": "code", + "expr": "sum by(instance) (irate(node_cpu_guest_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])) / on(instance) group_left sum by (instance)((irate(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))) > 0", + "format": "time_series", + "legendFormat": "Guest CPU usage", + "range": true, + "refId": "I", + "step": 240 + } + ], + "title": "CPU", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Breakdown of physical memory and swap usage. Hardware-detected memory errors are also displayed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Apps" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#629E51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#614D93", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A437C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#584477", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "RAM_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0F9D7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#806EB7", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E0752D", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap - Swap memory usage" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#2F575E", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Unused - Free memory unassigned" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Hardware Corrupted - *./" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Apps - Memory used by user-space applications", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Cache - Parked file data (file content) cache", + "range": true, + "refId": "E", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Buffers - Block device (e.g. harddisk) cache", + "range": true, + "refId": "F", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Unused - Free memory unassigned", + "range": true, + "refId": "G", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Swap - Swap space used", + "range": true, + "refId": "H", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "range": true, + "refId": "I", + "step": 240 + } + ], + "title": "Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Incoming and outgoing network traffic per interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 433 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])*8", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Network interface utilization as a percentage of its maximum capacity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 433 + }, + "id": 338, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"}", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(rate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n / ignoring(speed) node_network_speed_bytes{instance=\"$node\",job=\"$job\", speed!=\"-1\"})", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Saturation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Disk I/O operations per second for each device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 445 + }, + "id": 229, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk IOps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Disk I/O throughput per device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (-) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read*./" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 445 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Throughput", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Amount of available disk space per mounted filesystem, excluding rootfs. Based on block availability to non-root users", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 457 + }, + "id": 43, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "metric": "", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "legendFormat": "{{mountpoint}} - Free", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "legendFormat": "{{mountpoint}} - Size", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Filesystem Space Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Disk usage (used = total - available) per mountpoint", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 457 + }, + "id": 156, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Filesystem Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of time the disk was actively processing I/O operations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 469 + }, + "id": 127, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"[a-z]+|nvme[0-9]+n[0-9]+|mmcblk[0-9]+\"} [$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk I/O Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "How often tasks experience CPU, memory, or I/O delays. “Some” indicates partial slowdown; “Full” indicates all tasks are stalled. Based on Linux PSI metrics:\nhttps://docs.kernel.org/accounting/psi.html", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "some (-) / full (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Some.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Some.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 469 + }, + "id": 322, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_pressure_cpu_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "CPU - Some", + "range": true, + "refId": "CPU some", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_memory_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Memory - Some", + "range": true, + "refId": "Memory some", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_memory_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Memory - Full", + "range": true, + "refId": "Memory full", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_io_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "I/O - Some", + "range": true, + "refId": "I/O some", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_io_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "I/O - Full", + "range": true, + "refId": "I/O full", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_pressure_irq_stalled_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "IRQ - Full", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Pressure Stall Information", + "type": "timeseries" + } + ], + "title": "CPU / Memory / Net / Disk", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 266, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays committed memory usage versus the system's commit limit. Exceeding the limit is allowed under Linux overcommit policies but may increase OOM risks under high load", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*CommitLimit - *./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 732 + }, + "id": 135, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Committed_AS – Memory promised to processes (not necessarily used)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "CommitLimit - Max allowable committed memory", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Committed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory currently dirty (modified but not yet written to disk), being actively written back, or held by writeback buffers. High dirty or writeback memory may indicate disk I/O pressure or delayed flushing", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 732 + }, + "id": 130, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Writeback – Memory currently being flushed to disk", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "WritebackTmp – FUSE temporary writeback buffers", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Dirty – Memory marked dirty (pending write to disk)", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "NFS Unstable – Pages sent to NFS server, awaiting storage commit", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory Writeback and Dirty", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Kernel slab memory usage, separated into reclaimable and non-reclaimable categories. Reclaimable memory can be freed under memory pressure (e.g., caches), while unreclaimable memory is locked by the kernel for core functions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 932 + }, + "id": 131, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "SUnreclaim – Non-reclaimable slab memory (kernel objects)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "SReclaimable – Potentially reclaimable slab memory (e.g., inode cache)", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Slab", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory used for mapped files (such as libraries) and shared memory (shmem and tmpfs), including variants backed by huge pages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 932 + }, + "id": 138, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Mapped – Memory mapped from files (e.g., libraries, mmap)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Shmem – Shared memory used by processes and tmpfs", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "ShmemHugePages – Shared memory (shmem/tmpfs) allocated with HugePages", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PMD Mapped – Shmem/tmpfs backed by Transparent HugePages (PMD)", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory Shared and Mapped", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Proportion of memory pages in the kernel's active and inactive LRU lists relative to total RAM. Active pages have been recently used, while inactive pages are less recently accessed but still resident in memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Active.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Inactive.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 942 + }, + "id": 136, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "(node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "legendFormat": "Inactive – Less recently used memory, more likely to be reclaimed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "(node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}) \n/ \n(node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})\n", + "format": "time_series", + "legendFormat": "Active – Recently used memory, retained unless under pressure", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory LRU Active / Inactive (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Breakdown of memory pages in the kernel's active and inactive LRU lists, separated by anonymous (heap, tmpfs) and file-backed (caches, mmap) pages.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 942 + }, + "id": 191, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Inactive_anon – Anonymous memory on inactive LRU (incl. tmpfs & swap cache)", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Active_file - File-backed memory on active LRU list", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Active_anon – Anonymous memory on active LRU (incl. tmpfs & swap cache)", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory LRU Active / Inactive Detail", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks kernel memory used for CPU-local structures, per-thread stacks, and bounce buffers used for I/O on DMA-limited devices. These areas are typically small but critical for low-level operations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 952 + }, + "id": 160, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "KernelStack – Kernel stack memory (per-thread, non-reclaimable)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PerCPU – Dynamically allocated per-CPU memory (used by kernel modules)", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Bounce Memory – I/O buffer for DMA-limited devices", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory Kernel / CPU / IO", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Usage of the kernel's vmalloc area, which provides virtual memory allocations for kernel modules and drivers. Includes total, used, and largest free block sizes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Total.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 952 + }, + "id": 70, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Vmalloc Free Chunk – Largest available block in vmalloc area", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Vmalloc Total – Total size of the vmalloc memory area", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Vmalloc Used – Portion of vmalloc area currently in use", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory Vmalloc", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory used by anonymous pages (not backed by files), including standard and huge page allocations. Includes heap, stack, and memory-mapped anonymous regions", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 962 + }, + "id": 129, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "AnonHugePages – Anonymous memory using HugePages", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "AnonPages – Anonymous memory (non-file-backed)", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Anonymous", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Memory that is locked in RAM and cannot be swapped out. Includes both kernel-unevictable memory and user-level memory locked with mlock()", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#CFFAFF", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 962 + }, + "id": 137, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Unevictable – Kernel-pinned memory (not swappable)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Mlocked – Application-locked memory via mlock()", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Unevictable and MLocked", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "How much memory is directly mapped in the kernel using different page sizes (4K, 2M, 1G). Helps monitor large page utilization in the direct map region", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#99440A", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Buffers" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#58140C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6D1F62", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Cached" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Committed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#508642", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Dirty" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#6ED0E0", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Free" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#B7DBAB", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Mapped" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "PageTables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Page_Tables" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#0A50A1", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Slab_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EAB839", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Swap_Cache" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C15C17", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#511749", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Total RAM + Swap" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#052B51", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "VmallocUsed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EA6460", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 972 + }, + "id": 128, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "DirectMap 1G – Memory mapped with 1GB pages", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "DirectMap 2M – Memory mapped with 2MB pages", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "DirectMap 4K – Memory mapped with 4KB pages", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory DirectMap", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays HugePages memory usage in bytes, including allocated, free, reserved, and surplus memory. All values are calculated based on the number of huge pages multiplied by their configured size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 972 + }, + "id": 140, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Used – Currently allocated", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Reserved – Promised but unused", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Surplus – Dynamic pool extension", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"} * node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "HugePages Total – Reserved memory", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Memory HugePages", + "type": "timeseries" + } + ], + "title": "Memory Meminfo", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 267, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of memory pages being read from or written to disk (page-in and page-out operations). High page-out may indicate memory pressure or swapping activity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 733 + }, + "id": 176, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pagesin - Page in ops", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pagesout - Page out ops", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate at which memory pages are being swapped in from or out to disk. High swap-out activity may indicate memory pressure", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 733 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pswpin - Pages swapped in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pswpout - Pages swapped out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Memory Pages Swap In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of memory page faults, split into total, major (disk-backed), and derived minor (non-disk) faults. High major fault rates may indicate memory pressure or insufficient RAM", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Pgfault - Page major and minor fault ops" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "none" + } + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 913 + }, + "id": 175, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 350 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pgfault - Page major and minor fault ops", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pgmajfault - Major page fault ops", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[$__rate_interval]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Pgminfault - Minor page fault ops", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Memory Page Faults", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of Out-of-Memory (OOM) kill events. A non-zero value indicates the kernel has terminated one or more processes due to memory exhaustion", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "OOM Kills" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 913 + }, + "id": 307, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "OOM Kills", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "OOM Killer", + "type": "timeseries" + } + ], + "title": "Memory Vmstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 293, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks the system clock's estimated and maximum error, as well as its offset from the reference clock (e.g., via NTP). Useful for detecting synchronization drift", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 734 + }, + "id": 260, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Estimated error", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Offset local vs reference", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Maximum error", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Time Synchronized Drift", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "NTP phase-locked loop (PLL) time constant used by the kernel to control time adjustments. Lower values mean faster correction but less stability", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 734 + }, + "id": 291, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PLL Time Constant", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Time PLL Adjust", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows whether the system clock is synchronized to a reliable time source, and the current frequency correction ratio applied by the kernel to maintain synchronization", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 884 + }, + "id": 168, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Sync status (1 = ok)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Frequency Adjustment", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "Tick Interval", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "TAI Offset", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Time Synchronized Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays the PPS signal's frequency offset and stability (jitter) in hertz. Useful for monitoring high-precision time sources like GPS or atomic clocks", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "rothz" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 884 + }, + "id": 333, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_pps_frequency_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Frequency Offset", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_pps_stability_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Frequency Stability", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "PPS Frequency / Stability", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks PPS signal timing jitter and shift compared to system clock", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 894 + }, + "id": 334, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_timex_pps_jitter_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Jitter", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_timex_pps_shift_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Shift", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "PPS Time Accuracy", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of PPS synchronization diagnostics including calibration events, jitter violations, errors, and frequency stability exceedances", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 894 + }, + "id": 335, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_timex_pps_calibration_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Calibrations/sec", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_timex_pps_error_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Errors/sec", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_timex_pps_stability_exceeded_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Stability Exceeded/sec", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_timex_pps_jitter_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "PPS Jitter Events/sec", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "PPS Sync Events", + "type": "timeseries" + } + ], + "title": "System Timesync", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 312, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Processes currently in runnable or blocked states. Helps identify CPU contention or I/O wait bottlenecks.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 735 + }, + "id": 62, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Blocked (I/O Wait)", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Runnable (Ready for CPU)", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Processes Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current number of processes in each state (e.g., running, sleeping, zombie). Requires --collector.processes to be enabled in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "D" + }, + "properties": [ + { + "id": "displayName", + "value": "Uninterruptible Sleeping" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "I" + }, + "properties": [ + { + "id": "displayName", + "value": "Idle Kernel Thread" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "R" + }, + "properties": [ + { + "id": "displayName", + "value": "Running" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "S" + }, + "properties": [ + { + "id": "displayName", + "value": "Interruptible Sleeping" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "T" + }, + "properties": [ + { + "id": "displayName", + "value": "Stopped" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "X" + }, + "properties": [ + { + "id": "displayName", + "value": "Dead" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Z" + }, + "properties": [ + { + "id": "displayName", + "value": "Zombie" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 735 + }, + "id": 315, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ state }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Processes Detailed States", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of new processes being created on the system (forks/sec).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 765 + }, + "id": 148, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Process Forks per second", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Processes Forks", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows CPU saturation per core, calculated as the proportion of time spent waiting to run relative to total time demanded (running + waiting).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*waiting.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 765 + }, + "id": 305, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "CPU {{ cpu }} - Running", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "CPU {{cpu}} - Waiting Queue", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])\n/\n(irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) + irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]))\n", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}}", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "CPU Saturation per Core", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of active PIDs on the system and the configured maximum allowed. Useful for detecting PID exhaustion risk. Requires --collector.processes in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "PIDs limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 775 + }, + "id": 313, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Number of PIDs", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "PIDs limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "PIDs Number and Limit", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of active threads on the system and the configured thread limit. Useful for monitoring thread pressure. Requires --collector.processes in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Threads limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 775 + }, + "id": 314, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Allocated threads", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Threads limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Threads Number and Limit", + "type": "timeseries" + } + ], + "title": "System Processes", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 269, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Per-second rate of context switches and hardware interrupts. High values may indicate intense CPU or I/O activity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 816 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Context switches", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "Interrupts", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Context Switches / Interrupts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "System load average over 1, 5, and 15 minutes. Reflects the number of active or waiting processes. Values above CPU core count may indicate overload", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "CPU Core Count" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 816 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_load1{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Load 1m", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_load5{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Load 5m", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_load15{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Load 15m", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "format": "time_series", + "legendFormat": "CPU Core Count", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "System Load", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Real-time CPU frequency scaling per core, including average minimum and maximum allowed scaling frequencies", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "hertz" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Max" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Min" + }, + "properties": [ + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + }, + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": false, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 826 + }, + "id": 321, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_cpu_scaling_frequency_hertz{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{ cpu }}", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "avg(node_cpu_scaling_frequency_max_hertz{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "interval": "", + "legendFormat": "Max", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "avg(node_cpu_scaling_frequency_min_hertz{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "interval": "", + "legendFormat": "Min", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "CPU Frequency Scaling", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of scheduling timeslices executed per CPU. Reflects how frequently the scheduler switches tasks on each core", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 826 + }, + "id": 306, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{ cpu }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "CPU Schedule Timeslices", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Breaks down hardware interrupts by type and device. Useful for diagnosing IRQ load on network, disk, or CPU interfaces. Requires --collector.interrupts to be enabled in node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 836 + }, + "id": 259, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{ type }} - {{ info }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "IRQ Detail", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of bits of entropy currently available to the system's random number generators (e.g., /dev/random). Low values may indicate that random number generation could block or degrade performance of cryptographic operations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "decbits" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Entropy pool max" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 836 + }, + "id": 151, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Entropy available", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_entropy_pool_size_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Entropy pool max", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Entropy", + "type": "timeseries" + } + ], + "title": "System Misc", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 304, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Monitors hardware sensor temperatures and critical thresholds as exposed by Linux hwmon. Includes CPU, GPU, and motherboard sensors where available", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "celsius" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Critical*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#E24D42", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 737 + }, + "id": 158, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Critical Alarm", + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Critical", + "range": true, + "refId": "C", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Critical Historical", + "refId": "D", + "step": 240 + }, + { + "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} Max", + "refId": "E", + "step": 240 + } + ], + "title": "Hardware Temperature Monitor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows how hard each cooling device (fan/throttle) is working relative to its maximum capacity", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#EF843C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 737 + }, + "id": 300, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "100 * node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"} / node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }} - {{ type }} ", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Cooling Device Utilization", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows the online status of power supplies (e.g., AC, battery). A value of 1-Yes indicates the power supply is active/online", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 747 + }, + "id": 302, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ power_supply }} online", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Power Supply", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Displays the current fan speeds (RPM) from hardware sensors via the hwmon interface", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "rotrpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 747 + }, + "id": 325, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_hwmon_fan_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_hwmon_fan_min_rpm{instance=\"$node\",job=\"$job\"} * on(chip) group_left(chip_name) node_hwmon_chip_names{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "legendFormat": "{{ chip_name }} {{ sensor }} rpm min", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Hardware Fan Speed", + "type": "timeseries" + } + ], + "title": "Hardware Misc", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 296, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current number of systemd units in each operational state, such as active, failed, inactive, or transitioning", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Active" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Activating" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C8F2C2", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Deactivating" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Inactive" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 4228 + }, + "id": 298, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Activating", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Active", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Deactivating", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Failed", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Inactive", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "Systemd Units State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current number of active connections per systemd socket, as reported by the Node Exporter systemd collector", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 4228 + }, + "id": 331, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_systemd_socket_current_connections{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets Current", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of accepted connections per second for each systemd socket", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 4238 + }, + "id": 297, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets Accepted", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of systemd socket connection refusals per second, typically due to service unavailability or backlog overflow", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 4238 + }, + "id": 332, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_systemd_socket_refused_connections_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{ name }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Systemd Sockets Refused", + "type": "timeseries" + } + ], + "title": "Systemd", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 270, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of I/O operations completed per second for the device (after merges), including both reads and writes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (–) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 29 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Read/Write IOps", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of bytes read from or written to the device per second", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (–) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 29 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "exemplar": false, + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Read/Write Data", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Average time for requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (–) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 389 + }, + "id": 37, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval]) / irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk Average Wait Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Average queue length of the requests that were issued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda_*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#7EB26D", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 389 + }, + "id": 35, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Average Queue Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of read and write requests merged per second that were queued to the device", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "read (–) / write (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "iops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Read.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 399 + }, + "id": 133, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Read", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "legendFormat": "{{device}} - Write", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Disk R/W Merged", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Percentage of time the disk spent actively processing I/O operations, including general I/O, discards (TRIM), and write cache flushes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 399 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - General IO", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Discard/TRIM", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_flush_requests_time_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Flush (write cache)", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Time Spent Doing I/Os", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Per-second rate of discard (TRIM) and flush (write cache) operations. Useful for monitoring low-level disk activity on SSDs and advanced storage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 409 + }, + "id": 301, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Discards completed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Discards merged", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_disk_flush_requests_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} - Flush", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Disk Ops Discards / Flush", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows how many disk sectors are discarded (TRIMed) per second. Useful for monitoring SSD behavior and storage efficiency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 409 + }, + "id": 326, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_disk_discarded_sectors_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Disk Sectors Discarded Successfully", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of in-progress I/O requests at the time of sampling (active requests in the disk queue)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/sda.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 419 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_disk_io_now{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "{{device}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Instantaneous Queue Size", + "type": "timeseries" + } + ], + "title": "Storage Disk", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 271, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of file descriptors currently allocated system-wide versus the system limit. Important for detecting descriptor exhaustion risks", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max.*/" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Max open files", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "Open files", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "File Descriptor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of free file nodes (inodes) available per mounted filesystem. A low count may prevent file creation even if disk space is available", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "File Nodes Free", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Indicates filesystems mounted in read-only mode or reporting device-level I/O errors.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 370 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}} - ReadOnly", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs',fstype!~'tmpfs'}", + "format": "time_series", + "interval": "", + "legendFormat": "{{mountpoint}} - Device error", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Filesystem in ReadOnly / Error", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of file nodes (inodes) available per mounted filesystem. Reflects maximum file capacity regardless of disk size", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 370 + }, + "id": 219, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "legendFormat": "{{mountpoint}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "File Nodes Size", + "type": "timeseries" + } + ], + "title": "Storage Filesystem", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 272, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of network packets received and transmitted per second, by interface.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 60, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic by Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of packet-level errors for each network interface. Receive errors may indicate physical or driver issues; transmit errors may reflect collisions or hardware faults", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 142, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of dropped packets per network interface. Receive drops can indicate buffer overflow or driver issues; transmit drops may result from outbound congestion or queuing limits", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 251 + }, + "id": 143, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Drop", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of compressed network packets received and transmitted per interface. These are common in low-bandwidth or special interfaces like PPP or SLIP", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 251 + }, + "id": 141, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Compressed", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of incoming multicast packets received per network interface. Multicast is used by protocols such as mDNS, SSDP, and some streaming or cluster services", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 261 + }, + "id": 146, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Multicast", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of received packets that could not be processed due to missing protocol or handler in the kernel. May indicate unsupported traffic or misconfiguration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 261 + }, + "id": 327, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_nohandler_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic NoHandler", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of frame errors on received packets, typically caused by physical layer issues such as bad cables, duplex mismatches, or hardware problems", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 271 + }, + "id": 145, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Frame", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks FIFO buffer overrun errors on network interfaces. These occur when incoming or outgoing packets are dropped due to queue or buffer overflows, often indicating congestion or hardware limits", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 271 + }, + "id": 144, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "rate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Network Traffic Fifo", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of packet collisions detected during transmission. Mostly relevant on half-duplex or legacy Ethernet networks", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 281 + }, + "id": 232, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Collision", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of carrier errors during transmission. These typically indicate physical layer issues like faulty cabling or duplex mismatches", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 281 + }, + "id": 231, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "rate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "{{device}} - Tx out", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Network Traffic Carrier Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of ARP entries per interface. Useful for detecting excessive ARP traffic or table growth due to scanning or misconfiguration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 291 + }, + "id": 230, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "{{ device }} ARP Table", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "ARP Entries", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Current and maximum connection tracking entries used by Netfilter (nf_conntrack). High usage approaching the limit may cause packet drops or connection issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "NF conntrack limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 291 + }, + "id": 61, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "NF conntrack entries", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "NF conntrack limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "NF Conntrack", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Operational and physical link status of each network interface. Values are Yes for 'up' or link present, and No for 'down' or no carrier.\"", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bool_yes_no" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 301 + }, + "id": 309, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "legendFormat": "{{interface}} - Operational state UP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_network_carrier{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Physical link", + "refId": "B" + } + ], + "title": "Network Operational Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Maximum speed of each network interface as reported by the operating system. This is a static hardware capability, not current throughput", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "fieldMinMax": false, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 12, + "y": 301 + }, + "id": 280, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 30, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "manual", + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"} * 8", + "format": "time_series", + "legendFormat": "{{ device }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Speed", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "MTU (Maximum Transmission Unit) in bytes for each network interface. Affects packet size and transmission efficiency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 6, + "x": 18, + "y": 301 + }, + "id": 288, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 30, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "manual", + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "legendFormat": "{{ device }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "MTU", + "type": "bargauge" + } + ], + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 273, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks TCP socket usage and memory per node", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 63, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Allocated Sockets", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "In-Use Sockets", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Orphaned Sockets", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TIME_WAIT Sockets", + "range": true, + "refId": "D", + "step": 240 + } + ], + "title": "Sockstat TCP", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of UDP and UDPLite sockets currently in use", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 124, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDPLite - In-Use Sockets", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP - In-Use Sockets", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Sockstat UDP", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Total number of sockets currently in use across all protocols (TCP, UDP, UNIX, etc.), as reported by /proc/net/sockstat", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 126, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Total sockets", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Sockstat Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of FRAG and RAW sockets currently in use. RAW sockets are used for custom protocols or tools like ping; FRAG sockets are used internally for IP packet defragmentation", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 125, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "FRAG - In-Use Sockets", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "RAW - In-Use Sockets", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "Sockstat FRAG / RAW", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Kernel memory used by TCP, UDP, and IP fragmentation buffers", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 52 + }, + "id": 220, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TCP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Fragmentation", + "range": true, + "refId": "C" + } + ], + "title": "Sockstat Memory Size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Average memory used per socket (TCP/UDP). Helps tune net.ipv4.tcp_rmem / tcp_wmem", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 52 + }, + "id": 339, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"} / node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TCP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"} / node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Sockstat Average Socket Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "TCP/UDP socket memory usage in kernel (in pages)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 62 + }, + "id": 336, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TCP", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP/UDP Kernel Buffer Memory Pages", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Packets processed and dropped by the softnet network stack per CPU. Drops may indicate CPU saturation or network driver limitations", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "drop (-) / process (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Dropped.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 62 + }, + "id": 290, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Processed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Dropped", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Softnet Packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "How often the kernel was unable to process all packets in the softnet queue before time ran out. Frequent squeezes may indicate CPU contention or driver inefficiency", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 72 + }, + "id": 310, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Times Squeezed", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Softnet Out of Quota", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks the number of packets processed or dropped by Receive Packet Steering (RPS), a mechanism to distribute packet processing across CPUs", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Dropped.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 72 + }, + "id": 330, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_softnet_received_rps_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Processed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_softnet_flow_limit_count_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "CPU {{cpu}} - Dropped", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Softnet RPS", + "type": "timeseries" + } + ], + "title": "Network Sockstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 274, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of octets sent and received at the IP layer, as reported by /proc/net/netstat", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 163 + }, + "id": 221, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "width": 300 + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "IP Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "legendFormat": "IP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Netstat IP In / Out Octets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of TCP segments sent and received per second, including data and control segments", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": "/.*Snd.*/" + }, + "properties": [] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 163 + }, + "id": 299, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "TCP Rx in", + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "TCP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of UDP datagrams sent and received per second, based on /proc/net/netstat", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 193 + }, + "id": 55, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "UDP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of ICMP messages sent and received per second, including error and control messages", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 193 + }, + "id": 115, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "ICMP Rx in", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "ICMP Tx out", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "ICMP In / Out", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks various TCP error and congestion-related events, including retransmissions, timeouts, dropped connections, and buffer issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 203 + }, + "id": 104, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Listen Overflows", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Listen Drops", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Retransmits", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Segment Retransmits", + "range": true, + "refId": "D" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Receive Errors", + "range": true, + "refId": "E" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_OutRsts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "RST Sent", + "range": true, + "refId": "F" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPRcvQDrop{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Receive Queue Drops", + "range": true, + "refId": "G" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPOFOQueue{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "Out-of-order Queued", + "range": true, + "refId": "H" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_TCPTimeouts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "TCP Timeouts", + "range": true, + "refId": "I" + } + ], + "title": "TCP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of UDP and UDPLite datagram delivery errors, including missing listeners, buffer overflows, and protocol-specific issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 203 + }, + "id": 109, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in Errors", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP No Listener", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "interval": "", + "legendFormat": "UDPLite Rx in Errors", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in Buffer Errors", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Tx out Buffer Errors", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "UDP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of incoming ICMP messages that contained protocol-specific errors, such as bad checksums or invalid lengths", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "out (-) / in (+)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "pps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*out.*/" + }, + "properties": [ + { + "id": "custom.transform", + "value": "negative-Y" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 213 + }, + "id": 50, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "ICMP Rx In", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "ICMP Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of TCP SYN cookies sent, validated, and failed. These are used to protect against SYN flood attacks and manage TCP handshake resources under load", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Failed.*/" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 213 + }, + "id": 91, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Cookies Failed", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Cookies Validated", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "SYN Cookies Sent", + "range": true, + "refId": "C", + "step": 240 + } + ], + "title": "TCP SynCookie", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of currently established TCP connections and the system's max supported limit. On Linux, MaxConn may return -1 to indicate a dynamic/unlimited configuration", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 223 + }, + "id": 85, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Current Connections", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Max Connections", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of UDP packets currently queued in the receive (RX) and transmit (TX) buffers. A growing queue may indicate a bottleneck", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 223 + }, + "id": 337, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"rx\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Rx in Queue", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_udp_queues{instance=\"$node\",job=\"$job\",ip=\"v4\",queue=\"tx\"}", + "format": "time_series", + "interval": "", + "legendFormat": "UDP Tx out Queue", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "UDP Queue", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of TCP connection initiations per second. 'Active' opens are initiated by this host. 'Passive' opens are accepted from incoming connections", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "eps" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 233 + }, + "id": 82, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Active Opens", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Passive Opens", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "TCP Direct Transition", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of TCP sockets in key connection states. Requires the --collector.tcpstat flag on node_exporter", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 233 + }, + "id": 320, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"established\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Established", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"fin_wait2\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "FIN_WAIT2", + "range": true, + "refId": "B", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"listen\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "Listen", + "range": true, + "refId": "C", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"time_wait\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "TIME_WAIT", + "range": true, + "refId": "D", + "step": 240 + }, + { + "editorMode": "code", + "expr": "node_tcp_connection_states{state=\"close_wait\", instance=\"$node\", job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "CLOSE_WAIT", + "range": true, + "refId": "E", + "step": 240 + } + ], + "title": "TCP Stat", + "type": "timeseries" + } + ], + "title": "Network Netstat", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 279, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Duration of each individual collector executed during a Node Exporter scrape. Useful for identifying slow or failing collectors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 164 + }, + "id": 40, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{collector}}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Node Exporter Scrape Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Rate of CPU time used by the process exposing this metric (user + system mode)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 164 + }, + "id": 308, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "legendFormat": "Process CPU Usage", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Exporter Process CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Tracks the memory usage of the process exposing this metric (e.g., node_exporter), including current virtual memory and maximum virtual memory limit", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Virtual Memory Limit" + }, + "properties": [ + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "color", + "value": { + "fixedColor": "dark-red", + "mode": "fixed" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Virtual Memory" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 10, + "x": 0, + "y": 174 + }, + "id": 149, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Virtual Memory", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Virtual Memory Limit", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Exporter Processes Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Number of file descriptors used by the exporter process versus its configured limit", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/.*Max*./" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#890F02", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + } + ] + }, + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "Open file descriptors" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 10, + "x": 10, + "y": 174 + }, + "id": 64, + "options": { + "legend": { + "calcs": [ + "min", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Maximum open file descriptors", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "legendFormat": "Open file descriptors", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Exporter File Descriptor Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "description": "Shows whether each Node Exporter collector scraped successfully (1 = success, 0 = failure), and whether the textfile collector returned an error.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "dark-red", + "value": 0 + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "bool" + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 4, + "x": 20, + "y": 174 + }, + "id": 157, + "options": { + "displayMode": "basic", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.6.1", + "targets": [ + { + "editorMode": "code", + "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "{{collector}}", + "range": true, + "refId": "A", + "step": 240 + }, + { + "editorMode": "code", + "expr": "1 - node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "legendFormat": "textfile", + "range": true, + "refId": "B", + "step": 240 + } + ], + "title": "Node Exporter Scrape", + "type": "bargauge" + } + ], + "title": "Node Exporter", + "type": "row" + } + ], + "refresh": "1m", + "schemaVersion": 41, + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": {}, + "includeAll": false, + "label": "Datasource", + "name": "ds_prometheus", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "", + "includeAll": false, + "label": "Job", + "name": "job", + "options": [], + "query": { + "query": "label_values(node_uname_info, job)", + "refId": "Prometheus-job-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "includeAll": false, + "label": "Nodename", + "name": "nodename", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\"}, nodename)", + "refId": "Prometheus-nodename-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${ds_prometheus}" + }, + "definition": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "includeAll": false, + "label": "Instance", + "name": "node", + "options": [], + "query": { + "query": "label_values(node_uname_info{job=\"$job\", nodename=\"$nodename\"}, instance)", + "refId": "Prometheus-node-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Node Exporter Full", + "uid": "rYdddlPWk", + "version": 98, + "weekStart": "", + "gnetId": 1860 +} \ No newline at end of file diff --git a/observability/grafana/provisioning/dashboards/19908_rev1.json b/observability/grafana/provisioning/dashboards/19908_rev1.json new file mode 100644 index 0000000..2632ca6 --- /dev/null +++ b/observability/grafana/provisioning/dashboards/19908_rev1.json @@ -0,0 +1,991 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "Prometheus is used for data collection from cAdvisor", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.2.0" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 3, + "panels": [], + "title": "Basic", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "count(count(container_last_seen) by (name))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Running Containers", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 20, + "x": 4, + "y": 1 + }, + "id": 1, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_cpu_usage_seconds_total{name=~\".+\"}[10m])) by (name) * 100", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 4, + "panels": [], + "title": "Memory", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_memory_usage_bytes{name=~\".+\"}[10m])) by (name)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_memory_cache{name=~\".+\"}[10m])) by (name)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Memory Cached", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 12, + "panels": [], + "title": "I/O", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_reads_bytes_total{name=~\".+\"}[10m])) by (name)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Reads", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_fs_writes_bytes_total{name=~\".+\"}[10m])) by (name)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Writes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 6, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_network_receive_bytes_total{name=~\".+\"}[10m])) by (name)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Received Network Traffic", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "sum(rate(container_network_transmit_bytes_total{name=~\".+\"}[10m])) by (name)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Sent Network Traffic", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 11, + "panels": [], + "title": "Details", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + }, + "fieldMinMax": true + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 10, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-red", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "PuRd", + "steps": 30 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Restarts" + }, + "tooltip": { + "show": true, + "yHistogram": true + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus_uid" + }, + "editorMode": "code", + "expr": "count by(name) (count_over_time(container_last_seen{name=~\".+\"}[$__range]))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts", + "type": "heatmap" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "cAdvisor Docker Insights", + "uid": "ae3c41d7-cea5-4cca-a918-5708706b4d1a", + "version": 14, + "weekStart": "", + "gnetId": 19908, + "description": "This Grafana dashboard offers a basic overview of key performance metrics for Docker containers in your system." +} \ No newline at end of file diff --git a/observability/grafana/provisioning/dashboards/default.yml b/observability/grafana/provisioning/dashboards/default.yml new file mode 100644 index 0000000..558f5c5 --- /dev/null +++ b/observability/grafana/provisioning/dashboards/default.yml @@ -0,0 +1,27 @@ +apiVersion: 1 + +# https://grafana.com/docs/grafana/latest/administration/provisioning/#dashboards +providers: + # https://grafana.com/grafana/dashboards/19908-docker-container-monitoring-with-prometheus-and-cadvisor/ + - name: cAdvisor + type: file + disableDeletion: true + allowUiUpdates: false + options: + path: /etc/grafana/provisioning/dashboards/19908_rev1.json + + # https://grafana.com/grafana/dashboards/1860-node-exporter-full/ + - name: Node Exporter + type: file + disableDeletion: true + allowUiUpdates: false + options: + path: /etc/grafana/provisioning/dashboards/1860_rev42.json + + # https://grafana.com/grafana/dashboards/17346-traefik-official-standalone-dashboard/ + - name: Traefik + type: file + disableDeletion: true + allowUiUpdates: false + options: + path: /etc/grafana/provisioning/dashboards/17346_rev9.json diff --git a/observability/grafana/provisioning/datasources/datasource.yml b/observability/grafana/provisioning/datasources/datasource.yml new file mode 100644 index 0000000..2c25a8d --- /dev/null +++ b/observability/grafana/provisioning/datasources/datasource.yml @@ -0,0 +1,18 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + uid: prometheus_uid + editable: false + isDefault: true + access: proxy + url: http://prometheus:9090 + + - name: Loki + type: loki + uid: loki_uid + editable: false + isDefault: false + access: proxy + url: http://loki:3100 diff --git a/observability/loki/Dockerfile b/observability/loki/Dockerfile new file mode 100644 index 0000000..8663b79 --- /dev/null +++ b/observability/loki/Dockerfile @@ -0,0 +1,7 @@ +FROM grafana/loki:latest + +COPY loki-config.yaml /etc/loki/local-config.yaml + +CMD ["-config.file=/etc/loki/local-config.yaml"] + +EXPOSE 3100 diff --git a/observability/loki/loki-config.yaml b/observability/loki/loki-config.yaml new file mode 100644 index 0000000..fd2a8b2 --- /dev/null +++ b/observability/loki/loki-config.yaml @@ -0,0 +1,51 @@ +auth_enabled: false + +# https://grafana.com/docs/loki/latest/configure/#common +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: info # options: debug, info, warn, error + grpc_server_max_concurrent_streams: 1000 + +# https://grafana.com/docs/loki/latest/configure/#common +common: +# Loki is composed of multiple microservices that communicate with each other (gRPC). +# The `instance_addr` is the address each microservice uses for inter-service communication. +# We use the [monolithic mode](https://grafana.com/docs/loki/latest/get-started/deployment-modes/#monolithic-mode) for simplicity. +# Therefore, we set the address to 127.0.0.1 so services can communicate with each other, +# even when the container IP changes during upgrades. + instance_addr: 127.0.0.1 + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + +schema_config: + configs: + - from: 2025-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +frontend: + encoding: protobuf +# By default, Loki will send anonymous, but uniquely-identifiable usage and configuration +# analytics to Grafana Labs. These statistics are sent to https://stats.grafana.org/ +# +# Statistics help us better understand how Loki is used, and they show us performance +# levels for most users. This helps us prioritize features and documentation. +# For more information on what's sent, look at +# https://github.com/grafana/loki/blob/main/pkg/analytics/stats.go +# Refer to the buildReport method to see what goes into a report. +# +# If you would like to disable reporting, uncomment the following lines: +#analytics: +# reporting_enabled: false diff --git a/observability/prometheus/Dockerfile b/observability/prometheus/Dockerfile new file mode 100644 index 0000000..37f119d --- /dev/null +++ b/observability/prometheus/Dockerfile @@ -0,0 +1,13 @@ + +# https://github.com/prometheus/prometheus/releases +FROM prom/prometheus:latest + +# Copy Prometheus configuration +COPY prometheus.yaml /etc/prometheus/prometheus.yaml + +# Switch to default user and expose port +EXPOSE 9090 + +# Start Prometheus with the provided configuration file +ENTRYPOINT ["prometheus"] +CMD ["--config.file=/etc/prometheus/prometheus.yaml", "--storage.tsdb.path=/prometheus"] diff --git a/observability/prometheus/prometheus.yaml b/observability/prometheus/prometheus.yaml new file mode 100644 index 0000000..2c264a9 --- /dev/null +++ b/observability/prometheus/prometheus.yaml @@ -0,0 +1,41 @@ +# Prometheus main configuration + +global: + scrape_interval: 30s + evaluation_interval: 30s + + + +scrape_configs: + # Scrape Prometheus itself + - job_name: "prometheus" + metrics_path: /metrics + dns_sd_configs: + - names: ["tasks.prometheus"] + type: A + refresh_interval: "30s" + port: 9090 + + # Scrape cAdvisor + - job_name: "cadvisor" + metrics_path: /metrics + dns_sd_configs: + - names: ["tasks.cadvisor"] + type: A + refresh_interval: "30s" + port: 8080 + + # Scrape Node Exporter + - job_name: "node_exporter" + metrics_path: /metrics + dns_sd_configs: + - names: ["tasks.node_exporter"] + type: A + refresh_interval: "30s" + port: 9100 + + # Scrape Traefik + - job_name: "traefik" + metrics_path: /metrics + static_configs: + - targets: ["traefik:9090"] diff --git a/observability/secrets/grafana-admin-password.txt b/observability/secrets/grafana-admin-password.txt new file mode 100644 index 0000000..f77b004 --- /dev/null +++ b/observability/secrets/grafana-admin-password.txt @@ -0,0 +1 @@ +admin \ No newline at end of file diff --git a/observability/secrets/grafana-admin-user.txt b/observability/secrets/grafana-admin-user.txt new file mode 100644 index 0000000..f77b004 --- /dev/null +++ b/observability/secrets/grafana-admin-user.txt @@ -0,0 +1 @@ +admin \ No newline at end of file diff --git a/observability/stack.yaml b/observability/stack.yaml new file mode 100644 index 0000000..f5a5e8c --- /dev/null +++ b/observability/stack.yaml @@ -0,0 +1,149 @@ +networks: + observability-network: + driver: overlay + attachable: false + internal: true + driver_opts: + encrypted: "true" + + grafana-frontend-network: + driver: overlay + attachable: false + internal: true + driver_opts: + encrypted: "true" + + proxy-swarm-network: + external: true + +secrets: + grafana-admin-user: + file: ./secrets/grafana-admin-user.txt + grafana-admin-password: + file: ./secrets/grafana-admin-password.txt + +volumes: + grafana-storage: {} + prometheus-storage: {} + loki-storage: {} + +services: + grafana: + image: ghcr.io/hyntaria/observability/grafana:v1.0 + environment: + GF_SECURITY_ADMIN_USER__FILE: /run/secrets/grafana-admin-user + GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana-admin-password + secrets: + - grafana-admin-user + - grafana-admin-password + volumes: + - grafana-storage:/var/lib/grafana + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: any + labels: + - "traefik.enable=true" + - "traefik.http.routers.grafana.entrypoints=admin" + - "traefik.http.routers.grafana.rule=Host(`grafana.localhost`)" + + - "traefik.http.routers.grafana.middlewares=compress-grafana" + - "traefik.http.middlewares.compress-grafana.compress=true" + - "traefik.http.middlewares.compress-grafana.compress.encodings=zstd, br, gzip" + + - "traefik.http.routers.grafana.service=grafana" + - "traefik.http.services.grafana.loadbalancer.server.port=3000" + networks: + - proxy-swarm-network + - grafana-frontend-network + + prometheus: + image: ghcr.io/hyntaria/observability/prometheus:v0.2 + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + volumes: + - prometheus-storage:/prometheus + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: any + placement: + constraints: + - node.role == manager + labels: + - "traefik.enable=true" + - "traefik.http.routers.prometheus.entrypoints=admin" + - "traefik.http.routers.prometheus.rule=Host(`prometheus.localhost`)" + + - "traefik.http.routers.prometheus.middlewares=compress-prometheus" + - "traefik.http.middlewares.compress-prometheus.compress=true" + - "traefik.http.middlewares.compress-prometheus.compress.encodings=zstd, br, gzip" + + - "traefik.http.routers.prometheus.service=prometheus" + - "traefik.http.services.prometheus.loadbalancer.server.port=9090" + networks: + - observability-network + - grafana-frontend-network + - proxy-swarm-network + + loki: + image: ghcr.io/hyntaria/observability/loki:v0.1 + volumes: + - loki-storage:/loki + deploy: + mode: replicated + replicas: 1 + restart_policy: + condition: any + placement: + constraints: + - node.role == manager + networks: + - observability-network + - grafana-frontend-network + + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + volumes: + - /:/rootfs:ro + - /run:/run:ro + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + - /dev/disk/:/dev/disk:ro + deploy: + mode: global + endpoint_mode: dnsrr + restart_policy: + condition: any + networks: + - observability-network + + node_exporter: + image: quay.io/prometheus/node-exporter:latest + hostname: "{{.Node.Hostname}}" + command: + - "--path.rootfs=/host" + volumes: + - "/:/host:ro,rslave" + deploy: + mode: global + endpoint_mode: dnsrr + restart_policy: + condition: any + networks: + - observability-network + + alloy: + image: ghcr.io/hyntaria/observability/alloy:v0.1 + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + deploy: + mode: global + endpoint_mode: dnsrr + restart_policy: + condition: any + networks: + - observability-network diff --git a/scripts/bind-telemetry.sh b/scripts/bind-telemetry.sh new file mode 100755 index 0000000..2aa893b --- /dev/null +++ b/scripts/bind-telemetry.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Local port on your machine +LOCAL_PORT=3030 + +# Remote port on the server +REMOTE_PORT=3030 + +# SSH host alias defined in ~/.ssh/config +SSH_ALIAS="hyntaria" + +echo "🔐 Establishing SSH tunnel: localhost:${LOCAL_PORT} → ${SSH_ALIAS}:localhost:${REMOTE_PORT}" +echo "🌐 Access the telemetry dashboard at http://grafana.localhost:${LOCAL_PORT}" +echo "Press Ctrl+C to close the tunnel." + +ssh -N -L ${LOCAL_PORT}:127.0.0.1:${REMOTE_PORT} ${SSH_ALIAS} + + diff --git a/traefik/compose.yaml b/traefik/compose.yaml new file mode 100644 index 0000000..45a83e7 --- /dev/null +++ b/traefik/compose.yaml @@ -0,0 +1,104 @@ +networks: + + # External network created outside of this compose file with: + # docker network create proxy-swarm-network --attachable --driver overlay --opt encrypted + proxy-swarm-network: + external: true + + # External network created outside of this compose file with: + # docker network create proxy-docker-network + proxy-docker-network: + external: true + +volumes: + traefik-acme: {} + +services: + traefik: + image: traefik:v3.6.7 + container_name: traefik + restart: unless-stopped + command: + - "--global.checknewversion=true" + - "--global.sendanonymoususage=true" + + # Log configuration + - "--accesslog=true" # Enable Access Logs + - "--log.level=INFO" # Set the Log Level e.g INFO, DEBUG + # - "--log.format=json" + + # Prometheus Metrics configuration + - "--metrics.prometheus=true" + - "--metrics.prometheus.entrypoint=metrics" + - "--metrics.prometheus.addrouterslabels=true" + - "--metrics.prometheus.addserviceslabels=true" + + # Dashboard and API configuration + - "--api.dashboard=true" + - "--api.insecure=false" + - "--api.basepath=/" + + # Docker Provider configuration + - "--providers.docker.endpoint=unix:///var/run/docker.sock" + - "--providers.docker.watch=true" + - "--providers.docker.exposedbydefault=false" + - "--providers.docker.network=proxy-docker-network" + + # Swarm Provider configuration + - "--providers.swarm.endpoint=unix:///var/run/docker.sock" + - "--providers.swarm.watch=true" + - "--providers.swarm.exposedbydefault=false" + - "--providers.swarm.network=proxy-swarm-network" + + # EntryPoints configuration + - "--entrypoints.web.address=:80" + - "--entrypoints.web.http.redirections.entrypoint.to=websecure" + - "--entrypoints.web.http.redirections.entrypoint.scheme=https" + - "--entrypoints.web.http.redirections.entrypoint.permanent=true" + + - "--entrypoints.websecure.address=:443" + - "--entrypoints.websecure.http.tls=true" + + - "--entrypoints.admin.address=:3030" + - "--entrypoints.metrics.address=:9090" + + # Certificates Resolver configuration + - "--certificatesresolvers.default-resolver.acme.email=mart1.guillemot@gmail.com" + - "--certificatesresolvers.default-resolver.acme.storage=/etc/traefik/acme/default-resolver-acme.json" + - "--certificatesresolvers.default-resolver.acme.tlschallenge=true" + + - "--certificatesresolvers.default-resolver.acme.caserver=https://acme-v02.api.letsencrypt.org/directory" + # - "--certificatesresolvers.default-resolver.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory" + + labels: + - "traefik.enable=true" + - "traefik.http.routers.traefik.entrypoints=admin" + - "traefik.http.routers.traefik.rule=Host(`traefik.localhost`)" + - "traefik.http.routers.traefik.service=api@internal" + + # Add a compression middleware which can be used by other services + - "traefik.http.routers.traefik.middlewares=compress-all" + - "traefik.http.middlewares.compress-all.compress=true" + - "traefik.http.middlewares.compress-all.compress.encodings=zstd, br, gzip" + + # environment: + # - CLOUDFLARE_DNS_API_TOKEN=${CLOUDFLARE_DNS_API_TOKEN} + + # - OVH_ENDPOINT=ovh-eu + # - OVH_APPLICATION_KEY=${OVH_APPLICATION_KEY} + # - OVH_APPLICATION_SECRET=${OVH_APPLICATION_SECRET} + # - OVH_CONSUMER_KEY=${OVH_CONSUMER_KEY} + + ports: + - "80:80" + - "443:443" + - "127.0.0.1:3030:3030" + networks: + - proxy-docker-network + - proxy-swarm-network + + volumes: + # So that Traefik can listen to the Docker events + - /var/run/docker.sock:/var/run/docker.sock:ro + # So that Traefik can access the acme.json file + - traefik-acme:/etc/traefik/acme/:rw