From 289746c7d1cd17af0e3651c78edb9cad6fe0085f Mon Sep 17 00:00:00 2001 From: Mauritz Uphoff <39736813+h3adex@users.noreply.github.com> Date: Mon, 14 Apr 2025 13:21:30 +0200 Subject: [PATCH] Implement observability alertgroups (#778) * feat: implement observability alertgroups * review changes --- docs/data-sources/observability_alertgroup.md | 47 ++ docs/guides/ske_kube_state_metric_alerts.md | 267 ++++++++ docs/resources/observability_alertgroup.md | 80 +++ .../data-source.tf | 5 + .../resource.tf | 32 + .../observability/alertgroup/datasource.go | 187 ++++++ .../observability/alertgroup/resource.go | 577 ++++++++++++++++++ .../observability/alertgroup/resource_test.go | 366 +++++++++++ .../observability/observability_acc_test.go | 156 ++++- stackit/provider.go | 7 +- .../ske_kube_state_metric_alerts.md.tmpl | 267 ++++++++ 11 files changed, 1987 insertions(+), 4 deletions(-) create mode 100644 docs/data-sources/observability_alertgroup.md create mode 100644 docs/guides/ske_kube_state_metric_alerts.md create mode 100644 docs/resources/observability_alertgroup.md create mode 100644 examples/data-sources/stackit_observability_alertgroup/data-source.tf create mode 100644 examples/resources/stackit_observability_alertgroup/resource.tf create mode 100644 stackit/internal/services/observability/alertgroup/datasource.go create mode 100644 stackit/internal/services/observability/alertgroup/resource.go create mode 100644 stackit/internal/services/observability/alertgroup/resource_test.go create mode 100644 templates/guides/ske_kube_state_metric_alerts.md.tmpl diff --git a/docs/data-sources/observability_alertgroup.md b/docs/data-sources/observability_alertgroup.md new file mode 100644 index 00000000..ad3cd624 --- /dev/null +++ b/docs/data-sources/observability_alertgroup.md @@ -0,0 +1,47 @@ +--- +# generated by https://github.com/hashicorp/terraform-plugin-docs +page_title: "stackit_observability_alertgroup Data Source - stackit" +subcategory: "" +description: |- + Observability alert group resource schema. Must have a region specified in the provider configuration. +--- + +# stackit_observability_alertgroup (Data Source) + +Observability alert group resource schema. Must have a `region` specified in the provider configuration. + +## Example Usage + +```terraform +data "stackit_observability_alertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-alert-group" +} +``` + + +## Schema + +### Required + +- `instance_id` (String) Observability instance ID to which the alert group is associated. +- `name` (String) The name of the alert group. Is the identifier and must be unique in the group. +- `project_id` (String) STACKIT project ID to which the alert group is associated. + +### Read-Only + +- `id` (String) Terraform's internal resource ID. It is structured as "`project_id`,`instance_id`,`name`". +- `interval` (String) Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'. +- `rules` (Attributes List) (see [below for nested schema](#nestedatt--rules)) + + +### Nested Schema for `rules` + +Read-Only: + +- `alert` (String) The name of the alert rule. Is the identifier and must be unique in the group. +- `annotations` (Map of String) A map of key:value. Annotations to add or overwrite for each alert +- `expression` (String) The PromQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts. +- `for` (String) Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s +- `labels` (Map of String) A map of key:value. Labels to add or overwrite for each alert diff --git a/docs/guides/ske_kube_state_metric_alerts.md b/docs/guides/ske_kube_state_metric_alerts.md new file mode 100644 index 00000000..d27e27d5 --- /dev/null +++ b/docs/guides/ske_kube_state_metric_alerts.md @@ -0,0 +1,267 @@ +--- +page_title: "Alerting with Kube-State-Metrics in STACKIT Observability" +--- +# Alerting with Kube-State-Metrics in STACKIT Observability + +## Overview + +This guide explains how to configure the STACKIT Observability product to send alerts using metrics gathered from kube-state-metrics. + +1. **Set Up Providers** + + Begin by configuring the STACKIT and Kubernetes providers to connect to the STACKIT services. + + ```hcl + provider "stackit" { + region = "eu01" + } + + provider "kubernetes" { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + + provider "helm" { + kubernetes { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + } + ``` + +2. **Create SKE Cluster and Kubeconfig Resource** + + Set up a STACKIT SKE Cluster and generate the associated kubeconfig resource. + + ```hcl + resource "stackit_ske_cluster" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + kubernetes_version = "1.31" + node_pools = [ + { + name = "standard" + machine_type = "c1.4" + minimum = "3" + maximum = "9" + max_surge = "3" + availability_zones = ["eu01-1", "eu01-2", "eu01-3"] + os_version_min = "4081.2.1" + os_name = "flatcar" + volume_size = 32 + volume_type = "storage_premium_perf6" + } + ] + maintenance = { + enable_kubernetes_version_updates = true + enable_machine_image_version_updates = true + start = "01:00:00Z" + end = "02:00:00Z" + } + } + + resource "stackit_ske_kubeconfig" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + cluster_name = stackit_ske_cluster.example.name + refresh = true + } + ``` + +3. **Create Observability Instance and Credentials** + + Establish a STACKIT Observability instance and its credentials to handle alerts. + + ```hcl + locals { + alert_config = { + route = { + receiver = "EmailStackit", + repeat_interval = "1m", + continue = true + } + receivers = [ + { + name = "EmailStackit", + email_configs = [ + { + to = "" + } + ] + } + ] + } + } + + resource "stackit_observability_instance" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + plan_name = "Observability-Large-EU01" + alert_config = local.alert_config + } + + resource "stackit_observability_credential" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + } + ``` + +4. **Install Prometheus Operator** + + Use the Prometheus Helm chart to install kube-state-metrics and transfer metrics to the STACKIT Observability instance. Customize the helm values as needed for your deployment. + + ```yaml + # helm values + # save as prom-values.tftpl + prometheus: + enabled: true + agentMode: true + prometheusSpec: + enableRemoteWriteReceiver: true + scrapeInterval: 60s + evaluationInterval: 60s + replicas: 1 + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: premium-perf4-stackit + accessModes: ['ReadWriteOnce'] + resources: + requests: + storage: 80Gi + remoteWrite: + - url: ${metrics_push_url} + queueConfig: + batchSendDeadline: '5s' + # both values need to be configured according to your observability plan + capacity: 30000 + maxSamplesPerSend: 3000 + writeRelabelConfigs: + - sourceLabels: ['__name__'] + regex: 'apiserver_.*|etcd_.*|prober_.*|storage_.*|workqueue_(work|queue)_duration_seconds_bucket|kube_pod_tolerations|kubelet_.*|kubernetes_feature_enabled|instance_scrape_target_status' + action: 'drop' + - sourceLabels: ['namespace'] + regex: 'example' + action: 'keep' + basicAuth: + username: + key: username + name: ${secret_name} + password: + key: password + name: ${secret_name} + + grafana: + enabled: false + + defaultRules: + create: false + + alertmanager: + enabled: false + + nodeExporter: + enabled: true + + kube-state-metrics: + enabled: true + customResourceState: + enabled: true + collectors: + - deployments + - pods + ``` + + ```hcl + resource "kubernetes_namespace" "monitoring" { + metadata { + name = "monitoring" + } + } + + resource "kubernetes_secret" "argus_prometheus_authorization" { + metadata { + name = "argus-prometheus-credentials" + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + + data = { + username = stackit_observability_credential.example.username + password = stackit_observability_credential.example.password + } + } + + resource "helm_release" "prometheus_operator" { + name = "prometheus-operator" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + version = "60.1.0" + namespace = kubernetes_namespace.monitoring.metadata[0].name + + values = [ + templatefile("prom-values.tftpl", { + metrics_push_url = stackit_observability_instance.example.metrics_push_url + secret_name = kubernetes_secret.argus_prometheus_authorization.metadata[0].name + }) + ] + } + ``` + +5. **Create Alert Group** + + Define an alert group with a rule to notify when a pod is running in the "example" namespace. + + ```hcl + resource "stackit_observability_alertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + name = "TestAlertGroup" + interval = "2h" + rules = [ + { + alert = "SimplePodCheck" + expression = "sum(kube_pod_status_phase{phase=\"Running\", namespace=\"example\"}) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary = "Test Alert is working" + description = "Test Alert" + } + }, + ] + } + ``` + +6. **Deploy Test Pod** + + Deploy a test pod; doing so should trigger an email notification, as the deployment satisfies the conditions defined in the alert group rule. In a real-world scenario, you would typically configure alerts to monitor pods for error states instead. + + ```hcl + resource "kubernetes_namespace" "example" { + metadata { + name = "example" + } + } + + resource "kubernetes_pod" "example" { + metadata { + name = "nginx" + namespace = kubernetes_namespace.example.metadata[0].name + labels = { + app = "nginx" + } + } + + spec { + container { + image = "nginx:latest" + name = "nginx" + } + } + } + ``` \ No newline at end of file diff --git a/docs/resources/observability_alertgroup.md b/docs/resources/observability_alertgroup.md new file mode 100644 index 00000000..c6b70859 --- /dev/null +++ b/docs/resources/observability_alertgroup.md @@ -0,0 +1,80 @@ +--- +# generated by https://github.com/hashicorp/terraform-plugin-docs +page_title: "stackit_observability_alertgroup Resource - stackit" +subcategory: "" +description: |- + Observability alert group resource schema. Must have a region specified in the provider configuration. +--- + +# stackit_observability_alertgroup (Resource) + +Observability alert group resource schema. Must have a `region` specified in the provider configuration. + +## Example Usage + +```terraform +resource "stackit_observability_alertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-alert-group" + interval = "60s" + rules = [ + { + alert = "example-alert-name" + expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + { + alert = "example-alert-name-2" + expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0" + for = "1m" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + ] +} +``` + + +## Schema + +### Required + +- `instance_id` (String) Observability instance ID to which the alert group is associated. +- `name` (String) The name of the alert group. Is the identifier and must be unique in the group. +- `project_id` (String) STACKIT project ID to which the alert group is associated. +- `rules` (Attributes List) Rules for the alert group (see [below for nested schema](#nestedatt--rules)) + +### Optional + +- `interval` (String) Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'. + +### Read-Only + +- `id` (String) Terraform's internal resource ID. It is structured as "`project_id`,`instance_id`,`name`". + + +### Nested Schema for `rules` + +Required: + +- `alert` (String) The name of the alert rule. Is the identifier and must be unique in the group. +- `expression` (String) The PromQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts. + +Optional: + +- `annotations` (Map of String) A map of key:value. Annotations to add or overwrite for each alert +- `for` (String) Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s +- `labels` (Map of String) A map of key:value. Labels to add or overwrite for each alert diff --git a/examples/data-sources/stackit_observability_alertgroup/data-source.tf b/examples/data-sources/stackit_observability_alertgroup/data-source.tf new file mode 100644 index 00000000..18dc3c0b --- /dev/null +++ b/examples/data-sources/stackit_observability_alertgroup/data-source.tf @@ -0,0 +1,5 @@ +data "stackit_observability_alertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-alert-group" +} diff --git a/examples/resources/stackit_observability_alertgroup/resource.tf b/examples/resources/stackit_observability_alertgroup/resource.tf new file mode 100644 index 00000000..8f9c6e78 --- /dev/null +++ b/examples/resources/stackit_observability_alertgroup/resource.tf @@ -0,0 +1,32 @@ +resource "stackit_observability_alertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-alert-group" + interval = "60s" + rules = [ + { + alert = "example-alert-name" + expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + { + alert = "example-alert-name-2" + expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0" + for = "1m" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + ] +} \ No newline at end of file diff --git a/stackit/internal/services/observability/alertgroup/datasource.go b/stackit/internal/services/observability/alertgroup/datasource.go new file mode 100644 index 00000000..e7a40816 --- /dev/null +++ b/stackit/internal/services/observability/alertgroup/datasource.go @@ -0,0 +1,187 @@ +package alertgroup + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/hashicorp/terraform-plugin-framework-validators/stringvalidator" + "github.com/hashicorp/terraform-plugin-framework/datasource" + "github.com/hashicorp/terraform-plugin-framework/datasource/schema" + "github.com/hashicorp/terraform-plugin-framework/schema/validator" + "github.com/hashicorp/terraform-plugin-framework/types" + "github.com/hashicorp/terraform-plugin-log/tflog" + "github.com/stackitcloud/stackit-sdk-go/core/config" + "github.com/stackitcloud/stackit-sdk-go/core/oapierror" + "github.com/stackitcloud/stackit-sdk-go/services/observability" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/core" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/validate" +) + +// Ensure the implementation satisfies the expected interfaces. +var ( + _ datasource.DataSource = &alertGroupDataSource{} +) + +// NewAlertGroupDataSource creates a new instance of the alertGroupDataSource. +func NewAlertGroupDataSource() datasource.DataSource { + return &alertGroupDataSource{} +} + +// alertGroupDataSource is the datasource implementation. +type alertGroupDataSource struct { + client *observability.APIClient +} + +// Configure adds the provider configured client to the resource. +func (a *alertGroupDataSource) Configure(ctx context.Context, req datasource.ConfigureRequest, resp *datasource.ConfigureResponse) { + // Prevent panic if the provider has not been configured. + if req.ProviderData == nil { + return + } + + providerData, ok := req.ProviderData.(core.ProviderData) + if !ok { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Expected configure type stackit.ProviderData, got %T", req.ProviderData)) + return + } + + var apiClient *observability.APIClient + var err error + if providerData.ObservabilityCustomEndpoint != "" { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithEndpoint(providerData.ObservabilityCustomEndpoint), + ) + } else { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithRegion(providerData.GetRegion()), + ) + } + + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Configuring client: %v. This is an error related to the provider configuration, not to the resource configuration", err)) + return + } + a.client = apiClient + tflog.Info(ctx, "Observability alert group client configured") +} + +// Metadata provides metadata for the alert group datasource. +func (a *alertGroupDataSource) Metadata(_ context.Context, req datasource.MetadataRequest, resp *datasource.MetadataResponse) { + resp.TypeName = req.ProviderTypeName + "_observability_alertgroup" +} + +// Schema defines the schema for the alert group data source. +func (a *alertGroupDataSource) Schema(_ context.Context, _ datasource.SchemaRequest, resp *datasource.SchemaResponse) { + resp.Schema = schema.Schema{ + Description: "Observability alert group resource schema. Must have a `region` specified in the provider configuration.", + Attributes: map[string]schema.Attribute{ + "id": schema.StringAttribute{ + Description: descriptions["id"], + Computed: true, + }, + "project_id": schema.StringAttribute{ + Description: descriptions["project_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + }, + "instance_id": schema.StringAttribute{ + Description: descriptions["instance_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + }, + "name": schema.StringAttribute{ + Description: descriptions["name"], + Required: true, + Validators: []validator.String{ + validate.NoSeparator(), + stringvalidator.LengthBetween(1, 200), + }, + }, + "interval": schema.StringAttribute{ + Description: descriptions["interval"], + Computed: true, + Validators: []validator.String{ + validate.ValidDurationString(), + }, + }, + "rules": schema.ListNestedAttribute{ + Description: descriptions["rules"], + Computed: true, + NestedObject: schema.NestedAttributeObject{ + Attributes: map[string]schema.Attribute{ + "alert": schema.StringAttribute{ + Description: descriptions["alert"], + Computed: true, + }, + "expression": schema.StringAttribute{ + Description: descriptions["expression"], + Computed: true, + }, + "for": schema.StringAttribute{ + Description: descriptions["for"], + Computed: true, + }, + "labels": schema.MapAttribute{ + Description: descriptions["labels"], + ElementType: types.StringType, + Computed: true, + }, + "annotations": schema.MapAttribute{ + Description: descriptions["annotations"], + ElementType: types.StringType, + Computed: true, + }, + }, + }, + }, + }, + } +} + +func (a *alertGroupDataSource) Read(ctx context.Context, req datasource.ReadRequest, resp *datasource.ReadResponse) { // nolint:gocritic // function signature required by Terraform + var model Model + diags := req.Config.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + readAlertGroupResp, err := a.client.GetAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute() + if err != nil { + var oapiErr *oapierror.GenericOpenAPIError + ok := errors.As(err, &oapiErr) + if ok && oapiErr.StatusCode == http.StatusNotFound { + resp.State.RemoveResource(ctx) + return + } + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Calling API: %v", err)) + return + } + + err = mapFields(ctx, readAlertGroupResp.Data, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Error processing API response: %v", err)) + return + } + + // Set the updated state. + diags = resp.State.Set(ctx, &model) + resp.Diagnostics.Append(diags...) +} diff --git a/stackit/internal/services/observability/alertgroup/resource.go b/stackit/internal/services/observability/alertgroup/resource.go new file mode 100644 index 00000000..7ad77b98 --- /dev/null +++ b/stackit/internal/services/observability/alertgroup/resource.go @@ -0,0 +1,577 @@ +package alertgroup + +import ( + "context" + "errors" + "fmt" + "net/http" + "regexp" + "strings" + + "github.com/hashicorp/terraform-plugin-framework-validators/mapvalidator" + "github.com/hashicorp/terraform-plugin-framework-validators/stringvalidator" + "github.com/hashicorp/terraform-plugin-framework/attr" + "github.com/hashicorp/terraform-plugin-framework/path" + "github.com/hashicorp/terraform-plugin-framework/resource" + "github.com/hashicorp/terraform-plugin-framework/resource/schema" + "github.com/hashicorp/terraform-plugin-framework/resource/schema/listplanmodifier" + "github.com/hashicorp/terraform-plugin-framework/resource/schema/planmodifier" + "github.com/hashicorp/terraform-plugin-framework/resource/schema/stringplanmodifier" + "github.com/hashicorp/terraform-plugin-framework/schema/validator" + "github.com/hashicorp/terraform-plugin-framework/types" + "github.com/hashicorp/terraform-plugin-framework/types/basetypes" + "github.com/hashicorp/terraform-plugin-log/tflog" + "github.com/stackitcloud/stackit-sdk-go/core/config" + "github.com/stackitcloud/stackit-sdk-go/core/oapierror" + "github.com/stackitcloud/stackit-sdk-go/services/observability" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/conversion" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/core" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/utils" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/validate" +) + +// Ensure the implementation satisfies the expected interfaces. +var ( + _ resource.Resource = &alertGroupResource{} + _ resource.ResourceWithConfigure = &alertGroupResource{} + _ resource.ResourceWithImportState = &alertGroupResource{} +) + +type Model struct { + Id types.String `tfsdk:"id"` + ProjectId types.String `tfsdk:"project_id"` + InstanceId types.String `tfsdk:"instance_id"` + Name types.String `tfsdk:"name"` + Interval types.String `tfsdk:"interval"` + Rules types.List `tfsdk:"rules"` +} + +type rule struct { + Alert types.String `tfsdk:"alert"` + Annotations types.Map `tfsdk:"annotations"` + Labels types.Map `tfsdk:"labels"` + Expression types.String `tfsdk:"expression"` + For types.String `tfsdk:"for"` +} + +var ruleTypes = map[string]attr.Type{ + "alert": basetypes.StringType{}, + "annotations": basetypes.MapType{ElemType: types.StringType}, + "labels": basetypes.MapType{ElemType: types.StringType}, + "expression": basetypes.StringType{}, + "for": basetypes.StringType{}, +} + +// Descriptions for the resource and data source schemas are centralized here. +var descriptions = map[string]string{ + "id": "Terraform's internal resource ID. It is structured as \"`project_id`,`instance_id`,`name`\".", + "project_id": "STACKIT project ID to which the alert group is associated.", + "instance_id": "Observability instance ID to which the alert group is associated.", + "name": "The name of the alert group. Is the identifier and must be unique in the group.", + "interval": "Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'.", + "alert": "The name of the alert rule. Is the identifier and must be unique in the group.", + "expression": "The PromQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts.", + "for": "Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s", + "labels": "A map of key:value. Labels to add or overwrite for each alert", + "annotations": "A map of key:value. Annotations to add or overwrite for each alert", +} + +// NewAlertGroupResource is a helper function to simplify the provider implementation. +func NewAlertGroupResource() resource.Resource { + return &alertGroupResource{} +} + +// alertGroupResource is the resource implementation. +type alertGroupResource struct { + client *observability.APIClient +} + +// Metadata returns the resource type name. +func (a *alertGroupResource) Metadata(_ context.Context, req resource.MetadataRequest, resp *resource.MetadataResponse) { + resp.TypeName = req.ProviderTypeName + "_observability_alertgroup" +} + +// Configure adds the provider configured client to the resource. +func (a *alertGroupResource) Configure(ctx context.Context, req resource.ConfigureRequest, resp *resource.ConfigureResponse) { + // Prevent panic if the provider has not been configured. + if req.ProviderData == nil { + return + } + + providerData, ok := req.ProviderData.(core.ProviderData) + if !ok { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Expected configure type stackit.ProviderData, got %T", req.ProviderData)) + return + } + + var apiClient *observability.APIClient + var err error + if providerData.ObservabilityCustomEndpoint != "" { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithEndpoint(providerData.ObservabilityCustomEndpoint), + ) + } else { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithRegion(providerData.GetRegion()), + ) + } + + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Configuring client: %v. This is an error related to the provider configuration, not to the resource configuration", err)) + return + } + a.client = apiClient + tflog.Info(ctx, "Observability alert group client configured") +} + +// Schema defines the schema for the resource. +func (a *alertGroupResource) Schema(_ context.Context, _ resource.SchemaRequest, resp *resource.SchemaResponse) { + resp.Schema = schema.Schema{ + Description: "Observability alert group resource schema. Must have a `region` specified in the provider configuration.", + Attributes: map[string]schema.Attribute{ + "id": schema.StringAttribute{ + Description: descriptions["id"], + Computed: true, + }, + "project_id": schema.StringAttribute{ + Description: "STACKIT project ID to which the alert group is associated.", + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "instance_id": schema.StringAttribute{ + Description: descriptions["instance_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "name": schema.StringAttribute{ + Description: descriptions["name"], + Required: true, + Validators: []validator.String{ + validate.NoSeparator(), + stringvalidator.LengthBetween(1, 200), + stringvalidator.RegexMatches( + regexp.MustCompile(`^[a-zA-Z0-9-]+$`), + "must match expression", + ), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "interval": schema.StringAttribute{ + Description: descriptions["interval"], + Optional: true, + Validators: []validator.String{ + validate.ValidDurationString(), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "rules": schema.ListNestedAttribute{ + Description: "Rules for the alert group", + Required: true, + PlanModifiers: []planmodifier.List{ + listplanmodifier.RequiresReplace(), + }, + NestedObject: schema.NestedAttributeObject{ + Attributes: map[string]schema.Attribute{ + "alert": schema.StringAttribute{ + Description: descriptions["alert"], + Required: true, + Validators: []validator.String{ + stringvalidator.RegexMatches( + regexp.MustCompile(`^[a-zA-Z0-9-]+$`), + "must match expression", + ), + stringvalidator.LengthBetween(1, 200), + }, + }, + "expression": schema.StringAttribute{ + Description: descriptions["expression"], + Required: true, + Validators: []validator.String{ + stringvalidator.LengthBetween(1, 600), + }, + }, + "for": schema.StringAttribute{ + Description: descriptions["for"], + Optional: true, + Validators: []validator.String{ + stringvalidator.LengthBetween(2, 8), + validate.ValidDurationString(), + }, + }, + "labels": schema.MapAttribute{ + Description: descriptions["labels"], + Optional: true, + ElementType: types.StringType, + Validators: []validator.Map{ + mapvalidator.KeysAre(stringvalidator.LengthAtMost(200)), + mapvalidator.ValueStringsAre(stringvalidator.LengthAtMost(200)), + mapvalidator.SizeAtMost(10), + }, + }, + "annotations": schema.MapAttribute{ + Description: descriptions["annotations"], + Optional: true, + ElementType: types.StringType, + Validators: []validator.Map{ + mapvalidator.KeysAre(stringvalidator.LengthAtMost(200)), + mapvalidator.ValueStringsAre(stringvalidator.LengthAtMost(200)), + mapvalidator.SizeAtMost(5), + }, + }, + }, + }, + }, + }, + } +} + +// Create creates the resource and sets the initial Terraform state. +func (a *alertGroupResource) Create(ctx context.Context, req resource.CreateRequest, resp *resource.CreateResponse) { // nolint:gocritic // function signature required by Terraform + // Retrieve values from plan + var model Model + diags := req.Plan.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + payload, err := toCreatePayload(ctx, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alertgroup", fmt.Sprintf("Creating API payload: %v", err)) + return + } + + createAlertGroupResp, err := a.client.CreateAlertgroups(ctx, instanceId, projectId).CreateAlertgroupsPayload(*payload).Execute() + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alertgroup", fmt.Sprintf("Creating API payload: %v", err)) + return + } + + // all alert groups are returned. We have to search the map for the one corresponding to our name + for _, alertGroup := range *createAlertGroupResp.Data { + if model.Name.ValueString() != *alertGroup.Name { + continue + } + + err = mapFields(ctx, &alertGroup, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alert group", fmt.Sprintf("Processing API payload: %v", err)) + return + } + } + + // Set the state with fully populated data. + diags = resp.State.Set(ctx, model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + tflog.Info(ctx, "alert group created") +} + +// Read refreshes the Terraform state with the latest data. +func (a *alertGroupResource) Read(ctx context.Context, req resource.ReadRequest, resp *resource.ReadResponse) { // nolint:gocritic // function signature required by Terraform + var model Model + diags := req.State.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + readAlertGroupResp, err := a.client.GetAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute() + if err != nil { + var oapiErr *oapierror.GenericOpenAPIError + ok := errors.As(err, &oapiErr) + if ok && oapiErr.StatusCode == http.StatusNotFound { + resp.State.RemoveResource(ctx) + return + } + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Calling API: %v", err)) + return + } + + err = mapFields(ctx, readAlertGroupResp.Data, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Error processing API response: %v", err)) + return + } + + // Set the updated state. + diags = resp.State.Set(ctx, &model) + resp.Diagnostics.Append(diags...) +} + +// Update attempts to update the resource. In this case, alertgroups cannot be updated. +// The Update function is redundant since any modifications will +// automatically trigger a resource recreation through Terraform's built-in +// lifecycle management. +func (a *alertGroupResource) Update(ctx context.Context, _ resource.UpdateRequest, resp *resource.UpdateResponse) { // nolint:gocritic // function signature required by Terraform + core.LogAndAddError(ctx, &resp.Diagnostics, "Error updating alert group", "Observability alert groups can't be updated") +} + +// Delete deletes the resource and removes the Terraform state on success. +func (a *alertGroupResource) Delete(ctx context.Context, req resource.DeleteRequest, resp *resource.DeleteResponse) { // nolint:gocritic // function signature required by Terraform + // Retrieve values from state + var model Model + diags := req.State.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + _, err := a.client.DeleteAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute() + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error deleting alert group", fmt.Sprintf("Calling API: %v", err)) + return + } + + tflog.Info(ctx, "Alert group deleted") +} + +// ImportState imports a resource into the Terraform state on success. +// The expected format of the resource import identifier is: project_id,instance_id,name +func (a *alertGroupResource) ImportState(ctx context.Context, req resource.ImportStateRequest, resp *resource.ImportStateResponse) { + idParts := strings.Split(req.ID, core.Separator) + + if len(idParts) != 3 || idParts[0] == "" || idParts[1] == "" || idParts[2] == "" { + core.LogAndAddError(ctx, &resp.Diagnostics, + "Error importing scrape config", + fmt.Sprintf("Expected import identifier with format: [project_id],[instance_id],[name] Got: %q", req.ID), + ) + return + } + + resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("project_id"), idParts[0])...) + resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("instance_id"), idParts[1])...) + resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("name"), idParts[2])...) + tflog.Info(ctx, "Observability alert group state imported") +} + +// toCreatePayload generates the payload to create a new alert group. +func toCreatePayload(ctx context.Context, model *Model) (*observability.CreateAlertgroupsPayload, error) { + if model == nil { + return nil, fmt.Errorf("nil model") + } + + payload := observability.CreateAlertgroupsPayload{} + + if !utils.IsUndefined(model.Name) { + payload.Name = model.Name.ValueStringPointer() + } + + if !utils.IsUndefined(model.Interval) { + payload.Interval = model.Interval.ValueStringPointer() + } + + if !utils.IsUndefined(model.Rules) { + rules, err := toRulesPayload(ctx, model) + if err != nil { + return nil, err + } + payload.Rules = &rules + } + + return &payload, nil +} + +// toRulesPayload generates rules for create payload. +func toRulesPayload(ctx context.Context, model *Model) ([]observability.UpdateAlertgroupsRequestInnerRulesInner, error) { + if model.Rules.Elements() == nil || len(model.Rules.Elements()) == 0 { + return []observability.UpdateAlertgroupsRequestInnerRulesInner{}, nil + } + + var rules []rule + diags := model.Rules.ElementsAs(ctx, &rules, false) + if diags.HasError() { + return nil, core.DiagsToError(diags) + } + + var oarrs []observability.UpdateAlertgroupsRequestInnerRulesInner + for i := range rules { + rule := &rules[i] + oarr := observability.UpdateAlertgroupsRequestInnerRulesInner{} + + if !utils.IsUndefined(rule.Alert) { + alert := conversion.StringValueToPointer(rule.Alert) + if alert == nil { + return nil, fmt.Errorf("found nil alert for rule[%d]", i) + } + oarr.Alert = alert + } + + if !utils.IsUndefined(rule.Expression) { + expression := conversion.StringValueToPointer(rule.Expression) + if expression == nil { + return nil, fmt.Errorf("found nil expression for rule[%d]", i) + } + oarr.Expr = expression + } + + if !utils.IsUndefined(rule.For) { + for_ := conversion.StringValueToPointer(rule.For) + if for_ == nil { + return nil, fmt.Errorf("found nil expression for for_[%d]", i) + } + oarr.For = for_ + } + + if !utils.IsUndefined(rule.Labels) { + labels, err := conversion.ToStringInterfaceMap(ctx, rule.Labels) + if err != nil { + return nil, fmt.Errorf("converting to Go map: %w", err) + } + oarr.Labels = &labels + } + + if !utils.IsUndefined(rule.Annotations) { + annotations, err := conversion.ToStringInterfaceMap(ctx, rule.Annotations) + if err != nil { + return nil, fmt.Errorf("converting to Go map: %w", err) + } + oarr.Annotations = &annotations + } + + oarrs = append(oarrs, oarr) + } + + return oarrs, nil +} + +// mapRules maps alertGroup response to the model. +func mapFields(ctx context.Context, alertGroup *observability.AlertGroup, model *Model) error { + if alertGroup == nil { + return fmt.Errorf("nil alertGroup") + } + + if model == nil { + return fmt.Errorf("nil model") + } + + if utils.IsUndefined(model.Name) { + return fmt.Errorf("empty name") + } + + if utils.IsUndefined(model.ProjectId) { + return fmt.Errorf("empty projectId") + } + + if utils.IsUndefined(model.InstanceId) { + return fmt.Errorf("empty instanceId") + } + + var name string + if !utils.IsUndefined(model.Name) { + name = model.Name.ValueString() + } else if alertGroup.Name != nil { + name = *alertGroup.Name + } else { + return fmt.Errorf("found empty name") + } + + model.Name = types.StringValue(name) + idParts := []string{model.ProjectId.ValueString(), model.InstanceId.ValueString(), name} + model.Id = types.StringValue(strings.Join(idParts, core.Separator)) + + var interval string + if !utils.IsUndefined(model.Interval) { + interval = model.Interval.ValueString() + } else if alertGroup.Interval != nil { + interval = *alertGroup.Interval + } else { + return fmt.Errorf("found empty interval") + } + model.Interval = types.StringValue(interval) + + if alertGroup.Rules != nil { + err := mapRules(ctx, alertGroup, model) + if err != nil { + return fmt.Errorf("map rules: %w", err) + } + } + + return nil +} + +// mapRules maps alertGroup response rules to the model rules. +func mapRules(_ context.Context, alertGroup *observability.AlertGroup, model *Model) error { + var newRules []attr.Value + + for i, r := range *alertGroup.Rules { + ruleMap := map[string]attr.Value{ + "alert": types.StringPointerValue(r.Alert), + "expression": types.StringPointerValue(r.Expr), + "for": types.StringPointerValue(r.For), + "labels": types.MapNull(types.StringType), + "annotations": types.MapNull(types.StringType), + } + + if r.Labels != nil { + labelElems := map[string]attr.Value{} + for k, v := range *r.Labels { + labelElems[k] = types.StringValue(v) + } + ruleMap["labels"] = types.MapValueMust(types.StringType, labelElems) + } + + if r.Annotations != nil { + annoElems := map[string]attr.Value{} + for k, v := range *r.Annotations { + annoElems[k] = types.StringValue(v) + } + ruleMap["annotations"] = types.MapValueMust(types.StringType, annoElems) + } + + ruleTf, diags := types.ObjectValue(ruleTypes, ruleMap) + if diags.HasError() { + return fmt.Errorf("mapping index %d: %w", i, core.DiagsToError(diags)) + } + newRules = append(newRules, ruleTf) + } + + rulesTf, diags := types.ListValue(types.ObjectType{AttrTypes: ruleTypes}, newRules) + if diags.HasError() { + return core.DiagsToError(diags) + } + + model.Rules = rulesTf + return nil +} diff --git a/stackit/internal/services/observability/alertgroup/resource_test.go b/stackit/internal/services/observability/alertgroup/resource_test.go new file mode 100644 index 00000000..74697421 --- /dev/null +++ b/stackit/internal/services/observability/alertgroup/resource_test.go @@ -0,0 +1,366 @@ +package alertgroup + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/hashicorp/terraform-plugin-framework/attr" + "github.com/hashicorp/terraform-plugin-framework/types" + "github.com/stackitcloud/stackit-sdk-go/core/utils" + "github.com/stackitcloud/stackit-sdk-go/services/observability" +) + +func TestToCreatePayload(t *testing.T) { + tests := []struct { + name string + input *Model + expect *observability.CreateAlertgroupsPayload + expectErr bool + }{ + { + name: "Nil Model", + input: nil, + expect: nil, + expectErr: true, + }, + { + name: "Empty Model", + input: &Model{ + Name: types.StringNull(), + Interval: types.StringNull(), + Rules: types.ListNull(types.StringType), + }, + expect: &observability.CreateAlertgroupsPayload{}, + expectErr: false, + }, + { + name: "Model with Name and Interval", + input: &Model{ + Name: types.StringValue("test-alertgroup"), + Interval: types.StringValue("5m"), + }, + expect: &observability.CreateAlertgroupsPayload{ + Name: utils.Ptr("test-alertgroup"), + Interval: utils.Ptr("5m"), + }, + expectErr: false, + }, + { + name: "Model with Full Information", + input: &Model{ + Name: types.StringValue("full-alertgroup"), + Interval: types.StringValue("10m"), + Rules: types.ListValueMust( + types.ObjectType{AttrTypes: ruleTypes}, + []attr.Value{ + types.ObjectValueMust( + ruleTypes, + map[string]attr.Value{ + "alert": types.StringValue("alert"), + "expression": types.StringValue("expression"), + "for": types.StringValue("10s"), + "labels": types.MapValueMust( + types.StringType, + map[string]attr.Value{ + "k": types.StringValue("v"), + }, + ), + "annotations": types.MapValueMust( + types.StringType, + map[string]attr.Value{ + "k": types.StringValue("v"), + }, + ), + }, + ), + }, + ), + }, + expect: &observability.CreateAlertgroupsPayload{ + Name: utils.Ptr("full-alertgroup"), + Interval: utils.Ptr("10m"), + Rules: &[]observability.UpdateAlertgroupsRequestInnerRulesInner{ + { + Alert: utils.Ptr("alert"), + Annotations: &map[string]interface{}{ + "k": "v", + }, + Expr: utils.Ptr("expression"), + For: utils.Ptr("10s"), + Labels: &map[string]interface{}{ + "k": "v", + }, + }, + }, + }, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + got, err := toCreatePayload(ctx, tt.input) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err) + } + + if diff := cmp.Diff(got, tt.expect); diff != "" { + t.Errorf("unexpected result (-got +want):\n%s", diff) + } + }) + } +} + +func TestToRulesPayload(t *testing.T) { + tests := []struct { + name string + input *Model + expect []observability.UpdateAlertgroupsRequestInnerRulesInner + expectErr bool + }{ + { + name: "Nil Rules", + input: &Model{ + Rules: types.ListNull(types.StringType), // Simulates a lack of rules + }, + expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{}, + expectErr: false, + }, + { + name: "Invalid Rule Element Type", + input: &Model{ + Rules: types.ListValueMust(types.StringType, []attr.Value{ + types.StringValue("invalid"), // Should cause a conversion failure + }), + }, + expect: nil, + expectErr: true, + }, + { + name: "Single Valid Rule", + input: &Model{ + Rules: types.ListValueMust(types.ObjectType{AttrTypes: ruleTypes}, []attr.Value{ + types.ObjectValueMust(ruleTypes, map[string]attr.Value{ + "alert": types.StringValue("alert"), + "expression": types.StringValue("expr"), + "for": types.StringValue("5s"), + "labels": types.MapValueMust(types.StringType, map[string]attr.Value{ + "key": types.StringValue("value"), + }), + "annotations": types.MapValueMust(types.StringType, map[string]attr.Value{ + "note": types.StringValue("important"), + }), + }), + }), + }, + expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{ + { + Alert: utils.Ptr("alert"), + Expr: utils.Ptr("expr"), + For: utils.Ptr("5s"), + Labels: &map[string]interface{}{ + "key": "value", + }, + Annotations: &map[string]interface{}{ + "note": "important", + }, + }, + }, + expectErr: false, + }, + { + name: "Multiple Valid Rules", + input: &Model{ + Rules: types.ListValueMust(types.ObjectType{AttrTypes: ruleTypes}, []attr.Value{ + types.ObjectValueMust(ruleTypes, map[string]attr.Value{ + "alert": types.StringValue("alert1"), + "expression": types.StringValue("expr1"), + "for": types.StringValue("5s"), + "labels": types.MapNull(types.StringType), + "annotations": types.MapNull(types.StringType), + }), + types.ObjectValueMust(ruleTypes, map[string]attr.Value{ + "alert": types.StringValue("alert2"), + "expression": types.StringValue("expr2"), + "for": types.StringValue("10s"), + "labels": types.MapValueMust(types.StringType, map[string]attr.Value{ + "key": types.StringValue("value"), + }), + "annotations": types.MapValueMust(types.StringType, map[string]attr.Value{ + "note": types.StringValue("important"), + }), + }), + }), + }, + expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{ + { + Alert: utils.Ptr("alert1"), + Expr: utils.Ptr("expr1"), + For: utils.Ptr("5s"), + }, + { + Alert: utils.Ptr("alert2"), + Expr: utils.Ptr("expr2"), + For: utils.Ptr("10s"), + Labels: &map[string]interface{}{ + "key": "value", + }, + Annotations: &map[string]interface{}{ + "note": "important", + }, + }, + }, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + got, err := toRulesPayload(ctx, tt.input) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err) + } + + if diff := cmp.Diff(got, tt.expect); diff != "" { + t.Errorf("unexpected result (-got +want):\n%s", diff) + } + }) + } +} + +func TestMapFields(t *testing.T) { + tests := []struct { + name string + alertGroup *observability.AlertGroup + model *Model + expectedName string + expectedID string + expectErr bool + }{ + { + name: "Nil AlertGroup", + alertGroup: nil, + model: &Model{}, + expectErr: true, + }, + { + name: "Nil Model", + alertGroup: &observability.AlertGroup{}, + model: nil, + expectErr: true, + }, + { + name: "Interval Missing", + alertGroup: &observability.AlertGroup{ + Name: utils.Ptr("alert-group-name"), + }, + model: &Model{ + Name: types.StringValue("alert-group-name"), + ProjectId: types.StringValue("project1"), + InstanceId: types.StringValue("instance1"), + }, + expectedName: "alert-group-name", + expectedID: "project1,instance1,alert-group-name", + expectErr: true, + }, + { + name: "Name Missing", + alertGroup: &observability.AlertGroup{ + Interval: utils.Ptr("5m"), + }, + model: &Model{ + Name: types.StringValue("model-name"), + InstanceId: types.StringValue("instance1"), + }, + expectErr: true, + }, + { + name: "Complete Model and AlertGroup", + alertGroup: &observability.AlertGroup{ + Name: utils.Ptr("alert-group-name"), + Interval: utils.Ptr("10m"), + }, + model: &Model{ + Name: types.StringValue("alert-group-name"), + ProjectId: types.StringValue("project1"), + InstanceId: types.StringValue("instance1"), + Id: types.StringValue("project1,instance1,alert-group-name"), + Interval: types.StringValue("10m"), + }, + expectedName: "alert-group-name", + expectedID: "project1,instance1,alert-group-name", + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + err := mapFields(ctx, tt.alertGroup, tt.model) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err) + } + + if !tt.expectErr { + if diff := cmp.Diff(tt.model.Name.ValueString(), tt.expectedName); diff != "" { + t.Errorf("unexpected name (-got +want):\n%s", diff) + } + if diff := cmp.Diff(tt.model.Id.ValueString(), tt.expectedID); diff != "" { + t.Errorf("unexpected ID (-got +want):\n%s", diff) + } + } + }) + } +} + +func TestMapRules(t *testing.T) { + tests := []struct { + name string + alertGroup *observability.AlertGroup + model *Model + expectErr bool + }{ + { + name: "Empty Rules", + alertGroup: &observability.AlertGroup{ + Rules: &[]observability.AlertRuleRecord{}, + }, + model: &Model{}, + expectErr: false, + }, + { + name: "Single Complete Rule", + alertGroup: &observability.AlertGroup{ + Rules: &[]observability.AlertRuleRecord{ + { + Alert: utils.Ptr("HighCPUUsage"), + Expr: utils.Ptr("rate(cpu_usage[5m]) > 0.9"), + For: utils.Ptr("2m"), + Labels: &map[string]string{"severity": "critical"}, + Annotations: &map[string]string{"summary": "CPU usage high"}, + Record: utils.Ptr("record1"), + }, + }, + }, + model: &Model{}, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + err := mapRules(ctx, tt.alertGroup, tt.model) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err != nil) + } + }) + } +} diff --git a/stackit/internal/services/observability/observability_acc_test.go b/stackit/internal/services/observability/observability_acc_test.go index 8f91c5d3..21a30214 100644 --- a/stackit/internal/services/observability/observability_acc_test.go +++ b/stackit/internal/services/observability/observability_acc_test.go @@ -41,6 +41,17 @@ var scrapeConfigResource = map[string]string{ "saml2_enable_url_parameters": "false", } +var alertGroupResource = map[string]string{ + "name": fmt.Sprintf("alertgroup-%s", acctest.RandStringFromCharSet(7, acctest.CharSetAlphaNum)), + "name_updated": fmt.Sprintf("alertgroup-%s", acctest.RandStringFromCharSet(7, acctest.CharSetAlphaNum)), + "interval": "5h", + "interval_updated": "1h", + "alert": "alert1", + "expression": "expression1", + "expression_updated": "expression2", + "for": "60s", +} + var credentialResource = map[string]string{ "project_id": testutil.ProjectId, } @@ -228,8 +239,31 @@ func credentialResourceConfig() string { }` } -func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsampling, metricsRetentionDays5mDownsampling, alertConfig *string, instanceName, planName, target, saml2EnableUrlParameters string) string { - return fmt.Sprintf("%s\n\n%s\n\n%s\n\n%s", +func alertGroupResourceConfig(name, interval, expression string) string { + return fmt.Sprintf( + `resource "stackit_observability_alertgroup" "alertgroup" { + project_id = stackit_observability_instance.instance.project_id + instance_id = stackit_observability_instance.instance.instance_id + name = "%s" + interval = "%s" + rules = [ + { + alert = "%s" + expression = "%s" + for = "%s" + } + ] + }`, + name, + interval, + alertGroupResource["alert"], + expression, + alertGroupResource["for"], + ) +} + +func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsampling, metricsRetentionDays5mDownsampling, alertConfig *string, instanceName, planName, target, saml2EnableUrlParameters, alertGroupName, alertGroupInterval, alertGroupRule1Expression string) string { + return fmt.Sprintf("%s\n\n%s\n\n%s\n\n%s\n\n%s", testutil.ObservabilityProviderConfig(), instanceResourceConfig(acl, metricsRetentionDays, @@ -240,6 +274,7 @@ func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsamplin planName), scrapeConfigResourceConfig(target, saml2EnableUrlParameters), credentialResourceConfig(), + alertGroupResourceConfig(alertGroupName, alertGroupInterval, alertGroupRule1Expression), ) } @@ -265,6 +300,9 @@ func TestAccResource(t *testing.T) { instanceResource["plan_name"], scrapeConfigResource["urls"], scrapeConfigResource["saml2_enable_url_parameters"], + alertGroupResource["name"], + alertGroupResource["interval"], + alertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -349,6 +387,18 @@ func TestAccResource(t *testing.T) { ), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), + + // alertgroup + resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_alertgroup.alertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), ), }, // Update Alert Config with complete Receiver (email, webhook and opsgenie configs), global options and Route with child routes @@ -368,6 +418,9 @@ func TestAccResource(t *testing.T) { instanceResource["plan_name"], scrapeConfigResource["urls"], scrapeConfigResource["saml2_enable_url_parameters"], + alertGroupResource["name"], + alertGroupResource["interval"], + alertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -461,6 +514,18 @@ func TestAccResource(t *testing.T) { ), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), + + // alertgroup + resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_alertgroup.alertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), ), }, // Update without ACL, partial metrics retention days and NO alert configs @@ -475,6 +540,9 @@ func TestAccResource(t *testing.T) { instanceResource["plan_name"], scrapeConfigResource["urls"], scrapeConfigResource["saml2_enable_url_parameters"], + alertGroupResource["name"], + alertGroupResource["interval"], + alertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -530,6 +598,18 @@ func TestAccResource(t *testing.T) { ), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), + + // alertgroup + resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_alertgroup.alertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), ), }, // Update with empty ACL, NO metrics retention days and NO alert configs @@ -544,6 +624,9 @@ func TestAccResource(t *testing.T) { instanceResource["plan_name"], scrapeConfigResource["urls"], scrapeConfigResource["saml2_enable_url_parameters"], + alertGroupResource["name"], + alertGroupResource["interval"], + alertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -599,6 +682,18 @@ func TestAccResource(t *testing.T) { ), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), + + // alertgroup + resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_alertgroup.alertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), ), }, // Data source @@ -616,6 +711,12 @@ func TestAccResource(t *testing.T) { instance_id = stackit_observability_scrapeconfig.scrapeconfig.instance_id name = stackit_observability_scrapeconfig.scrapeconfig.name } + + data "stackit_observability_alertgroup" "alertgroup" { + project_id = stackit_observability_alertgroup.alertgroup.project_id + instance_id = stackit_observability_alertgroup.alertgroup.instance_id + name = stackit_observability_alertgroup.alertgroup.name + } `, resourceConfig( utils.Ptr(fmt.Sprintf( @@ -631,6 +732,9 @@ func TestAccResource(t *testing.T) { instanceResource["plan_name"], scrapeConfigResource["urls"], scrapeConfigResource["saml2_enable_url_parameters"], + alertGroupResource["name"], + alertGroupResource["interval"], + alertGroupResource["expression"], ), ), Check: resource.ComposeAggregateTestCheckFunc( @@ -670,6 +774,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttr("data.stackit_observability_scrapeconfig.scrapeconfig", "scrape_interval", scrapeConfigResource["scrape_interval"]), resource.TestCheckResourceAttr("stackit_observability_scrapeconfig.scrapeconfig", "sample_limit", scrapeConfigResource["sample_limit"]), resource.TestCheckResourceAttr("data.stackit_observability_scrapeconfig.scrapeconfig", "saml2.enable_url_parameters", scrapeConfigResource["saml2_enable_url_parameters"]), + + // alertgroup + resource.TestCheckResourceAttr("data.stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "data.stackit_observability_alertgroup.alertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "name"), + resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "interval"), + resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.for"), ), }, // Import 1 @@ -711,6 +827,27 @@ func TestAccResource(t *testing.T) { ImportState: true, ImportStateVerify: true, }, + // Import 3 + { + ResourceName: "stackit_observability_alertgroup.alertgroup", + ImportStateIdFunc: func(s *terraform.State) (string, error) { + r, ok := s.RootModule().Resources["stackit_observability_alertgroup.alertgroup"] + if !ok { + return "", fmt.Errorf("couldn't find resource stackit_observability_alertgroup.alertgroup") + } + instanceId, ok := r.Primary.Attributes["instance_id"] + if !ok { + return "", fmt.Errorf("couldn't find attribute instance_id") + } + name, ok := r.Primary.Attributes["name"] + if !ok { + return "", fmt.Errorf("couldn't find attribute name") + } + return fmt.Sprintf("%s,%s,%s", testutil.ProjectId, instanceId, name), nil + }, + ImportState: true, + ImportStateVerify: true, + }, // Update { Config: resourceConfig( @@ -727,6 +864,9 @@ func TestAccResource(t *testing.T) { instanceResource["new_plan_name"], "", "true", + alertGroupResource["name_updated"], + alertGroupResource["interval_updated"], + alertGroupResource["expression_updated"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance @@ -782,6 +922,18 @@ func TestAccResource(t *testing.T) { // Credentials resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), + + // alertgroup + resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_alertgroup.alertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), ), }, // Update and remove saml2 attribute diff --git a/stackit/provider.go b/stackit/provider.go index f33b961c..7f4a3002 100644 --- a/stackit/provider.go +++ b/stackit/provider.go @@ -13,7 +13,7 @@ import ( "github.com/hashicorp/terraform-plugin-framework/schema/validator" "github.com/hashicorp/terraform-plugin-framework/types" "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/features" - roleassignments "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/authorization/roleassignments" + roleAssignements "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/authorization/roleassignments" dnsRecordSet "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/dns/recordset" dnsZone "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/dns/zone" iaasAffinityGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/iaas/affinitygroup" @@ -45,6 +45,7 @@ import ( objectStorageBucket "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/bucket" objecStorageCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/credential" objecStorageCredentialsGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/credentialsgroup" + alertGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/alertgroup" observabilityCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/credential" observabilityInstance "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/instance" observabilityScrapeConfig "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/scrapeconfig" @@ -462,6 +463,7 @@ func (p *Provider) Configure(ctx context.Context, req provider.ConfigureRequest, // DataSources defines the data sources implemented in the provider. func (p *Provider) DataSources(_ context.Context) []func() datasource.DataSource { return []func() datasource.DataSource{ + alertGroup.NewAlertGroupDataSource, dnsZone.NewZoneDataSource, dnsRecordSet.NewRecordSetDataSource, iaasAffinityGroup.NewAffinityGroupDatasource, @@ -515,6 +517,7 @@ func (p *Provider) DataSources(_ context.Context) []func() datasource.DataSource // Resources defines the resources implemented in the provider. func (p *Provider) Resources(_ context.Context) []func() resource.Resource { resources := []func() resource.Resource{ + alertGroup.NewAlertGroupResource, dnsZone.NewZoneResource, dnsRecordSet.NewRecordSetResource, iaasAffinityGroup.NewAffinityGroupResource, @@ -570,7 +573,7 @@ func (p *Provider) Resources(_ context.Context) []func() resource.Resource { skeCluster.NewClusterResource, skeKubeconfig.NewKubeconfigResource, } - resources = append(resources, roleassignments.NewRoleAssignmentResources()...) + resources = append(resources, roleAssignements.NewRoleAssignmentResources()...) return resources } diff --git a/templates/guides/ske_kube_state_metric_alerts.md.tmpl b/templates/guides/ske_kube_state_metric_alerts.md.tmpl new file mode 100644 index 00000000..d27e27d5 --- /dev/null +++ b/templates/guides/ske_kube_state_metric_alerts.md.tmpl @@ -0,0 +1,267 @@ +--- +page_title: "Alerting with Kube-State-Metrics in STACKIT Observability" +--- +# Alerting with Kube-State-Metrics in STACKIT Observability + +## Overview + +This guide explains how to configure the STACKIT Observability product to send alerts using metrics gathered from kube-state-metrics. + +1. **Set Up Providers** + + Begin by configuring the STACKIT and Kubernetes providers to connect to the STACKIT services. + + ```hcl + provider "stackit" { + region = "eu01" + } + + provider "kubernetes" { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + + provider "helm" { + kubernetes { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + } + ``` + +2. **Create SKE Cluster and Kubeconfig Resource** + + Set up a STACKIT SKE Cluster and generate the associated kubeconfig resource. + + ```hcl + resource "stackit_ske_cluster" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + kubernetes_version = "1.31" + node_pools = [ + { + name = "standard" + machine_type = "c1.4" + minimum = "3" + maximum = "9" + max_surge = "3" + availability_zones = ["eu01-1", "eu01-2", "eu01-3"] + os_version_min = "4081.2.1" + os_name = "flatcar" + volume_size = 32 + volume_type = "storage_premium_perf6" + } + ] + maintenance = { + enable_kubernetes_version_updates = true + enable_machine_image_version_updates = true + start = "01:00:00Z" + end = "02:00:00Z" + } + } + + resource "stackit_ske_kubeconfig" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + cluster_name = stackit_ske_cluster.example.name + refresh = true + } + ``` + +3. **Create Observability Instance and Credentials** + + Establish a STACKIT Observability instance and its credentials to handle alerts. + + ```hcl + locals { + alert_config = { + route = { + receiver = "EmailStackit", + repeat_interval = "1m", + continue = true + } + receivers = [ + { + name = "EmailStackit", + email_configs = [ + { + to = "" + } + ] + } + ] + } + } + + resource "stackit_observability_instance" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + plan_name = "Observability-Large-EU01" + alert_config = local.alert_config + } + + resource "stackit_observability_credential" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + } + ``` + +4. **Install Prometheus Operator** + + Use the Prometheus Helm chart to install kube-state-metrics and transfer metrics to the STACKIT Observability instance. Customize the helm values as needed for your deployment. + + ```yaml + # helm values + # save as prom-values.tftpl + prometheus: + enabled: true + agentMode: true + prometheusSpec: + enableRemoteWriteReceiver: true + scrapeInterval: 60s + evaluationInterval: 60s + replicas: 1 + storageSpec: + volumeClaimTemplate: + spec: + storageClassName: premium-perf4-stackit + accessModes: ['ReadWriteOnce'] + resources: + requests: + storage: 80Gi + remoteWrite: + - url: ${metrics_push_url} + queueConfig: + batchSendDeadline: '5s' + # both values need to be configured according to your observability plan + capacity: 30000 + maxSamplesPerSend: 3000 + writeRelabelConfigs: + - sourceLabels: ['__name__'] + regex: 'apiserver_.*|etcd_.*|prober_.*|storage_.*|workqueue_(work|queue)_duration_seconds_bucket|kube_pod_tolerations|kubelet_.*|kubernetes_feature_enabled|instance_scrape_target_status' + action: 'drop' + - sourceLabels: ['namespace'] + regex: 'example' + action: 'keep' + basicAuth: + username: + key: username + name: ${secret_name} + password: + key: password + name: ${secret_name} + + grafana: + enabled: false + + defaultRules: + create: false + + alertmanager: + enabled: false + + nodeExporter: + enabled: true + + kube-state-metrics: + enabled: true + customResourceState: + enabled: true + collectors: + - deployments + - pods + ``` + + ```hcl + resource "kubernetes_namespace" "monitoring" { + metadata { + name = "monitoring" + } + } + + resource "kubernetes_secret" "argus_prometheus_authorization" { + metadata { + name = "argus-prometheus-credentials" + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + + data = { + username = stackit_observability_credential.example.username + password = stackit_observability_credential.example.password + } + } + + resource "helm_release" "prometheus_operator" { + name = "prometheus-operator" + repository = "https://prometheus-community.github.io/helm-charts" + chart = "kube-prometheus-stack" + version = "60.1.0" + namespace = kubernetes_namespace.monitoring.metadata[0].name + + values = [ + templatefile("prom-values.tftpl", { + metrics_push_url = stackit_observability_instance.example.metrics_push_url + secret_name = kubernetes_secret.argus_prometheus_authorization.metadata[0].name + }) + ] + } + ``` + +5. **Create Alert Group** + + Define an alert group with a rule to notify when a pod is running in the "example" namespace. + + ```hcl + resource "stackit_observability_alertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + name = "TestAlertGroup" + interval = "2h" + rules = [ + { + alert = "SimplePodCheck" + expression = "sum(kube_pod_status_phase{phase=\"Running\", namespace=\"example\"}) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary = "Test Alert is working" + description = "Test Alert" + } + }, + ] + } + ``` + +6. **Deploy Test Pod** + + Deploy a test pod; doing so should trigger an email notification, as the deployment satisfies the conditions defined in the alert group rule. In a real-world scenario, you would typically configure alerts to monitor pods for error states instead. + + ```hcl + resource "kubernetes_namespace" "example" { + metadata { + name = "example" + } + } + + resource "kubernetes_pod" "example" { + metadata { + name = "nginx" + namespace = kubernetes_namespace.example.metadata[0].name + labels = { + app = "nginx" + } + } + + spec { + container { + image = "nginx:latest" + name = "nginx" + } + } + } + ``` \ No newline at end of file