diff --git a/docs/data-sources/observability_alertgroup.md b/docs/data-sources/observability_alertgroup.md index ad3cd624..9fa930a6 100644 --- a/docs/data-sources/observability_alertgroup.md +++ b/docs/data-sources/observability_alertgroup.md @@ -3,12 +3,12 @@ page_title: "stackit_observability_alertgroup Data Source - stackit" subcategory: "" description: |- - Observability alert group resource schema. Must have a region specified in the provider configuration. + Observability alert group datasource schema. Used to create alerts based on metrics (Thanos). Must have a region specified in the provider configuration. --- # stackit_observability_alertgroup (Data Source) -Observability alert group resource schema. Must have a `region` specified in the provider configuration. +Observability alert group datasource schema. Used to create alerts based on metrics (Thanos). Must have a `region` specified in the provider configuration. ## Example Usage diff --git a/docs/data-sources/observability_logalertgroup.md b/docs/data-sources/observability_logalertgroup.md new file mode 100644 index 00000000..a2f7a132 --- /dev/null +++ b/docs/data-sources/observability_logalertgroup.md @@ -0,0 +1,47 @@ +--- +# generated by https://github.com/hashicorp/terraform-plugin-docs +page_title: "stackit_observability_logalertgroup Data Source - stackit" +subcategory: "" +description: |- + Observability log alert group datasource schema. Used to create alerts based on logs (Loki). Must have a region specified in the provider configuration. +--- + +# stackit_observability_logalertgroup (Data Source) + +Observability log alert group datasource schema. Used to create alerts based on logs (Loki). Must have a `region` specified in the provider configuration. + +## Example Usage + +```terraform +data "stackit_observability_logalertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-log-alert-group" +} +``` + + +## Schema + +### Required + +- `instance_id` (String) Observability instance ID to which the log alert group is associated. +- `name` (String) The name of the log alert group. Is the identifier and must be unique in the group. +- `project_id` (String) STACKIT project ID to which the log alert group is associated. + +### Read-Only + +- `id` (String) Terraform's internal resource ID. It is structured as "`project_id`,`instance_id`,`name`". +- `interval` (String) Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'. +- `rules` (Attributes List) (see [below for nested schema](#nestedatt--rules)) + + +### Nested Schema for `rules` + +Read-Only: + +- `alert` (String) The name of the alert rule. Is the identifier and must be unique in the group. +- `annotations` (Map of String) A map of key:value. Annotations to add or overwrite for each alert +- `expression` (String) The LogQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts. +- `for` (String) Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s +- `labels` (Map of String) A map of key:value. Labels to add or overwrite for each alert diff --git a/docs/guides/ske_log_alerts.md b/docs/guides/ske_log_alerts.md new file mode 100644 index 00000000..3cca0b66 --- /dev/null +++ b/docs/guides/ske_log_alerts.md @@ -0,0 +1,199 @@ +--- +page_title: "SKE Log Alerts with STACKIT Observability" +--- +# SKE Log Alerts with STACKIT Observability + +## Overview + +This guide walks you through setting up log-based alerting in STACKIT Observability using Promtail to ship Kubernetes logs. + +1. **Set Up Providers** + + Begin by configuring the STACKIT and Kubernetes providers to connect to the STACKIT services. + + ```hcl + provider "stackit" { + region = "eu01" + } + + provider "kubernetes" { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + + provider "helm" { + kubernetes { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + } + ``` + +2. **Create SKE Cluster and Kubeconfig Resource** + + Set up a STACKIT SKE Cluster and generate the associated kubeconfig resource. + + ```hcl + resource "stackit_ske_cluster" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + kubernetes_version = "1.31" + node_pools = [ + { + name = "standard" + machine_type = "c1.4" + minimum = "3" + maximum = "9" + max_surge = "3" + availability_zones = ["eu01-1", "eu01-2", "eu01-3"] + os_version_min = "4081.2.1" + os_name = "flatcar" + volume_size = 32 + volume_type = "storage_premium_perf6" + } + ] + maintenance = { + enable_kubernetes_version_updates = true + enable_machine_image_version_updates = true + start = "01:00:00Z" + end = "02:00:00Z" + } + } + + resource "stackit_ske_kubeconfig" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + cluster_name = stackit_ske_cluster.example.name + refresh = true + } + ``` + +3. **Create Observability Instance and Credentials** + + Establish a STACKIT Observability instance and its credentials to handle alerts. + + ```hcl + locals { + alert_config = { + route = { + receiver = "EmailStackit", + repeat_interval = "1m", + continue = true + } + receivers = [ + { + name = "EmailStackit", + email_configs = [ + { + to = "" + } + ] + } + ] + } + } + + resource "stackit_observability_instance" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + plan_name = "Observability-Large-EU01" + alert_config = local.alert_config + } + + resource "stackit_observability_credential" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + } + ``` + +4. **Install Promtail** + + Deploy Promtail via Helm to collect logs and forward them to the STACKIT Observability Loki endpoint. + + ```hcl + resource "helm_release" "promtail" { + name = "promtail" + repository = "https://grafana.github.io/helm-charts" + chart = "promtail" + namespace = kubernetes_namespace.monitoring.metadata.0.name + version = "6.16.4" + + values = [ + <<-EOF + config: + clients: + # To find the Loki push URL, navigate to the observability instance in the portal and select the API tab. + - url: "https://${stackit_observability_credential.example.username}:${stackit_observability_credential.example.password}@/instances/${stackit_observability_instance.example.instance_id}/loki/api/v1/push" + EOF + ] + } + ``` + +5. **Create Alert Group** + + Create a log alert that triggers when a specific pod logs an error message. + + ```hcl + resource "stackit_observability_logalertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + name = "TestLogAlertGroup" + interval = "1m" + rules = [ + { + alert = "SimplePodLogAlertCheck" + expression = "sum(rate({namespace=\"example\", pod=\"logger\"} |= \"Simulated error message\" [1m])) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "Test Log Alert is working" + description : "Test Log Alert" + }, + }, + ] + } + ``` + +6. **Deploy Test Pod** + + Launch a pod that emits simulated error logs. This should trigger the alert if everything is set up correctly. + + ```hcl + resource "kubernetes_namespace" "example" { + metadata { + name = "example" + } + } + + resource "kubernetes_pod" "logger" { + metadata { + name = "logger" + namespace = kubernetes_namespace.example.metadata[0].name + labels = { + app = "logger" + } + } + + spec { + container { + name = "logger" + image = "bash" + command = [ + "bash", + "-c", + <&2 + done + EOF + ] + } + } + } + ``` \ No newline at end of file diff --git a/docs/resources/observability_alertgroup.md b/docs/resources/observability_alertgroup.md index c6b70859..ea16ea94 100644 --- a/docs/resources/observability_alertgroup.md +++ b/docs/resources/observability_alertgroup.md @@ -3,12 +3,12 @@ page_title: "stackit_observability_alertgroup Resource - stackit" subcategory: "" description: |- - Observability alert group resource schema. Must have a region specified in the provider configuration. + Observability alert group resource schema. Used to create alerts based on metrics (Thanos). Must have a region specified in the provider configuration. --- # stackit_observability_alertgroup (Resource) -Observability alert group resource schema. Must have a `region` specified in the provider configuration. +Observability alert group resource schema. Used to create alerts based on metrics (Thanos). Must have a `region` specified in the provider configuration. ## Example Usage diff --git a/docs/resources/observability_logalertgroup.md b/docs/resources/observability_logalertgroup.md new file mode 100644 index 00000000..630e92c2 --- /dev/null +++ b/docs/resources/observability_logalertgroup.md @@ -0,0 +1,80 @@ +--- +# generated by https://github.com/hashicorp/terraform-plugin-docs +page_title: "stackit_observability_logalertgroup Resource - stackit" +subcategory: "" +description: |- + Observability log alert group resource schema. Used to create alerts based on logs (Loki). Must have a region specified in the provider configuration. +--- + +# stackit_observability_logalertgroup (Resource) + +Observability log alert group resource schema. Used to create alerts based on logs (Loki). Must have a `region` specified in the provider configuration. + +## Example Usage + +```terraform +resource "stackit_observability_logalertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-log-alert-group" + interval = "60m" + rules = [ + { + alert = "example-log-alert-name" + expression = "sum(rate({namespace=\"example\", pod=\"logger\"} |= \"Simulated error message\" [1m])) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + { + alert = "example-log-alert-name-2" + expression = "sum(rate({namespace=\"example\", pod=\"logger\"} |= \"Another error message\" [1m])) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + ] +} +``` + + +## Schema + +### Required + +- `instance_id` (String) Observability instance ID to which the log alert group is associated. +- `name` (String) The name of the log alert group. Is the identifier and must be unique in the group. +- `project_id` (String) STACKIT project ID to which the log alert group is associated. +- `rules` (Attributes List) Rules for the log alert group (see [below for nested schema](#nestedatt--rules)) + +### Optional + +- `interval` (String) Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'. + +### Read-Only + +- `id` (String) Terraform's internal resource ID. It is structured as "`project_id`,`instance_id`,`name`". + + +### Nested Schema for `rules` + +Required: + +- `alert` (String) The name of the alert rule. Is the identifier and must be unique in the group. +- `expression` (String) The LogQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts. + +Optional: + +- `annotations` (Map of String) A map of key:value. Annotations to add or overwrite for each alert +- `for` (String) Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s +- `labels` (Map of String) A map of key:value. Labels to add or overwrite for each alert diff --git a/examples/data-sources/stackit_observability_logalertgroup/data-source.tf b/examples/data-sources/stackit_observability_logalertgroup/data-source.tf new file mode 100644 index 00000000..fac8e26b --- /dev/null +++ b/examples/data-sources/stackit_observability_logalertgroup/data-source.tf @@ -0,0 +1,5 @@ +data "stackit_observability_logalertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-log-alert-group" +} diff --git a/examples/resources/stackit_observability_logalertgroup/resource.tf b/examples/resources/stackit_observability_logalertgroup/resource.tf new file mode 100644 index 00000000..b19ab976 --- /dev/null +++ b/examples/resources/stackit_observability_logalertgroup/resource.tf @@ -0,0 +1,32 @@ +resource "stackit_observability_logalertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example-log-alert-group" + interval = "60m" + rules = [ + { + alert = "example-log-alert-name" + expression = "sum(rate({namespace=\"example\", pod=\"logger\"} |= \"Simulated error message\" [1m])) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + { + alert = "example-log-alert-name-2" + expression = "sum(rate({namespace=\"example\", pod=\"logger\"} |= \"Another error message\" [1m])) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "example summary" + description : "example description" + } + }, + ] +} \ No newline at end of file diff --git a/stackit/internal/services/observability/alertgroup/datasource.go b/stackit/internal/services/observability/alertgroup/datasource.go index e7a40816..87f64c49 100644 --- a/stackit/internal/services/observability/alertgroup/datasource.go +++ b/stackit/internal/services/observability/alertgroup/datasource.go @@ -77,7 +77,7 @@ func (a *alertGroupDataSource) Metadata(_ context.Context, req datasource.Metada // Schema defines the schema for the alert group data source. func (a *alertGroupDataSource) Schema(_ context.Context, _ datasource.SchemaRequest, resp *datasource.SchemaResponse) { resp.Schema = schema.Schema{ - Description: "Observability alert group resource schema. Must have a `region` specified in the provider configuration.", + Description: "Observability alert group datasource schema. Used to create alerts based on metrics (Thanos). Must have a `region` specified in the provider configuration.", Attributes: map[string]schema.Attribute{ "id": schema.StringAttribute{ Description: descriptions["id"], diff --git a/stackit/internal/services/observability/alertgroup/resource.go b/stackit/internal/services/observability/alertgroup/resource.go index 7ad77b98..5757f701 100644 --- a/stackit/internal/services/observability/alertgroup/resource.go +++ b/stackit/internal/services/observability/alertgroup/resource.go @@ -129,14 +129,14 @@ func (a *alertGroupResource) Configure(ctx context.Context, req resource.Configu // Schema defines the schema for the resource. func (a *alertGroupResource) Schema(_ context.Context, _ resource.SchemaRequest, resp *resource.SchemaResponse) { resp.Schema = schema.Schema{ - Description: "Observability alert group resource schema. Must have a `region` specified in the provider configuration.", + Description: "Observability alert group resource schema. Used to create alerts based on metrics (Thanos). Must have a `region` specified in the provider configuration.", Attributes: map[string]schema.Attribute{ "id": schema.StringAttribute{ Description: descriptions["id"], Computed: true, }, "project_id": schema.StringAttribute{ - Description: "STACKIT project ID to which the alert group is associated.", + Description: descriptions["project_id"], Required: true, Validators: []validator.String{ validate.UUID(), diff --git a/stackit/internal/services/observability/log-alertgroup/datasource.go b/stackit/internal/services/observability/log-alertgroup/datasource.go new file mode 100644 index 00000000..65f1cc66 --- /dev/null +++ b/stackit/internal/services/observability/log-alertgroup/datasource.go @@ -0,0 +1,187 @@ +package logalertgroup + +import ( + "context" + "errors" + "fmt" + "net/http" + + "github.com/hashicorp/terraform-plugin-framework-validators/stringvalidator" + "github.com/hashicorp/terraform-plugin-framework/datasource" + "github.com/hashicorp/terraform-plugin-framework/datasource/schema" + "github.com/hashicorp/terraform-plugin-framework/schema/validator" + "github.com/hashicorp/terraform-plugin-framework/types" + "github.com/hashicorp/terraform-plugin-log/tflog" + "github.com/stackitcloud/stackit-sdk-go/core/config" + "github.com/stackitcloud/stackit-sdk-go/core/oapierror" + "github.com/stackitcloud/stackit-sdk-go/services/observability" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/core" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/validate" +) + +// Ensure the implementation satisfies the expected interfaces. +var ( + _ datasource.DataSource = &logAlertGroupDataSource{} +) + +// NewLogAlertGroupDataSource creates a new instance of the alertGroupDataSource. +func NewLogAlertGroupDataSource() datasource.DataSource { + return &logAlertGroupDataSource{} +} + +// alertGroupDataSource is the datasource implementation. +type logAlertGroupDataSource struct { + client *observability.APIClient +} + +// Configure adds the provider configured client to the resource. +func (l *logAlertGroupDataSource) Configure(ctx context.Context, req datasource.ConfigureRequest, resp *datasource.ConfigureResponse) { + // Prevent panic if the provider has not been configured. + if req.ProviderData == nil { + return + } + + providerData, ok := req.ProviderData.(core.ProviderData) + if !ok { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Expected configure type stackit.ProviderData, got %T", req.ProviderData)) + return + } + + var apiClient *observability.APIClient + var err error + if providerData.ObservabilityCustomEndpoint != "" { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithEndpoint(providerData.ObservabilityCustomEndpoint), + ) + } else { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithRegion(providerData.GetRegion()), + ) + } + + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Configuring client: %v. This is an error related to the provider configuration, not to the resource configuration", err)) + return + } + l.client = apiClient + tflog.Info(ctx, "Observability log alert group client configured") +} + +// Metadata provides metadata for the log alert group datasource. +func (l *logAlertGroupDataSource) Metadata(_ context.Context, req datasource.MetadataRequest, resp *datasource.MetadataResponse) { + resp.TypeName = req.ProviderTypeName + "_observability_logalertgroup" +} + +// Schema defines the schema for the log alert group data source. +func (l *logAlertGroupDataSource) Schema(_ context.Context, _ datasource.SchemaRequest, resp *datasource.SchemaResponse) { + resp.Schema = schema.Schema{ + Description: "Observability log alert group datasource schema. Used to create alerts based on logs (Loki). Must have a `region` specified in the provider configuration.", + Attributes: map[string]schema.Attribute{ + "id": schema.StringAttribute{ + Description: descriptions["id"], + Computed: true, + }, + "project_id": schema.StringAttribute{ + Description: descriptions["project_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + }, + "instance_id": schema.StringAttribute{ + Description: descriptions["instance_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + }, + "name": schema.StringAttribute{ + Description: descriptions["name"], + Required: true, + Validators: []validator.String{ + validate.NoSeparator(), + stringvalidator.LengthBetween(1, 200), + }, + }, + "interval": schema.StringAttribute{ + Description: descriptions["interval"], + Computed: true, + Validators: []validator.String{ + validate.ValidDurationString(), + }, + }, + "rules": schema.ListNestedAttribute{ + Description: descriptions["rules"], + Computed: true, + NestedObject: schema.NestedAttributeObject{ + Attributes: map[string]schema.Attribute{ + "alert": schema.StringAttribute{ + Description: descriptions["alert"], + Computed: true, + }, + "expression": schema.StringAttribute{ + Description: descriptions["expression"], + Computed: true, + }, + "for": schema.StringAttribute{ + Description: descriptions["for"], + Computed: true, + }, + "labels": schema.MapAttribute{ + Description: descriptions["labels"], + ElementType: types.StringType, + Computed: true, + }, + "annotations": schema.MapAttribute{ + Description: descriptions["annotations"], + ElementType: types.StringType, + Computed: true, + }, + }, + }, + }, + }, + } +} + +func (l *logAlertGroupDataSource) Read(ctx context.Context, req datasource.ReadRequest, resp *datasource.ReadResponse) { // nolint:gocritic // function signature required by Terraform + var model Model + diags := req.Config.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "log_alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + readAlertGroupResp, err := l.client.GetLogsAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute() + if err != nil { + var oapiErr *oapierror.GenericOpenAPIError + ok := errors.As(err, &oapiErr) + if ok && oapiErr.StatusCode == http.StatusNotFound { + resp.State.RemoveResource(ctx) + return + } + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading log alert group", fmt.Sprintf("Calling API: %v", err)) + return + } + + err = mapFields(ctx, readAlertGroupResp.Data, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading log alert group", fmt.Sprintf("Error processing API response: %v", err)) + return + } + + // Set the updated state. + diags = resp.State.Set(ctx, &model) + resp.Diagnostics.Append(diags...) +} diff --git a/stackit/internal/services/observability/log-alertgroup/resource.go b/stackit/internal/services/observability/log-alertgroup/resource.go new file mode 100644 index 00000000..2747ef72 --- /dev/null +++ b/stackit/internal/services/observability/log-alertgroup/resource.go @@ -0,0 +1,577 @@ +package logalertgroup + +import ( + "context" + "errors" + "fmt" + "net/http" + "regexp" + "strings" + + "github.com/hashicorp/terraform-plugin-framework-validators/mapvalidator" + "github.com/hashicorp/terraform-plugin-framework-validators/stringvalidator" + "github.com/hashicorp/terraform-plugin-framework/attr" + "github.com/hashicorp/terraform-plugin-framework/path" + "github.com/hashicorp/terraform-plugin-framework/resource" + "github.com/hashicorp/terraform-plugin-framework/resource/schema" + "github.com/hashicorp/terraform-plugin-framework/resource/schema/listplanmodifier" + "github.com/hashicorp/terraform-plugin-framework/resource/schema/planmodifier" + "github.com/hashicorp/terraform-plugin-framework/resource/schema/stringplanmodifier" + "github.com/hashicorp/terraform-plugin-framework/schema/validator" + "github.com/hashicorp/terraform-plugin-framework/types" + "github.com/hashicorp/terraform-plugin-framework/types/basetypes" + "github.com/hashicorp/terraform-plugin-log/tflog" + "github.com/stackitcloud/stackit-sdk-go/core/config" + "github.com/stackitcloud/stackit-sdk-go/core/oapierror" + "github.com/stackitcloud/stackit-sdk-go/services/observability" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/conversion" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/core" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/utils" + "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/validate" +) + +// Ensure the implementation satisfies the expected interfaces. +var ( + _ resource.Resource = &logAlertGroupResource{} + _ resource.ResourceWithConfigure = &logAlertGroupResource{} + _ resource.ResourceWithImportState = &logAlertGroupResource{} +) + +type Model struct { + Id types.String `tfsdk:"id"` + ProjectId types.String `tfsdk:"project_id"` + InstanceId types.String `tfsdk:"instance_id"` + Name types.String `tfsdk:"name"` + Interval types.String `tfsdk:"interval"` + Rules types.List `tfsdk:"rules"` +} + +type rule struct { + Alert types.String `tfsdk:"alert"` + Annotations types.Map `tfsdk:"annotations"` + Labels types.Map `tfsdk:"labels"` + Expression types.String `tfsdk:"expression"` + For types.String `tfsdk:"for"` +} + +var ruleTypes = map[string]attr.Type{ + "alert": basetypes.StringType{}, + "annotations": basetypes.MapType{ElemType: types.StringType}, + "labels": basetypes.MapType{ElemType: types.StringType}, + "expression": basetypes.StringType{}, + "for": basetypes.StringType{}, +} + +// Descriptions for the resource and data source schemas are centralized here. +var descriptions = map[string]string{ + "id": "Terraform's internal resource ID. It is structured as \"`project_id`,`instance_id`,`name`\".", + "project_id": "STACKIT project ID to which the log alert group is associated.", + "instance_id": "Observability instance ID to which the log alert group is associated.", + "name": "The name of the log alert group. Is the identifier and must be unique in the group.", + "interval": "Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'.", + "alert": "The name of the alert rule. Is the identifier and must be unique in the group.", + "expression": "The LogQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts.", + "for": "Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s", + "labels": "A map of key:value. Labels to add or overwrite for each alert", + "annotations": "A map of key:value. Annotations to add or overwrite for each alert", +} + +// NewLogAlertGroupResource is a helper function to simplify the provider implementation. +func NewLogAlertGroupResource() resource.Resource { + return &logAlertGroupResource{} +} + +// alertGroupResource is the resource implementation. +type logAlertGroupResource struct { + client *observability.APIClient +} + +// Metadata returns the resource type name. +func (l *logAlertGroupResource) Metadata(_ context.Context, req resource.MetadataRequest, resp *resource.MetadataResponse) { + resp.TypeName = req.ProviderTypeName + "_observability_logalertgroup" +} + +// Configure adds the provider configured client to the resource. +func (l *logAlertGroupResource) Configure(ctx context.Context, req resource.ConfigureRequest, resp *resource.ConfigureResponse) { + // Prevent panic if the provider has not been configured. + if req.ProviderData == nil { + return + } + + providerData, ok := req.ProviderData.(core.ProviderData) + if !ok { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Expected configure type stackit.ProviderData, got %T", req.ProviderData)) + return + } + + var apiClient *observability.APIClient + var err error + if providerData.ObservabilityCustomEndpoint != "" { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithEndpoint(providerData.ObservabilityCustomEndpoint), + ) + } else { + apiClient, err = observability.NewAPIClient( + config.WithCustomAuth(providerData.RoundTripper), + config.WithRegion(providerData.GetRegion()), + ) + } + + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Configuring client: %v. This is an error related to the provider configuration, not to the resource configuration", err)) + return + } + l.client = apiClient + tflog.Info(ctx, "Observability log alert group client configured") +} + +// Schema defines the schema for the resource. +func (l *logAlertGroupResource) Schema(_ context.Context, _ resource.SchemaRequest, resp *resource.SchemaResponse) { + resp.Schema = schema.Schema{ + Description: "Observability log alert group resource schema. Used to create alerts based on logs (Loki). Must have a `region` specified in the provider configuration.", + Attributes: map[string]schema.Attribute{ + "id": schema.StringAttribute{ + Description: descriptions["id"], + Computed: true, + }, + "project_id": schema.StringAttribute{ + Description: descriptions["project_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "instance_id": schema.StringAttribute{ + Description: descriptions["instance_id"], + Required: true, + Validators: []validator.String{ + validate.UUID(), + validate.NoSeparator(), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "name": schema.StringAttribute{ + Description: descriptions["name"], + Required: true, + Validators: []validator.String{ + validate.NoSeparator(), + stringvalidator.LengthBetween(1, 200), + stringvalidator.RegexMatches( + regexp.MustCompile(`^[a-zA-Z0-9-]+$`), + "must match expression", + ), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "interval": schema.StringAttribute{ + Description: descriptions["interval"], + Optional: true, + Validators: []validator.String{ + validate.ValidDurationString(), + }, + PlanModifiers: []planmodifier.String{ + stringplanmodifier.RequiresReplace(), + }, + }, + "rules": schema.ListNestedAttribute{ + Description: "Rules for the log alert group", + Required: true, + PlanModifiers: []planmodifier.List{ + listplanmodifier.RequiresReplace(), + }, + NestedObject: schema.NestedAttributeObject{ + Attributes: map[string]schema.Attribute{ + "alert": schema.StringAttribute{ + Description: descriptions["alert"], + Required: true, + Validators: []validator.String{ + stringvalidator.RegexMatches( + regexp.MustCompile(`^[a-zA-Z0-9-]+$`), + "must match expression", + ), + stringvalidator.LengthBetween(1, 200), + }, + }, + "expression": schema.StringAttribute{ + Description: descriptions["expression"], + Required: true, + Validators: []validator.String{ + stringvalidator.LengthBetween(1, 600), + }, + }, + "for": schema.StringAttribute{ + Description: descriptions["for"], + Optional: true, + Validators: []validator.String{ + stringvalidator.LengthBetween(2, 8), + validate.ValidDurationString(), + }, + }, + "labels": schema.MapAttribute{ + Description: descriptions["labels"], + Optional: true, + ElementType: types.StringType, + Validators: []validator.Map{ + mapvalidator.KeysAre(stringvalidator.LengthAtMost(200)), + mapvalidator.ValueStringsAre(stringvalidator.LengthAtMost(200)), + mapvalidator.SizeAtMost(10), + }, + }, + "annotations": schema.MapAttribute{ + Description: descriptions["annotations"], + Optional: true, + ElementType: types.StringType, + Validators: []validator.Map{ + mapvalidator.KeysAre(stringvalidator.LengthAtMost(200)), + mapvalidator.ValueStringsAre(stringvalidator.LengthAtMost(200)), + mapvalidator.SizeAtMost(5), + }, + }, + }, + }, + }, + }, + } +} + +// Create creates the resource and sets the initial Terraform state. +func (l *logAlertGroupResource) Create(ctx context.Context, req resource.CreateRequest, resp *resource.CreateResponse) { // nolint:gocritic // function signature required by Terraform + // Retrieve values from plan + var model Model + diags := req.Plan.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "log_alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + payload, err := toCreatePayload(ctx, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alertgroup", fmt.Sprintf("Creating API payload: %v", err)) + return + } + + createAlertGroupResp, err := l.client.CreateLogsAlertgroups(ctx, instanceId, projectId).CreateLogsAlertgroupsPayload(*payload).Execute() + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alertgroup", fmt.Sprintf("Creating API payload: %v", err)) + return + } + + // all log alert groups are returned. We have to search the map for the one corresponding to our name + for _, alertGroup := range *createAlertGroupResp.Data { + if model.Name.ValueString() != *alertGroup.Name { + continue + } + + err = mapFields(ctx, &alertGroup, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating log alert group", fmt.Sprintf("Processing API payload: %v", err)) + return + } + } + + // Set the state with fully populated data. + diags = resp.State.Set(ctx, model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + tflog.Info(ctx, "log alert group created") +} + +// Read refreshes the Terraform state with the latest data. +func (l *logAlertGroupResource) Read(ctx context.Context, req resource.ReadRequest, resp *resource.ReadResponse) { // nolint:gocritic // function signature required by Terraform + var model Model + diags := req.State.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "log_alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + readAlertGroupResp, err := l.client.GetLogsAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute() + if err != nil { + var oapiErr *oapierror.GenericOpenAPIError + ok := errors.As(err, &oapiErr) + if ok && oapiErr.StatusCode == http.StatusNotFound { + resp.State.RemoveResource(ctx) + return + } + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading log alert group", fmt.Sprintf("Calling API: %v", err)) + return + } + + err = mapFields(ctx, readAlertGroupResp.Data, &model) + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading log alert group", fmt.Sprintf("Error processing API response: %v", err)) + return + } + + // Set the updated state. + diags = resp.State.Set(ctx, &model) + resp.Diagnostics.Append(diags...) +} + +// Update attempts to update the resource. In this case, alertgroups cannot be updated. +// The Update function is redundant since any modifications will +// automatically trigger a resource recreation through Terraform's built-in +// lifecycle management. +func (l *logAlertGroupResource) Update(ctx context.Context, _ resource.UpdateRequest, resp *resource.UpdateResponse) { // nolint:gocritic // function signature required by Terraform + core.LogAndAddError(ctx, &resp.Diagnostics, "Error updating log alert group", "Observability log alert groups can't be updated") +} + +// Delete deletes the resource and removes the Terraform state on success. +func (l *logAlertGroupResource) Delete(ctx context.Context, req resource.DeleteRequest, resp *resource.DeleteResponse) { // nolint:gocritic // function signature required by Terraform + // Retrieve values from state + var model Model + diags := req.State.Get(ctx, &model) + resp.Diagnostics.Append(diags...) + if resp.Diagnostics.HasError() { + return + } + + projectId := model.ProjectId.ValueString() + instanceId := model.InstanceId.ValueString() + alertGroupName := model.Name.ValueString() + ctx = tflog.SetField(ctx, "project_id", projectId) + ctx = tflog.SetField(ctx, "log_alert_group_name", alertGroupName) + ctx = tflog.SetField(ctx, "instance_id", instanceId) + + _, err := l.client.DeleteLogsAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute() + if err != nil { + core.LogAndAddError(ctx, &resp.Diagnostics, "Error deleting log alert group", fmt.Sprintf("Calling API: %v", err)) + return + } + + tflog.Info(ctx, "log alert group deleted") +} + +// ImportState imports a resource into the Terraform state on success. +// The expected format of the resource import identifier is: project_id,instance_id,name +func (l *logAlertGroupResource) ImportState(ctx context.Context, req resource.ImportStateRequest, resp *resource.ImportStateResponse) { + idParts := strings.Split(req.ID, core.Separator) + + if len(idParts) != 3 || idParts[0] == "" || idParts[1] == "" || idParts[2] == "" { + core.LogAndAddError(ctx, &resp.Diagnostics, + "Error importing scrape config", + fmt.Sprintf("Expected import identifier with format: [project_id],[instance_id],[name] Got: %q", req.ID), + ) + return + } + + resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("project_id"), idParts[0])...) + resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("instance_id"), idParts[1])...) + resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("name"), idParts[2])...) + tflog.Info(ctx, "Observability log alert group state imported") +} + +// toCreatePayload generates the payload to create a new log alert group. +func toCreatePayload(ctx context.Context, model *Model) (*observability.CreateLogsAlertgroupsPayload, error) { + if model == nil { + return nil, fmt.Errorf("nil model") + } + + payload := observability.CreateLogsAlertgroupsPayload{} + + if !utils.IsUndefined(model.Name) { + payload.Name = model.Name.ValueStringPointer() + } + + if !utils.IsUndefined(model.Interval) { + payload.Interval = model.Interval.ValueStringPointer() + } + + if !utils.IsUndefined(model.Rules) { + rules, err := toRulesPayload(ctx, model) + if err != nil { + return nil, err + } + payload.Rules = &rules + } + + return &payload, nil +} + +// toRulesPayload generates rules for create payload. +func toRulesPayload(ctx context.Context, model *Model) ([]observability.UpdateAlertgroupsRequestInnerRulesInner, error) { + if model.Rules.Elements() == nil || len(model.Rules.Elements()) == 0 { + return []observability.UpdateAlertgroupsRequestInnerRulesInner{}, nil + } + + var rules []rule + diags := model.Rules.ElementsAs(ctx, &rules, false) + if diags.HasError() { + return nil, core.DiagsToError(diags) + } + + var oarrs []observability.UpdateAlertgroupsRequestInnerRulesInner + for i := range rules { + rule := &rules[i] + oarr := observability.UpdateAlertgroupsRequestInnerRulesInner{} + + if !utils.IsUndefined(rule.Alert) { + alert := conversion.StringValueToPointer(rule.Alert) + if alert == nil { + return nil, fmt.Errorf("found nil alert for rule[%d]", i) + } + oarr.Alert = alert + } + + if !utils.IsUndefined(rule.Expression) { + expression := conversion.StringValueToPointer(rule.Expression) + if expression == nil { + return nil, fmt.Errorf("found nil expression for rule[%d]", i) + } + oarr.Expr = expression + } + + if !utils.IsUndefined(rule.For) { + for_ := conversion.StringValueToPointer(rule.For) + if for_ == nil { + return nil, fmt.Errorf("found nil expression for for_[%d]", i) + } + oarr.For = for_ + } + + if !utils.IsUndefined(rule.Labels) { + labels, err := conversion.ToStringInterfaceMap(ctx, rule.Labels) + if err != nil { + return nil, fmt.Errorf("converting to Go map: %w", err) + } + oarr.Labels = &labels + } + + if !utils.IsUndefined(rule.Annotations) { + annotations, err := conversion.ToStringInterfaceMap(ctx, rule.Annotations) + if err != nil { + return nil, fmt.Errorf("converting to Go map: %w", err) + } + oarr.Annotations = &annotations + } + + oarrs = append(oarrs, oarr) + } + + return oarrs, nil +} + +// mapRules maps alertGroup response to the model. +func mapFields(ctx context.Context, alertGroup *observability.AlertGroup, model *Model) error { + if alertGroup == nil { + return fmt.Errorf("nil alertGroup") + } + + if model == nil { + return fmt.Errorf("nil model") + } + + if utils.IsUndefined(model.Name) { + return fmt.Errorf("empty name") + } + + if utils.IsUndefined(model.ProjectId) { + return fmt.Errorf("empty projectId") + } + + if utils.IsUndefined(model.InstanceId) { + return fmt.Errorf("empty instanceId") + } + + var name string + if !utils.IsUndefined(model.Name) { + name = model.Name.ValueString() + } else if alertGroup.Name != nil { + name = *alertGroup.Name + } else { + return fmt.Errorf("found empty name") + } + + model.Name = types.StringValue(name) + idParts := []string{model.ProjectId.ValueString(), model.InstanceId.ValueString(), name} + model.Id = types.StringValue(strings.Join(idParts, core.Separator)) + + var interval string + if !utils.IsUndefined(model.Interval) { + interval = model.Interval.ValueString() + } else if alertGroup.Interval != nil { + interval = *alertGroup.Interval + } else { + return fmt.Errorf("found empty interval") + } + model.Interval = types.StringValue(interval) + + if alertGroup.Rules != nil { + err := mapRules(ctx, alertGroup, model) + if err != nil { + return fmt.Errorf("map rules: %w", err) + } + } + + return nil +} + +// mapRules maps alertGroup response rules to the model rules. +func mapRules(_ context.Context, alertGroup *observability.AlertGroup, model *Model) error { + var newRules []attr.Value + + for i, r := range *alertGroup.Rules { + ruleMap := map[string]attr.Value{ + "alert": types.StringPointerValue(r.Alert), + "expression": types.StringPointerValue(r.Expr), + "for": types.StringPointerValue(r.For), + "labels": types.MapNull(types.StringType), + "annotations": types.MapNull(types.StringType), + } + + if r.Labels != nil { + labelElems := map[string]attr.Value{} + for k, v := range *r.Labels { + labelElems[k] = types.StringValue(v) + } + ruleMap["labels"] = types.MapValueMust(types.StringType, labelElems) + } + + if r.Annotations != nil { + annoElems := map[string]attr.Value{} + for k, v := range *r.Annotations { + annoElems[k] = types.StringValue(v) + } + ruleMap["annotations"] = types.MapValueMust(types.StringType, annoElems) + } + + ruleTf, diags := types.ObjectValue(ruleTypes, ruleMap) + if diags.HasError() { + return fmt.Errorf("mapping index %d: %w", i, core.DiagsToError(diags)) + } + newRules = append(newRules, ruleTf) + } + + rulesTf, diags := types.ListValue(types.ObjectType{AttrTypes: ruleTypes}, newRules) + if diags.HasError() { + return core.DiagsToError(diags) + } + + model.Rules = rulesTf + return nil +} diff --git a/stackit/internal/services/observability/log-alertgroup/resource_test.go b/stackit/internal/services/observability/log-alertgroup/resource_test.go new file mode 100644 index 00000000..4f3bd60b --- /dev/null +++ b/stackit/internal/services/observability/log-alertgroup/resource_test.go @@ -0,0 +1,366 @@ +package logalertgroup + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/hashicorp/terraform-plugin-framework/attr" + "github.com/hashicorp/terraform-plugin-framework/types" + "github.com/stackitcloud/stackit-sdk-go/core/utils" + "github.com/stackitcloud/stackit-sdk-go/services/observability" +) + +func TestToCreatePayload(t *testing.T) { + tests := []struct { + name string + input *Model + expect *observability.CreateLogsAlertgroupsPayload + expectErr bool + }{ + { + name: "Nil Model", + input: nil, + expect: nil, + expectErr: true, + }, + { + name: "Empty Model", + input: &Model{ + Name: types.StringNull(), + Interval: types.StringNull(), + Rules: types.ListNull(types.StringType), + }, + expect: &observability.CreateLogsAlertgroupsPayload{}, + expectErr: false, + }, + { + name: "Model with Name and Interval", + input: &Model{ + Name: types.StringValue("test-alertgroup"), + Interval: types.StringValue("5m"), + }, + expect: &observability.CreateLogsAlertgroupsPayload{ + Name: utils.Ptr("test-alertgroup"), + Interval: utils.Ptr("5m"), + }, + expectErr: false, + }, + { + name: "Model with Full Information", + input: &Model{ + Name: types.StringValue("full-alertgroup"), + Interval: types.StringValue("10m"), + Rules: types.ListValueMust( + types.ObjectType{AttrTypes: ruleTypes}, + []attr.Value{ + types.ObjectValueMust( + ruleTypes, + map[string]attr.Value{ + "alert": types.StringValue("alert"), + "expression": types.StringValue("expression"), + "for": types.StringValue("10s"), + "labels": types.MapValueMust( + types.StringType, + map[string]attr.Value{ + "k": types.StringValue("v"), + }, + ), + "annotations": types.MapValueMust( + types.StringType, + map[string]attr.Value{ + "k": types.StringValue("v"), + }, + ), + }, + ), + }, + ), + }, + expect: &observability.CreateLogsAlertgroupsPayload{ + Name: utils.Ptr("full-alertgroup"), + Interval: utils.Ptr("10m"), + Rules: &[]observability.UpdateAlertgroupsRequestInnerRulesInner{ + { + Alert: utils.Ptr("alert"), + Annotations: &map[string]interface{}{ + "k": "v", + }, + Expr: utils.Ptr("expression"), + For: utils.Ptr("10s"), + Labels: &map[string]interface{}{ + "k": "v", + }, + }, + }, + }, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + got, err := toCreatePayload(ctx, tt.input) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err) + } + + if diff := cmp.Diff(got, tt.expect); diff != "" { + t.Errorf("unexpected result (-got +want):\n%s", diff) + } + }) + } +} + +func TestToRulesPayload(t *testing.T) { + tests := []struct { + name string + input *Model + expect []observability.UpdateAlertgroupsRequestInnerRulesInner + expectErr bool + }{ + { + name: "Nil Rules", + input: &Model{ + Rules: types.ListNull(types.StringType), // Simulates a lack of rules + }, + expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{}, + expectErr: false, + }, + { + name: "Invalid Rule Element Type", + input: &Model{ + Rules: types.ListValueMust(types.StringType, []attr.Value{ + types.StringValue("invalid"), // Should cause a conversion failure + }), + }, + expect: nil, + expectErr: true, + }, + { + name: "Single Valid Rule", + input: &Model{ + Rules: types.ListValueMust(types.ObjectType{AttrTypes: ruleTypes}, []attr.Value{ + types.ObjectValueMust(ruleTypes, map[string]attr.Value{ + "alert": types.StringValue("alert"), + "expression": types.StringValue("expr"), + "for": types.StringValue("5s"), + "labels": types.MapValueMust(types.StringType, map[string]attr.Value{ + "key": types.StringValue("value"), + }), + "annotations": types.MapValueMust(types.StringType, map[string]attr.Value{ + "note": types.StringValue("important"), + }), + }), + }), + }, + expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{ + { + Alert: utils.Ptr("alert"), + Expr: utils.Ptr("expr"), + For: utils.Ptr("5s"), + Labels: &map[string]interface{}{ + "key": "value", + }, + Annotations: &map[string]interface{}{ + "note": "important", + }, + }, + }, + expectErr: false, + }, + { + name: "Multiple Valid Rules", + input: &Model{ + Rules: types.ListValueMust(types.ObjectType{AttrTypes: ruleTypes}, []attr.Value{ + types.ObjectValueMust(ruleTypes, map[string]attr.Value{ + "alert": types.StringValue("alert1"), + "expression": types.StringValue("expr1"), + "for": types.StringValue("5s"), + "labels": types.MapNull(types.StringType), + "annotations": types.MapNull(types.StringType), + }), + types.ObjectValueMust(ruleTypes, map[string]attr.Value{ + "alert": types.StringValue("alert2"), + "expression": types.StringValue("expr2"), + "for": types.StringValue("10s"), + "labels": types.MapValueMust(types.StringType, map[string]attr.Value{ + "key": types.StringValue("value"), + }), + "annotations": types.MapValueMust(types.StringType, map[string]attr.Value{ + "note": types.StringValue("important"), + }), + }), + }), + }, + expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{ + { + Alert: utils.Ptr("alert1"), + Expr: utils.Ptr("expr1"), + For: utils.Ptr("5s"), + }, + { + Alert: utils.Ptr("alert2"), + Expr: utils.Ptr("expr2"), + For: utils.Ptr("10s"), + Labels: &map[string]interface{}{ + "key": "value", + }, + Annotations: &map[string]interface{}{ + "note": "important", + }, + }, + }, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + got, err := toRulesPayload(ctx, tt.input) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err) + } + + if diff := cmp.Diff(got, tt.expect); diff != "" { + t.Errorf("unexpected result (-got +want):\n%s", diff) + } + }) + } +} + +func TestMapFields(t *testing.T) { + tests := []struct { + name string + alertGroup *observability.AlertGroup + model *Model + expectedName string + expectedID string + expectErr bool + }{ + { + name: "Nil AlertGroup", + alertGroup: nil, + model: &Model{}, + expectErr: true, + }, + { + name: "Nil Model", + alertGroup: &observability.AlertGroup{}, + model: nil, + expectErr: true, + }, + { + name: "Interval Missing", + alertGroup: &observability.AlertGroup{ + Name: utils.Ptr("alert-group-name"), + }, + model: &Model{ + Name: types.StringValue("alert-group-name"), + ProjectId: types.StringValue("project1"), + InstanceId: types.StringValue("instance1"), + }, + expectedName: "alert-group-name", + expectedID: "project1,instance1,alert-group-name", + expectErr: true, + }, + { + name: "Name Missing", + alertGroup: &observability.AlertGroup{ + Interval: utils.Ptr("5m"), + }, + model: &Model{ + Name: types.StringValue("model-name"), + InstanceId: types.StringValue("instance1"), + }, + expectErr: true, + }, + { + name: "Complete Model and AlertGroup", + alertGroup: &observability.AlertGroup{ + Name: utils.Ptr("alert-group-name"), + Interval: utils.Ptr("10m"), + }, + model: &Model{ + Name: types.StringValue("alert-group-name"), + ProjectId: types.StringValue("project1"), + InstanceId: types.StringValue("instance1"), + Id: types.StringValue("project1,instance1,alert-group-name"), + Interval: types.StringValue("10m"), + }, + expectedName: "alert-group-name", + expectedID: "project1,instance1,alert-group-name", + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + err := mapFields(ctx, tt.alertGroup, tt.model) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err) + } + + if !tt.expectErr { + if diff := cmp.Diff(tt.model.Name.ValueString(), tt.expectedName); diff != "" { + t.Errorf("unexpected name (-got +want):\n%s", diff) + } + if diff := cmp.Diff(tt.model.Id.ValueString(), tt.expectedID); diff != "" { + t.Errorf("unexpected ID (-got +want):\n%s", diff) + } + } + }) + } +} + +func TestMapRules(t *testing.T) { + tests := []struct { + name string + alertGroup *observability.AlertGroup + model *Model + expectErr bool + }{ + { + name: "Empty Rules", + alertGroup: &observability.AlertGroup{ + Rules: &[]observability.AlertRuleRecord{}, + }, + model: &Model{}, + expectErr: false, + }, + { + name: "Single Complete Rule", + alertGroup: &observability.AlertGroup{ + Rules: &[]observability.AlertRuleRecord{ + { + Alert: utils.Ptr("HighCPUUsage"), + Expr: utils.Ptr("rate(cpu_usage[5m]) > 0.9"), + For: utils.Ptr("2m"), + Labels: &map[string]string{"severity": "critical"}, + Annotations: &map[string]string{"summary": "CPU usage high"}, + Record: utils.Ptr("record1"), + }, + }, + }, + model: &Model{}, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + err := mapRules(ctx, tt.alertGroup, tt.model) + + if (err != nil) != tt.expectErr { + t.Fatalf("expected error: %v, got: %v", tt.expectErr, err != nil) + } + }) + } +} diff --git a/stackit/internal/services/observability/observability_acc_test.go b/stackit/internal/services/observability/observability_acc_test.go index 21a30214..82687b5c 100644 --- a/stackit/internal/services/observability/observability_acc_test.go +++ b/stackit/internal/services/observability/observability_acc_test.go @@ -20,8 +20,8 @@ import ( var instanceResource = map[string]string{ "project_id": testutil.ProjectId, "name": testutil.ResourceNameWithDateTime("observability"), - "plan_name": "Observability-Monitoring-Basic-EU01", - "new_plan_name": "Observability-Monitoring-Medium-EU01", + "plan_name": "Observability-Medium-EU01", + "new_plan_name": "Observability-Large-EU01", "acl-0": "1.2.3.4/32", "acl-1": "111.222.111.222/32", "acl-1-updated": "111.222.111.125/32", @@ -47,8 +47,19 @@ var alertGroupResource = map[string]string{ "interval": "5h", "interval_updated": "1h", "alert": "alert1", - "expression": "expression1", - "expression_updated": "expression2", + "expression": `sum(kube_pod_status_phase{phase=\"Running\"}) > 0`, + "expression_updated": `sum(kube_pod_status_phase{phase=\"Error\"}) > 0`, + "for": "60s", +} + +var logAlertGroupResource = map[string]string{ + "name": fmt.Sprintf("alertgroup-%s", acctest.RandStringFromCharSet(7, acctest.CharSetAlphaNum)), + "name_updated": fmt.Sprintf("alertgroup-%s", acctest.RandStringFromCharSet(7, acctest.CharSetAlphaNum)), + "interval": "5h", + "interval_updated": "1h", + "alert": "alert1", + "expression": `sum(rate({namespace=\"example\"} |= \"Simulated error message\" [1m])) > 0`, + "expression_updated": `sum(rate({namespace=\"example\"} |= \"Another error message\" [1m])) > 0`, "for": "60s", } @@ -262,8 +273,31 @@ func alertGroupResourceConfig(name, interval, expression string) string { ) } -func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsampling, metricsRetentionDays5mDownsampling, alertConfig *string, instanceName, planName, target, saml2EnableUrlParameters, alertGroupName, alertGroupInterval, alertGroupRule1Expression string) string { - return fmt.Sprintf("%s\n\n%s\n\n%s\n\n%s\n\n%s", +func logAlertGroupResourceConfig(name, interval, expression string) string { + return fmt.Sprintf( + `resource "stackit_observability_logalertgroup" "logalertgroup" { + project_id = stackit_observability_instance.instance.project_id + instance_id = stackit_observability_instance.instance.instance_id + name = "%s" + interval = "%s" + rules = [ + { + alert = "%s" + expression = "%s" + for = "%s" + } + ] + }`, + name, + interval, + logAlertGroupResource["alert"], + expression, + logAlertGroupResource["for"], + ) +} + +func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsampling, metricsRetentionDays5mDownsampling, alertConfig *string, instanceName, planName, target, saml2EnableUrlParameters, alertGroupName, alertGroupInterval, alertGroupRule1Expression, logAlertGroupName, logAlertGroupInterval, logAlertGroupRule1Expression string) string { + return fmt.Sprintf("%s\n\n%s\n\n%s\n\n%s\n\n%s\n\n%s", testutil.ObservabilityProviderConfig(), instanceResourceConfig(acl, metricsRetentionDays, @@ -275,6 +309,7 @@ func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsamplin scrapeConfigResourceConfig(target, saml2EnableUrlParameters), credentialResourceConfig(), alertGroupResourceConfig(alertGroupName, alertGroupInterval, alertGroupRule1Expression), + logAlertGroupResourceConfig(logAlertGroupName, logAlertGroupInterval, logAlertGroupRule1Expression), ) } @@ -303,6 +338,9 @@ func TestAccResource(t *testing.T) { alertGroupResource["name"], alertGroupResource["interval"], alertGroupResource["expression"], + logAlertGroupResource["name"], + logAlertGroupResource["interval"], + logAlertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -399,6 +437,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), + + // logalertgroup + resource.TestCheckResourceAttr("stackit_observability_logalertgroup.logalertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_logalertgroup.logalertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.for"), ), }, // Update Alert Config with complete Receiver (email, webhook and opsgenie configs), global options and Route with child routes @@ -421,6 +471,9 @@ func TestAccResource(t *testing.T) { alertGroupResource["name"], alertGroupResource["interval"], alertGroupResource["expression"], + logAlertGroupResource["name"], + logAlertGroupResource["interval"], + logAlertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -526,6 +579,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), + + // logalertgroup + resource.TestCheckResourceAttr("stackit_observability_logalertgroup.logalertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_logalertgroup.logalertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.for"), ), }, // Update without ACL, partial metrics retention days and NO alert configs @@ -543,6 +608,9 @@ func TestAccResource(t *testing.T) { alertGroupResource["name"], alertGroupResource["interval"], alertGroupResource["expression"], + logAlertGroupResource["name"], + logAlertGroupResource["interval"], + logAlertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -610,6 +678,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), + + // logalertgroup + resource.TestCheckResourceAttr("stackit_observability_logalertgroup.logalertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_logalertgroup.logalertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.for"), ), }, // Update with empty ACL, NO metrics retention days and NO alert configs @@ -627,6 +707,9 @@ func TestAccResource(t *testing.T) { alertGroupResource["name"], alertGroupResource["interval"], alertGroupResource["expression"], + logAlertGroupResource["name"], + logAlertGroupResource["interval"], + logAlertGroupResource["expression"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance data @@ -694,6 +777,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), + + // logalertgroup + resource.TestCheckResourceAttr("stackit_observability_logalertgroup.logalertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_logalertgroup.logalertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.for"), ), }, // Data source @@ -717,6 +812,12 @@ func TestAccResource(t *testing.T) { instance_id = stackit_observability_alertgroup.alertgroup.instance_id name = stackit_observability_alertgroup.alertgroup.name } + + data "stackit_observability_logalertgroup" "logalertgroup" { + project_id = stackit_observability_logalertgroup.logalertgroup.project_id + instance_id = stackit_observability_logalertgroup.logalertgroup.instance_id + name = stackit_observability_logalertgroup.logalertgroup.name + } `, resourceConfig( utils.Ptr(fmt.Sprintf( @@ -735,6 +836,9 @@ func TestAccResource(t *testing.T) { alertGroupResource["name"], alertGroupResource["interval"], alertGroupResource["expression"], + logAlertGroupResource["name"], + logAlertGroupResource["interval"], + logAlertGroupResource["expression"], ), ), Check: resource.ComposeAggregateTestCheckFunc( @@ -786,6 +890,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.alert"), resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.expression"), resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.for"), + + // logalertgroup + resource.TestCheckResourceAttr("data.stackit_observability_logalertgroup.logalertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "data.stackit_observability_logalertgroup.logalertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("data.stackit_observability_logalertgroup.logalertgroup", "name"), + resource.TestCheckResourceAttrSet("data.stackit_observability_logalertgroup.logalertgroup", "interval"), + resource.TestCheckResourceAttrSet("data.stackit_observability_logalertgroup.logalertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("data.stackit_observability_logalertgroup.logalertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("data.stackit_observability_logalertgroup.logalertgroup", "rules.0.for"), ), }, // Import 1 @@ -848,6 +964,27 @@ func TestAccResource(t *testing.T) { ImportState: true, ImportStateVerify: true, }, + // Import 4 + { + ResourceName: "stackit_observability_logalertgroup.logalertgroup", + ImportStateIdFunc: func(s *terraform.State) (string, error) { + r, ok := s.RootModule().Resources["stackit_observability_logalertgroup.logalertgroup"] + if !ok { + return "", fmt.Errorf("couldn't find resource stackit_observability_logalertgroup.logalertgroup") + } + instanceId, ok := r.Primary.Attributes["instance_id"] + if !ok { + return "", fmt.Errorf("couldn't find attribute instance_id") + } + name, ok := r.Primary.Attributes["name"] + if !ok { + return "", fmt.Errorf("couldn't find attribute name") + } + return fmt.Sprintf("%s,%s,%s", testutil.ProjectId, instanceId, name), nil + }, + ImportState: true, + ImportStateVerify: true, + }, // Update { Config: resourceConfig( @@ -867,6 +1004,9 @@ func TestAccResource(t *testing.T) { alertGroupResource["name_updated"], alertGroupResource["interval_updated"], alertGroupResource["expression_updated"], + logAlertGroupResource["name_updated"], + logAlertGroupResource["interval_updated"], + logAlertGroupResource["expression_updated"], ), Check: resource.ComposeAggregateTestCheckFunc( // Instance @@ -934,6 +1074,18 @@ func TestAccResource(t *testing.T) { resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"), resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"), + + // logalertgroup + resource.TestCheckResourceAttr("stackit_observability_logalertgroup.logalertgroup", "project_id", credentialResource["project_id"]), + resource.TestCheckResourceAttrPair( + "stackit_observability_instance.instance", "instance_id", + "stackit_observability_logalertgroup.logalertgroup", "instance_id", + ), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "name"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "interval"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.alert"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.expression"), + resource.TestCheckResourceAttrSet("stackit_observability_logalertgroup.logalertgroup", "rules.0.for"), ), }, // Update and remove saml2 attribute diff --git a/stackit/provider.go b/stackit/provider.go index 7f4a3002..3d6b679a 100644 --- a/stackit/provider.go +++ b/stackit/provider.go @@ -48,6 +48,7 @@ import ( alertGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/alertgroup" observabilityCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/credential" observabilityInstance "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/instance" + logAlertGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/log-alertgroup" observabilityScrapeConfig "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/scrapeconfig" openSearchCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/opensearch/credential" openSearchInstance "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/opensearch/instance" @@ -482,6 +483,7 @@ func (p *Provider) DataSources(_ context.Context) []func() datasource.DataSource loadBalancer.NewLoadBalancerDataSource, logMeInstance.NewInstanceDataSource, logMeCredential.NewCredentialDataSource, + logAlertGroup.NewLogAlertGroupDataSource, mariaDBInstance.NewInstanceDataSource, mariaDBCredential.NewCredentialDataSource, mongoDBFlexInstance.NewInstanceDataSource, @@ -540,6 +542,7 @@ func (p *Provider) Resources(_ context.Context) []func() resource.Resource { loadBalancerObservabilityCredential.NewObservabilityCredentialResource, logMeInstance.NewInstanceResource, logMeCredential.NewCredentialResource, + logAlertGroup.NewLogAlertGroupResource, mariaDBInstance.NewInstanceResource, mariaDBCredential.NewCredentialResource, modelServingToken.NewTokenResource, diff --git a/templates/guides/ske_log_alerts.md.tmpl b/templates/guides/ske_log_alerts.md.tmpl new file mode 100644 index 00000000..3cca0b66 --- /dev/null +++ b/templates/guides/ske_log_alerts.md.tmpl @@ -0,0 +1,199 @@ +--- +page_title: "SKE Log Alerts with STACKIT Observability" +--- +# SKE Log Alerts with STACKIT Observability + +## Overview + +This guide walks you through setting up log-based alerting in STACKIT Observability using Promtail to ship Kubernetes logs. + +1. **Set Up Providers** + + Begin by configuring the STACKIT and Kubernetes providers to connect to the STACKIT services. + + ```hcl + provider "stackit" { + region = "eu01" + } + + provider "kubernetes" { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + + provider "helm" { + kubernetes { + host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server + client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data) + client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data) + cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data) + } + } + ``` + +2. **Create SKE Cluster and Kubeconfig Resource** + + Set up a STACKIT SKE Cluster and generate the associated kubeconfig resource. + + ```hcl + resource "stackit_ske_cluster" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + kubernetes_version = "1.31" + node_pools = [ + { + name = "standard" + machine_type = "c1.4" + minimum = "3" + maximum = "9" + max_surge = "3" + availability_zones = ["eu01-1", "eu01-2", "eu01-3"] + os_version_min = "4081.2.1" + os_name = "flatcar" + volume_size = 32 + volume_type = "storage_premium_perf6" + } + ] + maintenance = { + enable_kubernetes_version_updates = true + enable_machine_image_version_updates = true + start = "01:00:00Z" + end = "02:00:00Z" + } + } + + resource "stackit_ske_kubeconfig" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + cluster_name = stackit_ske_cluster.example.name + refresh = true + } + ``` + +3. **Create Observability Instance and Credentials** + + Establish a STACKIT Observability instance and its credentials to handle alerts. + + ```hcl + locals { + alert_config = { + route = { + receiver = "EmailStackit", + repeat_interval = "1m", + continue = true + } + receivers = [ + { + name = "EmailStackit", + email_configs = [ + { + to = "" + } + ] + } + ] + } + } + + resource "stackit_observability_instance" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + name = "example" + plan_name = "Observability-Large-EU01" + alert_config = local.alert_config + } + + resource "stackit_observability_credential" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + } + ``` + +4. **Install Promtail** + + Deploy Promtail via Helm to collect logs and forward them to the STACKIT Observability Loki endpoint. + + ```hcl + resource "helm_release" "promtail" { + name = "promtail" + repository = "https://grafana.github.io/helm-charts" + chart = "promtail" + namespace = kubernetes_namespace.monitoring.metadata.0.name + version = "6.16.4" + + values = [ + <<-EOF + config: + clients: + # To find the Loki push URL, navigate to the observability instance in the portal and select the API tab. + - url: "https://${stackit_observability_credential.example.username}:${stackit_observability_credential.example.password}@/instances/${stackit_observability_instance.example.instance_id}/loki/api/v1/push" + EOF + ] + } + ``` + +5. **Create Alert Group** + + Create a log alert that triggers when a specific pod logs an error message. + + ```hcl + resource "stackit_observability_logalertgroup" "example" { + project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" + instance_id = stackit_observability_instance.example.instance_id + name = "TestLogAlertGroup" + interval = "1m" + rules = [ + { + alert = "SimplePodLogAlertCheck" + expression = "sum(rate({namespace=\"example\", pod=\"logger\"} |= \"Simulated error message\" [1m])) > 0" + for = "60s" + labels = { + severity = "critical" + }, + annotations = { + summary : "Test Log Alert is working" + description : "Test Log Alert" + }, + }, + ] + } + ``` + +6. **Deploy Test Pod** + + Launch a pod that emits simulated error logs. This should trigger the alert if everything is set up correctly. + + ```hcl + resource "kubernetes_namespace" "example" { + metadata { + name = "example" + } + } + + resource "kubernetes_pod" "logger" { + metadata { + name = "logger" + namespace = kubernetes_namespace.example.metadata[0].name + labels = { + app = "logger" + } + } + + spec { + container { + name = "logger" + image = "bash" + command = [ + "bash", + "-c", + <&2 + done + EOF + ] + } + } + } + ``` \ No newline at end of file