Implement observability alertgroups (#778)

* feat: implement observability alertgroups

* review changes
This commit is contained in:
Mauritz Uphoff 2025-04-14 13:21:30 +02:00 committed by GitHub
parent 44103a1ffd
commit 289746c7d1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 1987 additions and 4 deletions

View file

@ -0,0 +1,47 @@
---
# generated by https://github.com/hashicorp/terraform-plugin-docs
page_title: "stackit_observability_alertgroup Data Source - stackit"
subcategory: ""
description: |-
Observability alert group resource schema. Must have a region specified in the provider configuration.
---
# stackit_observability_alertgroup (Data Source)
Observability alert group resource schema. Must have a `region` specified in the provider configuration.
## Example Usage
```terraform
data "stackit_observability_alertgroup" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example-alert-group"
}
```
<!-- schema generated by tfplugindocs -->
## Schema
### Required
- `instance_id` (String) Observability instance ID to which the alert group is associated.
- `name` (String) The name of the alert group. Is the identifier and must be unique in the group.
- `project_id` (String) STACKIT project ID to which the alert group is associated.
### Read-Only
- `id` (String) Terraform's internal resource ID. It is structured as "`project_id`,`instance_id`,`name`".
- `interval` (String) Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'.
- `rules` (Attributes List) (see [below for nested schema](#nestedatt--rules))
<a id="nestedatt--rules"></a>
### Nested Schema for `rules`
Read-Only:
- `alert` (String) The name of the alert rule. Is the identifier and must be unique in the group.
- `annotations` (Map of String) A map of key:value. Annotations to add or overwrite for each alert
- `expression` (String) The PromQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts.
- `for` (String) Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s
- `labels` (Map of String) A map of key:value. Labels to add or overwrite for each alert

View file

@ -0,0 +1,267 @@
---
page_title: "Alerting with Kube-State-Metrics in STACKIT Observability"
---
# Alerting with Kube-State-Metrics in STACKIT Observability
## Overview
This guide explains how to configure the STACKIT Observability product to send alerts using metrics gathered from kube-state-metrics.
1. **Set Up Providers**
Begin by configuring the STACKIT and Kubernetes providers to connect to the STACKIT services.
```hcl
provider "stackit" {
region = "eu01"
}
provider "kubernetes" {
host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data)
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data)
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data)
}
provider "helm" {
kubernetes {
host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data)
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data)
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data)
}
}
```
2. **Create SKE Cluster and Kubeconfig Resource**
Set up a STACKIT SKE Cluster and generate the associated kubeconfig resource.
```hcl
resource "stackit_ske_cluster" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example"
kubernetes_version = "1.31"
node_pools = [
{
name = "standard"
machine_type = "c1.4"
minimum = "3"
maximum = "9"
max_surge = "3"
availability_zones = ["eu01-1", "eu01-2", "eu01-3"]
os_version_min = "4081.2.1"
os_name = "flatcar"
volume_size = 32
volume_type = "storage_premium_perf6"
}
]
maintenance = {
enable_kubernetes_version_updates = true
enable_machine_image_version_updates = true
start = "01:00:00Z"
end = "02:00:00Z"
}
}
resource "stackit_ske_kubeconfig" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
cluster_name = stackit_ske_cluster.example.name
refresh = true
}
```
3. **Create Observability Instance and Credentials**
Establish a STACKIT Observability instance and its credentials to handle alerts.
```hcl
locals {
alert_config = {
route = {
receiver = "EmailStackit",
repeat_interval = "1m",
continue = true
}
receivers = [
{
name = "EmailStackit",
email_configs = [
{
to = "<email>"
}
]
}
]
}
}
resource "stackit_observability_instance" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example"
plan_name = "Observability-Large-EU01"
alert_config = local.alert_config
}
resource "stackit_observability_credential" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = stackit_observability_instance.example.instance_id
}
```
4. **Install Prometheus Operator**
Use the Prometheus Helm chart to install kube-state-metrics and transfer metrics to the STACKIT Observability instance. Customize the helm values as needed for your deployment.
```yaml
# helm values
# save as prom-values.tftpl
prometheus:
enabled: true
agentMode: true
prometheusSpec:
enableRemoteWriteReceiver: true
scrapeInterval: 60s
evaluationInterval: 60s
replicas: 1
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: premium-perf4-stackit
accessModes: ['ReadWriteOnce']
resources:
requests:
storage: 80Gi
remoteWrite:
- url: ${metrics_push_url}
queueConfig:
batchSendDeadline: '5s'
# both values need to be configured according to your observability plan
capacity: 30000
maxSamplesPerSend: 3000
writeRelabelConfigs:
- sourceLabels: ['__name__']
regex: 'apiserver_.*|etcd_.*|prober_.*|storage_.*|workqueue_(work|queue)_duration_seconds_bucket|kube_pod_tolerations|kubelet_.*|kubernetes_feature_enabled|instance_scrape_target_status'
action: 'drop'
- sourceLabels: ['namespace']
regex: 'example'
action: 'keep'
basicAuth:
username:
key: username
name: ${secret_name}
password:
key: password
name: ${secret_name}
grafana:
enabled: false
defaultRules:
create: false
alertmanager:
enabled: false
nodeExporter:
enabled: true
kube-state-metrics:
enabled: true
customResourceState:
enabled: true
collectors:
- deployments
- pods
```
```hcl
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
}
}
resource "kubernetes_secret" "argus_prometheus_authorization" {
metadata {
name = "argus-prometheus-credentials"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
username = stackit_observability_credential.example.username
password = stackit_observability_credential.example.password
}
}
resource "helm_release" "prometheus_operator" {
name = "prometheus-operator"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
version = "60.1.0"
namespace = kubernetes_namespace.monitoring.metadata[0].name
values = [
templatefile("prom-values.tftpl", {
metrics_push_url = stackit_observability_instance.example.metrics_push_url
secret_name = kubernetes_secret.argus_prometheus_authorization.metadata[0].name
})
]
}
```
5. **Create Alert Group**
Define an alert group with a rule to notify when a pod is running in the "example" namespace.
```hcl
resource "stackit_observability_alertgroup" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = stackit_observability_instance.example.instance_id
name = "TestAlertGroup"
interval = "2h"
rules = [
{
alert = "SimplePodCheck"
expression = "sum(kube_pod_status_phase{phase=\"Running\", namespace=\"example\"}) > 0"
for = "60s"
labels = {
severity = "critical"
},
annotations = {
summary = "Test Alert is working"
description = "Test Alert"
}
},
]
}
```
6. **Deploy Test Pod**
Deploy a test pod; doing so should trigger an email notification, as the deployment satisfies the conditions defined in the alert group rule. In a real-world scenario, you would typically configure alerts to monitor pods for error states instead.
```hcl
resource "kubernetes_namespace" "example" {
metadata {
name = "example"
}
}
resource "kubernetes_pod" "example" {
metadata {
name = "nginx"
namespace = kubernetes_namespace.example.metadata[0].name
labels = {
app = "nginx"
}
}
spec {
container {
image = "nginx:latest"
name = "nginx"
}
}
}
```

View file

@ -0,0 +1,80 @@
---
# generated by https://github.com/hashicorp/terraform-plugin-docs
page_title: "stackit_observability_alertgroup Resource - stackit"
subcategory: ""
description: |-
Observability alert group resource schema. Must have a region specified in the provider configuration.
---
# stackit_observability_alertgroup (Resource)
Observability alert group resource schema. Must have a `region` specified in the provider configuration.
## Example Usage
```terraform
resource "stackit_observability_alertgroup" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example-alert-group"
interval = "60s"
rules = [
{
alert = "example-alert-name"
expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0"
for = "60s"
labels = {
severity = "critical"
},
annotations = {
summary : "example summary"
description : "example description"
}
},
{
alert = "example-alert-name-2"
expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0"
for = "1m"
labels = {
severity = "critical"
},
annotations = {
summary : "example summary"
description : "example description"
}
},
]
}
```
<!-- schema generated by tfplugindocs -->
## Schema
### Required
- `instance_id` (String) Observability instance ID to which the alert group is associated.
- `name` (String) The name of the alert group. Is the identifier and must be unique in the group.
- `project_id` (String) STACKIT project ID to which the alert group is associated.
- `rules` (Attributes List) Rules for the alert group (see [below for nested schema](#nestedatt--rules))
### Optional
- `interval` (String) Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'.
### Read-Only
- `id` (String) Terraform's internal resource ID. It is structured as "`project_id`,`instance_id`,`name`".
<a id="nestedatt--rules"></a>
### Nested Schema for `rules`
Required:
- `alert` (String) The name of the alert rule. Is the identifier and must be unique in the group.
- `expression` (String) The PromQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts.
Optional:
- `annotations` (Map of String) A map of key:value. Annotations to add or overwrite for each alert
- `for` (String) Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s
- `labels` (Map of String) A map of key:value. Labels to add or overwrite for each alert

View file

@ -0,0 +1,5 @@
data "stackit_observability_alertgroup" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example-alert-group"
}

View file

@ -0,0 +1,32 @@
resource "stackit_observability_alertgroup" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example-alert-group"
interval = "60s"
rules = [
{
alert = "example-alert-name"
expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0"
for = "60s"
labels = {
severity = "critical"
},
annotations = {
summary : "example summary"
description : "example description"
}
},
{
alert = "example-alert-name-2"
expression = "kube_node_status_condition{condition=\"Ready\", status=\"false\"} > 0"
for = "1m"
labels = {
severity = "critical"
},
annotations = {
summary : "example summary"
description : "example description"
}
},
]
}

View file

@ -0,0 +1,187 @@
package alertgroup
import (
"context"
"errors"
"fmt"
"net/http"
"github.com/hashicorp/terraform-plugin-framework-validators/stringvalidator"
"github.com/hashicorp/terraform-plugin-framework/datasource"
"github.com/hashicorp/terraform-plugin-framework/datasource/schema"
"github.com/hashicorp/terraform-plugin-framework/schema/validator"
"github.com/hashicorp/terraform-plugin-framework/types"
"github.com/hashicorp/terraform-plugin-log/tflog"
"github.com/stackitcloud/stackit-sdk-go/core/config"
"github.com/stackitcloud/stackit-sdk-go/core/oapierror"
"github.com/stackitcloud/stackit-sdk-go/services/observability"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/core"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/validate"
)
// Ensure the implementation satisfies the expected interfaces.
var (
_ datasource.DataSource = &alertGroupDataSource{}
)
// NewAlertGroupDataSource creates a new instance of the alertGroupDataSource.
func NewAlertGroupDataSource() datasource.DataSource {
return &alertGroupDataSource{}
}
// alertGroupDataSource is the datasource implementation.
type alertGroupDataSource struct {
client *observability.APIClient
}
// Configure adds the provider configured client to the resource.
func (a *alertGroupDataSource) Configure(ctx context.Context, req datasource.ConfigureRequest, resp *datasource.ConfigureResponse) {
// Prevent panic if the provider has not been configured.
if req.ProviderData == nil {
return
}
providerData, ok := req.ProviderData.(core.ProviderData)
if !ok {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Expected configure type stackit.ProviderData, got %T", req.ProviderData))
return
}
var apiClient *observability.APIClient
var err error
if providerData.ObservabilityCustomEndpoint != "" {
apiClient, err = observability.NewAPIClient(
config.WithCustomAuth(providerData.RoundTripper),
config.WithEndpoint(providerData.ObservabilityCustomEndpoint),
)
} else {
apiClient, err = observability.NewAPIClient(
config.WithCustomAuth(providerData.RoundTripper),
config.WithRegion(providerData.GetRegion()),
)
}
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Configuring client: %v. This is an error related to the provider configuration, not to the resource configuration", err))
return
}
a.client = apiClient
tflog.Info(ctx, "Observability alert group client configured")
}
// Metadata provides metadata for the alert group datasource.
func (a *alertGroupDataSource) Metadata(_ context.Context, req datasource.MetadataRequest, resp *datasource.MetadataResponse) {
resp.TypeName = req.ProviderTypeName + "_observability_alertgroup"
}
// Schema defines the schema for the alert group data source.
func (a *alertGroupDataSource) Schema(_ context.Context, _ datasource.SchemaRequest, resp *datasource.SchemaResponse) {
resp.Schema = schema.Schema{
Description: "Observability alert group resource schema. Must have a `region` specified in the provider configuration.",
Attributes: map[string]schema.Attribute{
"id": schema.StringAttribute{
Description: descriptions["id"],
Computed: true,
},
"project_id": schema.StringAttribute{
Description: descriptions["project_id"],
Required: true,
Validators: []validator.String{
validate.UUID(),
validate.NoSeparator(),
},
},
"instance_id": schema.StringAttribute{
Description: descriptions["instance_id"],
Required: true,
Validators: []validator.String{
validate.UUID(),
validate.NoSeparator(),
},
},
"name": schema.StringAttribute{
Description: descriptions["name"],
Required: true,
Validators: []validator.String{
validate.NoSeparator(),
stringvalidator.LengthBetween(1, 200),
},
},
"interval": schema.StringAttribute{
Description: descriptions["interval"],
Computed: true,
Validators: []validator.String{
validate.ValidDurationString(),
},
},
"rules": schema.ListNestedAttribute{
Description: descriptions["rules"],
Computed: true,
NestedObject: schema.NestedAttributeObject{
Attributes: map[string]schema.Attribute{
"alert": schema.StringAttribute{
Description: descriptions["alert"],
Computed: true,
},
"expression": schema.StringAttribute{
Description: descriptions["expression"],
Computed: true,
},
"for": schema.StringAttribute{
Description: descriptions["for"],
Computed: true,
},
"labels": schema.MapAttribute{
Description: descriptions["labels"],
ElementType: types.StringType,
Computed: true,
},
"annotations": schema.MapAttribute{
Description: descriptions["annotations"],
ElementType: types.StringType,
Computed: true,
},
},
},
},
},
}
}
func (a *alertGroupDataSource) Read(ctx context.Context, req datasource.ReadRequest, resp *datasource.ReadResponse) { // nolint:gocritic // function signature required by Terraform
var model Model
diags := req.Config.Get(ctx, &model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}
projectId := model.ProjectId.ValueString()
instanceId := model.InstanceId.ValueString()
alertGroupName := model.Name.ValueString()
ctx = tflog.SetField(ctx, "project_id", projectId)
ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName)
ctx = tflog.SetField(ctx, "instance_id", instanceId)
readAlertGroupResp, err := a.client.GetAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute()
if err != nil {
var oapiErr *oapierror.GenericOpenAPIError
ok := errors.As(err, &oapiErr)
if ok && oapiErr.StatusCode == http.StatusNotFound {
resp.State.RemoveResource(ctx)
return
}
core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Calling API: %v", err))
return
}
err = mapFields(ctx, readAlertGroupResp.Data, &model)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Error processing API response: %v", err))
return
}
// Set the updated state.
diags = resp.State.Set(ctx, &model)
resp.Diagnostics.Append(diags...)
}

View file

@ -0,0 +1,577 @@
package alertgroup
import (
"context"
"errors"
"fmt"
"net/http"
"regexp"
"strings"
"github.com/hashicorp/terraform-plugin-framework-validators/mapvalidator"
"github.com/hashicorp/terraform-plugin-framework-validators/stringvalidator"
"github.com/hashicorp/terraform-plugin-framework/attr"
"github.com/hashicorp/terraform-plugin-framework/path"
"github.com/hashicorp/terraform-plugin-framework/resource"
"github.com/hashicorp/terraform-plugin-framework/resource/schema"
"github.com/hashicorp/terraform-plugin-framework/resource/schema/listplanmodifier"
"github.com/hashicorp/terraform-plugin-framework/resource/schema/planmodifier"
"github.com/hashicorp/terraform-plugin-framework/resource/schema/stringplanmodifier"
"github.com/hashicorp/terraform-plugin-framework/schema/validator"
"github.com/hashicorp/terraform-plugin-framework/types"
"github.com/hashicorp/terraform-plugin-framework/types/basetypes"
"github.com/hashicorp/terraform-plugin-log/tflog"
"github.com/stackitcloud/stackit-sdk-go/core/config"
"github.com/stackitcloud/stackit-sdk-go/core/oapierror"
"github.com/stackitcloud/stackit-sdk-go/services/observability"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/conversion"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/core"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/utils"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/validate"
)
// Ensure the implementation satisfies the expected interfaces.
var (
_ resource.Resource = &alertGroupResource{}
_ resource.ResourceWithConfigure = &alertGroupResource{}
_ resource.ResourceWithImportState = &alertGroupResource{}
)
type Model struct {
Id types.String `tfsdk:"id"`
ProjectId types.String `tfsdk:"project_id"`
InstanceId types.String `tfsdk:"instance_id"`
Name types.String `tfsdk:"name"`
Interval types.String `tfsdk:"interval"`
Rules types.List `tfsdk:"rules"`
}
type rule struct {
Alert types.String `tfsdk:"alert"`
Annotations types.Map `tfsdk:"annotations"`
Labels types.Map `tfsdk:"labels"`
Expression types.String `tfsdk:"expression"`
For types.String `tfsdk:"for"`
}
var ruleTypes = map[string]attr.Type{
"alert": basetypes.StringType{},
"annotations": basetypes.MapType{ElemType: types.StringType},
"labels": basetypes.MapType{ElemType: types.StringType},
"expression": basetypes.StringType{},
"for": basetypes.StringType{},
}
// Descriptions for the resource and data source schemas are centralized here.
var descriptions = map[string]string{
"id": "Terraform's internal resource ID. It is structured as \"`project_id`,`instance_id`,`name`\".",
"project_id": "STACKIT project ID to which the alert group is associated.",
"instance_id": "Observability instance ID to which the alert group is associated.",
"name": "The name of the alert group. Is the identifier and must be unique in the group.",
"interval": "Specifies the frequency at which rules within the group are evaluated. The interval must be at least 60 seconds and defaults to 60 seconds if not set. Supported formats include hours, minutes, and seconds, either singly or in combination. Examples of valid formats are: '5h30m40s', '5h', '5h30m', '60m', and '60s'.",
"alert": "The name of the alert rule. Is the identifier and must be unique in the group.",
"expression": "The PromQL expression to evaluate. Every evaluation cycle this is evaluated at the current time, and all resultant time series become pending/firing alerts.",
"for": "Alerts are considered firing once they have been returned for this long. Alerts which have not yet fired for long enough are considered pending. Default is 0s",
"labels": "A map of key:value. Labels to add or overwrite for each alert",
"annotations": "A map of key:value. Annotations to add or overwrite for each alert",
}
// NewAlertGroupResource is a helper function to simplify the provider implementation.
func NewAlertGroupResource() resource.Resource {
return &alertGroupResource{}
}
// alertGroupResource is the resource implementation.
type alertGroupResource struct {
client *observability.APIClient
}
// Metadata returns the resource type name.
func (a *alertGroupResource) Metadata(_ context.Context, req resource.MetadataRequest, resp *resource.MetadataResponse) {
resp.TypeName = req.ProviderTypeName + "_observability_alertgroup"
}
// Configure adds the provider configured client to the resource.
func (a *alertGroupResource) Configure(ctx context.Context, req resource.ConfigureRequest, resp *resource.ConfigureResponse) {
// Prevent panic if the provider has not been configured.
if req.ProviderData == nil {
return
}
providerData, ok := req.ProviderData.(core.ProviderData)
if !ok {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Expected configure type stackit.ProviderData, got %T", req.ProviderData))
return
}
var apiClient *observability.APIClient
var err error
if providerData.ObservabilityCustomEndpoint != "" {
apiClient, err = observability.NewAPIClient(
config.WithCustomAuth(providerData.RoundTripper),
config.WithEndpoint(providerData.ObservabilityCustomEndpoint),
)
} else {
apiClient, err = observability.NewAPIClient(
config.WithCustomAuth(providerData.RoundTripper),
config.WithRegion(providerData.GetRegion()),
)
}
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error configuring API client", fmt.Sprintf("Configuring client: %v. This is an error related to the provider configuration, not to the resource configuration", err))
return
}
a.client = apiClient
tflog.Info(ctx, "Observability alert group client configured")
}
// Schema defines the schema for the resource.
func (a *alertGroupResource) Schema(_ context.Context, _ resource.SchemaRequest, resp *resource.SchemaResponse) {
resp.Schema = schema.Schema{
Description: "Observability alert group resource schema. Must have a `region` specified in the provider configuration.",
Attributes: map[string]schema.Attribute{
"id": schema.StringAttribute{
Description: descriptions["id"],
Computed: true,
},
"project_id": schema.StringAttribute{
Description: "STACKIT project ID to which the alert group is associated.",
Required: true,
Validators: []validator.String{
validate.UUID(),
validate.NoSeparator(),
},
PlanModifiers: []planmodifier.String{
stringplanmodifier.RequiresReplace(),
},
},
"instance_id": schema.StringAttribute{
Description: descriptions["instance_id"],
Required: true,
Validators: []validator.String{
validate.UUID(),
validate.NoSeparator(),
},
PlanModifiers: []planmodifier.String{
stringplanmodifier.RequiresReplace(),
},
},
"name": schema.StringAttribute{
Description: descriptions["name"],
Required: true,
Validators: []validator.String{
validate.NoSeparator(),
stringvalidator.LengthBetween(1, 200),
stringvalidator.RegexMatches(
regexp.MustCompile(`^[a-zA-Z0-9-]+$`),
"must match expression",
),
},
PlanModifiers: []planmodifier.String{
stringplanmodifier.RequiresReplace(),
},
},
"interval": schema.StringAttribute{
Description: descriptions["interval"],
Optional: true,
Validators: []validator.String{
validate.ValidDurationString(),
},
PlanModifiers: []planmodifier.String{
stringplanmodifier.RequiresReplace(),
},
},
"rules": schema.ListNestedAttribute{
Description: "Rules for the alert group",
Required: true,
PlanModifiers: []planmodifier.List{
listplanmodifier.RequiresReplace(),
},
NestedObject: schema.NestedAttributeObject{
Attributes: map[string]schema.Attribute{
"alert": schema.StringAttribute{
Description: descriptions["alert"],
Required: true,
Validators: []validator.String{
stringvalidator.RegexMatches(
regexp.MustCompile(`^[a-zA-Z0-9-]+$`),
"must match expression",
),
stringvalidator.LengthBetween(1, 200),
},
},
"expression": schema.StringAttribute{
Description: descriptions["expression"],
Required: true,
Validators: []validator.String{
stringvalidator.LengthBetween(1, 600),
},
},
"for": schema.StringAttribute{
Description: descriptions["for"],
Optional: true,
Validators: []validator.String{
stringvalidator.LengthBetween(2, 8),
validate.ValidDurationString(),
},
},
"labels": schema.MapAttribute{
Description: descriptions["labels"],
Optional: true,
ElementType: types.StringType,
Validators: []validator.Map{
mapvalidator.KeysAre(stringvalidator.LengthAtMost(200)),
mapvalidator.ValueStringsAre(stringvalidator.LengthAtMost(200)),
mapvalidator.SizeAtMost(10),
},
},
"annotations": schema.MapAttribute{
Description: descriptions["annotations"],
Optional: true,
ElementType: types.StringType,
Validators: []validator.Map{
mapvalidator.KeysAre(stringvalidator.LengthAtMost(200)),
mapvalidator.ValueStringsAre(stringvalidator.LengthAtMost(200)),
mapvalidator.SizeAtMost(5),
},
},
},
},
},
},
}
}
// Create creates the resource and sets the initial Terraform state.
func (a *alertGroupResource) Create(ctx context.Context, req resource.CreateRequest, resp *resource.CreateResponse) { // nolint:gocritic // function signature required by Terraform
// Retrieve values from plan
var model Model
diags := req.Plan.Get(ctx, &model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}
projectId := model.ProjectId.ValueString()
instanceId := model.InstanceId.ValueString()
alertGroupName := model.Name.ValueString()
ctx = tflog.SetField(ctx, "project_id", projectId)
ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName)
ctx = tflog.SetField(ctx, "instance_id", instanceId)
payload, err := toCreatePayload(ctx, &model)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alertgroup", fmt.Sprintf("Creating API payload: %v", err))
return
}
createAlertGroupResp, err := a.client.CreateAlertgroups(ctx, instanceId, projectId).CreateAlertgroupsPayload(*payload).Execute()
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alertgroup", fmt.Sprintf("Creating API payload: %v", err))
return
}
// all alert groups are returned. We have to search the map for the one corresponding to our name
for _, alertGroup := range *createAlertGroupResp.Data {
if model.Name.ValueString() != *alertGroup.Name {
continue
}
err = mapFields(ctx, &alertGroup, &model)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error creating alert group", fmt.Sprintf("Processing API payload: %v", err))
return
}
}
// Set the state with fully populated data.
diags = resp.State.Set(ctx, model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}
tflog.Info(ctx, "alert group created")
}
// Read refreshes the Terraform state with the latest data.
func (a *alertGroupResource) Read(ctx context.Context, req resource.ReadRequest, resp *resource.ReadResponse) { // nolint:gocritic // function signature required by Terraform
var model Model
diags := req.State.Get(ctx, &model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}
projectId := model.ProjectId.ValueString()
instanceId := model.InstanceId.ValueString()
alertGroupName := model.Name.ValueString()
ctx = tflog.SetField(ctx, "project_id", projectId)
ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName)
ctx = tflog.SetField(ctx, "instance_id", instanceId)
readAlertGroupResp, err := a.client.GetAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute()
if err != nil {
var oapiErr *oapierror.GenericOpenAPIError
ok := errors.As(err, &oapiErr)
if ok && oapiErr.StatusCode == http.StatusNotFound {
resp.State.RemoveResource(ctx)
return
}
core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Calling API: %v", err))
return
}
err = mapFields(ctx, readAlertGroupResp.Data, &model)
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error reading alert group", fmt.Sprintf("Error processing API response: %v", err))
return
}
// Set the updated state.
diags = resp.State.Set(ctx, &model)
resp.Diagnostics.Append(diags...)
}
// Update attempts to update the resource. In this case, alertgroups cannot be updated.
// The Update function is redundant since any modifications will
// automatically trigger a resource recreation through Terraform's built-in
// lifecycle management.
func (a *alertGroupResource) Update(ctx context.Context, _ resource.UpdateRequest, resp *resource.UpdateResponse) { // nolint:gocritic // function signature required by Terraform
core.LogAndAddError(ctx, &resp.Diagnostics, "Error updating alert group", "Observability alert groups can't be updated")
}
// Delete deletes the resource and removes the Terraform state on success.
func (a *alertGroupResource) Delete(ctx context.Context, req resource.DeleteRequest, resp *resource.DeleteResponse) { // nolint:gocritic // function signature required by Terraform
// Retrieve values from state
var model Model
diags := req.State.Get(ctx, &model)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
return
}
projectId := model.ProjectId.ValueString()
instanceId := model.InstanceId.ValueString()
alertGroupName := model.Name.ValueString()
ctx = tflog.SetField(ctx, "project_id", projectId)
ctx = tflog.SetField(ctx, "alert_group_name", alertGroupName)
ctx = tflog.SetField(ctx, "instance_id", instanceId)
_, err := a.client.DeleteAlertgroup(ctx, alertGroupName, instanceId, projectId).Execute()
if err != nil {
core.LogAndAddError(ctx, &resp.Diagnostics, "Error deleting alert group", fmt.Sprintf("Calling API: %v", err))
return
}
tflog.Info(ctx, "Alert group deleted")
}
// ImportState imports a resource into the Terraform state on success.
// The expected format of the resource import identifier is: project_id,instance_id,name
func (a *alertGroupResource) ImportState(ctx context.Context, req resource.ImportStateRequest, resp *resource.ImportStateResponse) {
idParts := strings.Split(req.ID, core.Separator)
if len(idParts) != 3 || idParts[0] == "" || idParts[1] == "" || idParts[2] == "" {
core.LogAndAddError(ctx, &resp.Diagnostics,
"Error importing scrape config",
fmt.Sprintf("Expected import identifier with format: [project_id],[instance_id],[name] Got: %q", req.ID),
)
return
}
resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("project_id"), idParts[0])...)
resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("instance_id"), idParts[1])...)
resp.Diagnostics.Append(resp.State.SetAttribute(ctx, path.Root("name"), idParts[2])...)
tflog.Info(ctx, "Observability alert group state imported")
}
// toCreatePayload generates the payload to create a new alert group.
func toCreatePayload(ctx context.Context, model *Model) (*observability.CreateAlertgroupsPayload, error) {
if model == nil {
return nil, fmt.Errorf("nil model")
}
payload := observability.CreateAlertgroupsPayload{}
if !utils.IsUndefined(model.Name) {
payload.Name = model.Name.ValueStringPointer()
}
if !utils.IsUndefined(model.Interval) {
payload.Interval = model.Interval.ValueStringPointer()
}
if !utils.IsUndefined(model.Rules) {
rules, err := toRulesPayload(ctx, model)
if err != nil {
return nil, err
}
payload.Rules = &rules
}
return &payload, nil
}
// toRulesPayload generates rules for create payload.
func toRulesPayload(ctx context.Context, model *Model) ([]observability.UpdateAlertgroupsRequestInnerRulesInner, error) {
if model.Rules.Elements() == nil || len(model.Rules.Elements()) == 0 {
return []observability.UpdateAlertgroupsRequestInnerRulesInner{}, nil
}
var rules []rule
diags := model.Rules.ElementsAs(ctx, &rules, false)
if diags.HasError() {
return nil, core.DiagsToError(diags)
}
var oarrs []observability.UpdateAlertgroupsRequestInnerRulesInner
for i := range rules {
rule := &rules[i]
oarr := observability.UpdateAlertgroupsRequestInnerRulesInner{}
if !utils.IsUndefined(rule.Alert) {
alert := conversion.StringValueToPointer(rule.Alert)
if alert == nil {
return nil, fmt.Errorf("found nil alert for rule[%d]", i)
}
oarr.Alert = alert
}
if !utils.IsUndefined(rule.Expression) {
expression := conversion.StringValueToPointer(rule.Expression)
if expression == nil {
return nil, fmt.Errorf("found nil expression for rule[%d]", i)
}
oarr.Expr = expression
}
if !utils.IsUndefined(rule.For) {
for_ := conversion.StringValueToPointer(rule.For)
if for_ == nil {
return nil, fmt.Errorf("found nil expression for for_[%d]", i)
}
oarr.For = for_
}
if !utils.IsUndefined(rule.Labels) {
labels, err := conversion.ToStringInterfaceMap(ctx, rule.Labels)
if err != nil {
return nil, fmt.Errorf("converting to Go map: %w", err)
}
oarr.Labels = &labels
}
if !utils.IsUndefined(rule.Annotations) {
annotations, err := conversion.ToStringInterfaceMap(ctx, rule.Annotations)
if err != nil {
return nil, fmt.Errorf("converting to Go map: %w", err)
}
oarr.Annotations = &annotations
}
oarrs = append(oarrs, oarr)
}
return oarrs, nil
}
// mapRules maps alertGroup response to the model.
func mapFields(ctx context.Context, alertGroup *observability.AlertGroup, model *Model) error {
if alertGroup == nil {
return fmt.Errorf("nil alertGroup")
}
if model == nil {
return fmt.Errorf("nil model")
}
if utils.IsUndefined(model.Name) {
return fmt.Errorf("empty name")
}
if utils.IsUndefined(model.ProjectId) {
return fmt.Errorf("empty projectId")
}
if utils.IsUndefined(model.InstanceId) {
return fmt.Errorf("empty instanceId")
}
var name string
if !utils.IsUndefined(model.Name) {
name = model.Name.ValueString()
} else if alertGroup.Name != nil {
name = *alertGroup.Name
} else {
return fmt.Errorf("found empty name")
}
model.Name = types.StringValue(name)
idParts := []string{model.ProjectId.ValueString(), model.InstanceId.ValueString(), name}
model.Id = types.StringValue(strings.Join(idParts, core.Separator))
var interval string
if !utils.IsUndefined(model.Interval) {
interval = model.Interval.ValueString()
} else if alertGroup.Interval != nil {
interval = *alertGroup.Interval
} else {
return fmt.Errorf("found empty interval")
}
model.Interval = types.StringValue(interval)
if alertGroup.Rules != nil {
err := mapRules(ctx, alertGroup, model)
if err != nil {
return fmt.Errorf("map rules: %w", err)
}
}
return nil
}
// mapRules maps alertGroup response rules to the model rules.
func mapRules(_ context.Context, alertGroup *observability.AlertGroup, model *Model) error {
var newRules []attr.Value
for i, r := range *alertGroup.Rules {
ruleMap := map[string]attr.Value{
"alert": types.StringPointerValue(r.Alert),
"expression": types.StringPointerValue(r.Expr),
"for": types.StringPointerValue(r.For),
"labels": types.MapNull(types.StringType),
"annotations": types.MapNull(types.StringType),
}
if r.Labels != nil {
labelElems := map[string]attr.Value{}
for k, v := range *r.Labels {
labelElems[k] = types.StringValue(v)
}
ruleMap["labels"] = types.MapValueMust(types.StringType, labelElems)
}
if r.Annotations != nil {
annoElems := map[string]attr.Value{}
for k, v := range *r.Annotations {
annoElems[k] = types.StringValue(v)
}
ruleMap["annotations"] = types.MapValueMust(types.StringType, annoElems)
}
ruleTf, diags := types.ObjectValue(ruleTypes, ruleMap)
if diags.HasError() {
return fmt.Errorf("mapping index %d: %w", i, core.DiagsToError(diags))
}
newRules = append(newRules, ruleTf)
}
rulesTf, diags := types.ListValue(types.ObjectType{AttrTypes: ruleTypes}, newRules)
if diags.HasError() {
return core.DiagsToError(diags)
}
model.Rules = rulesTf
return nil
}

View file

@ -0,0 +1,366 @@
package alertgroup
import (
"context"
"testing"
"github.com/google/go-cmp/cmp"
"github.com/hashicorp/terraform-plugin-framework/attr"
"github.com/hashicorp/terraform-plugin-framework/types"
"github.com/stackitcloud/stackit-sdk-go/core/utils"
"github.com/stackitcloud/stackit-sdk-go/services/observability"
)
func TestToCreatePayload(t *testing.T) {
tests := []struct {
name string
input *Model
expect *observability.CreateAlertgroupsPayload
expectErr bool
}{
{
name: "Nil Model",
input: nil,
expect: nil,
expectErr: true,
},
{
name: "Empty Model",
input: &Model{
Name: types.StringNull(),
Interval: types.StringNull(),
Rules: types.ListNull(types.StringType),
},
expect: &observability.CreateAlertgroupsPayload{},
expectErr: false,
},
{
name: "Model with Name and Interval",
input: &Model{
Name: types.StringValue("test-alertgroup"),
Interval: types.StringValue("5m"),
},
expect: &observability.CreateAlertgroupsPayload{
Name: utils.Ptr("test-alertgroup"),
Interval: utils.Ptr("5m"),
},
expectErr: false,
},
{
name: "Model with Full Information",
input: &Model{
Name: types.StringValue("full-alertgroup"),
Interval: types.StringValue("10m"),
Rules: types.ListValueMust(
types.ObjectType{AttrTypes: ruleTypes},
[]attr.Value{
types.ObjectValueMust(
ruleTypes,
map[string]attr.Value{
"alert": types.StringValue("alert"),
"expression": types.StringValue("expression"),
"for": types.StringValue("10s"),
"labels": types.MapValueMust(
types.StringType,
map[string]attr.Value{
"k": types.StringValue("v"),
},
),
"annotations": types.MapValueMust(
types.StringType,
map[string]attr.Value{
"k": types.StringValue("v"),
},
),
},
),
},
),
},
expect: &observability.CreateAlertgroupsPayload{
Name: utils.Ptr("full-alertgroup"),
Interval: utils.Ptr("10m"),
Rules: &[]observability.UpdateAlertgroupsRequestInnerRulesInner{
{
Alert: utils.Ptr("alert"),
Annotations: &map[string]interface{}{
"k": "v",
},
Expr: utils.Ptr("expression"),
For: utils.Ptr("10s"),
Labels: &map[string]interface{}{
"k": "v",
},
},
},
},
expectErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := context.Background()
got, err := toCreatePayload(ctx, tt.input)
if (err != nil) != tt.expectErr {
t.Fatalf("expected error: %v, got: %v", tt.expectErr, err)
}
if diff := cmp.Diff(got, tt.expect); diff != "" {
t.Errorf("unexpected result (-got +want):\n%s", diff)
}
})
}
}
func TestToRulesPayload(t *testing.T) {
tests := []struct {
name string
input *Model
expect []observability.UpdateAlertgroupsRequestInnerRulesInner
expectErr bool
}{
{
name: "Nil Rules",
input: &Model{
Rules: types.ListNull(types.StringType), // Simulates a lack of rules
},
expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{},
expectErr: false,
},
{
name: "Invalid Rule Element Type",
input: &Model{
Rules: types.ListValueMust(types.StringType, []attr.Value{
types.StringValue("invalid"), // Should cause a conversion failure
}),
},
expect: nil,
expectErr: true,
},
{
name: "Single Valid Rule",
input: &Model{
Rules: types.ListValueMust(types.ObjectType{AttrTypes: ruleTypes}, []attr.Value{
types.ObjectValueMust(ruleTypes, map[string]attr.Value{
"alert": types.StringValue("alert"),
"expression": types.StringValue("expr"),
"for": types.StringValue("5s"),
"labels": types.MapValueMust(types.StringType, map[string]attr.Value{
"key": types.StringValue("value"),
}),
"annotations": types.MapValueMust(types.StringType, map[string]attr.Value{
"note": types.StringValue("important"),
}),
}),
}),
},
expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{
{
Alert: utils.Ptr("alert"),
Expr: utils.Ptr("expr"),
For: utils.Ptr("5s"),
Labels: &map[string]interface{}{
"key": "value",
},
Annotations: &map[string]interface{}{
"note": "important",
},
},
},
expectErr: false,
},
{
name: "Multiple Valid Rules",
input: &Model{
Rules: types.ListValueMust(types.ObjectType{AttrTypes: ruleTypes}, []attr.Value{
types.ObjectValueMust(ruleTypes, map[string]attr.Value{
"alert": types.StringValue("alert1"),
"expression": types.StringValue("expr1"),
"for": types.StringValue("5s"),
"labels": types.MapNull(types.StringType),
"annotations": types.MapNull(types.StringType),
}),
types.ObjectValueMust(ruleTypes, map[string]attr.Value{
"alert": types.StringValue("alert2"),
"expression": types.StringValue("expr2"),
"for": types.StringValue("10s"),
"labels": types.MapValueMust(types.StringType, map[string]attr.Value{
"key": types.StringValue("value"),
}),
"annotations": types.MapValueMust(types.StringType, map[string]attr.Value{
"note": types.StringValue("important"),
}),
}),
}),
},
expect: []observability.UpdateAlertgroupsRequestInnerRulesInner{
{
Alert: utils.Ptr("alert1"),
Expr: utils.Ptr("expr1"),
For: utils.Ptr("5s"),
},
{
Alert: utils.Ptr("alert2"),
Expr: utils.Ptr("expr2"),
For: utils.Ptr("10s"),
Labels: &map[string]interface{}{
"key": "value",
},
Annotations: &map[string]interface{}{
"note": "important",
},
},
},
expectErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := context.Background()
got, err := toRulesPayload(ctx, tt.input)
if (err != nil) != tt.expectErr {
t.Fatalf("expected error: %v, got: %v", tt.expectErr, err)
}
if diff := cmp.Diff(got, tt.expect); diff != "" {
t.Errorf("unexpected result (-got +want):\n%s", diff)
}
})
}
}
func TestMapFields(t *testing.T) {
tests := []struct {
name string
alertGroup *observability.AlertGroup
model *Model
expectedName string
expectedID string
expectErr bool
}{
{
name: "Nil AlertGroup",
alertGroup: nil,
model: &Model{},
expectErr: true,
},
{
name: "Nil Model",
alertGroup: &observability.AlertGroup{},
model: nil,
expectErr: true,
},
{
name: "Interval Missing",
alertGroup: &observability.AlertGroup{
Name: utils.Ptr("alert-group-name"),
},
model: &Model{
Name: types.StringValue("alert-group-name"),
ProjectId: types.StringValue("project1"),
InstanceId: types.StringValue("instance1"),
},
expectedName: "alert-group-name",
expectedID: "project1,instance1,alert-group-name",
expectErr: true,
},
{
name: "Name Missing",
alertGroup: &observability.AlertGroup{
Interval: utils.Ptr("5m"),
},
model: &Model{
Name: types.StringValue("model-name"),
InstanceId: types.StringValue("instance1"),
},
expectErr: true,
},
{
name: "Complete Model and AlertGroup",
alertGroup: &observability.AlertGroup{
Name: utils.Ptr("alert-group-name"),
Interval: utils.Ptr("10m"),
},
model: &Model{
Name: types.StringValue("alert-group-name"),
ProjectId: types.StringValue("project1"),
InstanceId: types.StringValue("instance1"),
Id: types.StringValue("project1,instance1,alert-group-name"),
Interval: types.StringValue("10m"),
},
expectedName: "alert-group-name",
expectedID: "project1,instance1,alert-group-name",
expectErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := context.Background()
err := mapFields(ctx, tt.alertGroup, tt.model)
if (err != nil) != tt.expectErr {
t.Fatalf("expected error: %v, got: %v", tt.expectErr, err)
}
if !tt.expectErr {
if diff := cmp.Diff(tt.model.Name.ValueString(), tt.expectedName); diff != "" {
t.Errorf("unexpected name (-got +want):\n%s", diff)
}
if diff := cmp.Diff(tt.model.Id.ValueString(), tt.expectedID); diff != "" {
t.Errorf("unexpected ID (-got +want):\n%s", diff)
}
}
})
}
}
func TestMapRules(t *testing.T) {
tests := []struct {
name string
alertGroup *observability.AlertGroup
model *Model
expectErr bool
}{
{
name: "Empty Rules",
alertGroup: &observability.AlertGroup{
Rules: &[]observability.AlertRuleRecord{},
},
model: &Model{},
expectErr: false,
},
{
name: "Single Complete Rule",
alertGroup: &observability.AlertGroup{
Rules: &[]observability.AlertRuleRecord{
{
Alert: utils.Ptr("HighCPUUsage"),
Expr: utils.Ptr("rate(cpu_usage[5m]) > 0.9"),
For: utils.Ptr("2m"),
Labels: &map[string]string{"severity": "critical"},
Annotations: &map[string]string{"summary": "CPU usage high"},
Record: utils.Ptr("record1"),
},
},
},
model: &Model{},
expectErr: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
ctx := context.Background()
err := mapRules(ctx, tt.alertGroup, tt.model)
if (err != nil) != tt.expectErr {
t.Fatalf("expected error: %v, got: %v", tt.expectErr, err != nil)
}
})
}
}

View file

@ -41,6 +41,17 @@ var scrapeConfigResource = map[string]string{
"saml2_enable_url_parameters": "false", "saml2_enable_url_parameters": "false",
} }
var alertGroupResource = map[string]string{
"name": fmt.Sprintf("alertgroup-%s", acctest.RandStringFromCharSet(7, acctest.CharSetAlphaNum)),
"name_updated": fmt.Sprintf("alertgroup-%s", acctest.RandStringFromCharSet(7, acctest.CharSetAlphaNum)),
"interval": "5h",
"interval_updated": "1h",
"alert": "alert1",
"expression": "expression1",
"expression_updated": "expression2",
"for": "60s",
}
var credentialResource = map[string]string{ var credentialResource = map[string]string{
"project_id": testutil.ProjectId, "project_id": testutil.ProjectId,
} }
@ -228,8 +239,31 @@ func credentialResourceConfig() string {
}` }`
} }
func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsampling, metricsRetentionDays5mDownsampling, alertConfig *string, instanceName, planName, target, saml2EnableUrlParameters string) string { func alertGroupResourceConfig(name, interval, expression string) string {
return fmt.Sprintf("%s\n\n%s\n\n%s\n\n%s", return fmt.Sprintf(
`resource "stackit_observability_alertgroup" "alertgroup" {
project_id = stackit_observability_instance.instance.project_id
instance_id = stackit_observability_instance.instance.instance_id
name = "%s"
interval = "%s"
rules = [
{
alert = "%s"
expression = "%s"
for = "%s"
}
]
}`,
name,
interval,
alertGroupResource["alert"],
expression,
alertGroupResource["for"],
)
}
func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsampling, metricsRetentionDays5mDownsampling, alertConfig *string, instanceName, planName, target, saml2EnableUrlParameters, alertGroupName, alertGroupInterval, alertGroupRule1Expression string) string {
return fmt.Sprintf("%s\n\n%s\n\n%s\n\n%s\n\n%s",
testutil.ObservabilityProviderConfig(), testutil.ObservabilityProviderConfig(),
instanceResourceConfig(acl, instanceResourceConfig(acl,
metricsRetentionDays, metricsRetentionDays,
@ -240,6 +274,7 @@ func resourceConfig(acl, metricsRetentionDays, metricsRetentionDays1hDownsamplin
planName), planName),
scrapeConfigResourceConfig(target, saml2EnableUrlParameters), scrapeConfigResourceConfig(target, saml2EnableUrlParameters),
credentialResourceConfig(), credentialResourceConfig(),
alertGroupResourceConfig(alertGroupName, alertGroupInterval, alertGroupRule1Expression),
) )
} }
@ -265,6 +300,9 @@ func TestAccResource(t *testing.T) {
instanceResource["plan_name"], instanceResource["plan_name"],
scrapeConfigResource["urls"], scrapeConfigResource["urls"],
scrapeConfigResource["saml2_enable_url_parameters"], scrapeConfigResource["saml2_enable_url_parameters"],
alertGroupResource["name"],
alertGroupResource["interval"],
alertGroupResource["expression"],
), ),
Check: resource.ComposeAggregateTestCheckFunc( Check: resource.ComposeAggregateTestCheckFunc(
// Instance data // Instance data
@ -349,6 +387,18 @@ func TestAccResource(t *testing.T) {
), ),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"),
// alertgroup
resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]),
resource.TestCheckResourceAttrPair(
"stackit_observability_instance.instance", "instance_id",
"stackit_observability_alertgroup.alertgroup", "instance_id",
),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"),
), ),
}, },
// Update Alert Config with complete Receiver (email, webhook and opsgenie configs), global options and Route with child routes // Update Alert Config with complete Receiver (email, webhook and opsgenie configs), global options and Route with child routes
@ -368,6 +418,9 @@ func TestAccResource(t *testing.T) {
instanceResource["plan_name"], instanceResource["plan_name"],
scrapeConfigResource["urls"], scrapeConfigResource["urls"],
scrapeConfigResource["saml2_enable_url_parameters"], scrapeConfigResource["saml2_enable_url_parameters"],
alertGroupResource["name"],
alertGroupResource["interval"],
alertGroupResource["expression"],
), ),
Check: resource.ComposeAggregateTestCheckFunc( Check: resource.ComposeAggregateTestCheckFunc(
// Instance data // Instance data
@ -461,6 +514,18 @@ func TestAccResource(t *testing.T) {
), ),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"),
// alertgroup
resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]),
resource.TestCheckResourceAttrPair(
"stackit_observability_instance.instance", "instance_id",
"stackit_observability_alertgroup.alertgroup", "instance_id",
),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"),
), ),
}, },
// Update without ACL, partial metrics retention days and NO alert configs // Update without ACL, partial metrics retention days and NO alert configs
@ -475,6 +540,9 @@ func TestAccResource(t *testing.T) {
instanceResource["plan_name"], instanceResource["plan_name"],
scrapeConfigResource["urls"], scrapeConfigResource["urls"],
scrapeConfigResource["saml2_enable_url_parameters"], scrapeConfigResource["saml2_enable_url_parameters"],
alertGroupResource["name"],
alertGroupResource["interval"],
alertGroupResource["expression"],
), ),
Check: resource.ComposeAggregateTestCheckFunc( Check: resource.ComposeAggregateTestCheckFunc(
// Instance data // Instance data
@ -530,6 +598,18 @@ func TestAccResource(t *testing.T) {
), ),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"),
// alertgroup
resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]),
resource.TestCheckResourceAttrPair(
"stackit_observability_instance.instance", "instance_id",
"stackit_observability_alertgroup.alertgroup", "instance_id",
),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"),
), ),
}, },
// Update with empty ACL, NO metrics retention days and NO alert configs // Update with empty ACL, NO metrics retention days and NO alert configs
@ -544,6 +624,9 @@ func TestAccResource(t *testing.T) {
instanceResource["plan_name"], instanceResource["plan_name"],
scrapeConfigResource["urls"], scrapeConfigResource["urls"],
scrapeConfigResource["saml2_enable_url_parameters"], scrapeConfigResource["saml2_enable_url_parameters"],
alertGroupResource["name"],
alertGroupResource["interval"],
alertGroupResource["expression"],
), ),
Check: resource.ComposeAggregateTestCheckFunc( Check: resource.ComposeAggregateTestCheckFunc(
// Instance data // Instance data
@ -599,6 +682,18 @@ func TestAccResource(t *testing.T) {
), ),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"),
// alertgroup
resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]),
resource.TestCheckResourceAttrPair(
"stackit_observability_instance.instance", "instance_id",
"stackit_observability_alertgroup.alertgroup", "instance_id",
),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"),
), ),
}, },
// Data source // Data source
@ -616,6 +711,12 @@ func TestAccResource(t *testing.T) {
instance_id = stackit_observability_scrapeconfig.scrapeconfig.instance_id instance_id = stackit_observability_scrapeconfig.scrapeconfig.instance_id
name = stackit_observability_scrapeconfig.scrapeconfig.name name = stackit_observability_scrapeconfig.scrapeconfig.name
} }
data "stackit_observability_alertgroup" "alertgroup" {
project_id = stackit_observability_alertgroup.alertgroup.project_id
instance_id = stackit_observability_alertgroup.alertgroup.instance_id
name = stackit_observability_alertgroup.alertgroup.name
}
`, `,
resourceConfig( resourceConfig(
utils.Ptr(fmt.Sprintf( utils.Ptr(fmt.Sprintf(
@ -631,6 +732,9 @@ func TestAccResource(t *testing.T) {
instanceResource["plan_name"], instanceResource["plan_name"],
scrapeConfigResource["urls"], scrapeConfigResource["urls"],
scrapeConfigResource["saml2_enable_url_parameters"], scrapeConfigResource["saml2_enable_url_parameters"],
alertGroupResource["name"],
alertGroupResource["interval"],
alertGroupResource["expression"],
), ),
), ),
Check: resource.ComposeAggregateTestCheckFunc( Check: resource.ComposeAggregateTestCheckFunc(
@ -670,6 +774,18 @@ func TestAccResource(t *testing.T) {
resource.TestCheckResourceAttr("data.stackit_observability_scrapeconfig.scrapeconfig", "scrape_interval", scrapeConfigResource["scrape_interval"]), resource.TestCheckResourceAttr("data.stackit_observability_scrapeconfig.scrapeconfig", "scrape_interval", scrapeConfigResource["scrape_interval"]),
resource.TestCheckResourceAttr("stackit_observability_scrapeconfig.scrapeconfig", "sample_limit", scrapeConfigResource["sample_limit"]), resource.TestCheckResourceAttr("stackit_observability_scrapeconfig.scrapeconfig", "sample_limit", scrapeConfigResource["sample_limit"]),
resource.TestCheckResourceAttr("data.stackit_observability_scrapeconfig.scrapeconfig", "saml2.enable_url_parameters", scrapeConfigResource["saml2_enable_url_parameters"]), resource.TestCheckResourceAttr("data.stackit_observability_scrapeconfig.scrapeconfig", "saml2.enable_url_parameters", scrapeConfigResource["saml2_enable_url_parameters"]),
// alertgroup
resource.TestCheckResourceAttr("data.stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]),
resource.TestCheckResourceAttrPair(
"stackit_observability_instance.instance", "instance_id",
"data.stackit_observability_alertgroup.alertgroup", "instance_id",
),
resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "name"),
resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "interval"),
resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.alert"),
resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.expression"),
resource.TestCheckResourceAttrSet("data.stackit_observability_alertgroup.alertgroup", "rules.0.for"),
), ),
}, },
// Import 1 // Import 1
@ -711,6 +827,27 @@ func TestAccResource(t *testing.T) {
ImportState: true, ImportState: true,
ImportStateVerify: true, ImportStateVerify: true,
}, },
// Import 3
{
ResourceName: "stackit_observability_alertgroup.alertgroup",
ImportStateIdFunc: func(s *terraform.State) (string, error) {
r, ok := s.RootModule().Resources["stackit_observability_alertgroup.alertgroup"]
if !ok {
return "", fmt.Errorf("couldn't find resource stackit_observability_alertgroup.alertgroup")
}
instanceId, ok := r.Primary.Attributes["instance_id"]
if !ok {
return "", fmt.Errorf("couldn't find attribute instance_id")
}
name, ok := r.Primary.Attributes["name"]
if !ok {
return "", fmt.Errorf("couldn't find attribute name")
}
return fmt.Sprintf("%s,%s,%s", testutil.ProjectId, instanceId, name), nil
},
ImportState: true,
ImportStateVerify: true,
},
// Update // Update
{ {
Config: resourceConfig( Config: resourceConfig(
@ -727,6 +864,9 @@ func TestAccResource(t *testing.T) {
instanceResource["new_plan_name"], instanceResource["new_plan_name"],
"", "",
"true", "true",
alertGroupResource["name_updated"],
alertGroupResource["interval_updated"],
alertGroupResource["expression_updated"],
), ),
Check: resource.ComposeAggregateTestCheckFunc( Check: resource.ComposeAggregateTestCheckFunc(
// Instance // Instance
@ -782,6 +922,18 @@ func TestAccResource(t *testing.T) {
// Credentials // Credentials
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "username"),
resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"), resource.TestCheckResourceAttrSet("stackit_observability_credential.credential", "password"),
// alertgroup
resource.TestCheckResourceAttr("stackit_observability_alertgroup.alertgroup", "project_id", credentialResource["project_id"]),
resource.TestCheckResourceAttrPair(
"stackit_observability_instance.instance", "instance_id",
"stackit_observability_alertgroup.alertgroup", "instance_id",
),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "name"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "interval"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.alert"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.expression"),
resource.TestCheckResourceAttrSet("stackit_observability_alertgroup.alertgroup", "rules.0.for"),
), ),
}, },
// Update and remove saml2 attribute // Update and remove saml2 attribute

View file

@ -13,7 +13,7 @@ import (
"github.com/hashicorp/terraform-plugin-framework/schema/validator" "github.com/hashicorp/terraform-plugin-framework/schema/validator"
"github.com/hashicorp/terraform-plugin-framework/types" "github.com/hashicorp/terraform-plugin-framework/types"
"github.com/stackitcloud/terraform-provider-stackit/stackit/internal/features" "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/features"
roleassignments "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/authorization/roleassignments" roleAssignements "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/authorization/roleassignments"
dnsRecordSet "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/dns/recordset" dnsRecordSet "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/dns/recordset"
dnsZone "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/dns/zone" dnsZone "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/dns/zone"
iaasAffinityGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/iaas/affinitygroup" iaasAffinityGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/iaas/affinitygroup"
@ -45,6 +45,7 @@ import (
objectStorageBucket "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/bucket" objectStorageBucket "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/bucket"
objecStorageCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/credential" objecStorageCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/credential"
objecStorageCredentialsGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/credentialsgroup" objecStorageCredentialsGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/objectstorage/credentialsgroup"
alertGroup "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/alertgroup"
observabilityCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/credential" observabilityCredential "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/credential"
observabilityInstance "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/instance" observabilityInstance "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/instance"
observabilityScrapeConfig "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/scrapeconfig" observabilityScrapeConfig "github.com/stackitcloud/terraform-provider-stackit/stackit/internal/services/observability/scrapeconfig"
@ -462,6 +463,7 @@ func (p *Provider) Configure(ctx context.Context, req provider.ConfigureRequest,
// DataSources defines the data sources implemented in the provider. // DataSources defines the data sources implemented in the provider.
func (p *Provider) DataSources(_ context.Context) []func() datasource.DataSource { func (p *Provider) DataSources(_ context.Context) []func() datasource.DataSource {
return []func() datasource.DataSource{ return []func() datasource.DataSource{
alertGroup.NewAlertGroupDataSource,
dnsZone.NewZoneDataSource, dnsZone.NewZoneDataSource,
dnsRecordSet.NewRecordSetDataSource, dnsRecordSet.NewRecordSetDataSource,
iaasAffinityGroup.NewAffinityGroupDatasource, iaasAffinityGroup.NewAffinityGroupDatasource,
@ -515,6 +517,7 @@ func (p *Provider) DataSources(_ context.Context) []func() datasource.DataSource
// Resources defines the resources implemented in the provider. // Resources defines the resources implemented in the provider.
func (p *Provider) Resources(_ context.Context) []func() resource.Resource { func (p *Provider) Resources(_ context.Context) []func() resource.Resource {
resources := []func() resource.Resource{ resources := []func() resource.Resource{
alertGroup.NewAlertGroupResource,
dnsZone.NewZoneResource, dnsZone.NewZoneResource,
dnsRecordSet.NewRecordSetResource, dnsRecordSet.NewRecordSetResource,
iaasAffinityGroup.NewAffinityGroupResource, iaasAffinityGroup.NewAffinityGroupResource,
@ -570,7 +573,7 @@ func (p *Provider) Resources(_ context.Context) []func() resource.Resource {
skeCluster.NewClusterResource, skeCluster.NewClusterResource,
skeKubeconfig.NewKubeconfigResource, skeKubeconfig.NewKubeconfigResource,
} }
resources = append(resources, roleassignments.NewRoleAssignmentResources()...) resources = append(resources, roleAssignements.NewRoleAssignmentResources()...)
return resources return resources
} }

View file

@ -0,0 +1,267 @@
---
page_title: "Alerting with Kube-State-Metrics in STACKIT Observability"
---
# Alerting with Kube-State-Metrics in STACKIT Observability
## Overview
This guide explains how to configure the STACKIT Observability product to send alerts using metrics gathered from kube-state-metrics.
1. **Set Up Providers**
Begin by configuring the STACKIT and Kubernetes providers to connect to the STACKIT services.
```hcl
provider "stackit" {
region = "eu01"
}
provider "kubernetes" {
host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data)
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data)
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data)
}
provider "helm" {
kubernetes {
host = yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.server
client_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-certificate-data)
client_key = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).users.0.user.client-key-data)
cluster_ca_certificate = base64decode(yamldecode(stackit_ske_kubeconfig.example.kube_config).clusters.0.cluster.certificate-authority-data)
}
}
```
2. **Create SKE Cluster and Kubeconfig Resource**
Set up a STACKIT SKE Cluster and generate the associated kubeconfig resource.
```hcl
resource "stackit_ske_cluster" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example"
kubernetes_version = "1.31"
node_pools = [
{
name = "standard"
machine_type = "c1.4"
minimum = "3"
maximum = "9"
max_surge = "3"
availability_zones = ["eu01-1", "eu01-2", "eu01-3"]
os_version_min = "4081.2.1"
os_name = "flatcar"
volume_size = 32
volume_type = "storage_premium_perf6"
}
]
maintenance = {
enable_kubernetes_version_updates = true
enable_machine_image_version_updates = true
start = "01:00:00Z"
end = "02:00:00Z"
}
}
resource "stackit_ske_kubeconfig" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
cluster_name = stackit_ske_cluster.example.name
refresh = true
}
```
3. **Create Observability Instance and Credentials**
Establish a STACKIT Observability instance and its credentials to handle alerts.
```hcl
locals {
alert_config = {
route = {
receiver = "EmailStackit",
repeat_interval = "1m",
continue = true
}
receivers = [
{
name = "EmailStackit",
email_configs = [
{
to = "<email>"
}
]
}
]
}
}
resource "stackit_observability_instance" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
name = "example"
plan_name = "Observability-Large-EU01"
alert_config = local.alert_config
}
resource "stackit_observability_credential" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = stackit_observability_instance.example.instance_id
}
```
4. **Install Prometheus Operator**
Use the Prometheus Helm chart to install kube-state-metrics and transfer metrics to the STACKIT Observability instance. Customize the helm values as needed for your deployment.
```yaml
# helm values
# save as prom-values.tftpl
prometheus:
enabled: true
agentMode: true
prometheusSpec:
enableRemoteWriteReceiver: true
scrapeInterval: 60s
evaluationInterval: 60s
replicas: 1
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: premium-perf4-stackit
accessModes: ['ReadWriteOnce']
resources:
requests:
storage: 80Gi
remoteWrite:
- url: ${metrics_push_url}
queueConfig:
batchSendDeadline: '5s'
# both values need to be configured according to your observability plan
capacity: 30000
maxSamplesPerSend: 3000
writeRelabelConfigs:
- sourceLabels: ['__name__']
regex: 'apiserver_.*|etcd_.*|prober_.*|storage_.*|workqueue_(work|queue)_duration_seconds_bucket|kube_pod_tolerations|kubelet_.*|kubernetes_feature_enabled|instance_scrape_target_status'
action: 'drop'
- sourceLabels: ['namespace']
regex: 'example'
action: 'keep'
basicAuth:
username:
key: username
name: ${secret_name}
password:
key: password
name: ${secret_name}
grafana:
enabled: false
defaultRules:
create: false
alertmanager:
enabled: false
nodeExporter:
enabled: true
kube-state-metrics:
enabled: true
customResourceState:
enabled: true
collectors:
- deployments
- pods
```
```hcl
resource "kubernetes_namespace" "monitoring" {
metadata {
name = "monitoring"
}
}
resource "kubernetes_secret" "argus_prometheus_authorization" {
metadata {
name = "argus-prometheus-credentials"
namespace = kubernetes_namespace.monitoring.metadata[0].name
}
data = {
username = stackit_observability_credential.example.username
password = stackit_observability_credential.example.password
}
}
resource "helm_release" "prometheus_operator" {
name = "prometheus-operator"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
version = "60.1.0"
namespace = kubernetes_namespace.monitoring.metadata[0].name
values = [
templatefile("prom-values.tftpl", {
metrics_push_url = stackit_observability_instance.example.metrics_push_url
secret_name = kubernetes_secret.argus_prometheus_authorization.metadata[0].name
})
]
}
```
5. **Create Alert Group**
Define an alert group with a rule to notify when a pod is running in the "example" namespace.
```hcl
resource "stackit_observability_alertgroup" "example" {
project_id = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
instance_id = stackit_observability_instance.example.instance_id
name = "TestAlertGroup"
interval = "2h"
rules = [
{
alert = "SimplePodCheck"
expression = "sum(kube_pod_status_phase{phase=\"Running\", namespace=\"example\"}) > 0"
for = "60s"
labels = {
severity = "critical"
},
annotations = {
summary = "Test Alert is working"
description = "Test Alert"
}
},
]
}
```
6. **Deploy Test Pod**
Deploy a test pod; doing so should trigger an email notification, as the deployment satisfies the conditions defined in the alert group rule. In a real-world scenario, you would typically configure alerts to monitor pods for error states instead.
```hcl
resource "kubernetes_namespace" "example" {
metadata {
name = "example"
}
}
resource "kubernetes_pod" "example" {
metadata {
name = "nginx"
namespace = kubernetes_namespace.example.metadata[0].name
labels = {
app = "nginx"
}
}
spec {
container {
image = "nginx:latest"
name = "nginx"
}
}
}
```