diff --git a/stackit/internal/wait/postgresflexalpha/wait.go b/stackit/internal/wait/postgresflexalpha/wait.go index 26f2d729..198f075d 100644 --- a/stackit/internal/wait/postgresflexalpha/wait.go +++ b/stackit/internal/wait/postgresflexalpha/wait.go @@ -59,6 +59,8 @@ func CreateInstanceWaitHandler( maxWait := time.Minute * 45 startTime := time.Now() extendedTimeout := 0 + maxFailedCount := 3 + failedCount := 0 handler := wait.New( func() (waitFinished bool, response *v3alpha1api.GetInstanceResponse, err error) { @@ -132,8 +134,17 @@ func CreateInstanceWaitHandler( instanceCreated = true instanceGetResponse = s case InstanceStateFailed: - tflog.Warn(ctx, fmt.Sprintf("Wait handler got status FAILURE for instance: %s", instanceId)) - return false, nil, nil + if failedCount < maxFailedCount { + failedCount++ + tflog.Warn( + ctx, "got failed status from API retry", map[string]interface{}{ + "failedCount": failedCount, + }, + ) + time.Sleep(5 * time.Second) + return false, nil, nil + } + return true, s, fmt.Errorf("update got status FAILURE for instance with id %s", instanceId) // API responds with FAILURE for some seconds and then the instance goes to READY // return true, s, fmt.Errorf("create failed for instance with id %s", instanceId) } @@ -170,6 +181,8 @@ func PartialUpdateInstanceWaitHandler( ctx context.Context, a APIClientInstanceInterface, projectID, region, instanceID string, ) *wait.AsyncActionHandler[v3alpha1api.GetInstanceResponse] { + maxFailedCount := 3 + failedCount := 0 handler := wait.New( func() (waitFinished bool, response *v3alpha1api.GetInstanceResponse, err error) { s, err := a.GetInstanceRequest(ctx, projectID, region, instanceID).Execute() @@ -195,6 +208,16 @@ func PartialUpdateInstanceWaitHandler( case InstanceStateUnknown: return false, nil, nil case InstanceStateFailed: + if failedCount < maxFailedCount { + failedCount++ + tflog.Warn( + ctx, "got failed status from API retry", map[string]interface{}{ + "failedCount": failedCount, + }, + ) + time.Sleep(5 * time.Second) + return false, nil, nil + } return true, s, fmt.Errorf("update got status FAILURE for instance with id %s", instanceID) } }, @@ -295,6 +318,8 @@ func DeleteInstanceWaitHandler( instanceID string, timeout, sleepBeforeWait time.Duration, ) error { + maxFailedCount := 3 + failedCount := 0 handler := wait.New( func() (waitFinished bool, response *v3alpha1api.GetInstanceResponse, err error) { s, err := a.GetInstanceRequest(ctx, projectID, region, instanceID).Execute() @@ -314,6 +339,16 @@ func DeleteInstanceWaitHandler( case InstanceStateEmpty, InstanceStatePending, InstanceStateUnknown, InstanceStateProgressing, InstanceStateSuccess: return false, nil, nil case InstanceStateFailed: + if failedCount < maxFailedCount { + failedCount++ + tflog.Warn( + ctx, "got failed status from API retry", map[string]interface{}{ + "failedCount": failedCount, + }, + ) + time.Sleep(5 * time.Second) + return false, nil, nil + } return true, nil, fmt.Errorf("wait handler got status FAILURE for instance: %s", instanceID) default: return true, s, fmt.Errorf("instance with id %s has unexpected status %s", instanceID, s.Status)