Commit e49688e1 authored by Pawel Rozlach's avatar Pawel Rozlach
Browse files

feat: add storage retries metric for Azure, S3 and improve existing metric fro GCS

parent 085df286
Loading
Loading
Loading
Loading
+22 −0
Original line number Diff line number Diff line
@@ -10,12 +10,14 @@ import (
	"errors"
	"fmt"
	"io"
	"net/http"
	"strings"
	"sync/atomic"
	"time"

	"github.com/Azure/azure-sdk-for-go/sdk/azcore"
	"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
	azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
	"github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming"
	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
	"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/appendblob"
@@ -27,6 +29,7 @@ import (
	storagedriver "github.com/docker/distribution/registry/storage/driver"
	"github.com/docker/distribution/registry/storage/driver/azure/common"
	"github.com/docker/distribution/registry/storage/driver/base"
	"github.com/docker/distribution/registry/storage/internal/metrics"
)

var ErrCorruptedData = errors.New("corrupted data found in the uploaded data")
@@ -117,6 +120,25 @@ func (d *driver) GetContent(ctx context.Context, targetPath string) ([]byte, err
			TryTimeout:    d.retryTryTimeout,
			RetryDelay:    d.retryDelay,
			MaxRetryDelay: d.maxRetryDelay,
			ShouldRetry: func(resp *http.Response, err error) bool {
				// NOTE(prozlach): azure-sdk-go does the same thing, we only
				// piggy-back the metric calculation. List of codes can be found
				// in the godoc of the policy.RetryOptions object
				statusCodes := []int{
					http.StatusRequestTimeout,      // 408
					http.StatusTooManyRequests,     // 429
					http.StatusInternalServerError, // 500
					http.StatusBadGateway,          // 502
					http.StatusServiceUnavailable,  // 503
					http.StatusGatewayTimeout,      // 504
				}
				shouldRetry := err != nil || azruntime.HasStatusCode(resp, statusCodes...)
				if shouldRetry {
					// NOTE(prozlach): Azure has only native retries
					metrics.StorageBackendRetry(true)
				}
				return shouldRetry
			},
		},
	)

+4 −7
Original line number Diff line number Diff line
@@ -149,20 +149,17 @@ func ShouldRetry(err error) bool {
// The interface used by ShouldRetry func is determined by 3rd party package,
// so we wrap the function in wrapper that passes correct retry type.
func shouldRetryImpl(nativeRetry bool, err error) bool {
	metrics.StorageBackendRetry(nativeRetry)

	// Context cancelation/expiry is fatal, do not try to retry:
	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
		return false
	}

	// NOTE(prozlach): http2 stream errors are also retryable
	var streamError http2.StreamError
	if errors.As(err, &streamError) {
		return true
	shouldRetry := errors.As(err, new(http2.StreamError)) || storage.ShouldRetry(err)
	if shouldRetry {
		metrics.StorageBackendRetry(nativeRetry)
	}

	return storage.ShouldRetry(err)
	return shouldRetry
}

func retry(req request) error {
+8 −1
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@ import (
	"github.com/aws/aws-sdk-go-v2/aws/retry"
	"github.com/cenkalti/backoff/v4"
	"github.com/docker/distribution/registry/storage/driver/s3-aws/common"
	"github.com/docker/distribution/registry/storage/internal/metrics"
	log "github.com/sirupsen/logrus"
	"golang.org/x/time/rate"
)
@@ -84,7 +85,13 @@ func (cr *customRetryer) GetRetryToken(ctx context.Context, opErr error) (func(e
		return nil, fmt.Errorf("throtling GetRetryToken request: %w", err)
	}

	return cr.RetryerV2.GetRetryToken(ctx, opErr)
	releaseTokenF, err := cr.RetryerV2.GetRetryToken(ctx, opErr)
	if err == nil {
		// NOTE(prozlach): On AWS, all retries are done using native mechanism,
		// unlinke on GCS
		metrics.StorageBackendRetry(true)
	}
	return releaseTokenF, err
}

func (cr *customRetryer) GetAttemptToken(ctx context.Context) (func(error) error, error) {
+1 −1
Original line number Diff line number Diff line
@@ -46,7 +46,7 @@ const (
	storageBackendRetriesTotalDesc = "A counter of retires made while communicating with storage backend."
	// `native` - done using native retry mechanism
	// `custom` -done using our own/custom retry mechanism
	storageBackendRetriesTypeLabel = "type"
	storageBackendRetriesTypeLabel = "retry_type"

	urlCacheRequestsTotalName = "urlcache_requests_total"
	urlCacheRequestsTotalDesc = "A counter of the URL cache middleware requests."
+2 −2
Original line number Diff line number Diff line
@@ -166,8 +166,8 @@ func TestStorageBackendRetry(t *testing.T) {
	_, err := expected.WriteString(`
# HELP registry_storage_storage_backend_retries_total A counter of retires made while communicating with storage backend.
# TYPE registry_storage_storage_backend_retries_total counter
registry_storage_storage_backend_retries_total{type="custom"} 3
registry_storage_storage_backend_retries_total{type="native"} 2
registry_storage_storage_backend_retries_total{retry_type="custom"} 3
registry_storage_storage_backend_retries_total{retry_type="native"} 2
`)
	require.NoError(t, err)
	totalFullName := fmt.Sprintf("%s_%s_%s", metrics.NamespacePrefix, subsystem, storageBackendRetriesTotalName)