Upgrade failed, rollback failed. Partially deployed release?
Not sure who to mention, hope you don't mind the pings.
/cc @WarheadsSE @twk3
Summary
As my task runner pod keeps getting evicted, I elected to add persistent storage to the task runner to my values.yaml. A syntax error in another change I was making at the same time caused the upgrade to partially fail. The rollback also failed, I'm guessing because the previously deployed release did not contain a PVC for task runner.
Steps to reproduce
From chart version 2.4.6
- Pull latest chart repo for up to date values.yaml, merge into local branch containing my custom changes
helm repo update
- Add persistence to task-runner and prometheus server to my values.yaml, following documentation to enable persistent storage for Prometheus server and documentation about pod eviction issues during backup
helm upgrade --install gitlab -f values.yaml gitlab/gitlab
Configuration used
Apologies, this is in diff format rather than yaml because I'm attempting to keep track of changes to the chart's values.yaml (both upstream and mine). I should probably create a separate values-mine.yaml instead at some point, but here's what I used. (Based on a74ded6f)
Differences just since the last deployed release are below the full diff. Scroll up a bit from current behavior
diff --git a/values.yaml b/values.yaml
index e0fcd083..0d44fc04 100644
--- a/values.yaml
+++ b/values.yaml
@@ -28,7 +28,7 @@ global:
autoPause: true
## doc/installation/deployment.md#deploy-the-community-edition
- # edition: ee
+ edition: ce
## doc/charts/globals.md#gitlab-version
# gitlabVersion: master
@@ -40,10 +40,10 @@ global:
allowClusterRoles: true
## doc/charts/globals.md#configure-host-settings
hosts:
- domain: example.com
+ domain: apps.mydomain.com
# hostSuffix:
https: true
- externalIP:
+ #externalIP: 192.168.1.45
ssh: ~
gitlab: {}
minio: {}
@@ -54,9 +54,9 @@ global:
configureCertmanager: true
annotations: {}
enabled: true
- tls: {}
- # enabled: true
- # secretName:
+ class: nginx # set this class rather than the default gitlab-nginx, see https://gitlab.com/charts/gitlab/issues/1348
+ tls:
+ enabled: true
gitlab:
## Enterprise license for this GitLab installation
@@ -113,6 +113,8 @@ global:
enabled: true
credentials: {}
# secret:
+ persistence:
+ size: 50Gi
## doc/charts/globals.md#configure-grafana-integration
grafana:
@@ -243,15 +245,15 @@ global:
## doc/charts/globals.md#incoming-email-settings
## doc/installation/deployment.md#incoming-email
incomingEmail:
- enabled: false
- address: ""
- host: "imap.gmail.com"
+ enabled: true
+ address: "gitlab@mydomain.com"
+ host: "mail.mydomain.com"
port: 993
ssl: true
startTls: false
- user: ""
+ user: "gitlab@mydomain.com"
password:
- secret: ""
+ secret: "email-password"
key: password
mailbox: inbox
idleTimeout: 60
@@ -347,25 +349,25 @@ global:
## doc/installation/deployment.md#outgoing-email
## Outgoing email server settings
smtp:
- enabled: false
- address: smtp.mailgun.org
- port: 2525
- user_name: ""
+ enabled: true
+ address: mail.mydomain.com
+ port: 25
+ user_name: "gitlab@mydomain.com"
## doc/installation/secrets.md#smtp-password
password:
- secret: ""
+ secret: "email-password"
key: password
# domain:
- authentication: "plain"
- starttls_auto: false
+ authentication: "login"
+ starttls_auto: true
openssl_verify_mode: "peer"
## doc/installation/deployment.md#outgoing-email
## Email persona used in email sent by GitLab
email:
- from: ''
- display_name: GitLab
- reply_to: ''
+ from: 'gitlab@mydomain.com'
+ display_name: My GitLab
+ reply_to: 'gitlab@mydomain.com'
subject_suffix: ''
smime:
enabled: false
@@ -424,10 +426,10 @@ upgradeCheck:
cpu: 50m
## Settings to for the Let's Encrypt ACME Issuer
-# certmanager-issuer:
+certmanager-issuer:
## The email address to register certificates requested from Let's Encrypt.
## Required if using Let's Encrypt.
- # email: email@example.com
+ email: myname@mydomain.com
## Installation & configuration of stable/cert-manager
## See requirements.yaml for current version
@@ -439,6 +441,10 @@ certmanager:
# See https://github.com/kubernetes/charts/tree/master/stable/cert-manager#configuration
rbac:
create: true
+ # set default issuer and kind to a cluster issuer so it can manage certificates for ingresses in all namespaces
+ ingressShim:
+ defaultIssuer: gitlab-cluster-issuer
+ defaultIssuerKind: ClusterIssuer
## doc/charts/nginx/index.md
## doc/architecture/decisions.md#nginx-ingress
@@ -451,7 +457,7 @@ nginx-ingress:
hsts-include-subdomains: "false"
server-name-hash-bucket-size: "256"
enable-vts-status: "true"
- use-http2: "false"
+ use-http2: "true"
ssl-ciphers: "ECDHE-RSA-AES256-GCM-SHA384:ECDHE-RSA-AES128-GCM-SHA256:ECDHE-RSA-AES256-SHA384:ECDHE-RSA-AES128-SHA256:ECDHE-RSA-AES256-SHA:ECDHE-RSA-AES128-SHA:AES256-GCM-SHA384:AES128-GCM-SHA256:AES256-SHA256:AES128-SHA256:AES256-SHA:AES128-SHA:!aNULL:!eNULL:!EXPORT:!DES:!MD5:!PSK:!RC4"
ssl-protocols: "TLSv1.3 TLSv1.2"
server-tokens: "false"
@@ -468,7 +474,7 @@ nginx-ingress:
replicaCount: 3
minAvailable: 2
scope:
- enabled: true
+ enabled: false # don't limit scope to the default namespace. see https://gitlab.com/charts/gitlab/issues/1348
stats:
enabled: true
metrics:
@@ -500,13 +506,17 @@ prometheus:
alertmanagerFiles:
alertmanager.yml: {}
kubeStateMetrics:
- enabled: false
+ enabled: true
nodeExporter:
enabled: false
pushgateway:
enabled: false
server:
retention: 15d
+ persistentVolume:
+ enabled: true
+ size: 8GiB
+
## Configuration of Redis
## doc/architecture/decisions.md#redis
@@ -536,7 +546,10 @@ postgresql:
## Installation & configuration charts/registry
## doc/architecture/decisions.md#registry
## doc/charts/registry/
-# registry:
+registry:
+ ingress:
+ tls:
+ enabled: true
# enabled: false
@@ -556,6 +569,7 @@ gitlab-runner:
create: true
runners:
locked: false
+ privileged: true
cache:
cacheType: s3
s3BucketName: runner-cache
@@ -629,3 +643,20 @@ grafana:
# enabled: false
## doc/charts/gitlab/gitlab-grafana
# gitlab-grafana:
+
+
+# https://gitlab.com/gitlab-org/charts/gitlab/issues/1647
+gitlab:
+ unicorn:
+ extraVolumes: |
+ - name: shared-tmp
+ emptyDir:
+ medium: "Memory"
+ extraVolumeMounts: |
+ - name: shared-tmp
+ mountPath: /tmp
+ readOnly: false
+ task-runner:
+ persistence:
+ enabled: true
+ size: 50Gi
Notable differences since last deployed release:
diff --git a/values.yaml b/values.yaml
index a84b30ae..cf43c110 100644
--- a/values.yaml
+++ b/values.yaml
@@ -443,13 +443,17 @@ prometheus:
alertmanagerFiles:
alertmanager.yml: {}
kubeStateMetrics:
- enabled: false
+ enabled: true
nodeExporter:
enabled: false
pushgateway:
enabled: false
server:
retention: 15d
+ persistentVolume:
+ enabled: true
+ size: 8GiB
+
## Configuration of Redis
## doc/architecture/decisions.md#redis
@@ -589,3 +593,7 @@ gitlab:
- name: shared-tmp
mountPath: /tmp
readOnly: false
+ task-runner:
+ persistence:
+ enabled: true
+ size: 50Gi
Current behavior
The 8GiB
value in the prometheus.server.persistentVolume.size is rejected and helm rolls back. I fix it to 8Gi
and retry, but helm gets a time out waiting for the condition and rolls back again. Further attempts result in an error regarding the PersistentVolumeClaim for task-runner. Despite this, deployment of 12.5.3 appears to have succeeded, as gitlab pods are running 12.5.3. Helm deployment now appears to be in an ambiguous state and I fear I may not be able to upgrade again until I figure it out.
I probably should've tried --dry-run
, --verify
, or something along those lines instead, validating values.yaml before applying, but I've done this at least dozen times before and it Worked Every Time™ or at least never got this bad.
edit: submitted a separate issue/MR for doc fix so other blindly copypasting noobs like me don't run into this. See !1065 (merged)
Expected behavior
If there's an error in the syntax, I was hoping it would atomically not apply any of the upgrade. I should then be able to fix it and retry the upgrade. At worst, if it applies some of the upgrade but not all, it should be able to upgrade over the top of the failed deploy, or I should have some way of recovering.
Versions
- Chart: From 2.4.6 to 2.5.4 via
helm repo update
- Platform:
- Self-hosted: kubespray v2.10.4
- Kubernetes: (
kubectl version
)- Client Version: version.Info{Major:"1", Minor:"14", GitVersion:"v1.14.3", GitCommit:"5e53fd6bc17c0dec8434817e69b04a25d8ae0ff0", GitTreeState:"clean", BuildDate:"2019-06-06T01:36:19Z", GoVersion:"go1.12.5", Compiler:"gc", Platform:"linux/amd64"}
- Server Version: version.Info{Major:"1", Minor:"14", GitVersion:"v1.14.3", GitCommit:"5e53fd6bc17c0dec8434817e69b04a25d8ae0ff0", GitTreeState:"clean", BuildDate:"2019-06-06T01:36:19Z", GoVersion:"go1.12.5", Compiler:"gc", Platform:"linux/amd64"}
- Helm: (
helm version
)- Client: &version.Version{SemVer:"v2.13.1", GitCommit:"618447cbf203d147601b4b9bd7f8c37a5d39fbb4", GitTreeState:"clean"}
- Server: &version.Version{SemVer:"v2.13.1", GitCommit:"618447cbf203d147601b4b9bd7f8c37a5d39fbb4", GitTreeState:"clean"}
Relevant logs
me@apollo:~/gitlab/gitlab-charts$ helm upgrade --install gitlab -f values.yaml gitlab/gitlab
UPGRADE FAILED
ROLLING BACK
Error: v1.PersistentVolumeClaim.Spec: v1.PersistentVolumeClaimSpec.StorageClassName: Resovaluesurces: v1.ResourceRequirements.Requests: unmarshalerDecoder: quantities must match the regular expression '^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$', error found in #10 byte of ...|ge":"8GiB"}},"storag|..., bigger context ...|teOnce"],"resources":{"requests":{"storage":"8GiB"}},"storageClassName":"rook-ceph-block","volumeMod|...
Error: UPGRADE FAILED: v1.PersistentVolumeClaim.Spec: v1.PersistentVolumeClaimSpec.StorageClassName: Resources: v1.ResourceRequirements.Requests: unmarshalerDecoder: quantities must match the regular expression '^([+-]?[0-9.]+)([eEinumkKMGTP]*[-+]?[0-9]*)$', error found in #10 byte of ...|ge":"8GiB"}},"storag|..., bigger context ...|teOnce"],"resources":{"requests":{"storage":"8GiB"}},"storageClassName":"rook-ceph-block","volumeMod|...
me@apollo:~/gitlab/gitlab-charts$ nano values.yaml
me@apollo:~/gitlab/gitlab-charts$ git add values.yaml
me@apollo:~/gitlab/gitlab-charts$ git commit -m "Fix typo"
[mybranch 30eefa81] Fix typo
1 file changed, 1 insertion(+), 1 deletion(-)
me@apollo:~/gitlab/gitlab-charts$ helm upgrade --install gitlab -f values.yaml gitlab/gitlab
UPGRADE FAILED
ROLLING BACK
Error: timed out waiting for the condition
Error: UPGRADE FAILED: timed out waiting for the condition
me@apollo:~/gitlab/gitlab-charts$ helm ls
NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE
gitlab 17 Wed Dec 4 19:17:56 2019 FAILED gitlab-2.5.4 12.5.3 default
nextcloud 6 Tue Jul 30 09:31:56 2019 DEPLOYED nextcloud-1.6.2 16.0.3 default
nextcloud2 5 Tue Oct 22 16:23:48 2019 DEPLOYED nextcloud-1.7.2 16.0.3 default
me@apollo:~/gitlab/gitlab-charts$ helm upgrade --install gitlab -f values.yaml gitlab/gitlab
UPGRADE FAILED
ROLLING BACK
Error: no PersistentVolumeClaim with the name "gitlab-task-runner-tmp" found
Error: UPGRADE FAILED: no PersistentVolumeClaim with the name "gitlab-task-runner-tmp" found
me@apollo:~/gitlab/gitlab-charts$ helm history gitlab
REVISION UPDATED STATUS CHART DESCRIPTION
1 Wed Jul 24 08:37:09 2019 SUPERSEDED gitlab-2.1.0 Install complete
2 Fri Jul 26 13:48:58 2019 SUPERSEDED gitlab-2.1.0 Upgrade complete
3 Fri Jul 26 14:00:40 2019 SUPERSEDED gitlab-2.1.0 Upgrade complete
4 Fri Jul 26 14:08:52 2019 SUPERSEDED gitlab-2.1.0 Upgrade complete
5 Thu Aug 1 09:15:26 2019 SUPERSEDED gitlab-2.1.0 Upgrade complete
6 Thu Aug 1 14:30:44 2019 SUPERSEDED gitlab-2.1.3 Upgrade complete
7 Thu Aug 29 11:46:08 2019 SUPERSEDED gitlab-2.1.3 Upgrade complete
8 Thu Aug 29 11:54:13 2019 SUPERSEDED gitlab-2.2.1 Upgrade complete
9 Tue Oct 1 04:03:24 2019 SUPERSEDED gitlab-2.3.3 Upgrade complete
10 Mon Oct 7 11:29:20 2019 SUPERSEDED gitlab-2.3.7 Upgrade complete
11 Wed Oct 9 10:42:58 2019 SUPERSEDED gitlab-2.3.7 Upgrade complete
12 Wed Oct 23 14:17:42 2019 SUPERSEDED gitlab-2.4.0 Upgrade complete
13 Wed Oct 23 14:50:20 2019 SUPERSEDED gitlab-2.4.0 Upgrade complete
14 Thu Oct 24 08:27:20 2019 SUPERSEDED gitlab-2.4.1 Upgrade complete
15 Wed Oct 30 15:34:50 2019 SUPERSEDED gitlab-2.4.4 Upgrade complete
16 Fri Nov 8 15:45:31 2019 DEPLOYED gitlab-2.4.6 Upgrade complete
17 Wed Dec 4 19:17:56 2019 FAILED gitlab-2.5.4 Upgrade "gitlab" failed: v1.PersistentVolumeClaim.Spec: v...
18 Wed Dec 4 19:19:30 2019 PENDING_UPGRADE gitlab-2.5.4 Preparing upgrade
19 Wed Dec 4 19:28:15 2019 FAILED gitlab-2.5.4 Upgrade "gitlab" failed: no PersistentVolumeClaim with th...
me@apollo:~/gitlab/gitlab-charts$ kubectl get pvc
NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
data-nextcloud-mariadb-master-0 Bound pvc-1293fa62-ae10-11e9-91e8-94c69117162f 8Gi RWO rook-ceph-block 133d
data-nextcloud-mariadb-slave-0 Bound pvc-129c284c-ae10-11e9-91e8-94c69117162f 8Gi RWO rook-ceph-block 133d
data-nextcloud2-mariadb-master-0 Bound pvc-798c2002-f508-11e9-b3d7-94c69116a34a 8Gi RWO rook-ceph-block 43d
data-nextcloud2-mariadb-slave-0 Bound pvc-7990c3e9-f508-11e9-b3d7-94c69116a34a 8Gi RWO rook-ceph-block 43d
gitlab-minio Bound pvc-eb3d6745-ae0f-11e9-91e8-94c69117162f 51Gi RWO rook-ceph-block 133d
gitlab-postgresql Bound pvc-eb3e8a0e-ae0f-11e9-91e8-94c69117162f 8Gi RWO rook-ceph-block 133d
gitlab-prometheus-server Bound pvc-eb3fa8b4-ae0f-11e9-91e8-94c69117162f 8Gi RWO rook-ceph-block 133d
gitlab-redis Bound pvc-eb40d7ad-ae0f-11e9-91e8-94c69117162f 5Gi RWO rook-ceph-block 133d
gitlab-task-runner-tmp Bound pvc-cabd0b49-16f4-11ea-9d13-94c69116a34a 50Gi RWO rook-ceph-block 16m
nextcloud-nextcloud Bound pvc-127ddc7c-ae10-11e9-91e8-94c69117162f 30Gi RWO rook-ceph-block 133d
nextcloud2-nextcloud Bound pvc-7980a833-f508-11e9-95de-94c69117162f 30Gi RWO rook-ceph-block 43d
repo-data-gitlab-gitaly-0 Bound pvc-eb8dfc9e-ae0f-11e9-91e8-94c69117162f 50Gi RWO rook-ceph-block 133d
During this time, monitoring kubectl get po in another window revealed, based on the age of the pods, it had actually upgraded
$ kubectl get po
NAME READY STATUS RESTARTS AGE
gitlab-certmanager-78d7f58454-w6w4b 1/1 Running 1 97d
gitlab-gitaly-0 1/1 Running 0 19m
gitlab-gitlab-exporter-6ff5d6548-h22px 1/1 Running 0 30m
gitlab-gitlab-runner-84d8bd9fc4-2g7lp 1/1 Running 0 30m
gitlab-gitlab-shell-5c4f866999-svqzf 1/1 Running 0 30m
gitlab-gitlab-shell-5c4f866999-zknsb 1/1 Running 0 30m
gitlab-issuer.17-sg987 1/1 Running 0 30m
gitlab-mailroom-69d47cbf67-9227s 1/1 Running 0 30m
gitlab-migrations.16-crtmz 0/1 Completed 0 26d
gitlab-migrations.17-bbp4n 0/1 Completed 0 30m
gitlab-minio-5746f7f7c7-8j2t8 1/1 Running 0 27d
gitlab-minio-create-buckets.16-tsxds 0/1 Completed 0 26d
gitlab-minio-create-buckets.17-kl9fh 0/1 Completed 0 30m
gitlab-nginx-ingress-controller-7d6b6677fd-527lr 1/1 Running 4 133d
gitlab-nginx-ingress-controller-7d6b6677fd-ct2ps 1/1 Running 1 131d
gitlab-nginx-ingress-controller-7d6b6677fd-xl7ln 1/1 Running 10 131d
gitlab-nginx-ingress-default-backend-5c4b7cd5-5v6mt 1/1 Running 2 133d
gitlab-nginx-ingress-default-backend-5c4b7cd5-w4w98 1/1 Running 2 133d
gitlab-postgresql-554d9fc6d5-pht82 2/2 Running 2 133d
gitlab-prometheus-kube-state-metrics-6dc9f5c569-8cd52 1/1 Running 0 30m
gitlab-prometheus-server-c5d5f6d5d-r896p 2/2 Running 0 30m
gitlab-redis-84c4f74bb9-k8kf8 2/2 Running 2 133d
gitlab-registry-7cc49bcfc4-7qcmn 1/1 Running 0 30m
gitlab-registry-7cc49bcfc4-pjblv 1/1 Running 0 30m
gitlab-sidekiq-all-in-1-7b84fcc485-vgdh9 1/1 Running 0 30m
gitlab-task-runner-5687c9f6b4-k9rzr 1/1 Running 0 30m
gitlab-task-runner-885bd9987-8bv5m 0/1 Evicted 0 26d
gitlab-unicorn-7bd6d99d98-2mdr2 2/2 Running 0 16m
gitlab-unicorn-7bd6d99d98-6hvl2 2/2 Running 0 17m
nextcloud-1575505800-29wd2 0/1 Completed 0 19m
nextcloud-1575506700-mls4k 0/1 Completed 0 4m24s
nextcloud-76d4f689c4-s47lk 1/1 Running 24 50d
nextcloud-mariadb-master-0 1/1 Running 0 42d
nextcloud-mariadb-slave-0 1/1 Running 0 42d
nextcloud2-1575505800-d56wl 0/1 Completed 0 19m
nextcloud2-1575506700-j7klq 0/1 Completed 0 4m24s
nextcloud2-846c59cb7c-5b7dc 1/1 Running 1 42d
nextcloud2-mariadb-master-0 1/1 Running 0 43d
nextcloud2-mariadb-slave-0 1/1 Running 0 43d