GitLab restore hangs / is stuck on epoll_wait

Summary

When I try to restore a backup of GitLab, the restore process is stuck forever, no error, no warning, no way to know what's happening. Also, the process always hangs at the same place, with the same repository.

Note that gitaly is separated from rails in this case.

Steps to reproduce

Here is what I'm doing. Backup a 100 GB instance, backup is 85 GB. Restore the backup on another instance. The process will run, until a certain point where it hangs. The restore process is executed with the --trace option, but logs are the same, there's nothing more than the repo, and [DONE] when it's done.

Example Project

N/A

What is the current bug behavior?

The restore process hangs.

What is the expected correct behavior?

Restore should work, or at least tell what's happening, and if something is not going well.

Relevant logs and/or screenshots

The logs of the restore process until it hangs (anonymized data)

...
 * group-f/repo-f ... [DONE]
 * group-g/repo-g ... [DONE]
 * group-h/repo-h ... [DONE]
 * group-i/poc/infrastructure ... [DONE]
 * group-j/provisioning ... [DONE]
 * failing-group/successful-repo ... [DONE]
 * failing-group/failing-repo ... 

The strace command is showing continuously the same sequence, with various time data. The first lines are the last lines that are not epoll_wait:

sendmsg(19, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0@\0\0\0\0\0\r\227\0\0\2\0\4", iov_len=14}, {iov_base="\22\200\200\10\31 \252\312\227:?\243\267m\1\256\254a\237\257\367\307\316\217w\373\217\374JsW\235"..., iov_len=16379}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="$P=g\31\5\371\373\357\256kq\26\346~\362d\37\343=\274\242\7LZ\277\32\271\2s\315\33"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="!\202\315\203A\364\312\251\365\v4\235}\360a\215n\3203\236\212o\343(\372\0\204\1]\340\324\201"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\350\32\25\333\17&\362)=\216~Q\253Z\24\370\6\263V\326U\353\200\26\337k\213r\332\204c\232"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\266\303[\360\304\244\332\243\346T\243y\33.\342\242\361\214\252q\241\222d\374\r\321\270\3151\337C:"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\3109\3075\2\364\220\234z(\332[\320\3-\255l&4i\360\221D(\364\357]>$\313)\252"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\372\17\314\21\33\357\234\265Ibugr\367\366\\y\35\270\305\337\1\27_\377\273\241\4x\23434"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\357h\212\306\215\3231\35!\5\272\30\270\215\v4Aq)\364Q\4\35U\2\31\272\301ooC\346"..., iov_len=16384}, {iov_base="\0\0\t\0\0\0\0\r\227", iov_len=9}, {iov_base="5\233\221(\0079\370\223\33", iov_len=9}], msg_iovlen=18, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 131162
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85544140}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85563083}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85592502}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=74701448}) = 0
read(20, "5HA\372rd\315\7kM\327\313\6\351\313xM\34c\323\214\213\344\215\24\346\210gf$\347\311"..., 131072) = 57197
read(20, "", 73875)                     = 0
getpid()                                = 22180
sendmsg(19, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0@\0\0\0\0\0\r\227\0\0\0\337q", iov_len=14}, {iov_base="\22\355\276\0035HA\372rd\315\7kM\327\313\6\351\313xM\34c\323\214\213\344\215\24\346\210g"..., iov_len=16379}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="gIf\220\354I\3563\3465e\314K\212f\332\305[\273qi\\\255\\\376\323N\246\17\306\354\371"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="W&\327\270'%\362&$EP\327\321S<\243\347\271gY\316)Q\326_\265\335\21\2773@\31"..., iov_len=16384}, {iov_base="\0\37v\0\0\0\0\r\227", iov_len=9}, {iov_base="\220\315\224\347\254\211\362\2345S\36\343A\373\234\2\25\254\271\362H\24X\353+\316r=\3051r\362"..., iov_len=8054}], msg_iovlen=8, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 57242
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85878996}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85897156}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85912731}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=75021105}) = 0
read(20, "", 131072)                    = 0
close(20)                               = 0
getpid()                                = 22180
sendmsg(19, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0\0\4\10\0\0\0\r\227\0\0\0\5\0\0\0\0\1\0\0\r\227\0", iov_len=23}, {iov_base="\0\4\10\0\0\0\0\0\0\0\0\5", iov_len=12}], msg_iovlen=2, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 35
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=86091347}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=86108662}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=86124739}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=75233103}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=75249037}) = 0
...
epoll_wait(17, [], 100, 21)             = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=94686, tv_nsec=709762257}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580821008, tv_nsec=720690726}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580821008, tv_nsec=720706186}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=94686, tv_nsec=709813549}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=94686, tv_nsec=709828955}) = 0

On Gitaly side, we have a return code send for the failing-repo.The log sequence is the as than the previous repository restored. Here are the 2 3-steps sequences (anonymized)

{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"RemoveRepository","grpc.request.deadline":"2020-02-04T17:05:55Z","grpc.request.fullMethod":"/gitaly.RepositoryService/RemoveRepository","grpc.request.glProjectPath":"failing-group/successful-repo","grpc.request.glRepository":"project-1204","grpc.request.repoPath":"failing-group/successful-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:15Z","grpc.time_ms":0.662,"level":"info","msg":"finished unary call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:15Z"}
{"grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:05:55Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/successful-repo","grpc.request.glRepository":"project-1204","grpc.request.repoPath":"failing-group/successful-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:15Z","level":"error","msg":"Cloning into bare repository '/var/opt/gitlab/git-data/repositories/failing-group/successful-repo.git'...\\n","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:16Z"}
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:05:55Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/successful-repo","grpc.request.glRepository":"project-1204","grpc.request.repoPath":"failing-group/successful-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:15Z","grpc.time_ms":688.593,"level":"info","msg":"finished streaming call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:16Z"}
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"RemoveRepository","grpc.request.deadline":"2020-02-04T17:05:56Z","grpc.request.fullMethod":"/gitaly.RepositoryService/RemoveRepository","grpc.request.glProjectPath":"failing-group/failing-repo","grpc.request.glRepository":"project-1205","grpc.request.repoPath":"failing-group/failing-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:16Z","grpc.time_ms":0.091,"level":"info","msg":"finished unary call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:16Z"}
{"grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:04:16Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/failing-repo","grpc.request.glRepository":"project-1205","grpc.request.repoPath":"failing-group/failing-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:16Z","level":"error","msg":"Cloning into bare repository '/var/opt/gitlab/git-data/repositories/failing-group/failing-repo.git'...\\n","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:19:40Z"}
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:04:16Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/failing-repo","grpc.request.glRepository":"project-1205","grpc.request.repoPath":"failing-group/failing-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:16Z","grpc.time_ms":924027.2,"level":"info","msg":"finished streaming call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:19:40Z"}

Output of checks

(If you are reporting a bug on GitLab.com, write: This bug happens on GitLab.com)

Results of GitLab environment info

Expand for output related to GitLab environment info
System information
System:		Ubuntu 18.04
Current User:	git
Using RVM:	no
Ruby Version:	2.6.3p62
Gem Version:	2.7.9
Bundler Version:1.17.3
Rake Version:	12.3.3
Redis Version:	3.2.12
Git Version:	2.24.1
Sidekiq Version:5.2.7
Go Version:	unknown

GitLab information
Version:	12.6.4
Revision:	70900054dfe
Directory:	/opt/gitlab/embedded/service/gitlab-rails
DB Adapter:	PostgreSQL
DB Version:	9.6.13
URL:		https://nope.com
HTTP Clone URL:	https://nope.com/some-group/some-project.git
SSH Clone URL:	git@nope.com:some-group/some-project.git
Using LDAP:	yes
Using Omniauth:	yes
Omniauth Providers: saml

GitLab Shell
Version:	10.3.0
Repository storage paths:
- default: 	/var/opt/gitlab/git-data/repositories
GitLab Shell path:		/opt/gitlab/embedded/service/gitlab-shell
Git:		/opt/gitlab/embedded/bin/git

Results of GitLab application Check

Expand for output related to the GitLab application check

Restore process is stuck, can't do a sanitize.

Assignee Loading
Time tracking Loading