GitLab restore hangs / is stuck on epoll_wait
Summary
When I try to restore a backup of GitLab, the restore process is stuck forever, no error, no warning, no way to know what's happening. Also, the process always hangs at the same place, with the same repository.
Note that gitaly is separated from rails in this case.
Steps to reproduce
Here is what I'm doing.
Backup a 100 GB instance, backup is 85 GB.
Restore the backup on another instance. The process will run, until a certain point where it hangs.
The restore process is executed with the --trace option, but logs are the same, there's nothing more than the repo, and [DONE] when it's done.
Example Project
N/A
What is the current bug behavior?
The restore process hangs.
What is the expected correct behavior?
Restore should work, or at least tell what's happening, and if something is not going well.
Relevant logs and/or screenshots
The logs of the restore process until it hangs (anonymized data)
...
* group-f/repo-f ... [DONE]
* group-g/repo-g ... [DONE]
* group-h/repo-h ... [DONE]
* group-i/poc/infrastructure ... [DONE]
* group-j/provisioning ... [DONE]
* failing-group/successful-repo ... [DONE]
* failing-group/failing-repo ...
The strace command is showing continuously the same sequence, with various time data. The first lines are the last lines that are not epoll_wait:
sendmsg(19, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0@\0\0\0\0\0\r\227\0\0\2\0\4", iov_len=14}, {iov_base="\22\200\200\10\31 \252\312\227:?\243\267m\1\256\254a\237\257\367\307\316\217w\373\217\374JsW\235"..., iov_len=16379}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="$P=g\31\5\371\373\357\256kq\26\346~\362d\37\343=\274\242\7LZ\277\32\271\2s\315\33"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="!\202\315\203A\364\312\251\365\v4\235}\360a\215n\3203\236\212o\343(\372\0\204\1]\340\324\201"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\350\32\25\333\17&\362)=\216~Q\253Z\24\370\6\263V\326U\353\200\26\337k\213r\332\204c\232"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\266\303[\360\304\244\332\243\346T\243y\33.\342\242\361\214\252q\241\222d\374\r\321\270\3151\337C:"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\3109\3075\2\364\220\234z(\332[\320\3-\255l&4i\360\221D(\364\357]>$\313)\252"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\372\17\314\21\33\357\234\265Ibugr\367\366\\y\35\270\305\337\1\27_\377\273\241\4x\23434"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="\357h\212\306\215\3231\35!\5\272\30\270\215\v4Aq)\364Q\4\35U\2\31\272\301ooC\346"..., iov_len=16384}, {iov_base="\0\0\t\0\0\0\0\r\227", iov_len=9}, {iov_base="5\233\221(\0079\370\223\33", iov_len=9}], msg_iovlen=18, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 131162
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85544140}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85563083}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85592502}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=74701448}) = 0
read(20, "5HA\372rd\315\7kM\327\313\6\351\313xM\34c\323\214\213\344\215\24\346\210gf$\347\311"..., 131072) = 57197
read(20, "", 73875) = 0
getpid() = 22180
sendmsg(19, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0@\0\0\0\0\0\r\227\0\0\0\337q", iov_len=14}, {iov_base="\22\355\276\0035HA\372rd\315\7kM\327\313\6\351\313xM\34c\323\214\213\344\215\24\346\210g"..., iov_len=16379}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="gIf\220\354I\3563\3465e\314K\212f\332\305[\273qi\\\255\\\376\323N\246\17\306\354\371"..., iov_len=16384}, {iov_base="\0@\0\0\0\0\0\r\227", iov_len=9}, {iov_base="W&\327\270'%\362&$EP\327\321S<\243\347\271gY\316)Q\326_\265\335\21\2773@\31"..., iov_len=16384}, {iov_base="\0\37v\0\0\0\0\r\227", iov_len=9}, {iov_base="\220\315\224\347\254\211\362\2345S\36\343A\373\234\2\25\254\271\362H\24X\353+\316r=\3051r\362"..., iov_len=8054}], msg_iovlen=8, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 57242
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85878996}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85897156}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=85912731}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=75021105}) = 0
read(20, "", 131072) = 0
close(20) = 0
getpid() = 22180
sendmsg(19, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0\0\4\10\0\0\0\r\227\0\0\0\5\0\0\0\0\1\0\0\r\227\0", iov_len=23}, {iov_base="\0\4\10\0\0\0\0\0\0\0\0\5", iov_len=12}], msg_iovlen=2, msg_controllen=0, msg_flags=0}, MSG_NOSIGNAL) = 35
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=86091347}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=86108662}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580814257, tv_nsec=86124739}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=75233103}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=87935, tv_nsec=75249037}) = 0
...
epoll_wait(17, [], 100, 21) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=94686, tv_nsec=709762257}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580821008, tv_nsec=720690726}) = 0
clock_gettime(CLOCK_REALTIME, {tv_sec=1580821008, tv_nsec=720706186}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=94686, tv_nsec=709813549}) = 0
clock_gettime(CLOCK_MONOTONIC, {tv_sec=94686, tv_nsec=709828955}) = 0
On Gitaly side, we have a return code send for the failing-repo.The log sequence is the as than the previous repository restored. Here are the 2 3-steps sequences (anonymized)
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"RemoveRepository","grpc.request.deadline":"2020-02-04T17:05:55Z","grpc.request.fullMethod":"/gitaly.RepositoryService/RemoveRepository","grpc.request.glProjectPath":"failing-group/successful-repo","grpc.request.glRepository":"project-1204","grpc.request.repoPath":"failing-group/successful-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:15Z","grpc.time_ms":0.662,"level":"info","msg":"finished unary call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:15Z"}
{"grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:05:55Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/successful-repo","grpc.request.glRepository":"project-1204","grpc.request.repoPath":"failing-group/successful-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:15Z","level":"error","msg":"Cloning into bare repository '/var/opt/gitlab/git-data/repositories/failing-group/successful-repo.git'...\\n","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:16Z"}
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:05:55Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/successful-repo","grpc.request.glRepository":"project-1204","grpc.request.repoPath":"failing-group/successful-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:15Z","grpc.time_ms":688.593,"level":"info","msg":"finished streaming call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:16Z"}
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"RemoveRepository","grpc.request.deadline":"2020-02-04T17:05:56Z","grpc.request.fullMethod":"/gitaly.RepositoryService/RemoveRepository","grpc.request.glProjectPath":"failing-group/failing-repo","grpc.request.glRepository":"project-1205","grpc.request.repoPath":"failing-group/failing-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:16Z","grpc.time_ms":0.091,"level":"info","msg":"finished unary call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:04:16Z"}
{"grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:04:16Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/failing-repo","grpc.request.glRepository":"project-1205","grpc.request.repoPath":"failing-group/failing-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:16Z","level":"error","msg":"Cloning into bare repository '/var/opt/gitlab/git-data/repositories/failing-group/failing-repo.git'...\\n","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:19:40Z"}
{"grpc.code":"OK","grpc.meta.auth_version":"v2","grpc.meta.client_name":"gitlab-web","grpc.method":"CreateRepositoryFromBundle","grpc.request.deadline":"2020-02-04T17:04:16Z","grpc.request.fullMethod":"/gitaly.RepositoryService/CreateRepositoryFromBundle","grpc.request.glProjectPath":"failing-group/failing-repo","grpc.request.glRepository":"project-1205","grpc.request.repoPath":"failing-group/failing-repo.git","grpc.request.repoStorage":"default","grpc.request.topLevelGroup":"failing-group","grpc.service":"gitaly.RepositoryService","grpc.start_time":"2020-02-04T11:04:16Z","grpc.time_ms":924027.2,"level":"info","msg":"finished streaming call with code OK","peer.address":"192.168.0.70:42072","pid":11645,"span.kind":"server","system":"grpc","time":"2020-02-04T11:19:40Z"}
Output of checks
(If you are reporting a bug on GitLab.com, write: This bug happens on GitLab.com)
Results of GitLab environment info
Expand for output related to GitLab environment info
System information System: Ubuntu 18.04 Current User: git Using RVM: no Ruby Version: 2.6.3p62 Gem Version: 2.7.9 Bundler Version:1.17.3 Rake Version: 12.3.3 Redis Version: 3.2.12 Git Version: 2.24.1 Sidekiq Version:5.2.7 Go Version: unknown GitLab information Version: 12.6.4 Revision: 70900054dfe Directory: /opt/gitlab/embedded/service/gitlab-rails DB Adapter: PostgreSQL DB Version: 9.6.13 URL: https://nope.com HTTP Clone URL: https://nope.com/some-group/some-project.git SSH Clone URL: git@nope.com:some-group/some-project.git Using LDAP: yes Using Omniauth: yes Omniauth Providers: saml GitLab Shell Version: 10.3.0 Repository storage paths: - default: /var/opt/gitlab/git-data/repositories GitLab Shell path: /opt/gitlab/embedded/service/gitlab-shell Git: /opt/gitlab/embedded/bin/git
Results of GitLab application Check
Expand for output related to the GitLab application check
Restore process is stuck, can't do a sanitize.