Docker and Shared memory

Hello everybody,

I am facing a weird issue with docker.
Context:
Node in a cluster, that uses InfiniBand for computation purposes.

Versions:
[root@node168 ~]# uname -r
3.10.0-229.20.1.el7.x86_64
[root@node168 ~]# cat /etc/redhat-release
CentOS Linux release 7.1.1503 (Core)
[root@node168 ~]# rpm -qa | grep docker
docker-1.12.6-28.git1398f24.el7.centos.x86_64
docker-client-1.12.6-28.git1398f24.el7.centos.x86_64
docker-common-1.12.6-28.git1398f24.el7.centos.x86_64

I am having issues with Docker and the shared memory (as in the title).
To be more detailed, my application, when executed within the container, uses a different system call to use the shared memory and i think this leads to memory related issues.

The application is a simple MPI application and here is the strace with the crucial part:
17:35:08.474880 open("/proc/self/status", O_RDONLY) = 7
17:35:08.474939 fstat(7, {st_mode=S_IFREG|0444, st_size=0, …}) = 0
17:35:08.474979 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ab818d60000
17:35:08.475024 read(7, “Name:\ta.ibm\nState:\tR (running)\nTgid:\t1294\nNgid:\t0\nPid:\t1294\nPPid:\t1257\nTracerPid:\t1257\nUid:\t1053\t1053\t1053\t1053\nGid:\t1053\t1053\t1053\t1053\nFDSize:\t64\nGroups:\t151 1053 \nVmPeak:\t 282124 kB\nVmSize:\t 282124 kB\nVmLck:\t 0 kB\nVmPin:\t 0 kB\nVmHWM:\t 8352 kB\nVmRSS:\t 8352 kB\nVmData:\t 104580 kB\nVmStk:\t 136 kB\nVmExe:\t 4 kB\nVmLib:\t 7064 kB\nVmPTE:\t 256 kB\nVmSwap:\t 0 kB\nThreads:\t1\nSigQ:\t0/1031345\nSigPnd:\t0000000000000000\nShdPnd:\t0000000000000000\nSigBlk:\t0000000000000000\nSigIgn:\t0000000001000000\nSigCgt:\t0000000180000000\nCapInh:\t00000000a80c25fb\nCapPrm:\t0000000000000000\nCapEff:\t0000000000000000\nCapBnd:\t00000000a80c25fb\nSeccomp:\t0\nCpus_allowed:\tffffffff\nCpus_allowed_list:\t0-31\nMems_allowed:\t00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000003\nMems_allowed_l”, 1024) = 1024
17:35:08.475128 read(7, “ist:\t0-1\nvoluntary_ctxt_switches:\t52247\nnonvoluntary_ctxt_switches:\t131\n”, 1024) = 72
17:35:08.475166 read(7, “”, 1024) = 0
17:35:08.475208 close(7) = 0
17:35:08.475248 munmap(0x2ab818d60000, 4096) = 0
17:35:08.475290 munmap(0x2ab818d65000, 26694) = 0
17:35:08.475341 mbind(0x2ab8235b6000, 102236160, MPOL_INTERLEAVE, 0x1db3460, 65, 0) = 0
17:35:08.730559 madvise(0x2ab8235b6000, 102236160, MADV_DONTFORK) = 0
17:35:08.730896 write(5, “\t\0\0\0\f\0\3\0\320\354aC\374\177\0\0\0[#\270*\0\0\0\0\30\6\0\0\0\0\0[#\270*\0\0\1\0\0\0\17\0\0\0”, 48) = 48
17:35:08.747211 write(5, “\32\0\0\0\36\0\0\0\300\215bC\374\177\0\0\300\215bC\374\177\0\0\260\215bC\234\0\0\0\374\177\0\0\0\0\0\0019e\276\30\270*\0\0\0\20\0\0\0\0\0\0X\277\340\30\2\0\0\0\270*\0\0\0\0\0\0\2\0\0\0\201\221\22\0\0\0\0\0\0\0\0\0\1\0\0\0\360|\26\0X\235bC\v\0\0\0\2\270\5\0\0\0\20\30\24v\304\0\0\0\0\0”, 120) = 120
17:35:08.747328 write(5, “\32\0\0\0\36\0\0\0\300\215bC\374\177\0\0\300\215bC\374\177\0\0\260\215bC\234\0\0\0\374\177\0\0\0\0\0\0019e\276\30\270*\0\0\0\20\0\0\0\0\0\0X\277\340\30\2\0\0\0\270*\0\0\0\0\0\0\2\0\0\0\1.\1\0\0\0\0\0\0\0\0\0\0\0\0\0\360|\26\0X\235bC\v\0\0\0\3\270\5\0\0\20\20\30\24\22\7\7\0\0\0\0”, 120) = 120
17:35:08.747401 write(5, “\32\0\0\0\36\0\0\0\300\215bC\374\177\0\0\300\215bC\374\177\0\0\260\215bC’\0\0\0\374\177\0\0\0\0\0\0019e\276\30\270*\0\0\0\20\0\0\0\0\0\0X\277\340\30\2\0\0\0\270*\0\0\0\0\0\0\3\0\0\0\201\221\22\0\0\0\0\0\0\0\0\0\0\0\0\0\20d”\0X\235bC\v\0\0\0\2\270\5\0\0\20\20\30\24\22\7\7\0\0\0\0", 120) = 120
17:35:08.747462 write(5, “\32\0\0\0\36\0\0\0\300\215bC\374\177\0\0\300\215bC\374\177\0\0\260\215bC’\0\0\0\374\177\0\0\0\0\0\0019e\276\30\270*\0\0\0\20\0\0\0\0\0\0X\277\340\30\2\0\0\0\270*\0\0\0\0\0\0\3\0\0\0\1.\1\0\0\0\0\0\0\0\0\0\0\0\0\0\20d”\0X\235bC\v\0\0\0\3\270\5\0\0\20\20\30\24\22\7\7\0\0\0\0", 120) = 120
17:35:08.747551 write(5, “\30\0\0\0\20\0\n\0\200\212bC\374\177\0\0 \350\334\1\0\0\0\0\1\0\0\0\4\0\0\0\4\0\0\0\0\0\0\0008\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\200\0\0\0\0\2\0\0”, 64) = 64
17:35:08.747697 mmap(NULL, 24, PROT_READ|PROT_WRITE, MAP_SHARED, 5, 0x12ea000) = 0x2ab818d60000
17:35:08.747791 write(5, “\32\0\0\0\36\0\0\0\0*\261\1\0\0\0\0\200,\261\1\0\0\0\0\300\215bC\300\215\0\0\374\177\0bC\374\177\0\260\215bC\374\177\0\0’\0\0\0\0\0\1\0009e\276\30\0\20\0\0\270*\0\0\0\0\0\0\326\0\0\0009\0\0\0\3\0\0\0\270*\0\0\5\0\0\0\0\0\0\0\17\0\0\0\0\0\340\30\1\0\330\270\270\0\0\2\1\0\0\0\0\0\0\0”, 120) = 120
17:35:08.747868 write(5, “\30\0\0\0\20\0\n\0\200\212bC\374\177\0\0\340\375\334\1\0\0\0\0\1\0\0\0\4\0\0\0\4\0\0\0\0\0\0\0008\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\200\0\0\0\0\2\0\0”, 64) = 64
17:35:08.748029 mmap(NULL, 24, PROT_READ|PROT_WRITE, MAP_SHARED, 5, 0x12f1000) = 0x2ab818d65000

17:35:08.957396 mmap(NULL, 24, PROT_READ|PROT_WRITE, MAP_SHARED, 5, 0x51f9000) = 0x2ab829cc5000
17:35:08.957444 write(5, “\32\0\0\0\36\0\0\0\0*\261\1\0\0\0\0\200,\261\1\0\0\0\0\300\215bC\300\215\0\0\374\177\0bC\374\177\0\260\215bC\374\177\0\0’\0\0\0\0\0\1\0009e\276\30\0\20\0\0\270*\0\0\0\0\0\0\345?\0\0009\0\0\0\3\0\0\0\270*\0\0\5\0\0\0\0\0\0\0\17\0\0\0\0\0\340\30\1\0\330\270\270\0\0\2\1\0\0\0\0\0\0\0”, 120) = 120
17:35:08.957481 write(5, “\30\0\0\0\20\0\n\0\200\212bC\374\177\0\0P\230\367\1\0\0\0\0\1\0\0\0\4\0\0\0\4\0\0\0\0\0\0\0008\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\200\0\0\0\0\2\0\0”, 64) = -1 ENOMEM (Cannot allocate memory)
17:35:08.957935 open("/usr/share/locale/C/hpmpi", O_RDONLY) = -1 ENOENT (No such file or directory)
17:35:08.958013 open("/usr/share/locale/C/LC_MESSAGES/hpmpi", O_RDONLY) = -1 ENOENT (No such file or directory)
17:35:08.958081 open("/usr/share/locale/C/hpmpi", O_RDONLY) = -1 ENOENT (No such file or directory)
17:35:08.958138 open("/usr/share/locale/C/LC_MESSAGES/hpmpi", O_RDONLY) = -1 ENOENT (No such file or directory)
17:35:08.958195 fstat(2, {st_mode=S_IFIFO|0600, st_size=0, …}) = 0
17:35:08.958258 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2ab829cc6000
17:35:08.958310 write(2, “strace: Rank 0:99: MPI_Init: ibv_create_qp() failed\n”, 52) = 52
17:35:08.958364 write(2, “strace: Rank 0:99: MPI_Init: probably you need to increase pinnable memory in /etc/security/limits.conf\n”, 104) = 104
17:35:08.958427 write(2, “strace: Rank 0:99: MPI_Init: ibv_create_procqp() failed\n”, 56) = 56
17:35:08.958492 write(2, “strace: Rank 0:99: MPI_Init: Internal Error: Processes cannot connect to rdma device\n”, 85) = 85

The device id used by Write (5) is opened before everything starts and it seems to be the IB card:
17:35:06.583508 futex(0x2ab821e0b534, FUTEX_WAKE_PRIVATE, 2147483647) = 0
17:35:06.583538 open("/dev/infiniband/uverbs0", O_RDWR|O_CLOEXEC) = 5
17:35:06.583571 write(5, “\0\0\0\0\4\0\2\0`tbC\374\177\0\0”, 16) = 16
17:35:06.583603 write(5, “\1\0\0\0\4\0,\0\240sbC\374\177\0\0”, 16) = 16
17:35:06.583632 write(5, “\2\0\0\0\6\0\n\0000tbC\374\177\0\0\1\0\0\0\0\0\0\0”, 24) = 24
17:35:06.583662 open("/sys/class/infiniband/qib0/ports/1/pkeys/0", O_RDONLY|O_CLOEXEC) = 7
17:35:06.583697 read(7, “0xffff\n”, 8) = 7
17:35:06.583724 close(7) = 0
17:35:06.583753 open("/sys/class/infiniband/qib0/ports/1/pkeys/1", O_RDONLY|O_CLOEXEC) = 7
17:35:06.583786 read(7, “0x0000\n”, 8) = 7
17:35:06.583813 close(7) = 0
17:35:06.583841 open("/sys/class/infiniband/qib0/ports/1/pkeys/2", O_RDONLY|O_CLOEXEC) = 7
17:35:06.583874 read(7, “0x0000\n”, 8) = 7
17:35:06.583899 close(7) = 0
17:35:06.583927 open("/sys/class/infiniband/qib0/ports/1/pkeys/3", O_RDONLY|O_CLOEXEC) = 7
17:35:06.583959 read(7, “0x0000\n”, 8) = 7
17:35:06.583986 close(7) = 0
17:35:06.584018 open("/sys/class/infiniband/qib0/ports/1/gids/0", O_RDONLY|O_CLOEXEC) = 7
17:35:06.584051 read(7, “fe80:0000:0000:0000:0011:7500:006f:0d9a\n”, 41) = 40
17:35:06.584079 close(7) = 0

So it seems to try to use the memory on the IB card and then it runs out of memory?

The situation changes on the bare metal:
13:45:13.486233 open("/proc/self/status", O_RDONLY) = 9
13:45:13.486298 fstat(9, {st_mode=S_IFREG|0444, st_size=0, …}) = 0
13:45:13.486341 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2acef8669000
13:45:13.486378 read(9, “Name:\ta.ibm\nState:\tR (running)\nTgid:\t6547\nNgid:\t6472\nPid:\t6547\nPPid:\t6387\nTracerPid:\t6387\nUid:\t1053\t1053\t1053\t1053\nGid:\t1053\t1053\t1053\t1053\nFD
Size:\t64\nGroups:\t1053 \nVmPeak:\t 1880804 kB\nVmSize:\t 1824548 kB\nVmLck:\t 0 kB\nVmPin:\t 0 kB\nVmHWM:\t 46684 kB\nVmRSS:\t 46684 kB\nVmData:\t 94064 kB\nVmStk:\t 1
36 kB\nVmExe:\t 4 kB\nVmLib:\t 8088 kB\nVmPTE:\t 504 kB\nVmSwap:\t 0 kB\nThreads:\t2\nSigQ:\t0/1031345\nSigPnd:\t0000000000000000\nShdPnd:\t0000000000000000\nSigBlk:\t00000
00000000000\nSigIgn:\t0000000000000000\nSigCgt:\t000000018000446a\nCapInh:\t0000000000000000\nCapPrm:\t0000000000000000\nCapEff:\t0000000000000000\nCapBnd:\t0000001fffffffff\nSeccomp:\t0\nCpu
s_allowed:\t80000000\nCpus_allowed_list:\t31\nMems_allowed:\t00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,0000
0000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000000,00000003\nMems_allowed_list”, 1024) = 1024
13:45:13.486492 read(9, “:\t0-1\nvoluntary_ctxt_switches:\t82409\nnonvoluntary_ctxt_switches:\t389\n”, 1024) = 69
13:45:13.486554 read(9, “”, 1024) = 0
13:45:13.486603 close(9) = 0
13:45:13.486659 munmap(0x2acef8669000, 4096) = 0
13:45:13.486709 munmap(0x2acef86ee000, 41438) = 0
13:45:13.486761 mbind(0x2acf61c42000, 524288, MPOL_INTERLEAVE, 0x2e53060, 65, 0) = 0
13:45:13.486836 shmget(IPC_PRIVATE, 655360, IPC_CREAT|0600) = 15171630
13:45:13.486926 shmat(15171630, 0, 0) = 0x2acf620e4000
13:45:13.486967 shmctl(15171630, IPC_RMID, 0) = 0
13:45:13.487009 mbind(0x2acf620e4000, 655360, MPOL_INTERLEAVE, 0x2e53060, 65, 0) = 0
13:45:13.487136 shmget(IPC_PRIVATE, 819200, IPC_CREAT|0600) = 15499320
13:45:13.487188 shmat(15499320, 0, 0) = 0x2acf62184000
13:45:13.487240 shmctl(15499320, IPC_RMID, 0) = 0
13:45:13.487298 mbind(0x2acf62184000, 819200, MPOL_INTERLEAVE, 0x2e53060, 65, 0) = 0
13:45:13.487407 shmget(IPC_PRIVATE, 1024000, IPC_CREAT|0600) = 15958086
13:45:13.487460 shmat(15958086, 0, 0) = 0x2acf6224c000
13:45:13.487503 shmctl(15958086, IPC_RMID, 0) = 0
13:45:13.487559 mbind(0x2acf6224c000, 1024000, MPOL_INTERLEAVE, 0x2e53060, 65, 0) = 0
13:45:13.487679 shmget(IPC_PRIVATE, 1282048, IPC_CREAT|0600) = 16482390
13:45:13.487726 shmat(16482390, 0, 0) = 0x2acf62346000
13:45:13.487814 shmctl(16482390, IPC_RMID, 0) = 0
13:45:13.487884 mbind(0x2acf62346000, 1282048, MPOL_INTERLEAVE, 0x2e53060, 65, 0) = 0
13:45:13.488017 shmget(IPC_PRIVATE, 1605632, IPC_CREAT|0600) = 17039463
13:45:13.488058 shmat(17039463, 0, 0) = 0x2acf6247f000
13:45:13.488103 shmctl(17039463, IPC_RMID, 0) = 0

I can clearly see it using shm* as system calls, and so it works fine.

My application is
Ulimits are correct and the command to start the container is the following:
docker run
–net=none
–device=/dev/ipath
–device=/dev/infiniband/issm0
–device=/dev/infiniband/rdma_cm
–device=/dev/infiniband/ucm0
–device=/dev/infiniband/umad0
–device=/dev/infiniband/uverbs0
-v /root:/root
-v /etc/localtime:/etc/localtime:ro
–hostname=${NAME}
-d controller:5050/${IMAGE}

I have tried to play with the --ipc option to share the shm with the bare metal, but didn’t change much.

Any ideas?

Regards,