Closed rhiswell closed 5 years ago
Could you provide more details? How did you create the container?
I just follow the QuickStart in README in master branch. Here's some details:
// ffrouter/ffrouter.h
const char HOST_LIST[HOST_NUM][16] = {
"192.168.1.140", // ser140
"192.168.1.141" // ser141
}
// ffrouter/ffrouter.cpp
this->vip_map["10.40.0.3"] = "192.168.1.140"; // node1 in ser140, FFR_ID=10
this->vip_map["10.32.0.2"] = "192.168.1.141"; // node1 in ser141, FFR_ID=10
this->vip_map["10.32.0.3"] = "192.168.1.141"; // node2 in ser141, FFR_ID=11
Where ser140 and ser141 are both equipped with a ConnectX-3 adapter card. And here's the driver installed in host:
root@ser141:~# ofed_info -s
MLNX_OFED_LINUX-4.4-2.0.7.0:
root@ser141:~# ibv_devinfo
hca_id: mlx4_0
transport: InfiniBand (0)
fw_ver: 2.42.5000
node_guid: 7cfe:9003:0016:4d00
sys_image_guid: 7cfe:9003:0016:4d03
vendor_id: 0x02c9
vendor_part_id: 4099
hw_ver: 0x1
board_id: MT_1090120019
phys_port_cnt: 2
Device ports:
port: 1
state: PORT_ACTIVE (4)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 2
port_lid: 1
port_lmc: 0x00
link_layer: InfiniBand
port: 2
state: PORT_DOWN (1)
max_mtu: 4096 (5)
active_mtu: 4096 (5)
sm_lid: 0
port_lid: 0
port_lmc: 0x00
link_layer: InfiniBand
docker inspect [node1|router1]
where node1 and router1 are in the same host:root@ser141:~# docker inspect router1
[
{
"Id": "5ede70d69ea8677c77a8766dcaa4986498634d22b06b5d5763f869b1bfe59371",
"Created": "2018-10-18T06:38:40.691817844Z",
"Path": "/bin/bash",
"Args": [],
"State": {
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 19865,
"ExitCode": 0,
"Error": "",
"StartedAt": "2018-11-15T02:15:31.205168317Z",
"FinishedAt": "2018-11-01T13:33:45.731067322Z"
},
"Image": "sha256:dc9b0ca4639465d4baabb5a046fd04f1d99af953b8b166835e0fbe02b26a3b8b",
"ResolvConfPath": "/var/lib/docker/containers/5ede70d69ea8677c77a8766dcaa4986498634d22b06b5d5763f869b1bfe59371/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/5ede70d69ea8677c77a8766dcaa4986498634d22b06b5d5763f869b1bfe59371/hostname",
"HostsPath": "/var/lib/docker/containers/5ede70d69ea8677c77a8766dcaa4986498634d22b06b5d5763f869b1bfe59371/hosts",
"LogPath": "/var/lib/docker/containers/5ede70d69ea8677c77a8766dcaa4986498634d22b06b5d5763f869b1bfe59371/5ede70d69ea8677c77a8766dcaa4986498634d22b06b5d5763f869b1bfe59371-json.log",
"Name": "/router1",
"RestartCount": 0,
"Driver": "aufs",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "",
"ExecIDs": [
"31edbf07c661923532adeacc7a646344a2b3ebf52de47743e79ef1c3042f146b"
],
"HostConfig": {
"Binds": [
"/sys/class/:/sys/class/",
"/freeflow:/freeflow",
"/dev/:/dev/"
],
"ContainerIDFile": "",
"LogConfig": {
"Type": "json-file",
"Config": {}
},
"NetworkMode": "host",
"PortBindings": {},
"RestartPolicy": {
"Name": "no",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "",
"VolumesFrom": null,
"CapAdd": null,
"CapDrop": null,
"Dns": [],
"DnsOptions": [],
"DnsSearch": [],
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": true,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": [
"label=disable"
],
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"Runtime": "runc",
"ConsoleSize": [
0,
0
],
"Isolation": "",
"CpuShares": 0,
"Memory": 0,
"NanoCpus": 0,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": null,
"BlkioDeviceReadBps": null,
"BlkioDeviceWriteBps": null,
"BlkioDeviceReadIOps": null,
"BlkioDeviceWriteIOps": null,
"CpuPeriod": 0,
"CpuQuota": 0,
"CpuRealtimePeriod": 0,
"CpuRealtimeRuntime": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": [],
"DiskQuota": 0,
"KernelMemory": 0,
"MemoryReservation": 0,
"MemorySwap": 0,
"MemorySwappiness": -1,
"OomKillDisable": false,
"PidsLimit": 0,
"Ulimits": null,
"CpuCount": 0,
"CpuPercent": 0,
"IOMaximumIOps": 0,
"IOMaximumBandwidth": 0
},
"GraphDriver": {
"Name": "aufs",
"Data": null
},
"Mounts": [
{
"Type": "bind",
"Source": "/sys/class",
"Destination": "/sys/class",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
},
{
"Type": "bind",
"Source": "/dev",
"Destination": "/dev",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
},
{
"Type": "bind",
"Source": "/freeflow",
"Destination": "/freeflow",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
}
],
"Config": {
"Hostname": "ser141",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"Tty": true,
"OpenStdin": true,
"StdinOnce": false,
"Env": [
"FFR_NAME=router1",
"LD_LIBRARY_PATH=/usr/lib/:/usr/local/lib/:/usr/lib64/"
],
"Cmd": [
"/bin/bash"
],
"Image": "ubuntu:14.04",
"Volumes": null,
"WorkingDir": "",
"Entrypoint": null,
"OnBuild": null,
"Labels": {}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "e5a5f9521624d33dd75a509eeafc6b0a2facedf4031d57e407e33e4f085e6853",
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"Ports": {},
"SandboxKey": "/var/run/docker/netns/default",
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"host": {
"IPAMConfig": null,
"Links": null,
"Aliases": null,
"NetworkID": "92e1bdd47db91583fca14c61fac122cc8add5aec5cb37b6424816b390f28e081",
"EndpointID": "3fa24d7ed61ada5c3efab3de2a48fa02ea462ed10d28b31f6cb7c5f056333256",
"Gateway": "",
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": ""
}
}
}
}
]
root@ser141:~# docker inspect node1
[
{
"Id": "5be5d2c8374fd658fd54ae946a717152e39e155857ba595d5ff310e7be7f7a48",
"Created": "2018-10-31T08:02:34.271801795Z",
"Path": "/bin/bash",
"Args": [],
"State": {
"Status": "running",
"Running": true,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 20749,
"ExitCode": 0,
"Error": "",
"StartedAt": "2018-11-15T02:29:16.419775568Z",
"FinishedAt": "2018-11-15T02:29:08.469359022Z"
},
"Image": "sha256:dc9b0ca4639465d4baabb5a046fd04f1d99af953b8b166835e0fbe02b26a3b8b",
"ResolvConfPath": "/var/lib/docker/containers/5be5d2c8374fd658fd54ae946a717152e39e155857ba595d5ff310e7be7f7a48/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/5be5d2c8374fd658fd54ae946a717152e39e155857ba595d5ff310e7be7f7a48/hostname",
"HostsPath": "/var/lib/docker/containers/5be5d2c8374fd658fd54ae946a717152e39e155857ba595d5ff310e7be7f7a48/hosts",
"LogPath": "/var/lib/docker/containers/5be5d2c8374fd658fd54ae946a717152e39e155857ba595d5ff310e7be7f7a48/5be5d2c8374fd658fd54ae946a717152e39e155857ba595d5ff310e7be7f7a48-json.log",
"Name": "/node1",
"RestartCount": 0,
"Driver": "aufs",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "",
"ExecIDs": null,
"HostConfig": {
"Binds": [
"/dev/:/dev/",
"/sys/class/:/sys/class/",
"/freeflow:/freeflow"
],
"ContainerIDFile": "",
"LogConfig": {
"Type": "json-file",
"Config": {}
},
"NetworkMode": "weave",
"PortBindings": {},
"RestartPolicy": {
"Name": "no",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "",
"VolumesFrom": null,
"CapAdd": null,
"CapDrop": null,
"Dns": [],
"DnsOptions": [],
"DnsSearch": [],
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": true,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": [
"label=disable"
],
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"Runtime": "runc",
"ConsoleSize": [
0,
0
],
"Isolation": "",
"CpuShares": 0,
"Memory": 0,
"NanoCpus": 0,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": null,
"BlkioDeviceReadBps": null,
"BlkioDeviceWriteBps": null,
"BlkioDeviceReadIOps": null,
"BlkioDeviceWriteIOps": null,
"CpuPeriod": 0,
"CpuQuota": 0,
"CpuRealtimePeriod": 0,
"CpuRealtimeRuntime": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": [
{
"PathOnHost": "/dev/infiniband/uverbs0",
"PathInContainer": "/dev/infiniband/uverbs0",
"CgroupPermissions": "rwm"
},
{
"PathOnHost": "/dev/infiniband/rdma_cm",
"PathInContainer": "/dev/infiniband/rdma_cm",
"CgroupPermissions": "rwm"
}
],
"DiskQuota": 0,
"KernelMemory": 0,
"MemoryReservation": 0,
"MemorySwap": 0,
"MemorySwappiness": -1,
"OomKillDisable": false,
"PidsLimit": 0,
"Ulimits": null,
"CpuCount": 0,
"CpuPercent": 0,
"IOMaximumIOps": 0,
"IOMaximumBandwidth": 0
},
"GraphDriver": {
"Name": "aufs",
"Data": null
},
"Mounts": [
{
"Type": "bind",
"Source": "/dev",
"Destination": "/dev",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
},
{
"Type": "bind",
"Source": "/freeflow",
"Destination": "/freeflow",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
},
{
"Type": "bind",
"Source": "/sys/class",
"Destination": "/sys/class",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
}
],
"Config": {
"Hostname": "5be5d2c8374f",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"Tty": true,
"OpenStdin": true,
"StdinOnce": false,
"Env": [
"FFR_NAME=router1-ser141",
"FFR_ID=10",
"LD_LIBRARY_PATH=/usr/lib",
"--ipc=container:router1"
],
"Cmd": [
"/bin/bash"
],
"Image": "ubuntu:14.04",
"Volumes": null,
"WorkingDir": "",
"Entrypoint": null,
"OnBuild": null,
"Labels": {}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "4eba2d8029b936b0ae0c7375cfc17faf93b86ff5409b38f37897ea75182ca0ed",
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"Ports": {},
"SandboxKey": "/var/run/docker/netns/4eba2d8029b9",
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"weave": {
"IPAMConfig": null,
"Links": null,
"Aliases": [
"5be5d2c8374f"
],
"NetworkID": "8b16395c24a45909fa9653aab84ccf13dd885ecea8983f843a3d3aac1d3e7651",
"EndpointID": "28d311a7d98a0f63fdbe46341d82e5346c905992d451fb75ffa05c7336a4de84",
"Gateway": "",
"IPAddress": "10.32.0.2",
"IPPrefixLen": 12,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": "8e:5c:6d:dc:ca:b4"
}
}
}
}
]
After starting ffrouter in container router1, run ib_send_*
in application node will trigger the failure:
root@ser141:~# docker exec -it node1 bash
root@5be5d2c8374f:/# ib_send_bw
### FreeFlow ###
context->qp_table_mask=2047
mlx4: Warning: BlueFlame available, but failed to mmap() BlueFlame page.
*** Error in `ib_send_bw': malloc(): memory corruption: 0x0000000000ef9a10 ***
Aborted (core dumped)
I noticed that I cannot pass the test of libmempool in container (node1):
root@5be5d2c8374f:/freeflow/libmempool# make && make install && ldconfig
g++ -c -g -O3 -Wall -Werror -fPIC -std=c++11 libmempool.cpp MemoryPool.tcc
g++ -g -shared -o libmempool.so libmempool.o
cp libmempool.so /usr/lib/libmempool.so
root@5be5d2c8374f:/freeflow/libmempool# bash test.sh && ./test
Segmentation fault (core dumped)
You OFED version seems not right. Please use exactly "MLNX_OFED_LINUX-4.0-2.0.0.1-ubuntu14.04-x86_64.tgz". Are you still facing the memory pool problem?
Thanks for your reply.
You OFED version seems not right. Please use exactly "MLNX_OFED_LINUX-4.0-2.0.0.1-ubuntu14.04-x86_64.tgz".
Memory corruption error is cleared after changing host's driver from MLNX_OFED_LINUX-4.0-2.0.0.1-ubuntu14.04-x86_64.tgz
to MLNX_OFED_LINUX-4.0-2.0.0.1-ubuntu16.04-x86_64.tgz
(however, we run Ubuntu 16.04 on host here). But, ib_send_bw
will be stuck after connection is successfully established. Any idea about this problem? Here is the screenshot:
More details can see logs of ffrouter and ib_send_bw.
ser140-node1-ib_send_bw-client.txt ser140-router1.txt ser141-node1-ib_send_bw-server.txt ser141-router1.txt
Updated vip mapping:
// ffrouter/ffrouter.h
const char HOST_LIST[HOST_NUM][16] = {
"192.168.10.140", // ser140
"192.168.10.141" // ser141
}
// ffrouter/ffrouter.cpp
this->vip_map["10.40.0.6"] = "192.168.10.140"; // node1 in ser140, FFR_ID=10
this->vip_map["10.32.0.4"] = "192.168.10.141"; // node1 in ser141, FFR_ID=10
Are you still facing the memory pool problem?
No. Use mempool_insert
instead of mempool_get
in this line can fix the segementation fault.
Many thanks.
-e --ipc=container:router1
makes --ipc=container:router1
an environment variable instead of reusing router1's IPC namespace, so ib_send_bw
will be stuck when ffrouter working in fast-path mode.
Just remove -e
before --ipc=container:router1
to fix it.
When I run
ib_send_bw
in a container connected to a ffrouter, got the following failure:Any idea about this failure?