direct-code-execution / ns-3-dce

Run real programs in the discrete time simulator ns3
http://www.nsnam.org/projects/direct-code-execution/
75 stars 46 forks source link

[DCE 1.11] Symbol lookup error [rumpns_vzalloc] for a very large toplogy #144

Closed markverick closed 2 months ago

markverick commented 2 months ago

Installed using bake with this configuration: ./bake.py configure -e dce-linux-1.11 -e dce-quagga-1.11

Description of the problem

Using linux mode, it gives me the error when the p2p link count exceeds 252 or the interface count exceeds 504. symbol lookup error: elf-cache/0/liblinux.so: undefined symbol: rumpns_vzalloc

Running less than 252 links doesn't have this problem

According to the backtrace below, seems like fib_tries in the linux kernel tries to resize, but it cannot allocate more memory.

/home/vagrant/bake/source/ns-3-dce/build/myscripts/ns-3-dce-quagga/bin/dce-quagga-ospfd-leo: symbol lookup error: elf-cache/0/liblinux.so: undefined symbol: rumpns_vzalloc

Catchpoint 1 (call to syscall exit_group), 0x00007ffff7df2548 in __GI__exit (status=127) at ../sysdeps/unix/sysv/linux/_exit.c:31
31      ../sysdeps/unix/sysv/linux/_exit.c: No such file or directory.
(gdb) bt
#0  0x00007ffff7df2548 in __GI__exit (status=127) at ../sysdeps/unix/sysv/linux/_exit.c:31
#1  0x00007ffff7de747d in _dl_signal_error (errcode=errcode@entry=0, objname=objname@entry=0x892000 "elf-cache/0/liblinux.so",
    occation=occation@entry=0x7ffff7df6c9d "symbol lookup error",
    errstring=errstring@entry=0x7fffdf38e4e0 "undefined symbol: rumpns_vzalloc") at dl-error.c:125
#2  0x00007ffff7de7523 in _dl_signal_cerror (errcode=0, objname=0x892000 "elf-cache/0/liblinux.so",
    occation=0x7ffff7df6c9d "symbol lookup error", errstring=0x7fffdf38e4e0 "undefined symbol: rumpns_vzalloc") at dl-error.c:155
#3  0x00007ffff7de2301 in _dl_lookup_symbol_x (undef_name=0x7ffff22b5e2c "rumpns_vzalloc", undef_map=<optimized out>,
    ref=ref@entry=0x7fffdf38e628, symbol_scope=0xed2b18, version=<optimized out>, type_class=type_class@entry=1, flags=1,
    skip_map=0x0) at dl-lookup.c:871
#4  0x00007ffff7de6b06 in _dl_fixup (l=<optimized out>, reloc_arg=<optimized out>) at ../elf/dl-runtime.c:111
#5  0x00007ffff7deeeca in _dl_runtime_resolve_xsave () at ../sysdeps/x86_64/dl-trampoline.h:129
#6  0x00007ffff243d14c in tnode_new (key=167772160, pos=1, bits=9) at net/ipv4/fib_trie.c:359
#7  0x00007ffff243e0dc in inflate (t=t@entry=0x7fffeb6a4360, oldtnode=oldtnode@entry=0x7fffcf167c10) at net/ipv4/fib_trie.c:519
#8  0x00007ffff243dd56 in resize (t=t@entry=0x7fffeb6a4360, tn=0x7fffcf167c10) at net/ipv4/fib_trie.c:840
#9  0x00007ffff243df18 in trie_rebalance (t=t@entry=0x7fffeb6a4360, tn=<optimized out>) at net/ipv4/fib_trie.c:992
#10 0x00007ffff243e4ab in fib_insert_node (t=0x7fffeb6a4360, tp=<optimized out>, new=new@entry=0x7fffcf168bf0,
    key=key@entry=167773180) at net/ipv4/fib_trie.c:1035
#11 0x00007ffff243e6d0 in fib_insert_alias (t=t@entry=0x7fffeb6a4360, tp=<optimized out>, l=l@entry=0x0,
    new=new@entry=0x7fffcf168bf0, fa=fa@entry=0x0, key=key@entry=167773180) at net/ipv4/fib_trie.c:1049
#12 0x00007ffff243e857 in fib_table_insert (tb=0x7fffeb6a4330, cfg=0x7fffdf38eba0) at net/ipv4/fib_trie.c:1238
#13 0x00007ffff2438ab4 in inet_rtm_newroute (skb=<optimized out>, nlh=<optimized out>) at net/ipv4/fib_frontend.c:750
#14 0x00007ffff23b7dc5 in rtnetlink_rcv_msg (skb=0x7fffcf163590, nlh=0x7fffeb6a4130) at net/core/rtnetlink.c:3406
#15 0x00007ffff23c6c07 in netlink_rcv_skb (skb=0x7fffcf163590, cb=0x7ffff23b7cc0 <rtnetlink_rcv_msg>)
    at net/netlink/af_netlink.c:3016
#16 0x00007ffff23b5278 in rtnetlink_rcv (skb=0x7fffcf163590) at net/core/rtnetlink.c:3412
#17 0x00007ffff23c459e in netlink_unicast_kernel (sk=sk@entry=0x7fffdba98990, skb=skb@entry=0x7fffcf163590,
    ssk=ssk@entry=0x7fffdb96e970) at net/netlink/af_netlink.c:1834
#18 0x00007ffff23c65ae in netlink_unicast (ssk=ssk@entry=0x7fffdb96e970, skb=0x7fffcf163590, portid=portid@entry=0,
    nonblock=<optimized out>) at net/netlink/af_netlink.c:1860
#19 0x00007ffff23c68e2 in netlink_sendmsg (sock=<optimized out>, msg=0x7fffdf38edd0, len=52) at net/netlink/af_netlink.c:2511
#20 0x00007ffff234465c in lib_sock_sendmsg (socket=socket@entry=0x7fffecafc010, msg=msg@entry=0x7fffdf38f230, flags=flags@entry=0)
    at arch/lib/lib-socket.c:114
#21 0x00007ffff2343cb0 in lib_sock_sendmsg_forwarder (v0=0x7fffecafc010, v1=0x7fffdf38f230, v2=0) at arch/lib/lib.c:99
#22 0x00007ffff78fa586 in ns3::KernelSocketFdFactory::Sendmsg (this=0x818b70, socket=0x7fffecafc010, msg=0x7fffdf38f230, flags=0)
    at ../model/kernel-socket-fd-factory.cc:724
#23 0x00007ffff7903a06 in ns3::KernelSocketFd::Sendmsg (this=0x3445bc0, msg=0x7fffdf38f230, flags=0)
    at ../model/kernel-socket-fd.cc:77
#24 0x00007ffff782dcde in dce_sendmsg (fd=6, msg=0x7fffdf38f230, flags=0) at ../model/dce-fd.cc:272
#25 0x00007fffdab4ff93 in sendmsg () at ../model/libc-ns3.h:222
#26 0x00007fffd9d09767 in netlink_talk (n=0x7fffdf38f2e0, nl=0x7fffd9f65a60 <netlink_cmd>) at rt_netlink.c:1205
#27 0x00007fffd9d0a43e in netlink_route_multipath (cmd=24, p=0x7fffcf1c40e8, rib=0x7fffcf1c3f68, family=2) at rt_netlink.c:1723
#28 0x00007fffd9d0a48a in kernel_add_ipv4 (p=0x7fffcf1c40e8, rib=0x7fffcf1c3f68) at rt_netlink.c:1729
#29 0x00007fffd9cf3c8c in rib_install_kernel (rn=0x7fffcf1c40e8, rib=0x7fffcf1c3f68) at zebra_rib.c:914
#30 0x00007fffd9cf43a5 in rib_process (rn=0x7fffcf1c40e8) at zebra_rib.c:1150
---Type <return> to continue, or q <return> to quit---
#31 0x00007fffd9cf4517 in process_subq (subq=0x7fffdf35f048, qindex=2 '\002') at zebra_rib.c:1183
#32 0x00007fffd9cf45b0 in meta_queue_process (dummy=0x7fffdf36ee88, data=0x7fffdf36ef68) at zebra_rib.c:1211
#33 0x00007fffd9d37150 in work_queue_run (thread=0x7fffdf38fe10) at workqueue.c:289
#34 0x00007fffd9d1e14e in thread_call (thread=0x7fffdf38fe10) at thread.c:1177
#35 0x00007fffd9cee4dd in main (argc=5, argv=0x3372430) at main.c:398
#36 0x00007ffff77c957f in ns3::DceManager::DoStartProcess (context=0x3441240) at ../model/dce-manager.cc:298
#37 0x00007ffff786c532 in ns3::TaskManager::Trampoline (context=0x33bafd0) at ../model/task-manager.cc:275
#38 0x00007ffff78659b5 in ns3::UcontextFiberManager::Trampoline (a0=32767, a1=-142162706, a2=0, a3=54243280)
    at ../model/ucontext-fiber-manager.cc:199
#39 0x00007ffff3a565e0 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#40 0x0000000000000000 in ?? ()
(gdb) 

output of ./waf configure

 >> Building dce-linux-1.11 -  /usr/local/bin/python3 /home/vagrant/bake/source/ns-3-dce/waf configure --prefix=/home/vagrant/bake/build --with-ns3=/home/vagrant/bake/build --with-elf-loader=/home/vagrant/bake/build/lib --with-libaspect=/home/vagrant/bake/build --enable-kernel-stack=/home/vagrant/bake/source/ns-3-dce/../net-next-nuse-4.4.0/arch --disable-python dir=/home/vagrant/bake/source/ns-3-dce
Setting top to                           : /home/vagrant/bake/source/ns-3-dce 
Setting out to                           : /home/vagrant/bake/source/ns-3-dce/build 
Checking for 'gcc' (C compiler)          : /usr/bin/gcc 
Checking for cc version                  : 8.4.0 
Checking for 'g++' (C++ compiler)        : /usr/bin/g++ 
Checking for program 'pkg-config'        : /usr/bin/pkg-config 
Checking for pkg-config version >= '0.0.0' : yes 
Checking for -Wl,--soname=foo              : yes 
Checking for libns3.34-core-debug (mandatory) : yes 
Checking for libns3.34-network-debug (mandatory) : yes 
Checking for libns3.34-internet-debug (mandatory) : yes 
Checking for libns3.34-point-to-point-debug (optional) : yes 
Checking for libns3.34-tap-bridge-debug (optional)     : yes 
Checking for libns3.34-netanim-debug (optional)        : yes 
Checking for libns3.34-wifi-debug (optional)           : yes 
Checking for libns3.34-csma-debug (optional)           : yes 
Checking for libns3.34-mobility-debug (optional)       : yes 
Checking for libns3.34-point-to-point-layout-debug (optional) : yes 
Checking for libns3.34-mpi-debug (optional)                   : not found 
Checking for libns3.34-lte-debug (optional)                   : yes 
Checking for libns3.34-visualizer-debug (optional)            : not found 
Checking for libns3.34-applications-debug (optional)          : yes 
Checking for libns3.34-fd-net-device-debug (optional)         : yes 
Checking for header stdint.h                                  : yes 
Checking for header inttypes.h                                : yes 
Checking for header sys/inttypes.h                            : not found 
Checking for header sys/types.h                               : yes 
Checking for header sys/stat.h                                : yes 
Checking for header dirent.h                                  : yes 
Checking for library dl                                       : yes 
Checking for glibc get_cpu_features                           : yes 
Checking for glibc secure_getenv                              : yes 
Checking for glibc explicit_bzero                             : no 
Checking for header valgrind/valgrind.h                       : not found 
Checking for header valgrind/memcheck.h                       : not found 
Checking for header sim.h                                     : yes 
Checking for libns3.34-topology-read-debug (optional)         : yes 
Checking for libns3.34-internet-apps-debug (optional)         : yes 
Checking for libns3.34-visualizer-debug (optional)            : not found 
Checking for library dl                                       : yes 
---- Summary of optional NS-3 features:
Static build                  : not enabled (option --enable-static not selected)
Logging                       : enabled
Assert checks                 : enabled
Code coverage                 : not enabled (option --enable-gcov not selected)
Example programs              : enabled
Test programs                 : enabled
Debug Symbols                 : enabled
Checking for libns3.34-flow-monitor-debug (mandatory)         : yes 
Checking for libns3.34-visualizer-debug (optional)            : not found 
Checking for library dl                                       : yes 
---- Summary of optional NS-3 features:
Static build                  : not enabled (option --enable-static not selected)
Logging                       : enabled
Assert checks                 : enabled
Code coverage                 : not enabled (option --enable-gcov not selected)
Example programs              : enabled
Test programs                 : enabled
Debug Symbols                 : enabled
Checking for 'gcc' (C compiler)                               : /usr/bin/gcc 
Checking for 'g++' (C++ compiler)                             : /usr/bin/g++ 
Checking for 'gcc' (C compiler)                               : /usr/bin/gcc 
Checking for 'g++' (C++ compiler)                             : /usr/bin/g++ 
Checking for header hook-manager.h                            : yes 
Checking for program 'valgrind'                               : not found 
Checking for program 'doxygen'                                : not found 
Checking for header netinet/sctp.h                            : not found 
Checking for 'gcc' (C compiler)                               : /usr/bin/gcc 
Checking for 'g++' (C++ compiler)                             : /usr/bin/g++ 
---- Summary of optional NS-3 features:
Static build                  : not enabled (option --enable-static not selected)
Logging                       : enabled
Assert checks                 : enabled
Code coverage                 : not enabled (option --enable-gcov not selected)
Example programs              : enabled
Test programs                 : enabled
Debug Symbols                 : enabled
ELF magic loader              : enabled
Aspect-based tracing          : enabled
sctp-tools-dev                : not enabled (sctp-tools (netinet/sctp.h) not found)
Python Bindings               : not enabled (disabled by user request)
'configure' finished successfully (1.372s)

Steps to reproduce

  1. Install https://gitlab.com/nsnam/bake and its dependency according to Tom Henderson
  2. Assign point-to-point links more than 252, running Quagga's OSPFD application.
  3. Symbol error occurred at about simulation time ~ 60s before route convergence.

Source code that produces this error:

/* -*- Mode:C++; c-file-style:"gnu"; indent-tabs-mode:nil; -*- */
/*
 * Copyright (c) 2012 Hajime Tazaki, NICT
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation;
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * Author: Hajime Tazaki <tazaki@nict.go.jp>
 */

#include "ns3/network-module.h"
#include "ns3/core-module.h"
#include "ns3/internet-module.h"
#include "ns3/dce-module.h"
#include "ns3/quagga-helper.h"
#include "ns3/point-to-point-helper.h"
#include "ns3/topology-read-module.h"
#include <memory>

#include "ns3/v4ping.h"
#include "ns3/ipv4.h"

#include <sys/resource.h>
#undef NS3_MPI
#ifdef NS3_MPI
#include <mpi.h>
#include "ns3/mpi-interface.h"
#endif
using namespace ns3;

NS_LOG_COMPONENT_DEFINE ("quagga-ospfd-leo");

// Parameters
uint32_t stopTime = 120;

// Static functions for linux stack
static void RunIp (Ptr<Node> node, Time at, std::string str)
{
  DceApplicationHelper process;
  ApplicationContainer apps;
  process.SetBinary ("ip");
  process.SetStackSize (1 << 16);
  process.ResetArguments ();
  process.ParseArguments (str.c_str ());
  apps = process.Install (node);
  apps.Start (at);
}

static void AddAddress (Ptr<Node> node, Time at, const char *name, const char *address)
{
  std::ostringstream oss;
  oss << "-f inet addr add " << address << " dev " << name;
  RunIp (node, at, oss.str ());
}

// Genereate a pair of address of 10.0.0.0/30 by link id
std::pair<std::string, std::string> RawAddressHelper(int link_id) {
  int base_ip = link_id * 4;
  std::stringstream ss1; // 10.0.0.1/30
  std::stringstream ss2; // 10.0.0.2/30
  base_ip++;
  ss1 << "10." << base_ip / (256 * 256) << "." << (base_ip / 256) % 256 << "." << base_ip % 256 << "/30";
  base_ip++;
  ss2 << "10." << base_ip / (256 * 256) << "." << (base_ip / 256) % 256 << "." << base_ip % 256 << "/30";
  return std::make_pair(ss1.str(), ss2.str());
}

void AssignIP(int ms, int link_id, NetDeviceContainer nd, int* if_count, bool enabled) {
  // Assert size
  auto node1 = nd.Get(0)->GetNode();
  auto node2 = nd.Get(1)->GetNode();
  std::string if1 = "sim" + std::to_string(if_count[node1->GetId()]++);
  std::string if2 = "sim" + std::to_string(if_count[node2->GetId()]++);
  std::string cmd1 = "link set " + if1 +" up";
  std::string cmd2 = "link set " + if2 +" up";
  AddAddress (node1, MilliSeconds (ms), if1.c_str(), RawAddressHelper(link_id).first.c_str());
  if (enabled) {
    RunIp (node1, MilliSeconds (ms + 10), "link set lo up");
    RunIp (node1, MilliSeconds (ms + 10), cmd1.c_str());

  }
  AddAddress (node2, MilliSeconds (ms), if2.c_str(), RawAddressHelper(link_id).second.c_str());
  if (enabled) {
    RunIp (node2, MilliSeconds (ms + 10), "link set lo up");
    RunIp (node2, MilliSeconds (ms + 10), cmd2.c_str());
  }
  printf("Assigned addresses: %s %s\n", RawAddressHelper(link_id).first.c_str(), RawAddressHelper(link_id).second.c_str());
  printf("Assigned addresses: %s %s\n", cmd1.c_str(), cmd2.c_str());
}

void PrintRouteAt(int t, Ptr<Node> node) {
  RunIp (node, MilliSeconds (t * 1000), "link show");
  RunIp (node, MilliSeconds (t * 1000 + 10), "route show table all");
  RunIp (node, MilliSeconds (t * 1000 + 20), "addr list");
}

void PrintAllRouteAt(int t, NodeContainer nc) {
  for (int i = 0; i < nc.GetN(); i++) {
    RunIp (nc.Get(i), MilliSeconds (t * 1000), "link show");
    RunIp (nc.Get(i), MilliSeconds (t * 1000 + 10), "route show table all");
    RunIp (nc.Get(i), MilliSeconds (t * 1000 + 20), "addr list");
  }
}

int
main (int argc, char *argv[])
{
  //  LogComponentEnable ("quagga-ospfd-rocketfuel", LOG_LEVEL_INFO);
  int row = 128;
  int col = 1;
  CommandLine cmd;
  cmd.AddValue ("stopTime", "Time to stop(seconds)", stopTime);
  cmd.Parse (argc,argv);

  Ptr<TopologyReader> inFile = 0;

  NodeContainer nodes;
  nodes.Create(row * col);
  // Prepare topology
  int i, j, k = 0;
  int link_count = 0;

  // Set up topology
  NetDeviceContainer ndc[row * col * 2];
  PointToPointHelper p2p;
  int if_count[row * col];
  memset(if_count, 0, sizeof if_count);
  p2p.SetChannelAttribute ("Delay", StringValue ("2ms"));
  p2p.SetDeviceAttribute ("DataRate", StringValue ("5Mbps"));

  for (i = 0; i < row; i++) {
    for (j = 0; j < col; j++) {
      int id = i * col + j;
      int id1 = i * col + (j+1)%col;
      int id2 = ((i+1)%row) * col + j;
      // printf("Node %d %d - %d\n",i, j, time);
      ndc[link_count++].Add(p2p.Install (nodes.Get(id), nodes.Get(id1)));
      ndc[link_count++].Add(p2p.Install (nodes.Get(id), nodes.Get(id2)));
      //   AssignIP(1000, link_count++, nodes, id, id1, true);
      //   AssignIP(1000, link_count++, nodes, id, id2, true);
      // Simulator::Schedule (Seconds(0), &AddISL, ipv4AddrHelper,
      //                     nodes, id, id2);
    }
  }

  // Internet stack installation
  DceManagerHelper processManager;
  processManager.SetTaskManagerAttribute ("FiberManagerType",
                                              EnumValue (0));
  processManager.SetNetworkStack ("ns3::LinuxSocketFdFactory",
                                  "Library", StringValue ("liblinux.so"));
  QuaggaHelper quagga;
  processManager.Install (nodes);

  // IP Configuration
  for (int i = 0; i < link_count; i++) {
    AssignIP(100, i, ndc[i], if_count, true);
  }

  // Install Quagga
  quagga.EnableOspf (nodes, "10.0.0.0/8");
  quagga.Install (nodes);

  // Enable pcap
  p2p.EnablePcapAll ("leo-linux-test");

  // Debug
  PrintAllRouteAt(10, nodes);
  PrintAllRouteAt(80, nodes);

  //
  // Step 9
  // Now It's ready to GO!
  //
  if (stopTime != 0)
    {
      Simulator::Stop (Seconds (stopTime));
    }
  Simulator::Run ();
  Simulator::Destroy ();

  return 0;
}
markverick commented 2 months ago

Fixed by always using kzalloc size > PAGE_SIZE https://github.com/libos-nuse/net-next-nuse/blob/46e2206969943ba3fb87441dee0b433624daf35c/net/ipv4/fib_trie.c#L312