kangheng commented 4 weeks ago

I've recorded the following pytorch program with uftrace.

import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
#from apex import amp
from torch.profiler import profile, record_function, ProfilerActivity
#from torchsummary import summary
import logging
logging.basicConfig(level='INFO')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=1, type=int, metavar='N',
                        help='number of total epochs to run')
    parser.add_argument('--bsize', default=8, type=int, metavar='N',
                        help='number of batch_size')

    args = parser.parse_args()
    #args.world_size = args.gpus * args.nodes
    args.world_size = 1
    torch.set_num_threads(1)

    os.environ["OMP_NUM_THREADS"] = '1'
    os.environ["MKL__NUM_THREADS"] = '1'
    os.environ['MASTER_ADDR'] = '109.105.115.156'
    os.environ['MASTER_PORT'] = '8087'
    #os.environ['CUDA_LAUNCH_BLOCKING']='1'   
    logging.info("main args")
    #mp.spawn(train, nprocs=args.gpus, args=(args,))
    mp.spawn(train, nprocs=1, args=(args,))
    logging.info("main spawn")
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.fc = nn.Linear(4096*32, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

def train(gpu, args):
    #print('calnum begin',flush=True)
    #rank = args.nr * args.gpus + gpu
    rank = gpu
    dist.init_process_group(backend='gloo', init_method='env://', world_size=args.world_size, rank=rank)
    torch.manual_seed(0)
    model = ConvNet()
    #torch.cuda.set_device(gpu) 
    #model.cuda(gpu)

    batch_size = args.bsize
    # define loss function (criterion) and optimizer
    #criterion = nn.CrossEntropyLoss().cuda(gpu)
    criterion = nn.CrossEntropyLoss()

    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model 
    logging.info("nn.parallel.DistributedDataParallel")
    #model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    model = nn.parallel.DistributedDataParallel(model)
    # Data loading code
    images=torch.randn(batch_size,1,256,256)
    labels=torch.randn(batch_size,10)

    start = datetime.now()

    for epoch in range(args.epochs):
    #train_sampler.set_epoch(epoch) #shuffle
        #logging.info("calnum begin")
        #print('enumerate begin',flush=True)
        logging.info("load_begin_0") 
        #for i, (images, labels) in enumerate(train_loader):
        # load                  
        if(epoch>1): break
        #images = images.cuda(non_blocking=True)
        #labels = labels.cuda(non_blocking=True) 

        logging.info("load_over_%d",epoch)          
        # Forwad
        logging.info("Forwad_begin_%d",epoch)
        outputs = model(images)        
        loss = criterion(outputs, labels)
        logging.info("Forwad_over_%d",epoch)   
        # Backward and optimize
        logging.info("Backward_begin_%d",epoch)
        optimizer.zero_grad()  
        loss.backward()   
        optimizer.step()
        logging.info("Backward_over_%d",epoch)
        logging.info("load_begin_%d",epoch+1)
        if (epoch + 1) % 100 == 0 and gpu == 0:
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, epoch + 1, total_step,loss.item()))
        #print('enumerate over',flush=True)

    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))

    #print('calnum over',flush=True)  
if __name__ == '__main__':
    logging.info("main_begin") 
    main()

It works fine without uftrace.

$ python conv2.py

INFO:root:main_begin
INFO:root:main args

INFO:torch.distributed.distributed_c10d:Added key: store_based_barrier_key:1 to store for rank: 0
INFO:torch.distributed.distributed_c10d:Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
INFO:root:nn.parallel.DistributedDataParallel
INFO:root:load_begin_0
INFO:root:load_over_0
INFO:root:Forwad_begin_0
INFO:root:Forwad_over_0
INFO:root:Backward_begin_0
INFO:root:Backward_over_0
INFO:root:load_begin_1
Training complete in: 0:00:04.732139
INFO:root:main spawn

But it fails when running with uftrace as follows.

$ uftrace record -P . python conv2d.py

WARN: process crashed by signal 11: Segmentation fault (si_code: 128)
WARN:  if this happens only with uftrace, please consider -e/--estimate-return option.

WARN: Backtrace from uftrace v0.16 ( x86_64 dwarf python3 luajit tui perf sched dynamic )
WARN: =====================================
WARN: [34] (call_unbound_noarg[55e3876b6485] <= slot_tp_repr[55e387785095])
WARN: [33] (lookup_maybe_method.cold.1789[55e3876211f3] <= PyId___repr__.15069[55e3878d70a0])
WARN: [32] (lookup_maybe_method[55e3876e8135] <= slot_tp_repr[55e38778507a])
WARN: [31] (<7fc1baa3bdce>[7fc1baa3bdce] <= <7fc1baa41335>[7fc1baa41335])
WARN: [30] (<7fc1baa41188>[7fc1baa41188] <= <7fc1bb1762ab>[7fc1bb1762ab])
WARN: [29] (<7fc1bb1761a5>[7fc1bb1761a5] <= <7fc1bb16e518>[7fc1bb16e518])
WARN: [28] (<7fc1bb16e497>[7fc1bb16e497] <= <7fc1bb161e8d>[7fc1bb161e8d])
WARN: [27] (<7fc1bb161dbb>[7fc1bb161dbb] <= <7fc1bb15de94>[7fc1bb15de94])
WARN: [26] (<7fc1bb15d80d>[7fc1bb15d80d] <= <7fc1baf7663d>[7fc1baf7663d])
WARN: [25] (<7fc1baf73b5c>[7fc1baf73b5c] <= <7fc1bae05221>[7fc1bae05221])
WARN: [24] (<7fc1bae04eea>[7fc1bae04eea] <= <7fc1bd7ba118>[7fc1bd7ba118])
WARN: [23] (<7fc1bd7ba113>[7fc1bd7ba113] <= _PyImport_LoadDynamicModuleWithSpec[55e3877c86da])
WARN: [22] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [21] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [20] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [19] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [18] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [17] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [16] (_PyObject_CallMethodIdObjArgs[55e3876f7615] <= PyImport_ImportModuleLevelObject[55e3876a85e6])
WARN: [15] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [14] (PyEval_EvalCode[55e387798f45] <= builtin_exec[55e3877d1f33])
WARN: [13] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [12] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [11] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [10] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [9] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [8] (_PyObject_CallMethodIdObjArgs[55e3876f7615] <= PyImport_ImportModuleLevelObject[55e3876a85e6])
WARN: [7] (builtin___import__[55e387744315] <= PyCFunction_Call[55e3876f9c96])
WARN: [6] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [5] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [4] (PyEval_EvalFrameEx[55e3876973d5] <= _PyFunction_Vectorcall[55e387705b1b])
WARN: [3] (_PyObject_CallMethodIdObjArgs[55e3876f7615] <= PyImport_ImportModuleLevelObject[55e3876a85e6])
WARN: [2] (PyEval_EvalFrameEx[55e3876973d5] <= _PyEval_EvalCodeWithName[55e387704c32])
WARN: [1] (PyEval_EvalCode[55e387798f45] <= run_eval_code_obj[55e387799004])
WARN: [0] (Py_BytesMain[55e3877d1b95] <= __libc_start_main[7fc1c029b493])

Please report this bug to https://github.com/namhyung/uftrace/issues.

WARN: child terminated by signal: 11: Segmentation fault

m9rco commented 4 weeks ago

+1

yihong0618 commented 4 weeks ago

can you add shebang to your python script

!/usr/bin/env python3

chmod +x conv2d.py

then

uftrace record conv2d.py

kangheng commented 4 weeks ago

can you add shebang to your python script

!/usr/bin/env python3

chmod +x conv2d.py

then

uftrace record conv2d.py

I have added shebang to my python script, but it still fails

$  uftrace  record   -P . ./conv2d.py
WARN: Segmentation fault: address not mapped (addr: (nil))
WARN:  if this happens only with uftrace, please consider -e/--estimate-return option.

WARN: Backtrace from uftrace v0.16 ( x86_64 dwarf python3 luajit tui perf sched dynamic )
WARN: =====================================
WARN: [10] (call_trampoline[557f2695897c] <= profile_trampoline[557f26958ab7])
WARN: [9] (call_trace_protected[557f26909514] <= _PyEval_EvalFrameDefault.cold.2787[557f2699c00c])
WARN: [8] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [7] (PyEval_EvalCode[557f26ab5f45] <= run_eval_code_obj[557f26ab6004])
WARN: [6] (PyRun_StringFlags[557f26aee575] <= builtin_exec[557f26aeeffc])
WARN: [5] (trace_call_function[557f2691c0ac] <= _PyEval_EvalFrameDefault.cold.2787[557f2699c417])
WARN: [4] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [3] (PyEval_EvalCode[557f26ab5f45] <= builtin_exec[557f26aeef33])
WARN: [2] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [1] (PyEval_EvalFrameEx[557f269b43d5] <= _PyEval_EvalCodeWithName[557f26a21c32])
WARN: [0] (Py_BytesMain[557f26aeeb95] <= __libc_start_main[7f1166c1a493])

Please report this bug to https://github.com/namhyung/uftrace/issues.

WARN: child terminated by signal: 11: Segmentation fault

honggyukim commented 4 weeks ago

You should remove -P . in this case.

Do you want to trace python function or python interpreter itself, which is written in C?

kangheng commented 4 weeks ago

You should remove -P . in this case.

Do you want to trace python function or python interpreter itself, which is written in C?

I want to trace the C/C++ interface functions in pytorch

honggyukim commented 4 weeks ago

Then it will need more fine tuned argument usage. Having python conv2d.py shows python interpreter mostly so you should hide those, then it will eventually call some native functions of pytorch. But -P . is only applied to the python interpreter so you won't be able to see native functions of pytorch. Firstly, you need to find which binary file has pytorch native functions.

kangheng commented 3 weeks ago

Then it will need more fine tuned argument usage. Having python conv2d.py shows python interpreter mostly so you should hide those, then it will eventually call some native functions of pytorch. But -P . is only applied to the python interpreter so you won't be able to see native functions of pytorch. Firstly, you need to find which binary file has pytorch native functions.

When I added the "-e" option, although I couldn't record the exit time, I was able to trace the function interface of pytorch. $ uftrace record -e -P . python conv2d.py

namhyung / uftrace

Segmentation fault #1945

!/usr/bin/env python3

!/usr/bin/env python3