Segmentation fault + black image outputs

nikitinvv / rectv_gpu

Four-dimensional tomographic reconstruction by time domain decomposition. Version on GPU

BSD 2-Clause "Simplified" License

4 stars 4 forks source link

Hi, I've been trying to make use of the package on the operando fuel cell dataset provided by TomoBank HERE, just below the foam data that was demonstrated in the Nikitin et al. paper. The dataset has the dimensions (ntheta, nz, n) = (18060, 1100, 1440), with ntheta = nprojections ntimesteps = 301 60. The sample recon script provided HERE circumvents limited RAM issues for the Tomopy/ASTRA Toolbox recon algorithms by processing the dataset in chunks of 8 sinograms/slices at a time, i.e. (18060, 8, 1440) per iteration. This has worked fine with the CUDA-accelerated iterative SIRT/MLEM algorithms.

I've adapted the now-deprecated rectv parts of script to the current build of rectv_gpu and am attempting to run it on a 16GB Tesla V100 (50+ GB of RAM). However, even down to a chunk size of 1, I'm still maxing out my GPU memory and getting a "Segmentation fault (core dumped)" error at the rectv_gpu.Solver.recon step, so I'm considering batching by time steps as well to further minimize the memory issue. Also, when I do get outputs, they're all-zeros or black output images. For reference, I've tested the foam data notebook example and it works fine.

Am I doing anything wrong in the workflow below? Would batching by time steps affect the performance of this algorithm if I technically have motion at all timesteps? And if not, any suggestions for what I should be doing instead?

h5name = "fuelcell_i2.h5"
nproj = 301
ntimesteps = 60
rot_center = 702
chunksize = 2
frame = 0

data_size = get_dx_dims(h5name, 'data')

# Select sinogram range to reconstruct.
sino_start = 0
sino_end = data_size[1]

# Split sinograms into chunks to reconstruct to accommodate limited RAM
chunks = int(data_size[1] / chunksize) 
nSino_per_chunk = (sino_end - sino_start) / chunks

strt = 0
for iChunk in range(0, chunks):
    sino_chunk_start = sino_start + nSino_per_chunk * iChunk 
    sino_chunk_end = sino_start + nSino_per_chunk * (iChunk+1)

    if sino_chunk_end > sino_end: 
        break

    sino = (int(sino_chunk_start), int(sino_chunk_end)))            

    proj, flat, dark, theta = dxchange.read_aps_32id(h5fname, sino=sino)

    if int(frame - ntimesteps)>0:
        proj = proj[(frame - ntimesteps // 2) * nproj:(frame + \
                           ntimesteps / 2) * nproj, :, :]

    data = tomopy.normalize(proj, flat, dark, cutoff=1.4)
    data = tomopy.minus_log(data)
    data = tomopy.remove_nan(data, val=0.0)
    data = tomopy.remove_neg(data, val=0.00)
    data[np.where(data == np.inf)] = 0.00

    theta = np.linspace(0, np.pi * ntimesteps, nproj * ntimesteps, endpoint=False)

    [ntheta, nz, n] = data.shape
    lambda0 = pow(2,-9)
    lambda1 = pow(2,2)
    niter = 16
    titer = 4
    nzp = 1
    ngpus = 1
    m = ntimesteps  # number of basis functions

    # reorder input data to (nz, ntheta, n) for compatibility
    data = data.swapaxes(0, 1)  

    cl = rectv_gpu.Solver(n, theta, m, nz, nzp, ngpus)
    rtv = cl.recon(data, theta, phi, rot_center=rot_center, lambda0=lambda0, 
                            lambda1=lambda1, niter=niter, titer=titer)

    # reorder output data to (ntimesteps, nz, n, n) for compatibility
    rec = np.rot90(rtv.swapaxes(0, 1), axes=(2, 3)) / ntheta * ntimesteps * 2
    rec = rec[:: int(M // full_time_range)]

    for time_frame in range(0, ntimesteps):
        fname = os.path.dirname(os.path.abspath(h5fname)) + '/' + \ 
                 os.path.splitext(os.path.basename(h5fname))[0]+ '_rec_full/' + \
                 'recon' + str(frame - ntimesteps // 2 + time_frame) + '_'
        dxchange.write_tiff_stack(rec[time_frame], fname=fname, start=strt)

    strt += sino[1] - sino[0])

import rectv_gpu import numpy as np import dxchange import tomopy import os def takephi(ntheta, m): [x, y] = np.meshgrid(np.arange(-ntheta//2, ntheta//2), np.arange(-m//2, m//2)) phi = np.zeros([m, 2*ntheta], dtype='float32') phi[:, ::2] = np.cos(2*np.pi*x*y/ntheta)/np.sqrt(ntheta) phi[:, 1::2] = np.sin(2*np.pi*x*y/ntheta)/np.sqrt(ntheta) return phi h5fname = "fuelcell_i2.h5" nproj = 301 ntimesteps = 8 rot_center = 702 chunksize = 8 frame = 30 data_size = [18060, 1100, 1440] # Select sinogram range to reconstruct. sino_start = 0 sino_end = data_size[1] # Split sinograms into chunks to reconstruct to accommodate limited RAM chunks = int(data_size[1] / chunksize) nSino_per_chunk = (sino_end - sino_start) / chunks strt = 0 for iChunk in range(0, chunks): sino_chunk_start = sino_start + nSino_per_chunk * iChunk sino_chunk_end = sino_start + nSino_per_chunk * (iChunk+1) if sino_chunk_end > sino_end: break sino = (int(sino_chunk_start), int(sino_chunk_end)) proj, flat, dark, theta = dxchange.read_aps_32id(h5fname, sino=sino) if int(frame - ntimesteps)>0: proj = proj[(frame - ntimesteps // 2) * nproj:(frame + ntimesteps // 2) * nproj, :, :] data = tomopy.normalize(proj, flat, dark, cutoff=1.4) data = tomopy.minus_log(data) data = tomopy.remove_nan(data, val=0.0) data = tomopy.remove_neg(data, val=0.00) data[np.where(data == np.inf)] = 0.00 theta = np.linspace(0, np.pi * ntimesteps, nproj * ntimesteps, endpoint=False) [ntheta, nz, n] = data.shape lambda0 = pow(2,-9) lambda1 = pow(2,2) niter = 16 titer = 4 nzp = 2 ngpus = 1 m = ntimesteps # number of basis functions # reorder input data to (nz, ntheta, n) for compatibility data = data.swapaxes(0, 1) # take basis functions phi = takephi(ntheta,m) theta = theta.astype('float32') with rectv_gpu.Solver(n, ntheta, m, nz, nzp, ngpus) as cl: rtv = cl.recon(data, theta, phi, rot_center=rot_center, lambda0=lambda0, lambda1=lambda1, niter=niter, titer=titer) for time_frame in range(0, m): fname = os.path.dirname(os.path.abspath(h5fname)) + '/' + os.path.splitext(os.path.basename(h5fname))[0]+ '_rec_full/' + 'recon' + str(m) + '_' dxchange.write_tiff_stack(rtv[:,time_frame], fname=fname, start=strt) strt += sino[1] - sino[0]

nikitinvv / rectv_gpu

Segmentation fault + black image outputs #7