Open woensug-choi opened 2 years ago
Note, -cuda-dev rocker
-c dockwater
docker run --rm -it -v /home/woensug/.gitconfig:/home/None/.gitconfig:ro -v /home/woensug:/home/woensug --name noetic_runtime -e DISPLAY -e TERM -e QT_X11_NO_MITSHM=1 -e XAUTHORITY=/tmp/.docker7ua80rmz.xauth -v /tmp/.docker7ua80rmz.xauth:/tmp/.docker7ua80rmz.xauth -v /tmp/.X11-unix:/tmp/.X11-unix -v /etc/localtime:/etc/localtime:ro 5f75307ef91f
install gpgpu dependencies
install gcc 5
https://askubuntu.com/questions/1235819/ubuntu-20-04-gcc-version-lower-than-gcc-7
add gcc 5 and 9
https://www.fosslinux.com/39386/how-to-install-multiple-versions-of-gcc-and-g-on-ubuntu-20-04.htm
select gcc 5
add PATHs
source gpgpu
compile gpgpu
link cudart.so at cmakelist target_link_libraries(nps_multibeam_sonar_ros_plugin "libcudart.so")
select gcc 9
catkin_make
cp config files to ~/.ros
Segmentation fault when running it. Probably caused by catkin_make compiled with gcc-9 and gpgpu-sim compiled with gcc-5. Can't compile with same gcc version..
* Copyright 2020 Naval Postgraduate School
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
#include <cuda.h>
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <thrust/complex.h>
#include <stdio.h>
#include <iostream>
#include <complex>
#include <valarray>
#include <opencv2/core.hpp>
#include <opencv2/core/core.hpp>
// #include <math.h>
#include <assert.h>
// For complex numbers
#include <thrust/complex.h>
#include <cuComplex.h>
// For rand() function
#include <unistd.h>
#include <curand.h>
#include <curand_kernel.h>
// For FFT
#include <cufft.h>
#include <cufftw.h>
#include <thrust/device_vector.h>
#include <list>
#include <chrono>
#define BLOCK_SIZE 32
typedef std::complex<float> Complex;
typedef std::valarray<Complex> CArray;
typedef std::valarray<CArray> CArray2D;
__global__ void cuda_hello(){
printf("Hello World from GPU!\n");
}
static inline void _safe_cuda_call(cudaError err, const char *msg,
const char *file_name, const int line_number)
{
if (err != cudaSuccess)
{
fprintf(stderr, "%s\n\nFile: %s\n\nLine Number: %d\n\nReason: %s\n",
msg, file_name, line_number, cudaGetErrorString(err));
std::cin.get();
exit(EXIT_FAILURE);
}
}
#define SAFE_CALL(call, msg) _safe_cuda_call((call), (msg), __FILE__, __LINE__)
///////////////////////////////////////////////////////////////////////////
// Incident Angle Calculation Function
// incidence angle is target's normal angle accounting for the ray's azimuth
// and elevation
__device__ float compute_incidence(float azimuth, float elevation, float *normal)
{
// ray normal from camera azimuth and elevation
float camera_x = cosf(-azimuth) * cosf(elevation);
float camera_y = sinf(-azimuth) * cosf(elevation);
float camera_z = sinf(elevation);
float ray_normal[3] = {camera_x, camera_y, camera_z};
// target normal with axes compensated to camera axes
float target_normal[3] = {normal[2], -normal[0], -normal[1]};
// dot product
float dot_product = ray_normal[0] * target_normal[0]
+ ray_normal[1] * target_normal[1]
+ ray_normal[2] * target_normal[2];
return M_PI - acosf(dot_product);
}
///////////////////////////////////////////////////////////////////////////
__device__ __host__ float unnormalized_sinc(float t)
{
if (abs(t) < 1E-8)
return 1.0;
else
return sin(t) / t;
}
///////////////////////////////////////////////////////////////////////////
template <typename T>
__global__ void column_sums_reduce(const T *__restrict__ in, T *__restrict__ out, size_t width, size_t height)
{
__shared__ T sdata[BLOCK_SIZE][BLOCK_SIZE + 1];
size_t idx = threadIdx.x + blockDim.x * blockIdx.x;
size_t width_stride = gridDim.x * blockDim.x;
size_t full_width = (width & (~((unsigned long long)(BLOCK_SIZE - 1)))) + ((width & (BLOCK_SIZE - 1)) ? BLOCK_SIZE : 0); // round up to next block
for (size_t w = idx; w < full_width; w += width_stride)
{ // grid-stride loop across matrix width
sdata[threadIdx.y][threadIdx.x] = 0;
size_t in_ptr = w + threadIdx.y * width;
for (size_t h = threadIdx.y; h < height; h += BLOCK_SIZE)
{ // block-stride loop across matrix height
sdata[threadIdx.y][threadIdx.x] += (w < width) ? in[in_ptr] : 0;
in_ptr += width * BLOCK_SIZE;
}
__syncthreads();
T my_val = sdata[threadIdx.x][threadIdx.y];
for (int i = warpSize >> 1; i > 0; i >>= 1) // warp-wise parallel sum reduction
my_val += __shfl_xor_sync(0xFFFFFFFFU, my_val, i);
__syncthreads();
if (threadIdx.x == 0)
sdata[0][threadIdx.y] = my_val;
__syncthreads();
if ((threadIdx.y == 0) && ((w) < width))
out[w] = sdata[0][threadIdx.x];
}
}
__global__ void gpu_matrix_mult(float *a, float *b, float *c, int m, int n, int k)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0;
if (col < k && row < m)
{
for (int i = 0; i < n; i++)
{
sum += a[row * n + i] * b[i * k + col];
}
c[row * k + col] = sum;
}
}
__global__ void gpu_diag_matrix_mult(float *Val, int *RowPtr, float *diagVals, int total_rows)
{
const int row = threadIdx.x + blockIdx.x * blockDim.x;
if (row < total_rows)
{
for (int i = RowPtr[row]; i < RowPtr[row + 1]; i++)
{
Val[i] = diagVals[row] * Val[i];
}
}
}
///////////////////////////////////////////////////////////////////////////
// Sonar Claculation Function
__global__ void sonar_calculation(thrust::complex<float> *P_Beams,
float *depth_image,
float *normal_image,
int width,
int height,
int depth_image_step,
int normal_image_step,
float *rand_image,
int rand_image_step,
float *reflectivity_image,
int reflectivity_image_step,
float hPixelSize,
float vPixelSize,
float hFOV,
float vFOV,
float beam_azimuthAngleWidth,
float beam_elevationAngleWidth,
float ray_azimuthAngleWidth,
float ray_elevationAngleWidth,
float soundSpeed,
float sourceTerm,
int nBeams, int nRays,
int raySkips,
float sonarFreq, float delta_f,
int nFreq, float bandwidth,
float maxDistance,
float attenuation,
float area_scaler)
{
// 2D Index of current thread
const int beam = blockIdx.x * blockDim.x + threadIdx.x;
const int ray = blockIdx.y * blockDim.y + threadIdx.y;
//Only valid threads perform memory I/O
if ((beam < width) && (ray < height) && (ray % raySkips == 0))
{
// Location of the image pixel
const int depth_index = ray * depth_image_step / sizeof(float) + beam;
const int normal_index = ray * normal_image_step / sizeof(float) + (3 * beam);
const int rand_index = ray * rand_image_step / sizeof(float) + (2 * beam);
const int reflectivity_index = ray * reflectivity_image_step / sizeof(float) + beam;
// Input parameters for ray processing
float distance = depth_image[depth_index] * 1.0f;
float normal[3] = {normal_image[normal_index],
normal_image[normal_index + 1],
normal_image[normal_index + 2]};
// Calculate ray angles
double fl = static_cast<double>(width) / (2.0 * tan(hFOV/2.0));
float ray_azimuthAngle = atan2(static_cast<double>(beam) -
0.5 * static_cast<double>(width-1), fl);
float ray_elevationAngle = atan2(static_cast<double>(ray) -
0.5 * static_cast<double>(height-1), fl);
// Beam pattern
// float azimuthBeamPattern = abs(unnormalized_sinc(M_PI * 0.884
// / ray_azimuthAngleWidth * sin(ray_azimuthAngle)));
// only one column of rays for each beam at beam center, interference calculated later
float azimuthBeamPattern = 1.0;
float elevationBeamPattern = abs(unnormalized_sinc(M_PI * 0.884
/ (beam_elevationAngleWidth) * sin(ray_elevationAngle)));
// incidence angle
float incidence = acos(normal[2]); // compute_incidence(ray_azimuthAngle, ray_elevationAngle, normal);
// ----- Point scattering model ------ //
// Gaussian noise generated using opencv RNG
float xi_z = rand_image[rand_index];
float xi_y = rand_image[rand_index + 1];
// Calculate amplitude
thrust::complex<float> randomAmps = thrust::complex<float>(xi_z / sqrt(2.0), xi_y / sqrt(2.0));
thrust::complex<float> lambert_sqrt =
thrust::complex<float>(sqrt(reflectivity_image[reflectivity_index]) * cos(incidence), 0.0);
thrust::complex<float> beamPattern =
thrust::complex<float>(azimuthBeamPattern * elevationBeamPattern, 0.0);
thrust::complex<float> targetArea_sqrt = thrust::complex<float>(sqrt(distance * area_scaler), 0.0);
thrust::complex<float> propagationTerm =
thrust::complex<float>(1.0 / (distance*distance) * exp(-2.0 * attenuation * distance), 0.0);
thrust::complex<float> amplitude = randomAmps * thrust::complex<float>(sourceTerm, 0.0)
* propagationTerm * beamPattern * lambert_sqrt * targetArea_sqrt;
// Max distance cut-off
if (distance > maxDistance)
amplitude = thrust::complex<float>(0.0, 0.0);
// Summation of Echo returned from a signal (frequency domain)
for (size_t f = 0; f < nFreq; f++)
{
float freq;
if (nFreq % 2 == 0)
freq = delta_f * (-nFreq / 2.0 + f*1.0f + 1.0);
else
freq = delta_f * (-(nFreq - 1) / 2.0 + f*1.0f + 1.0);
float kw = 2.0 * M_PI * freq / soundSpeed; // wave vector
// Transmit spectrum, frequency domain
thrust::complex<float> kernel = exp(thrust::complex<float>(0.0f, 2.0f * distance * kw)) * amplitude;
P_Beams[beam * nFreq * (int)(nRays / raySkips) + (int)(ray / raySkips) * nFreq + f] =
thrust::complex<float>(kernel.real() , kernel.imag());
}
}
}
///////////////////////////////////////////////////////////////////////////
// Sonar Claculation main function
int main()
{
int height = 50;
int width = 10;
cv::Mat depth_image = cv::Mat::ones(height, width, CV_32FC1);
cv::Mat normal_image = cv::Mat::ones(height, width, CV_32FC3);
cv::Mat rand_image = cv::Mat::ones(height, width, CV_32FC2);
double _hFOV = 0.2;
double _vFOV = 0.2;
double _soundSpeed = 1500;
double _maxDistance = 2.0;
double _sourceLevel = 150.0;
int _nBeams = height; int _nRays = width;
int _raySkips = 1;
double _vPixelSize = _vFOV / height;
double _hPixelSize =_hFOV / width;
double _beam_azimuthAngleWidth = _hPixelSize;
double _beam_elevationAngleWidth = _vFOV/180*3.141592;
double _ray_azimuthAngleWidth = _hPixelSize;
double _ray_elevationAngleWidth = _vPixelSize*(_raySkips+1);
double _sonarFreq = 900e3;
double _bandwidth = 29.5e4;
float max_T = _maxDistance*2.0/_soundSpeed;
float _delta_f = 1.0/max_T;
int _nFreq = ceil(_bandwidth/_delta_f);
cv::Mat reflectivity_image = cv::Mat::ones(height, width, CV_32FC1);
double _attenuation = 0.1;
float *window;
window = new float[_nFreq];
float **beamCorrector;
beamCorrector = new float*[_nBeams];
for (int i = 0; i < _nBeams; i++)
beamCorrector[i] = new float[_nBeams];
float beamCorrectorSum = 10;
bool debugFlag = false;
auto start = std::chrono::high_resolution_clock::now();
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
if (debugFlag)
start = std::chrono::high_resolution_clock::now();
// ---- Allocation of properties parameters ---- //
const float hPixelSize = (float)_hPixelSize;
const float vPixelSize = (float)_vPixelSize;
const float hFOV = (float)_hFOV;
const float vFOV = (float)_vFOV;
const float beam_elevationAngleWidth = (float)_beam_elevationAngleWidth;
const float beam_azimuthAngleWidth = (float)_beam_azimuthAngleWidth;
const float ray_elevationAngleWidth = (float)_ray_elevationAngleWidth;
const float ray_azimuthAngleWidth = (float)_ray_azimuthAngleWidth;
const float soundSpeed = (float)_soundSpeed;
const float maxDistance = (float)_maxDistance;
const float sonarFreq = (float)_sonarFreq;
const float bandwidth = (float)_bandwidth;
const float attenuation = (float)_attenuation;
const int nBeams = _nBeams;
const int nRays = _nRays;
const int nFreq = _nFreq;
const int raySkips = _raySkips;
//#######################################################//
//############### Sonar Calculation ################//
//#######################################################//
// --------- Calculation parameters --------- //
const float max_distance = maxDistance;
// Signal
const float delta_f = bandwidth/nFreq;
// Precalculation
const float area_scaler = ray_azimuthAngleWidth * ray_elevationAngleWidth;
const float sourceLevel = (float)_sourceLevel; // db re 1 muPa;
const float pref = 1e-6; // 1 micro pascal (muPa);
const float sourceTerm = sqrt(pow(10, (sourceLevel / 10))) * pref; // source term
// --------- Allocate GPU memory for image --------- //
//Calculate total number of bytes of input and output image
const int depth_image_Bytes = depth_image.step * depth_image.rows;
const int normal_image_Bytes = normal_image.step * normal_image.rows;
const int rand_image_Bytes = rand_image.step * rand_image.rows;
const int reflectivity_image_Bytes = reflectivity_image.step * reflectivity_image.rows;
//Allocate device memory
float *d_depth_image, *d_normal_image, *d_rand_image, *d_reflectivity_image;
SAFE_CALL(cudaMalloc((void **)&d_depth_image, depth_image_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_normal_image, normal_image_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_rand_image, rand_image_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_reflectivity_image, reflectivity_image_Bytes), "CUDA Malloc Failed");
//Copy data from OpenCV input image to device memory
SAFE_CALL(cudaMemcpy(d_depth_image, depth_image.ptr(), depth_image_Bytes,
cudaMemcpyHostToDevice), "CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(d_normal_image, normal_image.ptr(), normal_image_Bytes,
cudaMemcpyHostToDevice),"CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(d_rand_image, rand_image.ptr(), rand_image_Bytes,
cudaMemcpyHostToDevice),"CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(d_reflectivity_image, reflectivity_image.ptr(), reflectivity_image_Bytes,
cudaMemcpyHostToDevice), "CUDA Memcpy Failed");
//Specify a reasonable block size
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
//Calculate grid size to cover the whole image
const dim3 grid((depth_image.cols + block.x - 1) / block.x,
(depth_image.rows + block.y - 1) / block.y);
// Beam data array
thrust::complex<float> *P_Beams;
thrust::complex<float> *d_P_Beams;
const int P_Beams_N = nBeams * (int)(nRays / raySkips) * (nFreq + 1);
const int P_Beams_Bytes = sizeof(thrust::complex<float>) * P_Beams_N;
SAFE_CALL(cudaMallocHost((void **)&P_Beams, P_Beams_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Beams, P_Beams_Bytes), "CUDA Malloc Failed");
//Launch the beamor conversion kernel
sonar_calculation<<<grid, block>>>(d_P_Beams,
d_depth_image,
d_normal_image,
normal_image.cols,
normal_image.rows,
depth_image.step,
normal_image.step,
d_rand_image,
rand_image.step,
d_reflectivity_image,
reflectivity_image.step,
hPixelSize,
vPixelSize,
hFOV,
vFOV,
beam_azimuthAngleWidth,
beam_elevationAngleWidth,
ray_azimuthAngleWidth,
ray_elevationAngleWidth,
soundSpeed,
sourceTerm,
nBeams, nRays,
raySkips,
sonarFreq, delta_f,
nFreq, bandwidth,
max_distance,
attenuation,
area_scaler);
// Synchronize to check for any kernel launch errors
SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
//Copy back data from destination device meory to OpenCV output image
SAFE_CALL(cudaMemcpy(P_Beams, d_P_Beams, P_Beams_Bytes,
cudaMemcpyDeviceToHost), "CUDA Memcpy Failed");
// Free GPU memory
cudaFree(d_depth_image);
cudaFree(d_normal_image);
cudaFree(d_rand_image);
cudaFree(d_reflectivity_image);
cudaFree(d_P_Beams);
// For calc time measure
if (debugFlag)
{
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
printf("GPU Sonar Computation Time %lld/100 [s]\n",
static_cast<long long int>(duration.count() / 10000));
start = std::chrono::high_resolution_clock::now();
}
//########################################################//
//######### Summation, Culling and windowing #########//
//########################################################//
// Preallocate an array for return
CArray2D P_Beams_F(CArray(nFreq), nBeams);
// GPU grids and rows
unsigned int grid_rows, grid_cols;
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
// GPU Ray summation using column sum
float *P_Ray_real, *P_Ray_imag;
float *d_P_Ray_real, *d_P_Ray_imag;
const int P_Ray_N = (int)(nRays / raySkips) * (nFreq);
const int P_Ray_Bytes = sizeof(float) * P_Ray_N;
float *P_Ray_F_real, *P_Ray_F_imag;
float *d_P_Ray_F_real, *d_P_Ray_F_imag;
const int P_Ray_F_N = (nFreq)*1;
const int P_Ray_F_Bytes = sizeof(float) * P_Ray_F_N;
cudaMallocHost((void **)&P_Ray_real, P_Ray_Bytes);
cudaMallocHost((void **)&P_Ray_imag, P_Ray_Bytes);
cudaMallocHost((void **)&P_Ray_F_real, P_Ray_F_Bytes);
cudaMallocHost((void **)&P_Ray_F_imag, P_Ray_F_Bytes);
SAFE_CALL(cudaMalloc((void **)&d_P_Ray_real, P_Ray_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Ray_imag, P_Ray_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_real, P_Ray_F_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Ray_F_imag, P_Ray_F_Bytes), "CUDA Malloc Failed");
dim3 dimGrid_Ray((nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE);
for (size_t beam = 0; beam < nBeams; beam ++)
{
for (size_t ray = 0; ray < (int)(nRays / raySkips); ray++)
{
for (size_t f = 0; f < nFreq; f++)
{
P_Ray_real[ray * nFreq + f] =
P_Beams[beam * nFreq * (int)(nRays / raySkips) + ray * nFreq + f].real();
P_Ray_imag[ray * nFreq + f] =
P_Beams[beam * nFreq * (int)(nRays / raySkips) + ray * nFreq + f].imag();
}
}
SAFE_CALL(cudaMemcpy(d_P_Ray_real, P_Ray_real, P_Ray_Bytes, cudaMemcpyHostToDevice),
"CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(d_P_Ray_imag, P_Ray_imag, P_Ray_Bytes, cudaMemcpyHostToDevice),
"CUDA Memcpy Failed");
column_sums_reduce<<<dimGrid_Ray, dimBlock>>>(d_P_Ray_real, d_P_Ray_F_real,
nFreq, (int)(nRays / raySkips));
column_sums_reduce<<<dimGrid_Ray, dimBlock>>>(d_P_Ray_imag, d_P_Ray_F_imag,
nFreq, (int)(nRays / raySkips));
SAFE_CALL(cudaMemcpy(P_Ray_F_real, d_P_Ray_F_real, P_Ray_F_Bytes,
cudaMemcpyDeviceToHost), "CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(P_Ray_F_imag, d_P_Ray_F_imag, P_Ray_F_Bytes,
cudaMemcpyDeviceToHost), "CUDA Memcpy Failed");
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
for (size_t f = 0; f < nFreq; f++)
P_Beams_F[beam][f] = Complex(P_Ray_F_real[f], P_Ray_F_imag[f]);
}
// free memory
cudaFreeHost(P_Beams);
cudaFreeHost(P_Ray_real);
cudaFreeHost(P_Ray_imag);
cudaFreeHost(P_Ray_F_real);
cudaFreeHost(P_Ray_F_imag);
cudaFree(d_P_Ray_real);
cudaFree(d_P_Ray_imag);
cudaFree(d_P_Ray_F_real);
cudaFree(d_P_Ray_F_imag);
if (debugFlag)
{
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
printf("Sonar Ray Summation %lld/100 [s]\n",
static_cast<long long int>(duration.count() / 10000));
start = std::chrono::high_resolution_clock::now();
}
// -------------- Beam culling correction -----------------//
// beamCorrector and beamCorrectorSum is precalculated at parent cpp
float *P_Beams_Cor_real, *P_Beams_Cor_imag;
// float *P_Beams_Cor_F_real, *P_Beams_Cor_F_imag;
float *P_Beams_Cor_real_tmp, *P_Beams_Cor_imag_tmp;
float *d_P_Beams_Cor_real, *d_P_Beams_Cor_imag;
float *d_P_Beams_Cor_F_real, *d_P_Beams_Cor_F_imag;
const int P_Beams_Cor_N = nBeams * nFreq;
const int P_Beams_Cor_Bytes = sizeof(float) * P_Beams_Cor_N;
cudaMallocHost((void **)&P_Beams_Cor_real, P_Beams_Cor_Bytes);
cudaMallocHost((void **)&P_Beams_Cor_imag, P_Beams_Cor_Bytes);
cudaMallocHost((void **)&P_Beams_Cor_real_tmp, P_Beams_Cor_Bytes);
cudaMallocHost((void **)&P_Beams_Cor_imag_tmp, P_Beams_Cor_Bytes);
// cudaMallocHost((void **)&P_Beams_Cor_F_real, P_Beams_Cor_Bytes);
// cudaMallocHost((void **)&P_Beams_Cor_F_imag, P_Beams_Cor_Bytes);
SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_imag, P_Beams_Cor_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes), "CUDA Malloc Failed");
SAFE_CALL(cudaMalloc((void **)&d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes), "CUDA Malloc Failed");
float *beamCorrector_lin, *d_beamCorrector_lin;
const int beamCorrector_lin_N = nBeams * nBeams;
const int beamCorrector_lin_Bytes = sizeof(float) * beamCorrector_lin_N;
cudaMallocHost((void **)&beamCorrector_lin, beamCorrector_lin_Bytes);
SAFE_CALL(cudaMalloc((void **)&d_beamCorrector_lin, beamCorrector_lin_Bytes), "CUDA Malloc Failed");
// (nfreq x nBeams) * (nBeams x nBeams) = (nfreq x nBeams)
for (size_t beam = 0; beam < nBeams; beam ++)
{
for (size_t f = 0; f < nFreq; f++)
{
P_Beams_Cor_real[f * nBeams + beam] = P_Beams_F[beam][f].real() * 1.0f;
P_Beams_Cor_imag[f * nBeams + beam] = P_Beams_F[beam][f].imag() * 1.0f;
}
for (size_t beam_other = 0; beam_other < nBeams; beam_other ++)
beamCorrector_lin[beam_other * nBeams + beam] = beamCorrector[beam][beam_other];
}
SAFE_CALL(cudaMemcpy(d_P_Beams_Cor_real, P_Beams_Cor_real, P_Beams_Cor_Bytes,
cudaMemcpyHostToDevice),
"CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(d_P_Beams_Cor_imag, P_Beams_Cor_imag, P_Beams_Cor_Bytes,
cudaMemcpyHostToDevice),
"CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(d_beamCorrector_lin, beamCorrector_lin, beamCorrector_lin_Bytes,
cudaMemcpyHostToDevice),
"CUDA Memcpy Failed");
grid_rows = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE;
grid_cols = (nBeams + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid_Beam(grid_cols, grid_rows);
gpu_matrix_mult<<<dimGrid_Beam, dimBlock>>>(d_P_Beams_Cor_real, d_beamCorrector_lin,
d_P_Beams_Cor_F_real, nFreq, nBeams, nBeams);
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
gpu_matrix_mult<<<dimGrid_Beam, dimBlock>>>(d_P_Beams_Cor_imag, d_beamCorrector_lin,
d_P_Beams_Cor_F_imag, nFreq, nBeams, nBeams);
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
//Copy back data from destination device meory
SAFE_CALL(cudaMemcpy(P_Beams_Cor_real_tmp, d_P_Beams_Cor_F_real, P_Beams_Cor_Bytes,
cudaMemcpyDeviceToHost),
"CUDA Memcpy Failed");
SAFE_CALL(cudaMemcpy(P_Beams_Cor_imag_tmp, d_P_Beams_Cor_F_imag, P_Beams_Cor_Bytes,
cudaMemcpyDeviceToHost),
"CUDA Memcpy Failed");
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
// --------------- Windowing ----------------- //
// float *window_diag, *d_window;
// const int window_N = nFreq * 1;
// const int window_Bytes = sizeof(float) * window_N;
// window_diag = (float *)malloc(window_Bytes);
// SAFE_CALL(cudaMalloc((void **)&d_window, window_Bytes), "CUDA Malloc Failed");
// int *diag_ptr, *d_diag_ptr;
// const int diag_ptr_N = nBeams * 1;
// const int diag_ptr_Bytes = sizeof(int) * diag_ptr_N;
// diag_ptr = (int *)malloc(diag_ptr_Bytes);
// SAFE_CALL(cudaMalloc((void **)&d_diag_ptr, diag_ptr_Bytes), "CUDA Malloc Failed");
// // (nBeams x nfreq) * (1 x nFreq) = (nBeams x nFreq)
// for (size_t beam = 0; beam < nBeams; beam ++)
// {
// for (size_t f = 0; f < nFreq; f++)
// { // Transpose
// P_Beams_Cor_real[beam * nFreq + f] = P_Beams_Cor_real_tmp[f * nBeams + beam];
// P_Beams_Cor_imag[beam * nFreq + f] = P_Beams_Cor_imag_tmp[f * nBeams + beam];
// window_diag[f] = window[f];
// }
// diag_ptr[beam] = (int)beam;
// }
// SAFE_CALL(cudaMemcpy(d_P_Beams_Cor_real, P_Beams_Cor_real, P_Beams_Cor_Bytes,
// cudaMemcpyHostToDevice),
// "CUDA Memcpy Failed");
// SAFE_CALL(cudaMemcpy(d_P_Beams_Cor_imag, P_Beams_Cor_imag, P_Beams_Cor_Bytes,
// cudaMemcpyHostToDevice),
// "CUDA Memcpy Failed");
// SAFE_CALL(cudaMemcpy(d_window, window_diag, window_Bytes,
// cudaMemcpyHostToDevice),
// "CUDA Memcpy Failed");
// SAFE_CALL(cudaMemcpy(d_diag_ptr, diag_ptr, diag_ptr_Bytes,
// cudaMemcpyHostToDevice),
// "CUDA Memcpy Failed");
// grid_rows = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE;
// grid_cols = (nFreq + BLOCK_SIZE - 1) / BLOCK_SIZE;
// dim3 dimGrid_window(grid_cols, grid_rows);
// gpu_diag_matrix_mult<<<dimGrid_window, dimBlock>>>(d_P_Beams_Cor_real, d_diag_ptr, d_window, nFreq);
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
// gpu_diag_matrix_mult<<<dimGrid_window, dimBlock>>>(d_P_Beams_Cor_imag, d_diag_ptr, d_window, nFreq);
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
// //Copy back data from destination device meory
// SAFE_CALL(cudaMemcpy(P_Beams_Cor_F_real, d_P_Beams_Cor_real, P_Beams_Cor_Bytes,
// cudaMemcpyDeviceToHost),
// "CUDA Memcpy Failed");
// SAFE_CALL(cudaMemcpy(P_Beams_Cor_F_imag, d_P_Beams_Cor_imag, P_Beams_Cor_Bytes,
// cudaMemcpyDeviceToHost),
// "CUDA Memcpy Failed");
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
// Return
for (size_t beam = 0; beam < nBeams; beam ++)
for (size_t f = 0; f < nFreq; f++)
P_Beams_F[beam][f] =
Complex(P_Beams_Cor_real_tmp[f * nBeams + beam] / beamCorrectorSum,
P_Beams_Cor_imag_tmp[f * nBeams + beam] / beamCorrectorSum);
// Free memory
cudaFree(d_P_Beams_Cor_imag);
cudaFree(d_P_Beams_Cor_real);
cudaFree(d_P_Beams_Cor_F_imag);
cudaFree(d_P_Beams_Cor_F_real);
cudaFree(d_beamCorrector_lin);
// cudaFree(d_window);
// cudaFree(d_diag_ptr);
cudaFreeHost(P_Beams_Cor_real);
cudaFreeHost(P_Beams_Cor_imag);
// cudaFreeHost(P_Beams_Cor_F_real);
// cudaFreeHost(P_Beams_Cor_F_imag);
cudaFreeHost(P_Beams_Cor_real_tmp);
cudaFreeHost(P_Beams_Cor_imag_tmp);
cudaFreeHost(beamCorrector_lin);
// cudaFreeHost(window_diag);
// cudaFreeHost(diag_ptr);
// For calc time measure
if (debugFlag)
{
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
printf("GPU Window & Correction %lld/100 [s]\n",
static_cast<long long int>(duration.count() / 10000));
start = std::chrono::high_resolution_clock::now();
}
//#################################################//
//################### FFT #####################//
//#################################################//
// SAFE_CALL(cudaDeviceSynchronize(), "Kernel Launch Failed");
const int DATASIZE = nFreq;
const int BATCH = nBeams;
// --- Host side input data allocation and initialization
cufftComplex *hostInputData = (cufftComplex *)malloc(
DATASIZE * BATCH * sizeof(cufftComplex));
for (int beam = 0; beam < BATCH; beam++)
{
for (int f = 0; f < DATASIZE; f++)
{
if (f < nFreq)
hostInputData[beam * DATASIZE + f] =
make_cuComplex(P_Beams_F[beam][f].real() * 1.0f,
P_Beams_F[beam][f].imag() * 1.0f);
else
hostInputData[beam * DATASIZE + f] =
(make_cuComplex(0.f, 0.f)); // zero padding
}
}
// --- Device side input data allocation and initialization
cufftComplex *deviceInputData;
SAFE_CALL(cudaMalloc((void **)&deviceInputData,
DATASIZE * BATCH * sizeof(cufftComplex)),
"FFT CUDA Malloc Failed");
SAFE_CALL(cudaMemcpy(deviceInputData, hostInputData,
DATASIZE * BATCH * sizeof(cufftComplex),
cudaMemcpyHostToDevice),
"FFT CUDA Memcopy Failed");
// --- Host side output data allocation
cufftComplex *hostOutputData =
(cufftComplex *)malloc(DATASIZE * BATCH * sizeof(cufftComplex));
// --- Device side output data allocation
cufftComplex *deviceOutputData;
cudaMalloc((void **)&deviceOutputData,
DATASIZE * BATCH * sizeof(cufftComplex));
// --- Batched 1D FFTs
cufftHandle handle;
int rank = 1; // --- 1D FFTs
int n[] = {DATASIZE}; // --- Size of the Fourier transform
// --- Distance between two successive input/output elements
int istride = 1, ostride = 1;
int idist = DATASIZE, odist = DATASIZE; // --- Distance between batches
// --- Input/Output size with pitch (ignored for 1D transforms)
int inembed[] = {0};
int onembed[] = {0};
int batch = BATCH; // --- Number of batched executions
cufftPlanMany(&handle, rank, n,
inembed, istride, idist,
onembed, ostride, odist, CUFFT_C2C, batch);
cufftExecC2C(handle, deviceInputData, deviceOutputData, CUFFT_FORWARD);
// --- Device->Host copy of the results
SAFE_CALL(cudaMemcpy(hostOutputData, deviceOutputData,
DATASIZE * BATCH * sizeof(cufftComplex),
cudaMemcpyDeviceToHost),
"FFT CUDA Memcopy Failed");
cufftDestroy(handle);
cudaFree(deviceOutputData);
cudaFree(deviceInputData);
free(hostInputData);
free(hostOutputData);
for (int beam = 0; beam < BATCH; beam++)
{
for (int f = 0; f < nFreq; f++)
{
P_Beams_F[beam][f] =
Complex(hostOutputData[beam * DATASIZE + f].x * delta_f,
hostOutputData[beam * DATASIZE + f].y * delta_f);
}
}
// For calc time measure
if (debugFlag)
{
stop = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
printf("GPU FFT Calc Time %lld/100 [s]\n",
static_cast<long long int>(duration.count() / 10000));
}
cuda_hello<<<1,1>>>();
return 0;
}
Works well as a standalone excutable with
FROM osrf/ros:melodic-desktop-full
# Required utilities
RUN apt update \
&& apt install -y --no-install-recommends \
build-essential \
cmake \
cppcheck \
curl \
git \
gnupg \
libeigen3-dev \
libgles2-mesa-dev \
libglu1-mesa-dev \
libgl1-mesa-glx \
libgl1-mesa-dri \
xutils-dev \
bison \
zlib1g-dev \
flex \
lsb-release \
pkg-config \
protobuf-compiler \
python3-dbg \
python3-pip \
python3-venv \
python3-catkin-tools \
qtbase5-dev \
ruby \
software-properties-common \
sudo \
wget \
gdb \
psmisc \
vim \
nano \
x11-apps \
&& apt clean
# --- Install GCUDA Libs --- #
# Download Cuda local installer
WORKDIR /tmp
RUN wget https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
# Install CUDA Libs (without opengl libraries to keep GUI libraries)
RUN sh cuda_11.1.1_455.32.00_linux.run --silent --toolkit --no-opengl-libs
# --- Install GPGPU-SIM --- #
# Clone source code
WORKDIR /tmp/gpgpu-sim
ENV VERSION_GPGPU_SIM=4.0.3
RUN curl -fsSL https://github.com/woensug-choi/gpgpu-sim_distribution/archive/v$VERSION_GPGPU_SIM.tar.gz | tar xz
RUN mv gpgpu-sim_distribution-$VERSION_GPGPU_SIM gpgpu-sim_distribution
# Define environment variables
ENV CUDA_INSTALL_PATH="/usr/local/cuda"
ENV PATH="$CUDA_INSTALL_PATH/bin:${PATH}"
# Compile
WORKDIR /tmp/gpgpu-sim/gpgpu-sim_distribution
RUN /bin/bash -c "source setup_environment && make -j$(nproc)"
# Define runtime env. variables
ENV PTX_SIM_MODE_FUNC=1
ENV PTX_SIM_MODE_DETAIL=0
# Install dave dependencies
# Set Ubuntu release
ARG RELEASE=bionic
# Set ROS distribution
ARG DIST=melodic
# Set Gazebo verison
ARG GAZ=gazebo9
RUN apt update \
&& apt install -y --no-install-recommends \
${GAZ} \
lib${GAZ}-dev \
python-vcstool \
ros-${DIST}-gazebo-plugins \
ros-${DIST}-gazebo-ros \
ros-${DIST}-gazebo-ros-control \
ros-${DIST}-gazebo-ros-pkgs \
ros-${DIST}-effort-controllers \
ros-${DIST}-geographic-info \
ros-${DIST}-hector-gazebo-plugins \
ros-${DIST}-joint-state-controller \
ros-${DIST}-joint-state-publisher \
ros-${DIST}-joy \
ros-${DIST}-joy-teleop \
ros-${DIST}-kdl-parser-py \
ros-${DIST}-key-teleop \
ros-${DIST}-move-base \
ros-${DIST}-robot-localization \
ros-${DIST}-robot-state-publisher \
ros-${DIST}-ros-base \
ros-${DIST}-rqt \
ros-${DIST}-rqt-common-plugins \
ros-${DIST}-rqt-robot-plugins \
ros-${DIST}-rviz \
ros-${DIST}-teleop-tools \
ros-${DIST}-teleop-twist-joy \
ros-${DIST}-teleop-twist-keyboard \
ros-${DIST}-tf2-geometry-msgs \
ros-${DIST}-tf2-tools \
ros-${DIST}-urdfdom-py \
ros-${DIST}-velodyne-gazebo-plugins \
ros-${DIST}-velodyne-simulator \
ros-${DIST}-xacro \
&& apt clean
# Add user
ARG USER_ID=1000
ARG GROUP_ID=1000
RUN groupadd -g ${GROUP_ID} gpgpu-sim \
&& useradd -ms /bin/bash gpgpu-sim -g gpgpu-sim
# RUN COMMAND
# docker run -it --net=host \
# --env="DISPLAY" \
# --env="QT_X11_NO_MITSHM=1" \
# --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \
# -v /home/${USER}:/home/gpgpu-sim \
# -u $(id -u ${USER}):$(id -g ${USER}) \
# gpgpu-sim-melodic \
# bash
# First things first
# source /tmp/gpgpu-sim/gpgpu-sim_distribution/setup_environment
The GPGPU-SIM repo https://github.com/gpgpu-sim/gpgpu-sim_distribution
This could benefit those who do not have a CUDA-capable NVIDIA card or any GPU card.
Testing on having it on our dockwater docker image