Creating multiple temporary variable inside __global__ and __device__ CUDA function seem resulting in poor performance
I am doing a project that requires implementing many small 3x3 matrix multiplications. To better organize my code, I wrote many __device__
functions to split such small operations. However, when I use those __device__
functions(calcNED
, Geoditic2ECEF
) in my main __global__
function cudaCalcDistance
, it performs insanely slow. calcNED
takes 500ms to execute. Is the fact that I used lots of temporary variables that caused such performance. If it is the case, is there a solution to avoid using many temporary variables while the code remains clear?
struct GPS {
float latitude;
float longtitude;
float height;
};
struct Coordinate
{
__host__ __device__ Coordinate(float x_ = 0, float y_ = 0, float z_= 0)
{
x = x_;
y = y_;
z = z_;
}
__host__ __device__ float norm()
{
return sqrtf(x * x + y * y + z * z);
}
float x;
float y;
float z;
};
__host__ __device__ Coordinate Geoditic2ECEF(GPS gps)
{
Coordinate result;
float a = 6378137;
float b = 6356752;
float f = (a - b) / a;
float e_sq = f * (2 - f);
float lambda = gps.latitude / 180 * M_PI;
float phi = gps.longtitude / 180 * M_PI;
float N = a / std::sqrt(1 - e_sq * sinf(lambda) * sinf(lambda));
result.x = (gps.height + N) * cosf(lambda) * cosf(phi);
result.y = (gps.height + N) * cosf(lambda) * sinf(phi);
result.z = (gps.height + (1 - e_sq) * N) * sinf(lambda);
return result;
}
__device__ void calcNED(float *NED, float2 imgPos,
float * Rb2c_cur, float *Ri2b_missile_cur,
float * Re2i_missile, float fov_pixel)
{
float Ldonic[3];
float normTmp = sqrtf(imgPos.x * imgPos.x + imgPos.y * imgPos.y + fov_pixel * fov_pixel);
Ldonic[0] = imgPos.x /normTmp;
Ldonic[1] = imgPos.y /normTmp;
Ldonic[2] = fov_pixel / normTmp;
float tmp[9];
float tmp1[9];
mul3x3TransposeFirst(&tmp[0], Re2i_missile, Ri2b_missile_cur);
mul3x3(&tmp1[0], &tmp[0], Rb2c_cur);
mul3x3ToVec3x1(&NED[0], &tmp1[0], &Ldonic[0]);
}
__global__ void cudaCalcDistance(RayInfo *distance,
GPS *ship_gps,
GPS3 * ship_surface_gps,
float6 * ship_surface_imgPos,
ObjStatus* missile_cur,
float * Rb2c_cur, float *Ri2b_missile_cur,
float * Re2i_missile, float fov_pixel,
int num_faces, int num_partialPix, int offset,
int batch_size, int width, int height)
{
int partialPixIdx = threadIdx.x + IMUL(blockIdx.x, blockDim.x);
int faceIdx = threadIdx.y + IMUL(blockIdx.y, blockDim.y);
// index of result array to be calculated(float* distance)
int resultIdx = faceIdx * batch_size * PIXEL_GRID_SIZE * PIXEL_GRID_SIZE + partialPixIdx;
// pixel index in the output image(640 * 480)
int pixIdx = offset * batch_size + partialPixIdx / (PIXEL_GRID_SIZE * PIXEL_GRID_SIZE);
// row, col index of current thread in the output width * height (640 * 480) image
int row = pixIdx / width;
int col = pixIdx % width;
// index of subpixel inside the subgrid for each pixel
int partialPixIdx_X = partialPixIdx / (batch_size * PIXEL_GRID_SIZE);
int partialPixIdx_Y = partialPixIdx % (batch_size * PIXEL_GRID_SIZE);
if(faceIdx < num_faces && partialPixIdx < num_partialPix)
{
Coordinate missile_pos = Geoditic2ECEF(missile_cur->gps);
float u = col + (float)(partialPixIdx_X - 0.5)/ PIXEL_GRID_SIZE - width / 2 - 0.5;
float w = row + (float)(partialPixIdx_Y - 0.5)/ PIXEL_GRID_SIZE - height / 2 - 0.5;
float2 imgPos;
imgPos.x = u;
imgPos.y = w;
// TODO: The following code performs a uncoalescing memory access
// It is not easy to avoid since it require random data access pattern
// A possible solution is to reorganize input data
float2 surface_imgPos[3];
surface_imgPos[0] = ship_surface_imgPos[faceIdx].x;
surface_imgPos[1] = ship_surface_imgPos[faceIdx].y;
surface_imgPos[2] = ship_surface_imgPos[faceIdx].z;
GPS surface_gps[3];
surface_gps[0] = ship_surface_gps[faceIdx].x;
surface_gps[1] = ship_surface_gps[faceIdx].y;
surface_gps[2] = ship_surface_gps[faceIdx].z;
float NED[3];
calcNED(&NED[0], imgPos, Rb2c_cur, Ri2b_missile_cur, Re2i_missile, fov_pixel);
if(isInsideSurface(&surface_imgPos[0], 3, imgPos))
{
distance[resultIdx] = calcDistanceToFace(&NED[0], &surface_imgPos[0], imgPos,
missile_pos, &surface_gps[0]);
}
else
{
distance[resultIdx] = calcDistanceToOcean(&NED[0], imgPos, missile_pos);
}
}
}
Is the fact that I used lots of temporary variables that caused such performance.
No. If you use unnecessary temporaries, the compiler will optimize them away (unless their use introduces side-effects).
std::sqrt()
That shouldn't work in device-side code...
__host__ __device__
Functions are compiled differently for the host and the device - and different libraries/intrinsics are available to them. So your problem is about the __device__
version of this function.
float *NED, ... float * Rb2c_cur
etc.
You should mark all of your input-only buffers as const __restrict__
and your output or input-output as just __restrict__
. Not doing so forces the compiler to perform a bunch of re-reading and avoid caching.
For some more discussion of restrict
here on SO, see:
-
What can human beings make out of the restrict qualifier?
-
Function call with pointer to non-const and pointer to const arguments of same address