Creating multiple temporary variable inside global and device CUDA function seem resulting in poor performance

I am doing a project that requires implementing many small 3x3 matrix multiplications. To better organize my code, I wrote many __device__ functions to split such small operations. However, when I use those __device__ functions(calcNED, Geoditic2ECEF) in my main __global__ function cudaCalcDistance, it performs insanely slow. calcNED takes 500ms to execute. Is the fact that I used lots of temporary variables that caused such performance. If it is the case, is there a solution to avoid using many temporary variables while the code remains clear?

struct GPS {
    float latitude;
    float longtitude;
    float height;
};

struct Coordinate
{
    __host__ __device__ Coordinate(float x_ = 0, float y_ = 0, float z_= 0)
    {
        x = x_;
        y = y_;
        z = z_;
    }
    __host__ __device__ float norm()
    {
        return sqrtf(x * x + y * y + z * z);
    }

    float x;
    float y;
    float z;
};

__host__ __device__ Coordinate Geoditic2ECEF(GPS gps)
{
    Coordinate result;

    float a = 6378137;
    float b = 6356752;
    float f = (a - b) / a;
    float e_sq = f * (2 - f);

    float lambda = gps.latitude / 180 * M_PI;
    float phi = gps.longtitude / 180 * M_PI;

    float N = a / std::sqrt(1 - e_sq * sinf(lambda) * sinf(lambda));
    result.x = (gps.height + N) * cosf(lambda) * cosf(phi);
    result.y = (gps.height + N) * cosf(lambda) * sinf(phi);
    result.z = (gps.height + (1 - e_sq) * N) * sinf(lambda);
    return result;
}

__device__ void calcNED(float *NED, float2 imgPos,
                        float * Rb2c_cur, float *Ri2b_missile_cur,
                        float * Re2i_missile, float fov_pixel)
{
    float Ldonic[3];
    float normTmp = sqrtf(imgPos.x * imgPos.x + imgPos.y * imgPos.y + fov_pixel * fov_pixel);
    Ldonic[0] = imgPos.x /normTmp;
    Ldonic[1] = imgPos.y /normTmp;
    Ldonic[2] = fov_pixel / normTmp;

    float tmp[9];
    float tmp1[9];
    mul3x3TransposeFirst(&tmp[0], Re2i_missile, Ri2b_missile_cur);
    mul3x3(&tmp1[0], &tmp[0], Rb2c_cur);
    mul3x3ToVec3x1(&NED[0], &tmp1[0], &Ldonic[0]);
}

__global__ void cudaCalcDistance(RayInfo *distance,
                                 GPS *ship_gps,
                                 GPS3 * ship_surface_gps,
                                 float6 * ship_surface_imgPos,
                                 ObjStatus* missile_cur,
                                 float * Rb2c_cur, float *Ri2b_missile_cur,
                                 float * Re2i_missile, float fov_pixel,
                                 int num_faces, int num_partialPix, int offset,
                                 int batch_size, int width, int height)
{
    int partialPixIdx = threadIdx.x + IMUL(blockIdx.x, blockDim.x);
    int faceIdx = threadIdx.y + IMUL(blockIdx.y, blockDim.y);

    // index of result array to be calculated(float* distance)
    int resultIdx = faceIdx * batch_size * PIXEL_GRID_SIZE * PIXEL_GRID_SIZE + partialPixIdx;

    // pixel index in the output image(640 * 480)
    int pixIdx = offset * batch_size + partialPixIdx / (PIXEL_GRID_SIZE * PIXEL_GRID_SIZE);

    // row, col index of current thread in the output width * height (640 * 480) image
    int row = pixIdx / width;
    int col = pixIdx % width;

    // index of subpixel inside the subgrid for each pixel
    int partialPixIdx_X = partialPixIdx / (batch_size * PIXEL_GRID_SIZE);
    int partialPixIdx_Y = partialPixIdx % (batch_size * PIXEL_GRID_SIZE);

    if(faceIdx < num_faces && partialPixIdx < num_partialPix)
    {
        Coordinate missile_pos = Geoditic2ECEF(missile_cur->gps);
        float u = col + (float)(partialPixIdx_X - 0.5)/ PIXEL_GRID_SIZE - width / 2 - 0.5;
        float w = row + (float)(partialPixIdx_Y - 0.5)/ PIXEL_GRID_SIZE - height / 2 - 0.5;
        float2 imgPos;
        imgPos.x = u;
        imgPos.y = w;

        // TODO: The following code performs a uncoalescing memory access
        // It is not easy to avoid since it require random data access pattern
        // A possible solution is to reorganize input data
        float2 surface_imgPos[3];
        surface_imgPos[0] = ship_surface_imgPos[faceIdx].x;
        surface_imgPos[1] = ship_surface_imgPos[faceIdx].y;
        surface_imgPos[2] = ship_surface_imgPos[faceIdx].z;

        GPS surface_gps[3];
        surface_gps[0] = ship_surface_gps[faceIdx].x;
        surface_gps[1] = ship_surface_gps[faceIdx].y;
        surface_gps[2] = ship_surface_gps[faceIdx].z;

        float NED[3];
        calcNED(&NED[0], imgPos, Rb2c_cur, Ri2b_missile_cur, Re2i_missile, fov_pixel);

        if(isInsideSurface(&surface_imgPos[0], 3, imgPos))
        {
            distance[resultIdx] = calcDistanceToFace(&NED[0], &surface_imgPos[0], imgPos,
                    missile_pos, &surface_gps[0]);
        }
        else
        {
            distance[resultIdx] = calcDistanceToOcean(&NED[0], imgPos, missile_pos);
        }
    }
}

Is the fact that I used lots of temporary variables that caused such performance.

No. If you use unnecessary temporaries, the compiler will optimize them away (unless their use introduces side-effects).

std::sqrt()

That shouldn't work in device-side code...

__host__ __device__

Functions are compiled differently for the host and the device - and different libraries/intrinsics are available to them. So your problem is about the __device__ version of this function.

float *NED, ... float * Rb2c_cur etc.

You should mark all of your input-only buffers as const __restrict__ and your output or input-output as just __restrict__. Not doing so forces the compiler to perform a bunch of re-reading and avoid caching.

For some more discussion of restrict here on SO, see:

What can human beings make out of the restrict qualifier?
Function call with pointer to non-const and pointer to const arguments of same address

Creating multiple temporary variable inside global and device CUDA function seem resulting in poor performance

Related

Recent Posts

Creating multiple temporary variable inside __global__ and __device__ CUDA function seem resulting in poor performance

Related

Recent Posts

Creating multiple temporary variable inside global and device CUDA function seem resulting in poor performance