Creating multiple temporary variable inside __global__ and __device__ CUDA function seem resulting in poor performance

I am doing a project that requires implementing many small 3x3 matrix multiplications. To better organize my code, I wrote many __device__ functions to split such small operations. However, when I use those __device__ functions(calcNED, Geoditic2ECEF) in my main __global__ function cudaCalcDistance, it performs insanely slow. calcNED takes 500ms to execute. Is the fact that I used lots of temporary variables that caused such performance. If it is the case, is there a solution to avoid using many temporary variables while the code remains clear?

struct GPS {
    float latitude;
    float longtitude;
    float height;
};

struct Coordinate
{
    __host__ __device__ Coordinate(float x_ = 0, float y_ = 0, float z_= 0)
    {
        x = x_;
        y = y_;
        z = z_;
    }
    __host__ __device__ float norm()
    {
        return sqrtf(x * x + y * y + z * z);
    }

    float x;
    float y;
    float z;
};

__host__ __device__ Coordinate Geoditic2ECEF(GPS gps)
{
    Coordinate result;

    float a = 6378137;
    float b = 6356752;
    float f = (a - b) / a;
    float e_sq = f * (2 - f);

    float lambda = gps.latitude / 180 * M_PI;
    float phi = gps.longtitude / 180 * M_PI;

    float N = a / std::sqrt(1 - e_sq * sinf(lambda) * sinf(lambda));
    result.x = (gps.height + N) * cosf(lambda) * cosf(phi);
    result.y = (gps.height + N) * cosf(lambda) * sinf(phi);
    result.z = (gps.height + (1 - e_sq) * N) * sinf(lambda);
    return result;
}

__device__ void calcNED(float *NED, float2 imgPos,
                        float * Rb2c_cur, float *Ri2b_missile_cur,
                        float * Re2i_missile, float fov_pixel)
{
    float Ldonic[3];
    float normTmp = sqrtf(imgPos.x * imgPos.x + imgPos.y * imgPos.y + fov_pixel * fov_pixel);
    Ldonic[0] = imgPos.x /normTmp;
    Ldonic[1] = imgPos.y /normTmp;
    Ldonic[2] = fov_pixel / normTmp;

    float tmp[9];
    float tmp1[9];
    mul3x3TransposeFirst(&tmp[0], Re2i_missile, Ri2b_missile_cur);
    mul3x3(&tmp1[0], &tmp[0], Rb2c_cur);
    mul3x3ToVec3x1(&NED[0], &tmp1[0], &Ldonic[0]);
}

__global__ void cudaCalcDistance(RayInfo *distance,
                                 GPS *ship_gps,
                                 GPS3 * ship_surface_gps,
                                 float6 * ship_surface_imgPos,
                                 ObjStatus* missile_cur,
                                 float * Rb2c_cur, float *Ri2b_missile_cur,
                                 float * Re2i_missile, float fov_pixel,
                                 int num_faces, int num_partialPix, int offset,
                                 int batch_size, int width, int height)
{
    int partialPixIdx = threadIdx.x + IMUL(blockIdx.x, blockDim.x);
    int faceIdx = threadIdx.y + IMUL(blockIdx.y, blockDim.y);

    // index of result array to be calculated(float* distance)
    int resultIdx = faceIdx * batch_size * PIXEL_GRID_SIZE * PIXEL_GRID_SIZE + partialPixIdx;

    // pixel index in the output image(640 * 480)
    int pixIdx = offset * batch_size + partialPixIdx / (PIXEL_GRID_SIZE * PIXEL_GRID_SIZE);

    // row, col index of current thread in the output width * height (640 * 480) image
    int row = pixIdx / width;
    int col = pixIdx % width;

    // index of subpixel inside the subgrid for each pixel
    int partialPixIdx_X = partialPixIdx / (batch_size * PIXEL_GRID_SIZE);
    int partialPixIdx_Y = partialPixIdx % (batch_size * PIXEL_GRID_SIZE);

    if(faceIdx < num_faces && partialPixIdx < num_partialPix)
    {
        Coordinate missile_pos = Geoditic2ECEF(missile_cur->gps);
        float u = col + (float)(partialPixIdx_X - 0.5)/ PIXEL_GRID_SIZE - width / 2 - 0.5;
        float w = row + (float)(partialPixIdx_Y - 0.5)/ PIXEL_GRID_SIZE - height / 2 - 0.5;
        float2 imgPos;
        imgPos.x = u;
        imgPos.y = w;

        // TODO: The following code performs a uncoalescing memory access
        // It is not easy to avoid since it require random data access pattern
        // A possible solution is to reorganize input data
        float2 surface_imgPos[3];
        surface_imgPos[0] = ship_surface_imgPos[faceIdx].x;
        surface_imgPos[1] = ship_surface_imgPos[faceIdx].y;
        surface_imgPos[2] = ship_surface_imgPos[faceIdx].z;

        GPS surface_gps[3];
        surface_gps[0] = ship_surface_gps[faceIdx].x;
        surface_gps[1] = ship_surface_gps[faceIdx].y;
        surface_gps[2] = ship_surface_gps[faceIdx].z;

        float NED[3];
        calcNED(&NED[0], imgPos, Rb2c_cur, Ri2b_missile_cur, Re2i_missile, fov_pixel);

        if(isInsideSurface(&surface_imgPos[0], 3, imgPos))
        {
            distance[resultIdx] = calcDistanceToFace(&NED[0], &surface_imgPos[0], imgPos,
                    missile_pos, &surface_gps[0]);
        }
        else
        {
            distance[resultIdx] = calcDistanceToOcean(&NED[0], imgPos, missile_pos);
        }
    }
}

Is the fact that I used lots of temporary variables that caused such performance.

No. If you use unnecessary temporaries, the compiler will optimize them away (unless their use introduces side-effects).

std::sqrt()

That shouldn't work in device-side code...

__host__ __device__

Functions are compiled differently for the host and the device - and different libraries/intrinsics are available to them. So your problem is about the __device__ version of this function.

float *NED, ... float * Rb2c_cur etc.

You should mark all of your input-only buffers as const __restrict__ and your output or input-output as just __restrict__. Not doing so forces the compiler to perform a bunch of re-reading and avoid caching.

For some more discussion of restrict here on SO, see:

  • What can human beings make out of the restrict qualifier?

  • Function call with pointer to non-const and pointer to const arguments of same address