Loop unrolling to achieve maximum throughput with Ivy Bridge and Haswell

For Sandy/Ivy Bridge you need to unroll by 3:

  • Only FP Add has dependency on the previous iteration of the loop
  • FP Add can issue every cycle
  • FP Add takes three cycles to complete
  • Thus unrolling by 3/1 = 3 completely hides the latency
  • FP Mul and FP Load do not have a dependency on the previous iteration and you can rely on the OoO core to issue them in the near-optimal order. These instructions could affect the unroll factor only if they lowered the throughput of FP Add (not the case here, FP Load + FP Add + FP Mul can issue every cycle).

For Haswell you need to unroll by 10:

  • Only FMA has dependency on the previous iteration of the loop
  • FMA can double-issue every cycle (i.e. on average independent instructions take 0.5 cycles)
  • FMA has latency of 5
  • Thus unrolling by 5/0.5 = 10 completely hides FMA latency
  • The two FP Load microoperations do not have a dependency on the previous iteration, and can co-issue with 2x FMA, so they don't affect the unroll factor.

I'm only answering my own question here to add information.

I went ahead and profiled the Ivy Bridge code. When I first tested this in MSVC2012 unrolling by more than two did not help much. However, I suspected that MSVC did not implement the intrinsics optimally based on my observation at Difference in performance between MSVC and GCC for highly optimized matrix multplication code. So I compiled the kernel in GCC with g++ -c -mavx -O3 -mabi=ms, converted the object to COFF64 and dropped it into MSVC and I now get that unrolling by three gives the best result confirming Marat Dunkhan's answer.

Here are the times in seconds, Xeon E5 1620 @3.6GHz MSVC2012

unroll    time default            time with GCC kernel
     1    3.7                     3.2
     2    1.8 (2.0x faster)       1.6 (2.0x faster)
     3    1.6 (2.3x faster)       1.2 (2.7x faster)
     4    1.6 (2.3x faster)       1.2 (2.7x faster)

Here are the times on i5-4250U using fma with GCC in Linux (g++ -mavx -mfma -fopenmp -O3 main.cpp kernel_fma.cpp -o sum_fma)

unroll    time
     1    20.3
     2    10.2 (2.0x faster)
     3     6.7 (3.0x faster) 
     4     5.2 (4.0x faster)
     8     2.9 (7.0x faster)
    10     2.6 (7.8x faster)

The code below is for Sandy-Bridge/Ivy Bridge. For Haswell use e.g. tmp0 = _mm256_fmadd_ps(a8,b8_1,tmp0) instead.

kernel.cpp

#include <immintrin.h>

extern "C" void foo_unroll1(const int n, const float *b, float *c) {      
    __m256 tmp0 = _mm256_set1_ps(0.0f);
    __m256 a8 = _mm256_set1_ps(1.0f);
    for(int i=0; i<n; i+=8) {
        __m256 b8 = _mm256_loadu_ps(&b[i + 0]);
        tmp0 = _mm256_add_ps(_mm256_mul_ps(a8,b8), tmp0);
    }
    _mm256_storeu_ps(c, tmp0);
}

extern "C" void foo_unroll2(const int n, const float *b, float *c) {
    __m256 tmp0 = _mm256_set1_ps(0.0f);
    __m256 tmp1 = _mm256_set1_ps(0.0f);
    __m256 a8 = _mm256_set1_ps(1.0f);
    for(int i=0; i<n; i+=16) {
        __m256 b8_1 = _mm256_loadu_ps(&b[i + 0]);
        tmp0 = _mm256_add_ps(_mm256_mul_ps(a8,b8_1), tmp0);
        __m256 b8_2 = _mm256_loadu_ps(&b[i + 8]);
        tmp1 = _mm256_add_ps(_mm256_mul_ps(a8,b8_2), tmp1);
    }
    tmp0 = _mm256_add_ps(tmp0,tmp1);
    _mm256_storeu_ps(c, tmp0);
}

extern "C" void foo_unroll3(const int n, const float *b, float *c) { 
    __m256 tmp0 = _mm256_set1_ps(0.0f);
    __m256 tmp1 = _mm256_set1_ps(0.0f);
    __m256 tmp2 = _mm256_set1_ps(0.0f);
    __m256 a8 = _mm256_set1_ps(1.0f);
    for(int i=0; i<n; i+=24) {
        __m256 b8_1 = _mm256_loadu_ps(&b[i + 0]);
        tmp0 = _mm256_add_ps(_mm256_mul_ps(a8,b8_1), tmp0);
        __m256 b8_2 = _mm256_loadu_ps(&b[i + 8]);
        tmp1 = _mm256_add_ps(_mm256_mul_ps(a8,b8_2), tmp1);
        __m256 b8_3 = _mm256_loadu_ps(&b[i + 16]);
        tmp2 = _mm256_add_ps(_mm256_mul_ps(a8,b8_3), tmp2);
    }
    tmp0 = _mm256_add_ps(tmp0,_mm256_add_ps(tmp1,tmp2));
    _mm256_storeu_ps(c, tmp0);
}

extern "C" void foo_unroll4(const int n, const float *b, float *c) {      
    __m256 tmp0 = _mm256_set1_ps(0.0f);
    __m256 tmp1 = _mm256_set1_ps(0.0f);
    __m256 tmp2 = _mm256_set1_ps(0.0f);
    __m256 tmp3 = _mm256_set1_ps(0.0f);
    __m256 a8 = _mm256_set1_ps(1.0f);
    for(int i=0; i<n; i+=32) {
        __m256 b8_1 = _mm256_loadu_ps(&b[i + 0]);
        tmp0 = _mm256_add_ps(_mm256_mul_ps(a8,b8_1), tmp0);
        __m256 b8_2 = _mm256_loadu_ps(&b[i + 8]);
        tmp1 = _mm256_add_ps(_mm256_mul_ps(a8,b8_2), tmp1);
        __m256 b8_3 = _mm256_loadu_ps(&b[i + 16]);
        tmp2 = _mm256_add_ps(_mm256_mul_ps(a8,b8_3), tmp2);
        __m256 b8_4 = _mm256_loadu_ps(&b[i + 24]);
        tmp3 = _mm256_add_ps(_mm256_mul_ps(a8,b8_4), tmp3);
    }
    tmp0 = _mm256_add_ps(_mm256_add_ps(tmp0,tmp1),_mm256_add_ps(tmp2,tmp3));
    _mm256_storeu_ps(c, tmp0);
}

main.cpp

#include <stdio.h>
#include <omp.h>
#include <immintrin.h>

extern "C" void foo_unroll1(const int n, const float *b, float *c);
extern "C" void foo_unroll2(const int n, const float *b, float *c);
extern "C" void foo_unroll3(const int n, const float *b, float *c);
extern "C" void foo_unroll4(const int n, const float *b, float *c);

int main() {
    const int n = 3*1<<10;
    const int r = 10000000;
    double dtime;
    float *b = (float*)_mm_malloc(sizeof(float)*n, 64);
    float *c = (float*)_mm_malloc(8, 64);
    for(int i=0; i<n; i++) b[i] = 1.0f;

    __m256 out;
    dtime = omp_get_wtime();    
    for(int i=0; i<r; i++) foo_unroll1(n, b, c);
    dtime = omp_get_wtime() - dtime;
    printf("%f, ", dtime); for(int i=0; i<8; i++) printf("%f ", c[i]); printf("\n");

    dtime = omp_get_wtime();    
    for(int i=0; i<r; i++) foo_unroll2(n, b, c);
    dtime = omp_get_wtime() - dtime;
    printf("%f, ", dtime); for(int i=0; i<8; i++) printf("%f ", c[i]); printf("\n");

    dtime = omp_get_wtime();    
    for(int i=0; i<r; i++) foo_unroll3(n, b, c);
    dtime = omp_get_wtime() - dtime;
    printf("%f, ", dtime); for(int i=0; i<8; i++) printf("%f ", c[i]); printf("\n");

    dtime = omp_get_wtime();    
    for(int i=0; i<r; i++) foo_unroll4(n, b, c);
    dtime = omp_get_wtime() - dtime;
    printf("%f, ", dtime); for(int i=0; i<8; i++) printf("%f ", c[i]); printf("\n");
}