Stuck while optimizing a segmented prime sieve in C

Solution 1:

I added a simple benchmarking framework to test various approaches. It turns out the unrolling does not improve performance. The reason is probably that modern processors predict branches so a simple loop that runs many times can run at full speed, which is the case for small primes and the same simple loop that breaks immediately is also correctly predicted once primes become large enough.

Here is a simplified version with identical performance on my laptop:

static void sieve_bit_3(unsigned *f, unsigned st, unsigned *pp) {
    static const unsigned m[] = {
        ~(1u << 0),  ~(1u << 1),  ~(1u << 2),  ~(1u << 3),  ~(1u << 4),
        ~(1u << 5),  ~(1u << 6),  ~(1u << 7),  ~(1u << 8),  ~(1u << 9),
        ~(1u << 10), ~(1u << 11), ~(1u << 12), ~(1u << 13), ~(1u << 14),
        ~(1u << 15), ~(1u << 16), ~(1u << 17), ~(1u << 18), ~(1u << 19),
        ~(1u << 20), ~(1u << 21), ~(1u << 22), ~(1u << 23), ~(1u << 24),
        ~(1u << 25), ~(1u << 26), ~(1u << 27), ~(1u << 28), ~(1u << 29),
        ~(1u << 30), ~(1u << 31)
    };
    unsigned p = *pp;
    unsigned p2 = sq(p);
    do {
        unsigned n = (p2 > st + p * 2 ? p2 - st : p - st % p + (-(st % p & 1) & p)) / 2;
        for (; n < C * 8; n += p) {
            f[n / 32] &= m[n % 32];
        }
        p2 = sq(p = *++pp);
    } while (p2 < st + C * 16);
}

Here is a modified version with fewer tests, a little faster:

static void sieve_bit_6(unsigned *f, unsigned st, unsigned *pp) {
    static const unsigned m[] = {
        ~(1u << 0),  ~(1u << 1),  ~(1u << 2),  ~(1u << 3),  ~(1u << 4),
        ~(1u << 5),  ~(1u << 6),  ~(1u << 7),  ~(1u << 8),  ~(1u << 9),
        ~(1u << 10), ~(1u << 11), ~(1u << 12), ~(1u << 13), ~(1u << 14),
        ~(1u << 15), ~(1u << 16), ~(1u << 17), ~(1u << 18), ~(1u << 19),
        ~(1u << 20), ~(1u << 21), ~(1u << 22), ~(1u << 23), ~(1u << 24),
        ~(1u << 25), ~(1u << 26), ~(1u << 27), ~(1u << 28), ~(1u << 29),
        ~(1u << 30), ~(1u << 31)
    };
    unsigned p, p2;
    while (p = *pp++, (p2 = sq(p)) <= st + 2 * p) {
        unsigned mod = st % p;
        unsigned n = (p - mod + (-(mod & 1) & p)) / 2;
        for (; n < C * 8; n += p) {
            f[n / 32] &= m[n % 32];
        }
    }
    while (p2 < st + C * 16) {
        unsigned n = (p2 - st) / 2;
        for (; n < C * 8; n += p) {
            f[n / 32] &= m[n % 32];
        }
        p2 = sq(p = *pp++);
    }
}