Cache size estimation on your system?
-
you need direct access to memory
I am not meaning DMA transfer by this. Memory must be accessed by CPU of course (otherwise you are not measuring CACHEs) but as directly as it can be ... so measurements will probably not be very accurate on Windows/Linux because services and other processes can mess with caches during runtime. Measure many times and average for better results (or use the fastest time or filter it together). For best accuracy use DOS and asm for example
rep + movsb,movsw,movsd rep + stosb,stosw,stosd
so you measure the memory transfer and not something else like in your code !!!
-
measure the raw transfer times and plot a graph
-
x
axis is transfer block size -
y
axis is transfer speed
zones with the same transfer rate are consistent with appropriate CACHE layer
-
[Edit1] could not find my old source code for this so I busted something right now in C++ for windows:
Time measurement:
//---------------------------------------------------------------------------
double performance_Tms=-1.0, // perioda citaca [ms]
performance_tms= 0.0; // zmerany cas [ms]
//---------------------------------------------------------------------------
void tbeg()
{
LARGE_INTEGER i;
if (performance_Tms<=0.0) { QueryPerformanceFrequency(&i); performance_Tms=1000.0/double(i.QuadPart); }
QueryPerformanceCounter(&i); performance_tms=double(i.QuadPart);
}
//---------------------------------------------------------------------------
double tend()
{
LARGE_INTEGER i;
QueryPerformanceCounter(&i); performance_tms=double(i.QuadPart)-performance_tms; performance_tms*=performance_Tms;
return performance_tms;
}
//---------------------------------------------------------------------------
Benchmark (32bit app):
//---------------------------------------------------------------------------
DWORD sizes[]= // used transfer block sizes
{
1<<10, 2<<10, 3<<10, 4<<10, 5<<10, 6<<10, 7<<10, 8<<10, 9<<10,
10<<10, 11<<10, 12<<10, 13<<10, 14<<10, 15<<10, 16<<10, 17<<10, 18<<10,
19<<10, 20<<10, 21<<10, 22<<10, 23<<10, 24<<10, 25<<10, 26<<10, 27<<10,
28<<10, 29<<10, 30<<10, 31<<10, 32<<10, 48<<10, 64<<10, 80<<10, 96<<10,
112<<10,128<<10,192<<10,256<<10,320<<10,384<<10,448<<10,512<<10, 1<<20,
2<<20, 3<<20, 4<<20, 5<<20, 6<<20, 7<<20, 8<<20, 9<<20, 10<<20,
11<<20, 12<<20, 13<<20, 14<<20, 15<<20, 16<<20, 17<<20, 18<<20, 19<<20,
20<<20, 21<<20, 22<<20, 23<<20, 24<<20, 25<<20, 26<<20, 27<<20, 28<<20,
29<<20, 30<<20, 31<<20, 32<<20,
};
const int N=sizeof(sizes)>>2; // number of used sizes
double pmovsd[N]; // measured transfer rate rep MOVSD [MB/sec]
double pstosd[N]; // measured transfer rate rep STOSD [MB/sec]
//---------------------------------------------------------------------------
void measure()
{
int i;
BYTE *dat; // pointer to used memory
DWORD adr,siz,num; // local variables for asm
double t,t0;
HANDLE hnd; // process handle
// enable priority change (huge difference)
#define measure_priority
// enable critical sections (no difference)
// #define measure_lock
for (i=0;i<N;i++) pmovsd[i]=0.0;
for (i=0;i<N;i++) pstosd[i]=0.0;
dat=new BYTE[sizes[N-1]+4]; // last DWORD +4 Bytes (should be 3 but i like 4 more)
if (dat==NULL) return;
#ifdef measure_priority
hnd=GetCurrentProcess(); if (hnd!=NULL) { SetPriorityClass(hnd,REALTIME_PRIORITY_CLASS); CloseHandle(hnd); }
Sleep(200); // wait to change take effect
#endif
#ifdef measure_lock
CRITICAL_SECTION lock; // lock handle
InitializeCriticalSectionAndSpinCount(&lock,0x00000400);
EnterCriticalSection(&lock);
#endif
adr=(DWORD)(dat);
for (i=0;i<N;i++)
{
siz=sizes[i]; // siz = actual block size
num=(8<<20)/siz; // compute n (times to repeat the measurement)
if (num<4) num=4;
siz>>=2; // size / 4 because of 32bit transfer
// measure overhead
tbeg(); // start time meassurement
asm {
push esi
push edi
push ecx
push ebx
push eax
mov ebx,num
mov al,0
loop0: mov esi,adr
mov edi,adr
mov ecx,siz
// rep movsd // es,ds already set by C++
// rep stosd // es already set by C++
dec ebx
jnz loop0
pop eax
pop ebx
pop ecx
pop edi
pop esi
}
t0=tend(); // stop time meassurement
// measurement 1
tbeg(); // start time meassurement
asm {
push esi
push edi
push ecx
push ebx
push eax
mov ebx,num
mov al,0
loop1: mov esi,adr
mov edi,adr
mov ecx,siz
rep movsd // es,ds already set by C++
// rep stosd // es already set by C++
dec ebx
jnz loop1
pop eax
pop ebx
pop ecx
pop edi
pop esi
}
t=tend(); // stop time meassurement
t-=t0; if (t<1e-6) t=1e-6; // remove overhead and avoid division by zero
t=double(siz<<2)*double(num)/t; // Byte/ms
pmovsd[i]=t/(1.024*1024.0); // MByte/s
// measurement 2
tbeg(); // start time meassurement
asm {
push esi
push edi
push ecx
push ebx
push eax
mov ebx,num
mov al,0
loop2: mov esi,adr
mov edi,adr
mov ecx,siz
// rep movsd // es,ds already set by C++
rep stosd // es already set by C++
dec ebx
jnz loop2
pop eax
pop ebx
pop ecx
pop edi
pop esi
}
t=tend(); // stop time meassurement
t-=t0; if (t<1e-6) t=1e-6; // remove overhead and avoid division by zero
t=double(siz<<2)*double(num)/t; // Byte/ms
pstosd[i]=t/(1.024*1024.0); // MByte/s
}
#ifdef measure_lock
LeaveCriticalSection(&lock);
DeleteCriticalSection(&lock);
#endif
#ifdef measure_priority
hnd=GetCurrentProcess(); if (hnd!=NULL) { SetPriorityClass(hnd,NORMAL_PRIORITY_CLASS); CloseHandle(hnd); }
#endif
delete dat;
}
//---------------------------------------------------------------------------
Where arrays pmovsd[]
and pstosd[]
holds the measured 32bit
transfer rates [MByte/sec]
. You can configure the code by use/rem two defines at the start of measure function.
Graphical Output:
To maximize accuracy you can change process priority class to maximum. So create measure thread with max priority (I try it but it mess thing up actually) and add critical section to it so the test will not be uninterrupted by OS as often (no visible difference with and without threads). If you want to use Byte
transfers then take account that it uses only 16bit
registers so you need to add loop and address iterations.
PS.
If you try this on notebook then you should overheat the CPU to be sure that you measure on top CPU/Mem speed. So no Sleep
s. Some stupid loops before measurement will do it but they should run at least few seconds. Also you can synchronize this by CPU frequency measurement and loop while is rising. Stop after it saturates ...
asm instruction RDTSC
is best for this (but beware its meaning has slightly changed with new architectures).
If you are not under Windows then change functions tbeg,tend
to your OS equivalents
[edit2] further improvements of accuracy
Well after finally solving problem with VCL affecting measurement accuracy which I discover thanks to this question and more about it here, to improve accuracy you can prior to benchmark do this:
set process priority class to
realtime
-
set process affinity to single CPU
so you measure just single CPU on multi-core
flush DATA and Instruction CACHEs
For example:
// before mem benchmark
DWORD process_affinity_mask=0;
DWORD system_affinity_mask =0;
HANDLE hnd=GetCurrentProcess();
if (hnd!=NULL)
{
// priority
SetPriorityClass(hnd,REALTIME_PRIORITY_CLASS);
// affinity
GetProcessAffinityMask(hnd,&process_affinity_mask,&system_affinity_mask);
process_affinity_mask=1;
SetProcessAffinityMask(hnd,process_affinity_mask);
GetProcessAffinityMask(hnd,&process_affinity_mask,&system_affinity_mask);
}
// flush CACHEs
for (DWORD i=0;i<sizes[N-1];i+=7)
{
dat[i]+=i;
dat[i]*=i;
dat[i]&=i;
}
// after mem benchmark
if (hnd!=NULL)
{
SetPriorityClass(hnd,NORMAL_PRIORITY_CLASS);
SetProcessAffinityMask(hnd,system_affinity_mask);
}
So the more accurate measurement looks like this:
Your lengthMod
variable doesn't do what you think it does. You want it to limit the size of your data set, but you have 2 problems there -
- Doing a bitwise AND with a power of 2 would mask off all bits except the one that's on. If for e.g.
lengthMod
is 1k (0x400), then all indices lower than 0x400 (meaning i=1 to 63) would simply map to index 0, so you'll always hit the cache. That's probably why the results are so fast. Instead uselengthMod - 1
to create a correct mask (0x400 --> 0x3ff, which would mask just the upper bits and leave the lower ones intact). - Some of the values for
lengthMod
are not a power of 2, so doing thelengthMod-1
isn't going to work there as some of the mask bits would still be zeros. Either remove them from the list, or use a modulo operation instead oflengthMod-1
altogether. See also my answer here for a similar case.
Another issue is that 16B jumps are probably not enough to skip a cachline as most common CPUs work with 64 byte cachelines, so you get only one miss for every 4 iterations. Use (i*64)
instead.