c++ - openMP bad performance with false sharing -
i know exists thread openmp performance
but here example simple
c code:
int mafunc(size_t szglobalworksize) { int igid = 0; float *pfresult = (float *)calloc(szglobalworksize * 100, sizeof(float)); float fvalue = 0.5f; struct timeval tim; gettimeofday(&tim, null); double tlaunch1=tim.tv_sec+(tim.tv_usec/1000000.0); #pragma omp parallel (igid = 0; igid < (int)szglobalworksize * 100; igid++) { pfresult[igid] = fvalue; // printf("element %d traité par le thread %d \n",igid,omp_get_thread_num()); } gettimeofday(&tim, null); double tlaunch2=tim.tv_sec+(tim.tv_usec/1000000.0); printf("%.6lf time omp\n", tlaunch2-tlaunch1); }
timing of example increases when use openmp 0.015s without openmp against 0.045 sec openmp (szglobalworksize = 131072)
i use line of gcc: gcc -march=native -fopenmp -o3 mycode.c -lm
gcc (gcc) 4.8.2 20140120 (red hat 4.8.2-15)
edit1:
int myfunc2() { int igid = 0; int j = 0; //float *pfresult = (float *)calloc(szglobalworksize * 100, sizeof(float)); float *pfresult = (float *)valloc(szglobalworksize * 100* sizeof(float)); float fvalue = 0.5f; struct timeval tim; gettimeofday(&tim, null); double tlaunch1=tim.tv_sec+(tim.tv_usec/1000000.0); double time = omp_get_wtime(); int ichunk = getpagesize(); int isize = ((int)szglobalworksize * 100) / ichunk; // #pragma omp parallel #pragma omp parallel (igid = 0; igid < isize; igid++) { (j = 0; j < ichunk; j++) { pfresult[igid * ichunk + j] = fvalue; //pfresult[igid] = fvalue; } // printf("element %d traité par le thread %d \n",igid,omp_get_thread_num()); } time = omp_get_wtime() - time; gettimeofday(&tim, null); double tlaunch2=tim.tv_sec+(tim.tv_usec/1000000.0); printf("%.6lf time omp\n", tlaunch2-tlaunch1); printf("pagesize=%d\n", getpagesize()); printf("%.6lf time omp2\n", time); }
also same time chunk memalign
edit 2 timing thread
#pragma omp parallel private(dlocaltime) { pdtime[omp_get_thread_num()] = omp_get_wtime(); printf("thread begin %d time %f\n", omp_get_thread_num(), pdtime[omp_get_thread_num()] ); #pragma omp (igid = 0; igid < isize; igid++) { // (j = 0; j < ichunk; j++) { // pfresult[igid * ichunk + j] = fvalue; pfresult[igid] = fvalue; } } //dlocaltime = (omp_get_wtime() - dlocaltime); pdtime[omp_get_thread_num()] = (omp_get_wtime() - pdtime[omp_get_thread_num()]); printf("thread end %d time %f\n", omp_get_thread_num(), pdtime[omp_get_thread_num()]); // printf("end element %d traité par le thread %d \n",0,tid); }
each threads takes 0.015 total of 0.045 there fix part in openmp of 0.03 strange huge dimensions see fix part of openmp , thread have less work takes same time whole size (48 threads here)
thanks
ok, since insist.. :)
with fixed threads warm-up:
#include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <omp.h> #include <unistd.h> int main() { int szglobalworksize = 131072; int igid = 0; int j = 0; omp_set_dynamic(0); // warmup #if warmup #pragma omp parallel { #pragma omp master { printf("%d threads\n", omp_get_num_threads()); } } #endif printf("pagesize=%d\n", getpagesize()); float *pfresult = (float *)valloc(szglobalworksize * 100* sizeof(float)); float fvalue = 0.5f; struct timeval tim; gettimeofday(&tim, null); double tlaunch1=tim.tv_sec+(tim.tv_usec/1000000.0); double time = omp_get_wtime(); int ichunk = getpagesize(); int isize = ((int)szglobalworksize * 100) / ichunk; #pragma omp parallel (igid = 0; igid < isize; igid++) { (j = 0; j < ichunk; j++) pfresult[igid * ichunk + j] = fvalue; } time = omp_get_wtime() - time; gettimeofday(&tim, null); double tlaunch2=tim.tv_sec+(tim.tv_usec/1000000.0); printf("%.6lf time1\n", tlaunch2-tlaunch1); printf("%.6lf time2\n", time); }
i've got following numbers on machine:
$ g++ -o2 -fopenmp testomp.cpp && omp_num_threads=1 ./a.out pagesize=4096 0.036493 time1 0.036489 time2 $ g++ -o2 -fopenmp testomp.cpp && ./a.out pagesize=4096 0.034721 time1 0.034718 time2 $ g++ -o2 -fopenmp testomp.cpp -dwarmup && ./a.out 24 threads pagesize=4096 0.026966 time1 0.026963 time2
as can see, threads creation time contributes lot numbers.
why still doesn't scale? well, extremely memory-bound workload. actually, fills pages twice: once os clears on first touch, program fills via value. seems there not enough memory bandwidth in system. i'd not expect false-sharing play significant role here since parallel for
default uses static schedule not interleave iterations between threads, false sharing possible once on boundaries.
Comments
Post a Comment