c++ - openMP bad performance with false sharing -


i know exists thread openmp performance

but here example simple

c code:

int mafunc(size_t szglobalworksize) {         int igid = 0;         float *pfresult = (float *)calloc(szglobalworksize * 100, sizeof(float));         float fvalue = 0.5f;         struct timeval tim;         gettimeofday(&tim, null);         double tlaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);          #pragma omp parallel         (igid = 0; igid < (int)szglobalworksize * 100; igid++)         {           pfresult[igid] = fvalue;          // printf("element %d traité par le thread %d \n",igid,omp_get_thread_num());         }         gettimeofday(&tim, null);         double tlaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);         printf("%.6lf time omp\n", tlaunch2-tlaunch1);      } 

timing of example increases when use openmp 0.015s without openmp against 0.045 sec openmp (szglobalworksize = 131072)

i use line of gcc: gcc -march=native -fopenmp -o3 mycode.c -lm

gcc (gcc) 4.8.2 20140120 (red hat 4.8.2-15)

edit1:

int myfunc2() {         int igid = 0;         int j = 0;         //float *pfresult = (float *)calloc(szglobalworksize * 100, sizeof(float));         float *pfresult = (float *)valloc(szglobalworksize * 100* sizeof(float));         float fvalue = 0.5f;         struct timeval tim;         gettimeofday(&tim, null);          double tlaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);         double time = omp_get_wtime();         int ichunk = getpagesize();         int isize = ((int)szglobalworksize * 100) / ichunk;          // #pragma omp parallel         #pragma omp parallel         (igid = 0; igid < isize; igid++)         {           (j = 0; j < ichunk; j++)           {               pfresult[igid * ichunk + j] = fvalue;          //pfresult[igid] = fvalue;       }          // printf("element %d traité par le thread %d \n",igid,omp_get_thread_num());         }         time = omp_get_wtime() - time;         gettimeofday(&tim, null);         double tlaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);         printf("%.6lf time omp\n", tlaunch2-tlaunch1);         printf("pagesize=%d\n", getpagesize());         printf("%.6lf time omp2\n", time);      } 

also same time chunk memalign

edit 2 timing thread

#pragma omp parallel private(dlocaltime)     {            pdtime[omp_get_thread_num()] = omp_get_wtime();        printf("thread begin %d time %f\n", omp_get_thread_num(), pdtime[omp_get_thread_num()] );        #pragma omp            (igid = 0; igid < isize; igid++)            {     //   (j = 0; j < ichunk; j++)              {               //  pfresult[igid * ichunk + j] = fvalue;             pfresult[igid] = fvalue;          }             }        //dlocaltime = (omp_get_wtime() - dlocaltime);          pdtime[omp_get_thread_num()] = (omp_get_wtime() - pdtime[omp_get_thread_num()]);              printf("thread end %d time %f\n", omp_get_thread_num(), pdtime[omp_get_thread_num()]);        // printf("end element %d traité par le thread %d \n",0,tid);     } 

each threads takes 0.015 total of 0.045 there fix part in openmp of 0.03 strange huge dimensions see fix part of openmp , thread have less work takes same time whole size (48 threads here)

thanks

ok, since insist.. :)

with fixed threads warm-up:

#include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <omp.h> #include <unistd.h>  int main() {         int szglobalworksize = 131072;         int igid = 0;         int j = 0;         omp_set_dynamic(0);         // warmup         #if warmup         #pragma omp parallel         {         #pragma omp master         {         printf("%d threads\n", omp_get_num_threads());         }         }         #endif         printf("pagesize=%d\n", getpagesize());         float *pfresult = (float *)valloc(szglobalworksize * 100* sizeof(float));         float fvalue = 0.5f;         struct timeval tim;         gettimeofday(&tim, null);          double tlaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);         double time = omp_get_wtime();         int ichunk = getpagesize();         int isize = ((int)szglobalworksize * 100) / ichunk;          #pragma omp parallel         (igid = 0; igid < isize; igid++)         {           (j = 0; j < ichunk; j++)              pfresult[igid * ichunk + j] = fvalue;         }         time = omp_get_wtime() - time;         gettimeofday(&tim, null);         double tlaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);         printf("%.6lf time1\n", tlaunch2-tlaunch1);         printf("%.6lf time2\n", time); } 

i've got following numbers on machine:

$ g++ -o2 -fopenmp testomp.cpp && omp_num_threads=1 ./a.out pagesize=4096 0.036493 time1 0.036489 time2 $ g++ -o2 -fopenmp testomp.cpp && ./a.out pagesize=4096 0.034721 time1 0.034718 time2 $ g++ -o2 -fopenmp testomp.cpp -dwarmup && ./a.out 24 threads pagesize=4096 0.026966 time1 0.026963 time2 

as can see, threads creation time contributes lot numbers.

why still doesn't scale? well, extremely memory-bound workload. actually, fills pages twice: once os clears on first touch, program fills via value. seems there not enough memory bandwidth in system. i'd not expect false-sharing play significant role here since parallel for default uses static schedule not interleave iterations between threads, false sharing possible once on boundaries.


Comments

Popular posts from this blog

java - Oracle EBS .ClassNotFoundException: oracle.apps.fnd.formsClient.FormsLauncher.class ERROR -

c# - how to use buttonedit in devexpress gridcontrol -

How do you convert a timestamp into a datetime in python with the correct timezone? -