This is a version of the original BEM4I kernel code, where the work-sharing loop over all degrees of freedom in the global system is processed using the automatic computation of the chunksize so that each openmp thread has only one chunk.
...
#pragma omp parallel
{
// apply K, K', V and D
{ ... }
int nts = omp_get_num_threads( );
int CHUNKSIZE = (int) (nDOFs / nts) ;
// loop over all degrees of freedom
#pragma omp for schedule(dynamic, CHUNKSIZE)
for(int j = 0; j < nDOFs; j++)
{ ... }
} // end of parallel region
MPI_Allreduce(...);