[petsc-users] Explicit linking to OpenMP results in performance drop and wrong results

Roland Richter roland.richter at ntnu.no
Wed Feb 17 10:23:48 CST 2021


Hei,

I replaced the linking line with

//usr/lib64/mpi/gcc/openmpi3/bin/mpicxx  -march=native -fopenmp-simd
-DMKL_LP64 -m64
CMakeFiles/armadillo_with_PETSc.dir/Unity/unity_0_cxx.cxx.o -o
bin/armadillo_with_PETSc 
-Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
/usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran 
-L${MKLROOT}/lib/intel64 -Wl,--no-as-needed -lmkl_intel_lp64
-lmkl_gnu_thread -lmkl_core -lgomp -lpthread -lm -ldl
/opt/boost/lib/libboost_filesystem.so.1.72.0
/opt/boost/lib/libboost_mpi.so.1.72.0
/opt/boost/lib/libboost_program_options.so.1.72.0
/opt/boost/lib/libboost_serialization.so.1.72.0
/opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
/opt/petsc_release/lib/libpetsc.so
/usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so
/

and now the results are correct. Nevertheless, when comparing the loop
in line 26-28 in file test_scaling.cpp

/#pragma omp parallel for//
//    for(int i = 0; i < r_0 * r_1; ++i)//
//        *(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);/

the version without /#pragma omp parallel/ for is significantly faster
(i.e. 18 s vs 28 s) compared to the version with /omp./ Why is there
still such a big difference?

Thanks!

Am 17.02.21 um 17:10 schrieb Jed Brown:
> You're using an MKL linked to Intel's OpenMP. I could imagine there being symbol conflicts causing MKL to compute wrong results if libgomp symbols are picked up.
>
> Note that -fopenmp-simd does not require linking -- it just gives the compiler hints about how to vectorize. So you can probably keep using it and just stop passing libgomp.so. Alternatively, you can link MKL to work with libgomp (see the MKL link advisor).
>
> Roland Richter <roland.richter at ntnu.no> writes:
>
>> Hei,
>>
>> when compiling the attached files using the following compilation line
>>
>> //usr/lib64/mpi/gcc/openmpi3/bin/mpicxx -DBOOST_ALL_NO_LIB
>> -DBOOST_FILESYSTEM_DYN_LINK -DBOOST_MPI_DYN_LINK
>> -DBOOST_PROGRAM_OPTIONS_DYN_LINK -DBOOST_SERIALIZATION_DYN_LINK
>> -DUSE_CUDA
>> -I/home/roland/Dokumente/C++-Projekte/armadillo_with_PETSc/include
>> -I/opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/include
>> -I/opt/armadillo/include -isystem /opt/petsc_release/include -isystem
>> /opt/fftw3/include -isystem /opt/boost/include -march=native
>> -fopenmp-simd -DMKL_LP64 -m64 -Wall -Wextra -pedantic -fPIC -flto -O2
>> -funroll-loops -funroll-all-loops -fstrict-aliasing -mavx -march=native
>> -fopenmp -std=gnu++17 -c <source_files> -o <target_files>/
>>
>> and linking them with 
>>
>> //usr/lib64/mpi/gcc/openmpi3/bin/mpicxx  -march=native -fopenmp-simd
>> -DMKL_LP64 -m64 <target_files> -o bin/armadillo_with_PETSc 
>> -Wl,-rpath,/opt/boost/lib:/opt/fftw3/lib64:/opt/petsc_release/lib
>> /usr/lib64/libgsl.so /usr/lib64/libgslcblas.so -lgfortran
>> /opt/intel/mkl/lib/intel64/libmkl_rt.so
>> /opt/boost/lib/libboost_filesystem.so.1.72.0
>> /opt/boost/lib/libboost_mpi.so.1.72.0
>> /opt/boost/lib/libboost_program_options.so.1.72.0
>> /opt/boost/lib/libboost_serialization.so.1.72.0
>> /opt/fftw3/lib64/libfftw3.so /opt/fftw3/lib64/libfftw3_mpi.so
>> /opt/petsc_release/lib/libpetsc.so
>> /usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/
>>
>> my output is 
>>
>> /Arma and PETSc/MatScale are equal:                                     
>>         0//
>> //Arma-time for a matrix size of [1024,
>> 8192]:                                      24955//
>> //PETSc-time, pointer for a matrix size of [1024,
>> 8192]:                      28283//
>> //PETSc-time, MatScale for a matrix size of [1024,
>> 8192]:                  23138/
>>
>> but when removing the explicit call to openmp (i.e. removing /-fopenmp/
>> and //usr/lib64/gcc/x86_64-suse-linux/9/libgomp.so/) my result is
>>
>> /Arma and PETSc/MatScale are equal:                                     
>>        1//
>> //Arma-time for a matrix size of [1024,
>> 8192]:                                     24878//
>> //PETSc-time, pointer for a matrix size of [1024,
>> 8192]:                     18942//
>> //PETSc-time, MatScale for a matrix size of [1024,
>> 8192]:                 23350/
>>
>> even though both times the executable is linked to
>>
>> /        libmkl_intel_lp64.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_lp64.so
>> (0x00007f9eebd70000)//
>> //        libmkl_core.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_core.so
>> (0x00007f9ee77aa000)//
>> //        libmkl_intel_thread.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/mkl/lib/intel64_lin/libmkl_intel_thread.so
>> (0x00007f9ee42c3000)//
>> //        libiomp5.so =>
>> /opt/intel/compilers_and_libraries_2020.2.254/linux/compiler/lib/intel64_lin/libiomp5.so
>> (0x00007f9ee3ebd000)//
>> //        libgomp.so.1 => /usr/lib64/libgomp.so.1 (0x00007f9ea98bd000)/
>>
>> via the petsc-library. Why does the execution time vary by so much, and
>> why does my result change when calling MatScale (i.e. returning wrong
>> results) when explicitly linking to OpenMP?
>>
>> Thanks!
>>
>> Regards,
>>
>> Roland
>>
>> #include <test_scaling.hpp>
>>
>> int main(int argc, char **args) {
>> 	PetscMPIInt rank, size;
>> 	PetscInitialize(&argc, &args, (char*) 0, NULL);
>>
>> 	MPI_Comm_size(PETSC_COMM_WORLD, &size);
>> 	MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
>>
>> 	test_scaling (1024, 8192, false);
>> 	PetscFinalize();
>> 	return 0;
>> }
>> #include <test_scaling.hpp>
>>
>> void test_scaling_arma(const arma::cx_mat & in_mat,
>> 					   arma::cx_mat &out_mat,
>> 					   const arma::cx_double &scaling_factor) {
>> 	out_mat = in_mat;
>> 	out_mat *= scaling_factor;
>> }
>>
>> void test_scaling_petsc(const Mat &in_mat,
>> 						Mat &out_mat,
>> 						const PetscScalar &scaling_factor) {
>> 	MatZeroEntries(out_mat);
>> 	MatAXPY(out_mat, scaling_factor, in_mat, SAME_NONZERO_PATTERN);
>> }
>>
>> void test_scaling_petsc_pointer(const Mat &in_mat,
>> 								Mat &out_mat,
>> 								const PetscScalar &scaling_factor) {
>> 	const PetscScalar *in_mat_ptr;
>> 	PetscScalar *out_mat_ptr;
>> 	MatDenseGetArrayRead (in_mat, &in_mat_ptr);
>> 	MatDenseGetArrayWrite (out_mat, &out_mat_ptr);
>> 	PetscInt r_0, r_1;
>> 	MatGetLocalSize (out_mat, &r_0, &r_1);
>> #pragma omp parallel for
>> 	for(int i = 0; i < r_0 * r_1; ++i)
>> 		*(out_mat_ptr + i) = (*(in_mat_ptr + i) * scaling_factor);
>>
>> 	MatDenseRestoreArrayRead (in_mat, &in_mat_ptr);
>> 	MatDenseRestoreArrayWrite(out_mat, &out_mat_ptr);
>> }
>>
>> void test_scaling(const size_t matrix_size_rows, const size_t matrix_size_cols, const bool print_matrices) {
>> 	PetscMPIInt rank, size;
>>
>> 	MPI_Comm_size(PETSC_COMM_WORLD, &size);
>> 	MPI_Comm_rank(PETSC_COMM_WORLD, &rank);
>>
>> 	arma::cx_mat in_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols),
>> 			out_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
>> 	arma::cx_rowvec matrix_vec = arma::conv_to<arma::cx_rowvec>::from(arma::linspace<arma::rowvec>(0, matrix_size_cols - 1, matrix_size_cols));
>> 	in_mat.each_row([&](arma::cx_rowvec &a){
>> 		a = matrix_vec;
>> 	});
>>
>> 	Mat petsc_in_mat, petsc_out_mat;
>> 	arma::cx_mat petsc_out_comparison_mat = arma::zeros<arma::cx_mat>(matrix_size_rows, matrix_size_cols);
>> 	MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, matrix_size_rows, matrix_size_cols, NULL, &petsc_in_mat);
>> 	MatCreateDense(PETSC_COMM_WORLD, PETSC_DECIDE, PETSC_DECIDE, matrix_size_rows, matrix_size_cols, NULL, &petsc_out_mat);
>>
>> 	create_data_in_PETSc_from_scratch (in_mat, petsc_in_mat);
>> 	MatZeroEntries(petsc_out_mat);
>>
>> 	const std::complex<double> scaling_factor{1. / matrix_size_cols, 0.};
>>
>> 	test_scaling_arma (in_mat, out_mat, scaling_factor);
>> 	test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
>>
>> 	//Benchmark
>> 	auto t1 = std::chrono::high_resolution_clock::now();
>> 	for(size_t i = 0; i < bench_rounds; ++i) {
>> 		test_scaling_arma (in_mat, out_mat, scaling_factor);
>> 	}
>> 	auto t2 = std::chrono::high_resolution_clock::now();
>>
>> 	auto t3 = std::chrono::high_resolution_clock::now();
>> 	for(size_t i = 0; i < bench_rounds; ++i) {
>> 		test_scaling_petsc_pointer (petsc_in_mat, petsc_out_mat, scaling_factor);
>> 	}
>> 	auto t4 = std::chrono::high_resolution_clock::now();
>>
>> 	auto t5 = std::chrono::high_resolution_clock::now();
>> 	for(size_t i = 0; i < bench_rounds; ++i) {
>> 		test_scaling_petsc (petsc_in_mat, petsc_out_mat, scaling_factor);
>> 	}
>> 	auto t6 = std::chrono::high_resolution_clock::now();
>>
>>
>> 	retrieve_data_from_PETSc (petsc_out_mat, petsc_out_comparison_mat, matrix_size_cols, matrix_size_rows);
>>
>> 	if(print_matrices && rank == 0) {
>> 		std::cout << "In-matrix, ARMA:\n" << in_mat
>> 				  << "\n\nOut-matrix, ARMA:\n" << out_mat
>> 				  << "\n\nComparison-out-matrix, ARMA:\n" << petsc_out_comparison_mat
>> 				  << "\n\nDifference: \n" << arma::abs(petsc_out_comparison_mat - out_mat)
>> 				  <<'\n';
>> 	}
>> 	if(rank == 0) {
>> 		std::cout << "Arma and PETSc/MatScale are equal:\t\t\t\t\t" << arma::approx_equal(out_mat, petsc_out_comparison_mat, "reldiff", 1e-8) << '\n';
>> 		std::cout << "Arma-time for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t2 - t1 ).count() << '\n';
>> 		std::cout << "PETSc-time, pointer for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t4 - t3 ).count() << '\n';
>> 		std::cout << "PETSc-time, MatScale for a matrix size of ["
>> << matrix_size_rows << ", "
>> << matrix_size_cols << "]:\t\t\t"
>> << std::chrono::duration_cast<std::chrono::milliseconds>( t6 - t5 ).count() << '\n';
>> 	}
>> 	MatDestroy (&petsc_in_mat);
>> 	MatDestroy (&petsc_out_mat);
>> }
>> #include <helper_functions.hpp>
>>
>> void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
>> 							  const arma::uword Ntime, const arma::uword Nradius) {
>> 	PetscMPIInt size;
>> 	MPI_Comm_size(PETSC_COMM_WORLD, &size);
>> 	if(out_data.n_rows != Ntime && out_data.n_cols != Nradius) {
>> 		out_data = arma::zeros<arma::cx_mat>(Ntime, Nradius);
>> 	}
>> 	Mat local_mat;
>> 	arma::Col<int> vector_indices_radius = arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
>> 	arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, Ntime - 1, Ntime);
>> 	//MatCreateRedundantMatrix(petsc_mat, Ntime * Nradius, MPI_COMM_NULL, MAT_INITIAL_MATRIX, &local_mat);
>> 	MatCreateRedundantMatrix(petsc_mat, size, MPI_COMM_NULL, MAT_INITIAL_MATRIX, &local_mat);
>> 	MatAssemblyBegin(local_mat, MAT_FINAL_ASSEMBLY);
>> 	MatAssemblyEnd(local_mat, MAT_FINAL_ASSEMBLY);
>> 	MatGetValues(local_mat, Nradius, vector_indices_radius.memptr(), Ntime, vector_indices_time.memptr(), out_data.memptr());
>> 	MatDestroy(&local_mat);
>> 	out_data = out_data.st();
>> }
>>
>> void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat) {
>> 	const arma::uword Ntime = in_data.n_cols;
>> 	const arma::uword Nradius = in_data.n_rows;
>> 	arma::Col<int> vector_indices_radius = arma::linspace<arma::Col<int>>(0, Nradius - 1, Nradius);
>> 	arma::Col<int> vector_indices_time = arma::linspace<arma::Col<int>>(0, Ntime - 1, Ntime);
>> 	MatZeroEntries(petsc_mat);
>> 	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> 	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> 	arma::cx_mat local_mat = in_data.st();
>> 	MatSetValues(petsc_mat, Nradius, vector_indices_radius.memptr(), Ntime, vector_indices_time.memptr(), local_mat.memptr(), INSERT_VALUES);
>> 	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> 	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> }
>>
>> void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat &petsc_mat) {
>> 	const arma::uword Ntime = in_data.n_cols;
>> 	const arma::uword Nradius = in_data.n_rows;
>> 	MatZeroEntries(petsc_mat);
>> 	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> 	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> 	for(int i = 0; i < (int)Ntime; ++i){
>> 		for(int j = 0; j < (int)Nradius; ++j) {
>> 			MatSetValue(petsc_mat, j, i, i, INSERT_VALUES);
>> 		}
>> 	}
>> 	MatAssemblyBegin(petsc_mat, MAT_FINAL_ASSEMBLY);
>> 	MatAssemblyEnd(petsc_mat, MAT_FINAL_ASSEMBLY);
>> }
>> #ifndef TEST_SCALING_HPP
>> #define TEST_SCALING_HPP
>>
>> #include <helper_functions.hpp>
>>
>> void test_scaling_arma(const arma::cx_mat & in_mat,
>> 					   arma::cx_mat &out_mat,
>> 					   const arma::cx_double &scaling_factor);
>>
>> void test_scaling_petsc(const Mat &in_mat,
>> 						Mat &out_mat,
>> 						const PetscScalar &scaling_factor);
>>
>> void test_scaling_petsc_pointer(const Mat &in_mat,
>> 								Mat &out_mat,
>> 								const PetscScalar &scaling_factor);
>>
>> void test_scaling(const size_t matrix_size_rows, const size_t matrix_size_cols, const bool print_matrices);
>>
>> #endif // TEST_SCALING_HPP
>> #ifndef HELPER_FUNCTIONS_HPP
>> #define HELPER_FUNCTIONS_HPP
>>
>> #include <iostream>
>>
>> #include <armadillo>
>> #include <petscts.h>
>> #include <petscmat.h>
>> #include <petscvec.h>
>> #include <fftw3.h>
>> #include <fftw3-mpi.h>
>> #include <metis.h>
>> #include <chrono>
>>
>> #include <boost/numeric/odeint.hpp>
>>
>> constexpr int bench_rounds = 1000;
>>
>> void retrieve_data_from_PETSc(const Mat petsc_mat, arma::cx_mat &out_data,
>> 							  const arma::uword Ntime, const arma::uword Nradius);
>>
>> void store_data_in_PETSc(const arma::cx_mat &in_data, Mat &petsc_mat);
>>
>> void create_data_in_PETSc_from_scratch(const arma::cx_mat &in_data, Mat &petsc_mat);
>>
>> #endif // HELPER_FUNCTIONS_HPP
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.mcs.anl.gov/pipermail/petsc-users/attachments/20210217/7f774a7b/attachment.html>


More information about the petsc-users mailing list