Message Passing (MPI)¶

MPI is a standardized and portable message-passing library for distributed processes. Both point-to-point and collective communication are supported.

There is a wide variety of training materials about using OpenMP on CPUs available, so this documentation focuses on the basics and on the combination of MPI libraries with GPU programming models.

Compilers¶

The following table lists the MPI compiler commands available on most of the systems run by NHR@KIT and the underlying compilers, compiler families, languages, and application binary interfaces (ABIs) that they support.

MPI Compiler Command	Default Compiler	Supported Language(s)	Supported ABI's
mpicc	gcc, cc	C	32/64 bit
mpicxx	g++	C/C++	32/64 bit
mpifc	gfortran	Fortran77/Fortran 95	32/64 bit
GNU mpigcc	gcc	C	32/64 bit
GNU mpigxx	g++	C/C++	32/64 bit
GNU mpif77	g77	Fortran 77	32/64 bit
GNU mpif90	gfortran	Fortran 95	32/64 bit
Intel mpiicc	icc	C	32/64 bit
Intel mpiicpc	icpc	C++	32/64 bit
Intel impiifort	ifort	Fortran77/Fortran 95	32/64 bit

GPU Offloading and MPI¶

All MPI communication takes place between the private memory spaces of the distributed processes. Therefore, all data must be copied from the GPU to this private memory area prior to communication. With so-called CUDA aware MPI implementations, this re-copying is not required and transfers can be performed directly from the GPU memory.

See:

The following simple MPI program demonstrates host to host, host to device and device to device MPI point to point communication:

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <err.h>
#include "cuda.h"
#include "cuda_runtime.h"

int main(int argc, char* argv[]) {
    // Initialize the MPI execution environment
    MPI_Init(&argc, &argv);

    // Get the size of the group associated with communicator MPI_COMM_WORLD
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // Get the rank of the calling process in the communicator MPI_COMM_WORLD
    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    printf("* Allocate memory on [%1i],CPU\n", world_rank);
    int size = 1000;
    double *a = (double *) malloc(size * sizeof(double));
    if (a == NULL) {
        errx(1, "malloc a[] failed");
    }
    printf("* Allocate memory [%1i],GPU\n", world_rank);
    double *d_a;
    if (cudaMalloc((void **) &d_a, size * sizeof(double)) != cudaSuccess) {
        errx(1, "cudaMalloc d_a[] failed");
    }

    printf("* Initalize memory on [%1i],CPU\n", world_rank);
    for (int i = 0; i < size; i++) {
        a[i] = (double) world_rank;
    }

    MPI_Status status;
    int err;
    // From [0],CPU to [1],GPU
    if      (world_rank == 0) {
        printf("* Send from [%1i],CPU\n", world_rank);
        err = MPI_Send(  a, size, MPI_DOUBLE, 1, 1, MPI_COMM_WORLD);
    }
    else if (world_rank == 1) {
        printf("* Receive to [%1i],GPU\n", world_rank);
        err = MPI_Recv(d_a, size, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [0],CPU to [1],GPU failed");
    }

    // From [1],GPU to [0],GPU
    if      (world_rank == 1) {
        printf("* Send from [%1i],GPU\n", world_rank);
        err = MPI_Send(d_a, size, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD);
    }
    else if (world_rank == 0) {
        printf("* Receive to [%1i],GPU\n", world_rank);
        err = MPI_Recv(d_a, size, MPI_DOUBLE, 1, 2, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [1],GPU to [0],GPU failed");
    }

    // From [0],GPU to [1],CPU
    if      (world_rank == 0) {
        printf("* Send from [%1i],GPU\n", world_rank);
        err = MPI_Send(d_a, size, MPI_DOUBLE, 1, 3, MPI_COMM_WORLD);
    }
    else if (world_rank == 1) {
        printf("* Receive to [%1i],CPU \n", world_rank);
        err = MPI_Recv(  a, size, MPI_DOUBLE, 0, 3, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [0],GPU to [1],CPU failed");
    }

    // From [1],CPU to [0],CPU
    if      (world_rank == 1) {
        printf("* Send from [%1i],CPU\n", world_rank);
        err = MPI_Send(  a, size, MPI_DOUBLE, 0, 4, MPI_COMM_WORLD);
    }
    else if (world_rank == 0) {
        printf("* Receive to [%1i],CPU\n", world_rank);
        err = MPI_Recv(  a, size, MPI_DOUBLE, 1, 4, MPI_COMM_WORLD, &status);
    }
    if (err != MPI_SUCCESS) {
        errx(2, "MPI transport from [1],CPU to [0],CPU failed");
    }

    // Check host memory
    for (int i = 0; i < size; i++) {
        if (a[i] != 0.) {
            errx(2, "MPI transport failed");
        }
    }

    printf("* Free memory on [%1i],GPU\n", world_rank);
    cudaFree(d_a);

    printf("* Free memory on [%1i],CPU\n", world_rank);
    free(a);

    // Terminates MPI execution environment
    MPI_Finalize();
}

MPI programs can be compiled with different compilers. The procedure for this is as follows:

GNU Compiler Collection, OpenMPI

## Load GNU compiler, OpenMPI and CUDA environment
$ module add \
    compiler/gnu \
    mpi/openmpi \
    devel/cuda

## Compile C or C++ source code with OpenMP support
$ mpicc  ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
$ mpicxx ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart

LLVM Compiler, OpenMPI

## Load LLVM compiler, OpenMPI and CUDA environment
$ module add \
    compiler/llvm  \
    mpi/openmpi \
    devel/cuda

## Compile C or C++ source code with OpenMP support
$ mpicc  ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
$ mpicxx ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart

NVIDIA High Performance Computing (HPC) SDK

## Load NVIDIA HPC SDK environment
$ module add \
    toolkit/nvidia-hpc-sdk \
    devel/cuda


## Compile C or C++ source code with OpenMP support
$ mpicc  ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart
$ mpicxx ... ${C_SOURCE} -o ${EXECUTABLE} -lcudart

The start of an MPI program with accelerator support is handled as usual with mpirun. In the accelerated partition of the HoreKa cluster, 2 CPUs each with 38 cores and 4 GPUs are available per node. To distribute the CPUs evenly to the GPUs, you can proceed as follows:

$ mpirun \
    --display-map \
    --display-allocation \
    --map-by ppr:2:socket:pe=19 \
    --bind-to core \
    bash -c \
        'export CUDA_VISIBLE_DEVICES=${OMPI_COMM_WORLD_LOCAL_RANK};
        ${EXECUTABLE}'