当前位置: 首页 > news >正文

MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

MPI Code for Ghost Data Exchange in 3D Domain Decomposition with Multi-GPUs

Here’s a comprehensive MPI code that demonstrates ghost data exchange for a 3D domain decomposition across multiple GPUs. This implementation assumes you’re using CUDA-aware MPI for efficient GPU-to-GPU communication.

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <cuda_runtime.h>// Define the dimensions of the global domain
#define NX_GLOBAL 256
#define NY_GLOBAL 256
#define NZ_GLOBAL 256// Define the ghost layer width
#define GHOST_WIDTH 2// Macro for checking CUDA errors
#define CUDA_CHECK(cmd) {cudaError_t error = cmd; if(error!=cudaSuccess){printf("<%s>:%i ",__FILE__,__LINE__); printf("[CUDA] Error: %s\n", cudaGetErrorString(error));exit(EXIT_FAILURE);}}// Structure to hold domain decomposition information
typedef struct {int rank;           // MPI rankint size;           // Number of MPI processesint dims[3];        // Process grid dimensionsint coords[3];      // Process coordinates in gridMPI_Comm cart_comm; // Cartesian communicator// Local domain sizes (excluding ghost layers)int nx, ny, nz;// Starting indices in global domainint start_x, start_y, start_z;// Device pointers for data and ghost buffersdouble *d_data;         // Main data array on devicedouble *d_send_buffers[6]; // Send buffers for each facedouble *d_recv_buffers[6]; // Receive buffers for each face
} DomainInfo;// Initialize domain decomposition
void init_domain_decomposition(DomainInfo *domain, int argc, char **argv) {MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &domain->rank);MPI_Comm_size(MPI_COMM_WORLD, &domain->size);// Create a 3D Cartesian topologyint periods[3] = {1, 1, 1}; // Periodic boundariesdomain->dims[0] = domain->dims[1] = domain->dims[2] = 0;MPI_Dims_create(domain->size, 3, domain->dims);MPI_Cart_create(MPI_COMM_WORLD, 3, domain->dims, periods, 1, &domain->cart_comm);MPI_Cart_coords(domain->cart_comm, domain->rank, 3, domain->coords);// Calculate local domain sizes (excluding ghost layers)domain->nx = NX_GLOBAL / domain->dims[0];domain->ny = NY_GLOBAL / domain->dims[1];domain->nz = NZ_GLOBAL / domain->dims[2];// Calculate starting indices in global domaindomain->start_x = domain->coords[0] * domain->nx;domain->start_y = domain->coords[1] * domain->ny;domain->start_z = domain->coords[2] * domain->nz;// Adjust for non-divisible casesif (domain->coords[0] == domain->dims[0]-1) domain->nx = NX_GLOBAL - domain->start_x;if (domain->coords[1] == domain->dims[1]-1) domain->ny = NY_GLOBAL - domain->start_y;if (domain->coords[2] == domain->dims[2]-1) domain->nz = NZ_GLOBAL - domain->start_z;// Allocate device memory for main data array (including ghost layers)size_t data_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);CUDA_CHECK(cudaMalloc(&domain->d_data, data_size));CUDA_CHECK(cudaMemset(domain->d_data, 0, data_size));// Allocate send and receive buffers for each facefor (int i = 0; i < 6; i++) {size_t buffer_size = 0;if (i < 2) { // X facesbuffer_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else if (i < 4) { // Y facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH) * sizeof(double);} else { // Z facesbuffer_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH * sizeof(double);}CUDA_CHECK(cudaMalloc(&domain->d_send_buffers[i], buffer_size));CUDA_CHECK(cudaMalloc(&domain->d_recv_buffers[i], buffer_size));}
}// Pack data into send buffers
__global__ void pack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int src_idx;if (face == 0) { // Left facesrc_idx = ghost_width * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;} else { // Right facesrc_idx = (nx + ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;buffer[buf_idx] = data[src_idx];}
}// Similar pack kernels for y and z faces would be defined here...// Unpack data from receive buffers
__global__ void unpack_x_face(double *data, double *buffer, int nx, int ny, int nz, int ghost_width, int face) {int j = blockIdx.x * blockDim.x + threadIdx.x;int k = blockIdx.y * blockDim.y + threadIdx.y;if (j < ny + 2*ghost_width && k < nz + 2*ghost_width) {int dest_idx;if (face == 0) { // Left facedest_idx = j * (nz + 2*ghost_width) + k;} else { // Right facedest_idx = (nx + 2*ghost_width) * (ny + 2*ghost_width) * (nz + 2*ghost_width) + j * (nz + 2*ghost_width) + k;}int buf_idx = j * (nz + 2*ghost_width) + k;data[dest_idx] = buffer[buf_idx];}
}// Similar unpack kernels for y and z faces would be defined here...// Exchange ghost data with neighbors
void exchange_ghost_data(DomainInfo *domain) {MPI_Request requests[12]; // 6 sends + 6 receivesint request_count = 0;MPI_Status statuses[12];// Determine neighbor ranksint neighbors[6];MPI_Cart_shift(domain->cart_comm, 0, 1, &neighbors[0], &neighbors[1]); // X directionMPI_Cart_shift(domain->cart_comm, 1, 1, &neighbors[2], &neighbors[3]); // Y directionMPI_Cart_shift(domain->cart_comm, 2, 1, &neighbors[4], &neighbors[5]); // Z direction// Post receives firstfor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {size_t recv_size = 0;if (face < 2) { // X facesrecv_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facesrecv_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Irecv(domain->d_recv_buffers[face], recv_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Pack and send datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate pack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;pack_x_face<<<grid, block>>>(domain->d_data, domain->d_send_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());size_t send_size = 0;if (face < 2) { // X facessend_size = GHOST_WIDTH * (domain->ny + 2*GHOST_WIDTH) * (domain->nz + 2*GHOST_WIDTH);} else if (face < 4) { // Y facessend_size = (domain->nx + 2*GHOST_WIDTH) * GHOST_WIDTH * (domain->nz + 2*GHOST_WIDTH);} else { // Z facessend_size = (domain->nx + 2*GHOST_WIDTH) * (domain->ny + 2*GHOST_WIDTH) * GHOST_WIDTH;}MPI_Isend(domain->d_send_buffers[face], send_size, MPI_DOUBLE, neighbors[face], face, domain->cart_comm, &requests[request_count++]);}}// Wait for all communications to completeMPI_Waitall(request_count, requests, statuses);// Unpack received datafor (int face = 0; face < 6; face++) {if (neighbors[face] != MPI_PROC_NULL) {// Launch appropriate unpack kernel based on facedim3 block(16, 16);dim3 grid;if (face < 2) { // X facesgrid.x = (domain->ny + 2*GHOST_WIDTH + block.x - 1) / block.x;grid.y = (domain->nz + 2*GHOST_WIDTH + block.y - 1) / block.y;unpack_x_face<<<grid, block>>>(domain->d_data, domain->d_recv_buffers[face], domain->nx, domain->ny, domain->nz, GHOST_WIDTH, face);} // Similar for y and z faces would be here...CUDA_CHECK(cudaDeviceSynchronize());}}
}// Clean up resources
void cleanup(DomainInfo *domain) {CUDA_CHECK(cudaFree(domain->d_data));for (int i = 0; i < 6; i++) {CUDA_CHECK(cudaFree(domain->d_send_buffers[i]));CUDA_CHECK(cudaFree(domain->d_recv_buffers[i]));}MPI_Finalize();
}int main(int argc, char **argv) {DomainInfo domain;init_domain_decomposition(&domain, argc, argv);// Main simulation loop would go herefor (int step = 0; step < 10; step++) {// Perform computation on local domain// Exchange ghost dataexchange_ghost_data(&domain);// Continue computation}cleanup(&domain);return 0;
}

Key Features of This Implementation:

  1. 3D Domain Decomposition:

    • Uses MPI Cartesian topology for logical process arrangement
    • Handles both divisible and non-divisible domain sizes
  2. GPU-Aware MPI:

    • Directly passes GPU pointers to MPI calls (requires CUDA-aware MPI)
    • Avoids host-staging for better performance
  3. Efficient Communication:

    • Non-blocking sends and receives for overlap opportunities
    • Separate buffers for each face to prevent contention
  4. Kernel-Based Packing/Unpacking:

    • CUDA kernels for efficient data movement between main array and buffers
    • Parallel packing/unpacking operations
  5. Flexible Ghost Layer Width:

    • Configurable ghost layer size via GHOST_WIDTH macro

Requirements:

  1. MPI implementation with CUDA-aware support (OpenMPI, MVAPICH2, etc.)
  2. CUDA toolkit
  3. Compilation with nvcc and MPI compiler wrappers

Usage Notes:

  1. You’ll need to complete the pack/unpack kernels for Y and Z faces (omitted for brevity)
  2. The code assumes periodic boundaries (adjust periods array if needed)
  3. For optimal performance, tune the block/grid dimensions in the kernels
  4. Add error checking for MPI operations in production code

This implementation provides a solid foundation for multi-GPU simulations with 3D domain decomposition and ghost data exchange.

相关文章:

  • 20250427 对话1: 何东山的宇宙起源理论
  • vscode eslint与vue-official冲突,导致点击的时候鼠标不会变成手型,一直在加载,但是不转到相应方法。
  • vue2 项目的 vscode 插件整理
  • Marmoset Toolbag 5.0 中文汉化版 八猴软件中文汉化版 免费下载
  • Maven 依赖范围(Scope)详解
  • 写windows服务日志-.net4.5.2-定时修改数据库中某些参数
  • 批量级负载均衡(Batch-Wise Load Balance)和顺序级负载均衡(Sequence-Wise Load Balance)
  • 【如何使用solidwork编辑结构导入到simscope】
  • FastAPI中的依赖注入详解与示例
  • MLLM之Bench:LEGO-Puzzles的简介、安装和使用方法、案例应用之详细攻略
  • 语音合成之八-情感化语音合成的演进路线
  • HTTP header Cookie 和 Set-Cookie
  • DIFY教程第一集:安装Dify配置环境
  • 泰迪杯实战案例超深度解析:旅游景点游客流量预测与资源优化
  • 英文中日期读法
  • 记录学习记录学习《手动学习深度学习》这本书的笔记(九)
  • Python中的Walrus运算符分析
  • 第35课 常用快捷操作——用“鼠标左键”拖动图元
  • 产品经理面经(1)
  • 在winform中使用chromiumWebBrowser显示Echarts图表
  • 气温“过山车”现象未来或更频繁且更剧烈
  • 独家丨申万宏源研究所将迎来新所长:首席策略分析师王胜升任
  • 哈马斯同意释放剩余所有以方被扣押人员,以换取停火五年
  • 白酒瓶“神似”北京第一高楼被判侵权,法院一审判赔45万并停售
  • 中国人民对外友好协会代表团访问美国
  • 影子调查丨掉落的喷淋头:太原一7天酒店加盟店消防设施造假迷局