计算pi - hip+mpi+fortran
实现一个计算pi的小程序,探索hip+mpi+fortran的编译链接方式
有fortran文件、hip.cpp文件、C文件(嵌入mpi),其中, c文件调用fortran文件中的函数和hip.cpp文件中的函数。每个文件单独编译,最终用mpif90链接在一起。.c文件用mpicc编译,.f90文件用mpif90编译,hip文件用hipcc编译,并编译成动态库(.so).
文件结构
mpi-pi.f90    用mpif90编译
pi-hip.cpp    用hipcc编译
main.cpp      用mpicc编译
    
用mpif90链接
Makefile
编译链接
方法1:
分别将三个文件编译成对应的.o文件,直接链接在一起:
make ex1:
        mpif90 -c  mpi-pi.f90
        hipcc  -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -c pi_hip.cpp
        mpicc -c  -std=c++11 main.cpp
        mpif90 mpi-pi.o pi_hip.o main.o -lm -lstdc++ -o out-pi
        make run
run:
        mpirun -mca pml ucx -np 8 ./out-pi
clean:
        rm -fr *.o *.err *.out out-pi *.so
编译报错:
(.text+0x18835): undefined reference to `hip_impl::kernargs(bool)'
pi_hip.o: In function `std::vector<unsigned char, std::allocator<unsigned char> > hip_impl::make_kernarg<float*, int, float, float, int, int, float*, int, float, float, int, int>(void (*)(float*, int, float, float, int, int), std::tuple<float*, int, float, float, int, int>)':
解决:链接时加入hip的库,改Makefile如下:
HIPLIB=-L/opt/rocm/hip/lib -lhip_hcc
make ex1:
        mpif90 -c  mpi-pi.f90
        hipcc  -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -c pi_hip.cpp
        mpicc -c  -std=c++11 main.cpp
        mpif90 mpi-pi.o pi_hip.o main.o $(HIPLIB) -lm -lstdc++ -o out-pi
        make run
run:
        mpirun -mca pml ucx -np 8 ./out-pi
clean:
        rm -fr *.o *.err *.out out-pi *.so
报错:
terminate called after throwing an instance of 'std::runtime_error'
  what():  Missing metadata for __global__ function: _Z6cal_piPfiffii
[h16r4n18:86129] *** Process received signal ***
[h16r4n18:86129] Signal: Aborted (6)
[h16r4n18:86129] Signal code:  (-6)
[h16r4n18:86129] [ 0] /lib64/libpthread.so.0(+0xf5e0)[0x2b1da8eb95e0]
[h16r4n18:86129] [ 1] /lib64/libc.so.6(gsignal+0x37)[0x2b1da90fb1f7]
[h16r4n18:86129] [ 2] /lib64/libc.so.6(abort+0x148)[0x2b1da90fc8e8]
[h16r4n18:86129] [ 3] /lib64/libstdc++.so.6(_ZN9__gnu_cxx27__verbose_terminate_handlerEv+0x165)[0x2b1da8064ac5]
[h16r4n18:86129] [ 4] /lib64/libstdc++.so.6(+0x5ea36)[0x2b1da8062a36]
[h16r4n18:86129] [ 5] /lib64/libstdc++.so.6(+0x5ea63)[0x2b1da8062a63]
[h16r4n18:86129] [ 6] /lib64/libstdc++.so.6(+0x5ec83)[0x2b1da8062c83]
[h16r4n18:86129] [ 7] ./out-pi[0x4239fd]
[h16r4n18:86129] [ 8] ./out-pi[0x40ba79]
[h16r4n18:86129] [ 9] ./out-pi[0x4243ea]
[h16r4n18:86129] [10] /lib64/libc.so.6(__libc_start_main+0xf5)[0x2b1da90e7c05]
[h16r4n18:86129] [11] ./out-pi[0x40a348]
[h16r4n18:86129] *** End of error message ***
方法2:
先将hip文件(pi-hip.cpp)编译成对应的.o文件,在编译成动态库pi_hip.so,链接时加入目标文件。
make ex2:
        make clean
        mpif90 -c  mpi-pi.f90
        hipcc -c -fpic -O3 -std=c++11 -D_HIP_PLATFORM_HCC__ -o pi_hip.o pi_hip.cpp
        hipcc -shared pi_hip.o -o libpi_hip.so
        mpicc -c -std=c++11 main.cpp
        mpif90 -L./ -lpi_hip mpi-pi.o main.o -lm  -lstdc++  -o out-pi
        make run
run:
        mpirun -mca pml ucx -np 8 ./out-pi
clean:
        rm -fr *.o *.err *.out out-pi *.so
运行结果正确,如下
  Process    1 says "Hello, world!".
  Process    5 says "Hello, world!".
COMMUNICATOR_MPI - Master process:
  FORTRAN90/MPI version
  An MPI example program.
 
  The number of processes is    8
 
  Process    0 says "Hello, world!".
  Process    4 says "Hello, world!".
  Process    7 says "Hello, world!".
  Process    3 says "Hello, world!".
  Process    6 says "Hello, world!".
  Process    2 says "Hello, world!".
  Number of processes in even communicator =    4
  Sum of global ID's in even communicator =   12
 
COMMUNICATOR_MPI:
  Normal end of execution.
 
  Number of processes in odd communicator  =    4
  Sum of global ID's in odd communicator  =   16
step =   0.0000001
myid = 5: partial pi =   0.3396083
step =   0.0000001
myid = 1: partial pi =   0.4824923
step =   0.0000001
myid = 4: partial pi =   0.3798115
step =   0.0000001
myid = 6: partial pi =   0.3013121
step =   0.0000001
myid = 2: partial pi =   0.4551676
step =   0.0000001
myid = 3: partial pi =   0.4195001
step =   0.0000001
myid = 7: partial pi =   0.2662676
step =   0.0000001
myid = 0: partial pi =   0.4974218
PI =   3.1415813
源文件代码如下:
mpi-pi.f90
subroutine hello( nnn )
!*****************************************************************************80
! mpif90 -o exe  mpi-pi.f90
  include "mpif.h"
  integer ( kind = 4 ) even_comm_id
  integer ( kind = 4 ) even_group_id
  integer ( kind = 4 ) even_id
  integer ( kind = 4 ) even_id_sum
  integer ( kind = 4 ) even_p
  integer ( kind = 4 ), allocatable :: even_rank(:)
  integer ( kind = 4 ) i
  integer ( kind = 4 ) id
  integer ( kind = 4 ) ierr
  integer ( kind = 4 ) j
  integer ( kind = 4 ) odd_comm_id
  integer ( kind = 4 ) odd_group_id
  integer ( kind = 4 ) odd_id
  integer ( kind = 4 ) odd_id_sum
  integer ( kind = 4 ) odd_p
  integer ( kind = 4 ) p
  integer ( kind = 4 ), allocatable :: odd_rank(:)
  integer  nnn
  integer ( kind = 4 ) world_group_id
!
!  Initialize MPI.
!
!  call MPI_Init ( ierr )
!
!  Get the number of processes.
!
  call MPI_Comm_size ( MPI_COMM_WORLD, p, ierr )
!
!  Get the individual process ID.
!
  call MPI_Comm_rank ( MPI_COMM_WORLD, id, ierr )
!
!  Process 0 prints an introductory message.
!
  if ( id == 0 ) then
    write ( *, '(a)' ) 'COMMUNICATOR_MPI - Master process:'
    write ( *, '(a)' ) '  FORTRAN90/MPI version'
    write ( *, '(a)' ) '  An MPI example program.'
    write ( *, '(a)' ) ' '
    write ( *, '(a,i4)' ) '  The number of processes is ', p
    write ( *, '(a)' ) ' '
  end if
!
!  Every process prints a hello.
!
!  write ( *, '(a,i4,a)' ) '  Process ', id, ' says "Hello, world!".'
!
!  Get a group identifier for MPI_COMM_WORLD.
!
  call MPI_Comm_group ( MPI_COMM_WORLD, world_group_id, ierr )
!
!  List the even processes, and create their group.
!
  even_p = ( p + 1 ) / 2
  allocate ( even_rank(1:even_p) )
  j = 0
  do i = 0, p - 1, 2
    j = j + 1
    even_rank(j) = i
  end do
  call MPI_Group_incl ( world_group_id, even_p, even_rank, even_group_id, ierr )
  call MPI_Comm_create ( MPI_COMM_WORLD, even_group_id, even_comm_id, ierr )
!
!  List the odd processes, and create their group.
!
  odd_p = p / 2
  allocate ( odd_rank(1:odd_p) )
  j = 0
  do i = 1, p - 1, 2
    j = j + 1
    odd_rank(j) = i
  end do
  call MPI_Group_incl ( world_group_id, odd_p, odd_rank, odd_group_id, ierr )
  call MPI_Comm_create ( MPI_COMM_WORLD, odd_group_id, odd_comm_id, ierr )
!
!  Try to get ID of each process in both groups.  
!  If a process is not in a communicator, set its ID to -1.
!
  if ( mod ( id, 2 ) == 0 ) then
    call MPI_Comm_rank ( even_comm_id, even_id, ierr )
    odd_id = -1
  else
    call MPI_Comm_rank ( odd_comm_id,  odd_id, ierr )
    even_id = -1
  end if
!
!  Use MPI_Reduce to sum the global ID of each process in the even
!  group.
!  Assuming 4 processes: EVEN_SUM = 0 + 2 = 2
!
  if ( even_id /= -1 ) then
    call MPI_Reduce ( id, even_id_sum, 1, MPI_INTEGER, MPI_SUM, 0, &
      even_comm_id, ierr )
  end if
  if ( even_id == 0 ) then
    write ( *, '(a,i4)' ) &
      '  Number of processes in even communicator = ', even_p
    write ( *, '(a,i4)' ) &
      '  Sum of global ID''s in even communicator = ', even_id_sum
  end if
!
!  Use MPI_Reduce to sum the global ID of each process in the odd group.
!  Assuming 4 processes: ODD_SUM = 1 + 3 = 4
!
  if ( odd_id /= -1 ) then
    call MPI_Reduce ( id, odd_id_sum,  1, MPI_INTEGER, MPI_SUM, 0, &
      odd_comm_id, ierr )
  end if
  if ( odd_id == 0 ) then
    write ( *, '(a,i4)' ) &
      '  Number of processes in odd communicator  = ', odd_p
    write ( *, '(a,i4)' ) &
      '  Sum of global ID''s in odd communicator  = ', odd_id_sum
  end if
!
!  Terminate MPI.
!
!  call MPI_Finalize ( ierr )
!
!  Free memory.
!
  deallocate ( even_rank )
  deallocate ( odd_rank )
!
!  Terminate
!
  if ( id == 0 ) then
    write ( *, '(a)' ) ' '
    write ( *, '(a)' ) 'COMMUNICATOR_MPI:'
    write ( *, '(a)' ) '  Normal end of execution.'
    write ( *, '(a)' ) ' '
  end if
end subroutine hello
pi-hip.cpp
#include<stdio.h>
#include<stdlib.h>
#include <hip/hip_runtime.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13  // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
__global__ void cal_pi(float *sum,int nbin,float step,float offset,int nthreads,int nblocks)
 {
   int i;
   float x;
   int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across blocks
   for (i=idx; i< nbin; i+=nthreads*nblocks) { // Interleaved bin assignment to threads
   x = offset+(i+0.5)*step;
   sum[idx] += 4.0/(1.0+x*x);
   }
 }
void computePI(int nproc,int myid, float *sumHost,float step)
{
  int nbin;
  float offset;
  float *sumDev; // Pointers to device arrays
  dim3 dimGrid(NUM_BLOCK,1,1); // Grid dimensions (only use 1D)
  dim3 dimBlock(NUM_THREAD,1,1); // Block dimensions (only use 1D)
 
  nbin = NBIN/nproc; // Number of bins per MPI process
  offset = myid*step*nbin; // Quadrature-point offset
  size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
  hipMalloc((void **) &sumDev,size); // Allocate array on device
  hipMemset(sumDev,0,size); // Reset array in device to 0
  // // Calculate on device (call CUDA kernel)
  hipLaunchKernelGGL(cal_pi,dimGrid,dimBlock,0,0,sumDev,nbin,step,offset,NUM_THREAD,NUM_BLOCK);
  // // Retrieve result from device and store it in host array
  hipMemcpy(sumHost,sumDev,size,hipMemcpyDeviceToHost);
  hipFree(sumDev);
}
main.cpp
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block
// Kernel that executes on the CUDA device
void computePI(int nproc,int myid,float *sumHost,float step);
extern "C" { void  hello_(int *nnn ); }
#define hello_f hello_
 int main(int argc,char **argv) {
 int myid,nproc,tid, nbin, nnn;
 float pi=0.0, pig, step;
 float *sumHost; // Pointers to host arrays
 MPI_Init(&argc,&argv);
 
 nnn = 1;//add by wangwu
 hello_f(&nnn);//calling fortran mpi hello, just a test
 MPI_Comm_rank(MPI_COMM_WORLD,&myid); // My MPI rank
 MPI_Comm_size(MPI_COMM_WORLD,&nproc); // Number of MPI processes
 size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
 sumHost = (float *)malloc(size); // Allocate array on host
 nbin = NBIN/nproc; // Number of bins per MPI process
 step = 1.0/(float)(nbin*nproc); // Step size with redefined number of bins
 computePI(nproc,myid,sumHost,step);
// // Reduction over CUDA threads
 for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++) pi += sumHost[tid];
if (myid==0) printf("on DCUs: HIP CPP for pi\n");
// printf("step = %11.7f\n", step);
 pi *=step;
 free(sumHost);
 printf("myid = %d: partial pi = %11.7f\n",myid, pi);
// // Reduction over MPI processes
 MPI_Allreduce(&pi,&pig,1,MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD);
 if (myid==0) printf("PI = %11.7f\n",pig);
 MPI_Finalize();
 return 0;
 }
 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号