MPI之矩阵乘法cannon算法
导入库函数
import math import numpy as np from mpi4py import MPI import os
参数设置
comm = MPI.COMM_WORLD rank = comm.Get_rank() n = 100 array=np.load('zong.npy') arrayA=array[0:n,:] arrayB = array[n:2*n,:] flag= array[-1][0:2] sp = int(flag[0]) # 总循环次数 kk = int(flag[1]) # 当前循环次数 p=sp**2 # 进程数-1(总进程rank) num = int(n / sp) # 分块后矩阵的维度 dims = [sp, sp] # 进程对应的坐标 periods = [True, True] # 表示当前维度是周期性的 cart_comm = comm.Create_cart(dims, periods) # 创建笛卡尔拓扑结构 用于循环交换分块 arrayA=np.ones( (n, n)) arrayB=np.ones( (n, n))
检查是否计算完毕
# 如果宕机后继续运行 检查是否已经计算完毕 if kk>=(sp-1): print('success!') print('array1******') array = np.load('zong.npy') arrayA = array[0:n, :] arrayB = array[n:2*n,:] resc = array[2 * n:3 * n, :] print(arrayA) print('array2**********') print(arrayB) print('answer**********') print(resc)
整体计算过程
else: # 还需要继续运算 if rank == p: # 主进程给其他进程分发分块 array=np.load('zong.npy') arrayA=array[0:n,:] arrayB = array[n:2*n,:] print('array1:') print(arrayA) print('*********************') print('array2:') print(arrayB) datac = array[2 * n:3 * n, :] # 保存计算结果 k = 0 # 从0进程开始分发 for i in range(int(sp)): for j in range(int(sp)): dataa = arrayA[i * num:(i + 1) * num, j * num:(j + 1) * num] datab = arrayB[i * num:(i + 1) * num, j * num:(j + 1) * num] comm.send(dataa, dest=k) comm.send(datab, dest=k) rrr=array[2*n:3*n,:] # 结果 resc = rrr[i * num:(i + 1) * num, j * num:(j + 1) * num] # 最终结果 comm.send(resc, dest=k) k += 1 else: # 如果不是主进程就接收矩阵块 dataa = comm.recv(source=p) datab = comm.recv(source=p) resc=comm.recv(source=p) midA=np.zeros((n, n)) midB=np.zeros((n, n)) if rank==p: if datac.any()!=0 : # 判断是否宕机后继续运算 如果是 从下一次循环开始 kk=kk+1 else: # 主进程外其他进程同上 if resc.any()!=0 : kk=kk+1 for k in range(kk,sp): # 循环操作 if k == 0: # 第一次循环 if rank != p: # 非主进程 # 当前进程的笛卡尔坐标 横纵取值0 1 2 3 不包括主进程 是个二维矩阵 pos = cart_comm.coords if pos[0] != 0: # 首次循环不是第一行移动 既要发也要收 left = pos[1] - pos[0] # 移动后进程的纵坐标=当前纵-当前横 fa = cart_comm.Get_cart_rank([pos[0], left]) # 发送 comm.send(dataa, dest=fa) shou = cart_comm.Get_cart_rank([pos[0], pos[1] + pos[0]]) # 接收 dataa = comm.recv(source=shou) if pos[1] != 0: # 不是第一列 uupp = pos[0] - pos[1] fa = cart_comm.Get_cart_rank([uupp, pos[1]]) comm.send(datab, dest=fa) shou = cart_comm.Get_cart_rank([pos[0] + pos[1], pos[1]]) datab = comm.recv(source=shou) # 非主进程把移动后的矩阵分块发送给主进程 comm.send(dataa, dest=p) comm.send(datab, dest=p) # 分块计算 res = dataa.dot(datab) resc += res comm.send(resc, dest=p) else: # 主进程 numrank=0 # 接收移动后 for i in range(int(sp)): for j in range(int(sp)): midA[i * num:(i + 1) * num, j * num:(j + 1) * num] = comm.recv(source=numrank) midB[i * num:(i + 1) * num, j * num:(j + 1) * num] = comm.recv(source=numrank) datac[i * num:(i + 1) * num, j * num:(j + 1) * num] = comm.recv(source=numrank) numrank += 1 flag = [sp, k] # 更新循环次数 flag_v = np.tile(flag, int(n/2)).reshape(1, n) AAA = np.vstack((midA, midB,datac, flag_v)) np.save('zong.npy', AAA) IP = [] # cpu_ip rmip = [] # 要删除的IP for line in open("cpu_ip.txt"): line = line.replace("\n", '') IP.append(line) for ip in IP: if (os.system('timeout 5s ping ' + ip + ' -c3')): # 执行成功返回0 ping不通的执行 rmip.append(ip) else: os.system('scp zong.npy mpiuser@'+str(ip)+':/home/mpiuser/mpii/zong.npy') continue else: # 除了第一次循环的剩下循环 if rank != p: pos = cart_comm.coords left = pos[1] - 1 fa = cart_comm.Get_cart_rank([pos[0], left]) comm.send(dataa, dest=fa) shou = cart_comm.Get_cart_rank([pos[0], pos[1] + 1]) dataa = comm.recv(source=shou) uupp = pos[0] - 1 fa = cart_comm.Get_cart_rank([uupp, pos[1]]) comm.send(datab, dest=fa) shou = cart_comm.Get_cart_rank([pos[0] + 1, pos[1]]) datab = comm.recv(source=shou) comm.send(dataa, dest=p) comm.send(datab, dest=p) res = dataa.dot(datab) resc=resc+res comm.send(resc, dest=p) else: numrank = 0 for i in range(int(sp)): for j in range(int(sp)): midA[i * num:(i + 1) * num, j * num:(j + 1) * num] = comm.recv(source=numrank) midB[i * num:(i + 1) * num, j * num:(j + 1) * num] = comm.recv(source=numrank) datac[i * num:(i + 1) * num, j * num:(j + 1) * num] = comm.recv(source=numrank) numrank += 1 flag = [sp, k] flag_v = np.tile(flag, int(n/2)).reshape(1, n) AAA = np.vstack((midA, midB, datac, flag_v)) np.save('zong.npy', AAA) IP = [] # cpu_ip rmip = [] # 要删除的IP for line in open("cpu_ip.txt"): line = line.replace("\n", '') IP.append(line) for ip in IP: if (os.system('timeout 5s ping ' + ip + ' -c3')): # 执行成功返回0 ping不通的执行 rmip.append(ip) else: os.system('scp zong.npy mpiuser@' + str(ip) + ':/home/mpiuser/mpii/zong.npy') continue if rank==p: print('array1*********************') print(midA) print('array2*********************') print(midB) print('answer*********************') print(datac)