4 5
1.000000 4.000000 -2.000000 3.000000 6.000000
2.000000 2.000000 0.000000 4.000000 2.000000
3.000000 0.000000 -1.000000 2.000000 1.000000
1.000000 2.000000 2.000000 -3.000000 8.000000
一.基本实现
#include <cstdio>
#include <cstdlib>
#include <cstring>
using namespace std;
void environment_init();
typedef double data_t;
void cblas_factor(data_t *vec,int size,data_t factor);
void cblas_sub(data_t *minuend,data_t *subtrahend,int size);
data_t * cblas_max(data_t *vec,int size);
//N行,M列,M = N+1
int N,M;
//X的系数矩阵
data_t * matrix_A;
//y向量
data_t * B;
//x的解向量
data_t * X;
//通过A(i,j)得到,x系数矩阵中[i,j]元素的地址
data_t * A(int i,int j){
return matrix_A+i*N+j;
}
void print_matrix(){
printf("matrix:\n");
for(int i = 0 ; i < N ; i++){
for(int j = 0 ; j < N ; j++){
printf("%lf ",*A(i,j));
}
printf("%lf\n",*(B+i));
}
}
int main()
{
FILE * f = fopen("data.txt","r");
fscanf(f,"%d%d",&N,&M);
matrix_A = (data_t *)malloc(sizeof(data_t)*N*N);
B = (data_t *)malloc(sizeof(data_t)*N);
X = (data_t *)malloc(sizeof(data_t)*N);
int *shift_table = (int*)malloc(sizeof(int)*N);
//get matrix from file
for(int i = 0 ; i < N ; i++){
for(int j = 0 ; j < N ; j++){
fscanf(f,"%lf",A(i,j));
printf("%lf ",*A(i,j));
}
fscanf(f,"%lf",B+i);
printf("%lf\n",*(B+i));
}
//init shift table
for(int i = 0 ; i < N ; i++){
shift_table[i] = i;
}
//computer
for(int i = 0 ; i < N-1 ; i++){
//选主元
int index = cblas_max(A(i,i),N-i)-A(i,0);
//列交换
if(index != i){
//错误!!!
shift_table[i] = index;
shift_table[index] = i;
for(int row = 0 ; row < N ; row++){
data_t temp = *A(row,i);
*(A(row,i)) = *A(row,index);
*A(row,index) = temp;
}
}
//主元归一化
// 消元
//错误!!! *A(row,i)!= 0会让消元提前结束
for(int row = i+1 ; *A(row,i)!= 0 && row < N ; row++){
data_t factor = *A(i,i)/ *A(row,i);
cblas_factor(A(row,i),N-i,factor);
cblas_sub(A(row,i),A(i,i),N-i);
*(B+row) *= factor;
*(B+row) = *(B+row)-*(B+i);
}
print_matrix();
}
//回代,i代表了Xi,col代表了列号,用来计算前面已经计算的xi值的sum
for(int i = N-1 ; i >= 0 ; i–){
data_t sum = 0;
for(int col = i+1 ; col < N ; col++){
sum += *A(i,col)*X[col];
}
X[i] = (*(B+i) – sum) / *A(i,i);
}
//恢复,其中shift table保存了原来的列号
for(int i = 0 ; i < N ; i++){
printf("x%d:%lf ",shift_table[i],X[i]);
}
}
//向量操作,对所有的数*factor
void cblas_factor(data_t *vec,int size,data_t factor){
for(int i = 0 ; i < size ; i++){
*(vec+i) = *(vec+i)*factor;
}
}
//两个向量的减
void cblas_sub(data_t *minuend,data_t *subtrahend,int size){
for(int i = 0 ; i < size ; i++){
*(minuend+i) = *(minuend+i) – *(subtrahend+i);
}
}
//寻找向量中的最大值
data_t * cblas_max(data_t *vec,int size){
data_t * index = vec;
data_t max = *vec;
for(int i = 0 ; i < size ; i++){
if(*(vec+i) > max){
max = *(vec+i);
index = vec+i;
}
}
return index;
}
二.使用blas上的cell实现
blas参考文档:http://www.netlib.org/blas/blast-forum/blas-report.pdf
基本blas实现:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cblas.h>
#include "malloc_align.h"
void environment_init();
typedef double data_t;
int N,M;//N行,M列,M = N+1
data_t * matrix_A;//X的系数矩阵
data_t * B;//y向量
data_t * X;//x的解向量
int *shift_table;
//通过A(i,j)得到,x系数矩阵中[i,j]元素的地址
data_t * A(int i,int j){
return matrix_A+i*N+j;
}
void print_matrix();
int main()
{
FILE * f = fopen("data.txt","r");
int i = 0,j = 0,index = 0,row = 0,temp = 0;
fscanf(f,"%d%d",&N,&M);
matrix_A = (data_t *)_malloc_align(sizeof(data_t)*N*N,7);
B = (data_t *)_malloc_align(sizeof(data_t)*N,7);
X = (data_t *)_malloc_align(sizeof(data_t)*N,7);
shift_table = (int*)_malloc_align(sizeof(int)*N,7);
//get matrix from file
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
fscanf(f,"%lf",A(i,j));
printf("%lf ",*A(i,j));
}
fscanf(f,"%lf",B+i);
printf("%lf\n",*(B+i));
}
//init shift table
for(i = 0 ; i < N ; i++){
shift_table[i] = i;
}
//computer
for(i = 0 ; i < N-1 ; i++){
//选主元
index = cblas_idamax(N-i,A(i,i),1)+i;//cblas_idamax 选择最大绝对值的blas库
printf("index:%d\n",index);
//列交换
if(index != i){
temp = shift_table[i];
shift_table[i] = shift_table[index];
shift_table[index] = temp;
for(row = 0 ; row < N ; row++){
data_t temp = *A(row,i);
*(A(row,i)) = *A(row,index);
*A(row,index) = temp;
}
}
for(row = 0 ; row < N ; row++)
printf("%d ",shift_table[row]);
printf("\n");
//主元归一化
// 消元
for(row = i+1 ; row < N ; row++){
if(*A(row,i) == 0) continue;
data_t factor = -1*(*A(row,i)/ *A(i,i));
cblas_daxpy(N-i,factor,A(i,i),1,A(row,i),1);//cblas_daxpy( N, alpha,X,incX, Y, incY); Y=Y+alpha*X
*(B+row) = *(B+row) + *(B+i)*factor;
}
print_matrix();
}
//回代,i代表了Xi,col代表了列号,用来计算前面已经计算的xi值的sum
for(i = N-1 ; i >= 0 ; i–){
data_t sum = 0;
for( j = i+1 ; j < N ; j++){
sum += *A(i,j)*X[j];
}
X[i] = (*(B+i) – sum) / *A(i,i);
}
//恢复,其中shift table保存了原来的列号
for(i = 0 ; i < N ; i++){
printf("x%d:%lf ",shift_table[i],X[i]);
}
}
void print_matrix(){
int i,j;
printf("matrix:\n");
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
printf("%lf ",*A(i,j));
}
printf("%lf\n",*(B+i));
}
}
三.使用mpi的基本实现
四.使用mpi的cell实现
五.使用mpi的异构实现
六.使用mpi的异构适用性实现
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cblas.h>
#include "mpi.h"
#include "malloc_align.h"
void environment_init();
typedef double data_t;
int N,M;//N行,M列,M = N+1
data_t * matrix_A;//X的系数矩阵
data_t * B;//y向量
data_t * X;//x的解向量
int *shift_table;
MPI_Status status;
//通过A(i,j)得到,x系数矩阵中[i,j]元素的地址
data_t * A(int i,int j){
return matrix_A+i*M+j;
}
void print_matrix();
int main(int argc,char **argv)
{
FILE * f;
int i = 0,j = 0,index = 0,row = 0;
int group_size;
int *row_rank_table;
int process_rownum;
int myrank;
data_t * mainrow_space;
data_t ** row_address_table;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&group_size);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
//printf("mpi:%d %d\n",group_size,myrank);
if(myrank == 0){
f= fopen("/home/duanple/CBLAS/data.txt","r");
fscanf(f,"%d%d",&N,&M);
matrix_A = (data_t *)_malloc_align(sizeof(data_t)*N*M,7);
B = (data_t *)_malloc_align(sizeof(data_t)*N,7);
X = (data_t *)_malloc_align(sizeof(data_t)*N,7);
//get matrix from file
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
fscanf(f,"%lf",A(i,j));
}
fscanf(f,"%lf",A(i,j));
*(B+i) = *A(i,j);
}
//print_matrix();
}
//broadcast the process row to be in,and rows num in every process
MPI_Bcast(&N,1,MPI_INT,0,MPI_COMM_WORLD);
M = N+1;
//printf("rank:%d m:%d\n",myrank,M);
//malloc the memory for the row of matrix
shift_table = (int*)_malloc_align(sizeof(int)*N,7);
row_rank_table = (int *)_malloc_align(sizeof(int)*N,7);
row_address_table = (data_t**)_malloc_align(sizeof(data_t*)*N,7);
//init shift table
for(i = 0 ; i < N ; i++){
shift_table[i] = i;
}
//divide matrix
if(myrank == 0){
for(i = 0,j = 0 ; i < N ; i++,j++){
row_rank_table[i] = j%group_size;
}
}
MPI_Bcast(row_rank_table,N,MPI_INT,0,MPI_COMM_WORLD);
process_rownum = 0;
for(i = 0 ; i < N ; i++){
//printf("rank:%d row%d\n",myrank,row_rank_table[i]);
if(row_rank_table[i] == myrank){
row_address_table[i] = (data_t *)_malloc_align(sizeof(data_t)*M,7);
process_rownum++;
}
}
//
//printf("rank:%d rows:%d\n",myrank,process_rownum);
//root send the data to the processes.
if(myrank == 0){//send
for(i = 0 ; i < N ; i++){
if(row_rank_table[i] == 0){
row_address_table[i] = A(i,0);
}else{
MPI_Send(A(i,0),N,MPI_DOUBLE,row_rank_table[i],i,MPI_COMM_WORLD);
MPI_Send(B+i,1,MPI_DOUBLE,row_rank_table[i],i,MPI_COMM_WORLD);
}
}
}else{//receive
for(i = 0 ; i < N ; i++){
if(myrank !=0 && myrank == row_rank_table[i]){
MPI_Recv(row_address_table[i],N,MPI_DOUBLE,0,i,MPI_COMM_WORLD,&status);
MPI_Recv(row_address_table[i]+N,1,MPI_DOUBLE,0,i,MPI_COMM_WORLD,&status);
}
}
}
//printf the revecive result.
/*
for(i = 0 ; i < N ; i++){
if(myrank != row_rank_table[i]) continue;
printf("rank%d-row%d:\n",myrank,row_rank_table[i]);
for(j = 0 ; j < M ; j++){
printf("%lf ",*(row_address_table[i]+j));
}
printf("\n");
} */
//computer
//消元
mainrow_space = (data_t *)_malloc_align(sizeof(data_t)*M,7);//存放主行的临时空间
for(i = 0 ; i < N-1 ; i++){
if(myrank == row_rank_table[i]){
//上面的条件,可能造成同一个进程上的其他行不被处理。
//选主元
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",*(row_address_table[i]+j));}printf("\n");
index = cblas_idamax(N-i,row_address_table[i]+i,1)+i;
//列交换
if(index != i){
int temp = shift_table[i];
shift_table[i] = shift_table[index];
shift_table[index] = temp;
data_t data_temp = *(row_address_table[i]+i);
*(row_address_table[i]+i) = *(row_address_table[i]+index);
*(row_address_table[i]+index) = data_temp;
}
//拷贝主行元素到临时空间
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",*(row_address_table[i]+j));}printf("\n");
memcpy(mainrow_space,row_address_table[i],sizeof(data_t)*M);
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",*(row_address_table[i]+j));}printf("\n");
}
//其他行的列交换
MPI_Bcast(&index,1,MPI_INT,row_rank_table[i],MPI_COMM_WORLD);
if(myrank != row_rank_table[i]){
//上面的条件,可能造成同一个进程上的其他行不被处理。
if(index != i){
int temp = shift_table[i];
shift_table[i] = shift_table[index];
shift_table[index] = temp;
//swap the other rows
for(j = 0 ; j < N ; j++){
if(myrank == row_rank_table[j]){
data_t data_temp = *(row_address_table[j]+i);
*(row_address_table[j]+i) = *(row_address_table[j]+index);
*(row_address_table[j]+index) = data_temp;
}
}//end of for
}
}
//发送主行元素
MPI_Bcast(mainrow_space,M,MPI_DOUBLE,row_rank_table[i],MPI_COMM_WORLD);
//主元归一化
if(myrank == row_rank_table[i]){
//注意,上面这个条件是错的,因为当前rank上还有其他行,这样会导致其他行不被处理。
//if(*(row_address_table[i]+i) == 0){continue;}
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",mainrow_space[j]);}printf("\n");
}else{
for(j = 0 ; j < N ; j++){
//处理该进程所分配的行。
if(j != i && myrank == row_rank_table[j]){
if(*(row_address_table[j]+i) == 0){continue;}
// 消元
data_t factor = -1*(*(row_address_table[j]+i)/mainrow_space[i]);
cblas_daxpy(N,factor,mainrow_space,1,row_address_table[j],1);
*(row_address_table[j]+N) = *(row_address_table[j]+N) +mainrow_space[N]*factor;//elem of B
}
}
}
}
//
//printf the revecive result.
/*
for(i = 0 ; i < N ; i++){
if(myrank != row_rank_table[i]) continue;
printf("rank%d-row%d:\n",myrank,row_rank_table[i]);
for(j = 0 ; j < M ; j++){
printf("%lf ",*(row_address_table[i]+j));
}
printf("\n");
} */
//回代
data_t sum = 0;
//注意一个进程不只一个行,因此只安排一个sum也是不够的。
for(i = N-1 ; i>= 0 ; i–){
if(myrank == row_rank_table[i]){
mainrow_space[i] = (*(row_address_table[i]+N)-sum)/(*(row_address_table[i]+i));//(*(B+i) – sum) / *A(i,i);
}
MPI_Bcast(mainrow_space+i,1,MPI_DOUBLE,row_rank_table[i],MPI_COMM_WORLD);
for(j = 0 ; j < i ; j++){
//处理该进程所分配的行。
if(myrank == row_rank_table[j]){
sum += *(row_address_table[j]+i)*mainrow_space[i];
}
}
}
//printf res
for(i = 0 ; myrank == 0 && i < N ; i++){
printf("x%d:%lf ",shift_table[i],mainrow_space[i]);
}
/*
//computer
for(i = 0 ; i < N-1 ; i++){
//选主元
index = cblas_idamax(N-i,A(i,i),1)+i;//cblas_idamax 选择最大绝对值的blas库
//printf("index:%d\n",index);
//列交换
if(index != i){
temp = shift_table[i];
shift_table[i] = shift_table[index];
shift_table[index] = temp;
for(row = 0 ; row < N ; row++){
data_t temp = *A(row,i);
*(A(row,i)) = *A(row,index);
*A(row,index) = temp;
}
}
for(row = 0 ; row < N ; row++)
printf("%d ",shift_table[row]);
printf("\n");
//主元归一化
// 消元
for(row = i+1 ; row < N ; row++){
if(*A(row,i) == 0) continue;
data_t factor = -1*(*A(row,i)/ *A(i,i));
cblas_daxpy(N-i,factor,A(i,i),1,A(row,i),1);//cblas_daxpy( N, alpha,X,incX, Y, incY); Y=Y+alpha*X
*(B+row) = *(B+row) + *(B+i)*factor;
}
//print_matrix();
}
//回代,i代表了Xi,col代表了列号,用来计算前面已经计算的xi值的sum
for(i = N-1 ; i >= 0 ; i–){
data_t sum = 0;
for( j = i+1 ; j < N ; j++){
sum += *A(i,j)*X[j];
}
X[i] = (*(B+i) – sum) / *A(i,i);
}
//恢复,其中shift table保存了原来的列号
for(i = 0 ; i < N ; i++){
printf("x%d:%lf ",shift_table[i],X[i]);
}
*/
MPI_Finalize();
}
void print_matrix(){
int i,j;
printf("matrix:\n");
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
printf("%lf ",*A(i,j));
}
printf("%lf\n",*(B+i));
}
}
修改后的实现:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cblas.h>
#include "mpi.h"
#include "malloc_align.h"
void environment_init();
typedef double data_t;
int N,M;//N行,M列,M = N+1
data_t * matrix_A;//X的系数矩阵
data_t * B;//y向量
data_t * X;//x的解向量
int *shift_table;
MPI_Status status;
//通过A(i,j)得到,x系数矩阵中[i,j]元素的地址
data_t * A(int i,int j){
return matrix_A+i*M+j;
}
void print_matrix();
int main(int argc,char **argv)
{
FILE * f;
int i = 0,j = 0,index = 0,row = 0;
int group_size;
int *row_rank_table;
int process_rownum;
int myrank;
data_t * mainrow_space;
data_t ** row_address_table;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&group_size);
MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
//printf("mpi:%d %d\n",group_size,myrank);
if(myrank == 0){
f= fopen("/home/duanple/CBLAS/data.txt","r");
fscanf(f,"%d%d",&N,&M);
matrix_A = (data_t *)_malloc_align(sizeof(data_t)*N*M,7);
B = (data_t *)_malloc_align(sizeof(data_t)*N,7);
X = (data_t *)_malloc_align(sizeof(data_t)*N,7);
//get matrix from file
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
fscanf(f,"%lf",A(i,j));
}
fscanf(f,"%lf",A(i,j));//A(i,N) also stores B[i]
*(B+i) = *A(i,j);
}
//print_matrix();
}
//broadcast the process row to be in,and rows num in every process
MPI_Bcast(&N,1,MPI_INT,0,MPI_COMM_WORLD);
M = N+1;
//printf("rank:%d m:%d\n",myrank,M);
//malloc the memory for the row of matrix
shift_table = (int*)_malloc_align(sizeof(int)*N,7);
row_rank_table = (int *)_malloc_align(sizeof(int)*N,7);
row_address_table = (data_t**)_malloc_align(sizeof(data_t*)*N,7);
//init shift table
for(i = 0 ; i < N ; i++){
shift_table[i] = i;
}
//divide matrix
if(myrank == 0){
for(i = 0,j = 0 ; i < N ; i++,j++){
row_rank_table[i] = j%group_size;
}
}
MPI_Bcast(row_rank_table,N,MPI_INT,0,MPI_COMM_WORLD);
process_rownum = 0;
for(i = 0 ; i < N ; i++){
//printf("rank:%d row%d\n",myrank,row_rank_table[i]);
if(row_rank_table[i] == myrank){
row_address_table[i] = (data_t *)_malloc_align(sizeof(data_t)*M,7);
process_rownum++;
}
}
//
//printf("rank:%d rows:%d\n",myrank,process_rownum);
//root send the data to the processes.
if(myrank == 0){//send
for(i = 0 ; i < N ; i++){
if(row_rank_table[i] == 0){
row_address_table[i] = A(i,0);
}else{
MPI_Send(A(i,0),N,MPI_DOUBLE,row_rank_table[i],i,MPI_COMM_WORLD);
MPI_Send(B+i,1,MPI_DOUBLE,row_rank_table[i],i,MPI_COMM_WORLD);
}
}
}else{//receive
for(i = 0 ; i < N ; i++){
if(myrank !=0 && myrank == row_rank_table[i]){
MPI_Recv(row_address_table[i],N,MPI_DOUBLE,0,i,MPI_COMM_WORLD,&status);
MPI_Recv(row_address_table[i]+N,1,MPI_DOUBLE,0,i,MPI_COMM_WORLD,&status);
}
}
}
//printf the revecive result.
/*
for(i = 0 ; i < N ; i++){
if(myrank != row_rank_table[i]) continue;
printf("rank%d-row%d:\n",myrank,row_rank_table[i]);
for(j = 0 ; j < M ; j++){
printf("%lf ",*(row_address_table[i]+j));
}
printf("\n");
} */
//computer
//消元
mainrow_space = (data_t *)_malloc_align(sizeof(data_t)*M,7);//存放主行的临时空间
for(i = 0 ; i < N-1 ; i++){
if(myrank == row_rank_table[i]){
//选主元
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",*(row_address_table[i]+j));}printf("\n");
index = cblas_idamax(N-i,row_address_table[i]+i,1)+i;
//列交换
if(index != i){
int temp = shift_table[i];
shift_table[i] = shift_table[index];
shift_table[index] = temp;
data_t data_temp = *(row_address_table[i]+i);
*(row_address_table[i]+i) = *(row_address_table[i]+index);
*(row_address_table[i]+index) = data_temp;
}
//拷贝主行元素到临时空间
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",*(row_address_table[i]+j));}printf("\n");
memcpy(mainrow_space,row_address_table[i],sizeof(data_t)*M);
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",*(row_address_table[i]+j));}printf("\n");
}
//其他行的列交换
MPI_Bcast(&index,1,MPI_INT,row_rank_table[i],MPI_COMM_WORLD);
//修改列交换标志,以进程为单位,不是以行
if(myrank != row_rank_table[i]){
int temp = shift_table[i];
shift_table[i] = shift_table[index];
shift_table[index] = temp;
}
for(j = 0 ; j < N ; j++){
if(j != i && myrank == row_rank_table[j]){
data_t data_temp = *(row_address_table[j]+i);
*(row_address_table[j]+i) = *(row_address_table[j]+index);
*(row_address_table[j]+index) = data_temp;
}
}
//发送主行元素
MPI_Bcast(mainrow_space,M,MPI_DOUBLE,row_rank_table[i],MPI_COMM_WORLD);
//printf("myrank:%d\n",myrank);for(j = 0 ; j < M ; j++){printf("%lf ",mainrow_space[j]);}printf("\n");
//主元归一化
for(j = i+1 ; j < N ; j++){
//处理该进程所分配的行。
if(myrank == row_rank_table[j]){
if(*(row_address_table[j]+i) == 0){continue;}
// 消元
data_t factor = -1*(*(row_address_table[j]+i)/mainrow_space[i]);
cblas_daxpy(N,factor,mainrow_space,1,row_address_table[j],1);
*(row_address_table[j]+N) = *(row_address_table[j]+N) +mainrow_space[N]*factor;//elem of B
}
}
}
//
//printf the result.
/*
for(i = 0 ; i < N ; i++){
if(myrank != row_rank_table[i]) continue;
printf("rank%d-row%d:\n",myrank,row_rank_table[i]);
for(j = 0 ; j < M ; j++){
printf("%lf ",*(row_address_table[i]+j));
}
printf("\n");
}
*/
//回代
for(i = N-1 ; i>= 0 ; i–){
if(myrank == row_rank_table[i]){
mainrow_space[i] = *(row_address_table[i]+N)/(*(row_address_table[i]+i));//(*(B+i) – sum) / *A(i,i);
}
MPI_Bcast(mainrow_space+i,1,MPI_DOUBLE,row_rank_table[i],MPI_COMM_WORLD);
for(j = 0 ; j < i ; j++){
//处理该进程所分配的行。
if(myrank == row_rank_table[j]){
*(row_address_table[j]+N) -= *(row_address_table[j]+i)*mainrow_space[i];
}
}
}
//printf res
for(i = 0 ; myrank == 0 && i < N ; i++){
printf("x%d:%lf ",shift_table[i],mainrow_space[i]);
}
MPI_Finalize();
}
void print_matrix(){
int i,j;
printf("matrix:\n");
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
printf("%lf ",*A(i,j));
}
printf("%lf\n",*(B+i));
}
}
#############################################################################
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cblas.h>
#include "mpi.h"
#include "malloc_align.h"
void environment_init();
typedef double data_t;
int N,M;//N行,M列,M = N+1
data_t * matrix_A;//X的系数矩阵
data_t * B;//y向量
data_t * X;//x的解向量
int *shift_table;
MPI_Status status;
//通过A(i,j)得到,x系数矩阵中[i,j]元素的地址
data_t * A(int i,int j){
return matrix_A+i*M+j;
}
void print_matrix();
double * processor_test(MPI_Comm comm,void test(),long loops,double comp_size,int master_node);
void data_malloc(double * processor_descrip,int * rank_table,int matrix_size);
void linear_solver(MPI_Comm comm);
void test(){
double a = 1.0;
double b = 2.44;
double c = a*b;
}
int main(int argc,char **argv)
{
MPI_Init(&argc,&argv);
processor_test(MPI_COMM_WORLD,test,1000,1.0,0);
//linear_solver(MPI_COMM_WORLD);
MPI_Finalize();
}
//test the computer ability of processors
double * processor_test(MPI_Comm comm,void test(),long loops,double comp_size,int master_node){
int myrank;
int group_size,i;
double start_time,end_time,time;
double * processor_able;
MPI_Comm_size(comm,&group_size);
MPI_Comm_rank(comm,&myrank);
if(myrank == master_node){
processor_able = (double *)_malloc_align(sizeof(double)*group_size+2,7);
processor_able[0] = group_size;
processor_able[1] = comp_size*loops;
} else
processor_able = NULL;
start_time = MPI_Wtime();
for(i = 0; i < loops ; i++)
test();
end_time = MPI_Wtime();
time = end_time-start_time;
printf("rank%d %lf\n",myrank,time);
MPI_Gather(&time,1,MPI_DOUBLE,processor_able+2,1,MPI_DOUBLE,master_node,comm);
//print the test result.
if(myrank == master_node){
for(i = 0 ; i < group_size ; i++){
printf("%d %lf ",i,processor_able[i+2]);
}
}
return processor_able;
}
void data_malloc(double * processor_descrip,int * rank_table,int matrix_size){
int p_num =(int)processor_descrip[0];
}
void linear_solver(MPI_Comm comm){
FILE * f;
int i = 0,j = 0,index = 0,row = 0;
int group_size;
int *row_rank_table;
int process_rownum;
int myrank;
data_t * mainrow_space;
data_t ** row_address_table;
MPI_Comm_size(comm,&group_size);
MPI_Comm_rank(comm,&myrank);
//printf("mpi:%d %d\n",group_size,myrank);
if(myrank == 0){
f= fopen("/home/duanple/CBLAS/data.txt","r");
fscanf(f,"%d%d",&N,&M);
matrix_A = (data_t *)_malloc_align(sizeof(data_t)*N*M,7);
B = (data_t *)_malloc_align(sizeof(data_t)*N,7);
X = (data_t *)_malloc_align(sizeof(data_t)*N,7);
//get matrix from file
for(i = 0 ; i < N ; i++){
for(j = 0 ; j < N ; j++){
fscanf(f,"%lf",A(i,j));
}
fscanf(f,"%lf",A(i,j));//A(i,N) also stores B[i]
*(B+i) = *A(i,j);
}
//print_matrix();
}
//broadcast the process row to be in,and rows num in every process
MPI_Bcast(&N,1,MPI_INT,0,comm);
M = N+1;
//printf("rank:%d m:%d\n",myrank,M);
//malloc the memory for the row of matrix
shift_table = (int*)_malloc_align(sizeof(int)*N,7);
row_rank_table = (int *)_malloc_align(sizeof(int)*N,7);
row_address_table = (data_t**)_malloc_align(sizeof(data_t*)*N,7);
//init shift table
for(i = 0 ; i < N ; i++){
shift_table[i] = i;
}
//divide matrix
if(myrank == 0){
for(i = 0,j = 0 ; i < N ; i++,j++){
row_rank_table[i] = j%group_size;
}
}
MPI_Bcast(row_rank_table,N,MPI_INT,0,comm);
process_rownum = 0;
for(i = 0 ; i < N ; i++){
//printf("rank:%d row%d\n",myrank,row_rank_table[i]);
if(row_rank_table[i] == myrank){
row_address_table[i] = (data_t *)_malloc_align(sizeof(data_t)*M,7);
process_rownum++;
}
}
//
//printf("rank:%d rows:%d\n",myrank,process_rownum);
//root send the data to the processes.
if(myrank == 0){//send
for(i = 0 ; i < N ; i++){
if(row_rank_table[i] == 0){
row_address_table[i] = A(i,0);
}else{
MPI_Send(A(i,0),N,MPI_DOUBLE,row_rank_table[i],i,comm);
MPI_Send(B+i,1,MPI_DOUBLE,row_rank_table[i],i,comm);
}
}
}else{//receive
for(i = 0 ; i < N ; i++){
if(myrank !=0 && myrank == row_rank_table[i]){
MPI_Recv(row_address_table[i],N,MPI_DOUBLE,0,i,comm,&status);
MPI_Recv(row_address_table[i]+N,1,MPI_DOUBLE,0,i,comm,&status);
}
}
}
//printf the revecive result.
/*
for(i = 0 ; i < N ; i++){
if(myrank != row_rank_table[i]) continue;
printf("rank%d-row%d:\n",myrank,row_rank_table[i]);
for(j = 0 ; j < M ; j++){
printf("%lf ",*(row_address_table[i]+j));
}
printf("\n");
} */
//computer