static double pWrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE]; static long pSync[_SHMEM_REDUCE_SYNC_SIZE]; static double vsm[2]; static double ss2[2*LOC]; static double rr2[2*LOC]; /********************** * FUNCTION: run_m * **********************/ void run_m (solvec, xm, ym, num, nproc, maxiter, eps, pe, Afile, xfile, bfile) int xm, ym, num, nproc, maxiter, pe; double *solvec, eps; char *Afile, *xfile, *bfile; { int loc, glob, nslaves, blks_per_proc, pass_vec_size, pass_mat_size, pass_all_size, bppp1, bppm1, bppm2, bppm3, tp1, tm1, i, im1, j, l, conv, iter, ctr; double reltol, *rho, oldrho, alpha, omega, beta, secs, stime, ftime; struct collvec U, K, R, R0, V, T, P, S, Y, Z; struct first_proc_collmat M; struct timeval t_s, t_f; struct kvec2 kv21, kv22; FILE *fp; /********************* * define loc, glob * *********************/ if (num == 0) { loc = xm; glob = ym; } else { loc = ym; glob = xm; } /********************** * define constants * **********************/ nslaves = nproc - 1; blks_per_proc = glob / nproc; pass_vec_size = 4 * loc*blks_per_proc; pass_mat_size = 32 * (2*loc-1) * blks_per_proc; pass_all_size = 2*pass_vec_size + pass_mat_size; bppp1 = blks_per_proc + 1; bppm1 = blks_per_proc - 1; bppm2 = blks_per_proc - 2; bppm3 = blks_per_proc - 3; /****************************************************** * get info from files, load into proper structures * ******************************************************/ dcollvec(&U, loc, bppp1); dcollvec(&K, loc, blks_per_proc); d_first_proc_collmat(&M, loc, bppm1); file2Mcollvec(&U, xfile, nproc, blks_per_proc, xm, ym, num, loc); file2Mcollvec(&K, bfile, nproc, blks_per_proc, xm, ym, num, loc); file2Mcollmat(&M, Afile, nproc, blks_per_proc, xm, ym, num, loc, glob); /*************** * neighbors * ***************/ tp1 = 1; tm1 = nslaves; /************************** * circular shift for U * **************************/ cshift_S2_R4(&U.L[0], tm1, &U.M[bppm1], tp1, loc); /***************** * compute M*u * *****************/ dcollvec(&R, loc, blks_per_proc); dkvec2(&kv21, loc); dkvec2(&kv22, loc); mult_M24(&kv21, &M.C[bppm2], &U.M[bppm2], loc); ctr = -1; for (i = 0; i < loc; i++) for (j = 0; j < 2; j++) ss2[++ctr] = kv21.el[i].el[j]; /*mult_CU_send(&M.C[bppm2], &U.M[bppm2], loc, tp1);*/ for (i = bppm2; i >= 1; i--) { im1 = i-1; mult_CU_AU_BU(&R.M[i], &M.A[i], &M.B[i], &M.C[im1], &U.M[im1], &U.M[i], &U.M[i+1], loc); } mult_CFUF_AU_BU(&R.M[0], &M.A[0], &M.B[0], &M.CF[0], &U.F[0], &U.M[0], &U.M[1], loc); mult_AFUF_BFU0(&R.F[0], &M.AF[0], &M.BF[0], &U.F[0], &U.M[0], loc); mult_tri2xu(&kv21, &M.AL[0], &U.L[0], loc); barrier(); shmem_get(rr2, ss2, 2*loc, tm1); ctr = -1; for (i = 0; i < loc; i++) for (j = 0; j < 2; j++) kv22.el[i].el[j] = rr2[++ctr]; addkv2(&R.L[0], &kv21, &kv22, loc); /*mult_ALUL_recv(&R.L[0], &M.AL[0], &U.L[0], loc, tm1);*/ /************************* * compute r = k - M*u * *************************/ subkv2(&R.F[0], &K.F[0], &R.F[0], loc); subkv2(&R.L[0], &K.L[0], &R.L[0], loc); for (i = 0; i <= bppm2; i++) subkv4(&R.M[i], &K.M[i], &R.M[i], loc); /**************** * set r0 = r * ****************/ dcollvec(&R0, loc, blks_per_proc); copy_kvec2(&R0.F[0], &R.F[0], loc); copy_kvec2(&R0.L[0], &R.L[0], loc); for (l = 0; l <= bppm2; l++) copy_kvec4(&R0.M[l], &R.M[l], loc); /******************** * get 2norm of r * ********************/ vsm[0] = dot_collvec(&R, &R, loc, blks_per_proc); for(i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i++) pSync[i]=_SHMEM_SYNC_VALUE; barrier(); shmem_double_sum_to_all(&vsm[0], &vsm[0], 1, 0, 0, nproc, pWrk, pSync); /***************************************** * define reltol and initial constants * *****************************************/ reltol = (*vsm) * eps*eps; printf("\nreltol = %e\n\n", reltol); rho = dvector(1); alpha = 1; omega = 1; oldrho = 1; *rho = *vsm; dcollvec(&V, loc, blks_per_proc); dcollvec(&T, loc, blks_per_proc); dcollvec(&P, loc, blks_per_proc); dcollvec(&S, loc, blks_per_proc); dcollvec(&Y, loc, bppp1); dcollvec(&Z, loc, bppp1); /*************** * MAIN LOOP * ***************/ conv = 0; iter = 0; gettimeofday(&t_s, (struct timezone*)0); while ((iter < maxiter) && (!conv)) { iter++; beta = (*rho/oldrho) * (alpha/omega); saxpy_collvec(&P, -omega, &V, &P, loc, blks_per_proc); saxpy_collvec(&P, beta, &P, &R, loc, blks_per_proc); psolve_M(&Y, &M, &P, blks_per_proc, loc, tm1, tp1); mult_collmat_collvec_trick_M(&V, &M, &Y, &P, blks_per_proc, loc, tm1, tp1); *vsm = dot_collvec(&R0, &V, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(&vsm[0], &vsm[0], 1, 0, 0, nproc, pWrk, pSync); alpha = (*rho) / (*vsm); saxpy_collvec(&S, -alpha, &V, &R, loc, blks_per_proc); psolve_M(&Z, &M, &S, blks_per_proc, loc, tm1, tp1); mult_collmat_collvec_trick_M(&T, &M, &Z, &S, blks_per_proc, loc, tm1, tp1); vsm[0] = dot_collvec(&T, &T, loc, blks_per_proc); vsm[1] = dot_collvec(&T, &S, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(vsm, vsm, 2, 0, 0, nproc, pWrk, pSync); omega = vsm[1] / vsm[0]; saxpy_collvec(&U, alpha, &Y, &U, loc, blks_per_proc); saxpy_collvec(&U, omega, &Z, &U, loc, blks_per_proc); saxpy_collvec(&R, -omega, &T, &S, loc, blks_per_proc); vsm[0] = dot_collvec(&R0, &R, loc, blks_per_proc); vsm[1] = dot_collvec( &R, &R, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(vsm, vsm, 2, 0, 0, nproc, pWrk, pSync); oldrho = *rho; *rho = vsm[0]; printf("%d %e\n", iter, vsm[1]); if (vsm[1] <= reltol) conv = 1; } gettimeofday(&t_f, (struct timezone*)0); secs = ((t_f.tv_sec-t_s.tv_sec)*1000000 + t_f.tv_usec - t_s.tv_usec)/1000000.; stime = (t_s.tv_sec*1000000 + t_s.tv_usec) / 1000000.; ftime = (t_f.tv_sec*1000000 + t_f.tv_usec) / 1000000.; barrier(); shmem_double_max_to_all(&ftime, &ftime, 1, 0, 0, nproc, pWrk, pSync); barrier(); shmem_double_min_to_all(&stime, &stime, 1, 0, 0, nproc, pWrk, pSync); secs = ftime - stime; printf("max time for main loop = %lf seconds\n", secs); fp = fopen("out.dat", "a"); fprintf(fp, "%3d %3d %2d %lf ", xm, ym, nproc, secs); fclose(fp); /* secs = ((t_f.tv_sec-t_s.tv_sec)*1000000 + t_f.tv_usec - t_s.tv_usec)/1000000.; printf("time for main loop = %lf seconds for pe = %d\n", secs, pe); */ /*************************************************************** * single-node gather to obtain all parts of solution vector * ***************************************************************/ pvm_gather(solvec, solvec, pass_vec_size, PVM_DOUBLE, 37, "", 0); /******************* * de-allocation * *******************/ free_collvec(&U, bppp1); free_collvec(&K, blks_per_proc); free_collvec(&R, blks_per_proc); free_collvec(&R0, blks_per_proc); free_collvec(&V, blks_per_proc); free_collvec(&T, blks_per_proc); free_collvec(&P, blks_per_proc); free_collvec(&S, blks_per_proc); free_collvec(&Y, bppp1); free_collvec(&Z, bppp1); free_first_proc_collmat(&M, bppm1); free(rho); free_kvec2(&kv21); free_kvec2(&kv22); } /* end of function run_m */ /********************** * FUNCTION: run_s * **********************/ void run_s (xm, ym, num, nproc, maxiter, eps, pe, Afile, xfile, bfile) int xm, ym, num, nproc, maxiter, pe; double eps; char *Afile, *xfile, *bfile; { int loc, glob, nslaves, blks_per_proc, pass_vec_size, pass_mat_size, pass_all_size, bppp1, bppm1, bppm2, bppm3, tp1, tm1, tz, i, im1, j, l, conv, iter, ctr; double reltol, *solution, secs, stime, ftime, *rho, oldrho, alpha, omega, beta; struct mat_kvec4 U_S, K_S, R_S, R0_S, V_S, T_S, P_S, S_S, Y_S, Z_S; struct proc_collmat M_S; struct timeval t_s, t_f; struct kvec2 kv21, kv22; struct kvec4 kv41, kv42; /********************* * define loc, glob * *********************/ if (num == 0) { loc = xm; glob = ym; } else { loc = ym; glob = xm; } /********************** * define constants * **********************/ nslaves = nproc - 1; blks_per_proc = glob / nproc; pass_vec_size = 4 * loc*blks_per_proc; pass_mat_size = 32 * (2*loc-1) * blks_per_proc; pass_all_size = 2*pass_vec_size + pass_mat_size; bppp1 = blks_per_proc + 1; bppm1 = blks_per_proc - 1; bppm2 = blks_per_proc - 2; bppm3 = blks_per_proc - 3; /****************************************************** * get info from files, load into proper structures * ******************************************************/ dmat_kvec4(&U_S, bppp1, loc); dmat_kvec4(&K_S, blks_per_proc, loc); d_proc_collmat(&M_S, loc, blks_per_proc); file2Scollvec(&U_S, xfile, nproc, blks_per_proc, xm, ym, num, loc, pe); file2Scollvec(&K_S, bfile, nproc, blks_per_proc, xm, ym, num, loc, pe); file2Scollmat(&M_S, Afile, blks_per_proc, xm, ym, num, loc, glob, pe); /*************** * neighbors * ***************/ tp1 = pe+1; tm1 = pe-1; tz = 0; /************************** * circular shift for U * **************************/ cshift_S4_R4(&U_S.el[0], tm1, &U_S.el[bppm1], tp1, loc); /***************** * compute M*u * *****************/ dmat_kvec4(&R_S, blks_per_proc, loc); dkvec2(&kv21, loc); dkvec2(&kv22, loc); dkvec4(&kv41, loc); dkvec4(&kv42, loc); mult_M24(&kv21, &M_S.C[bppm2], &U_S.el[bppm1], loc); ctr = -1; for (i = 0; i < loc; i++) for (j = 0; j < 2; j++) ss2[++ctr] = kv21.el[i].el[j]; /*mult_CU_send(&M_S.C[bppm1], &U_S.el[bppm1], loc, tp1);*/ for (i = bppm1; i >= 1; i--) { im1 = i-1; mult_CU_AU_BU(&R_S.el[i], &M_S.A[i], &M_S.B[i], &M_S.C[im1], &U_S.el[im1], &U_S.el[i], &U_S.el[i+1], loc); } mult_M44(&kv42, &M_S.A[0], &U_S.el[0], loc); mult_M24(&kv22, &M_S.B[0], &U_S.el[1], loc); barrier(); shmem_get(rr2, ss2, 2*loc, tm1); ctr = -1; for (i = 0; i < loc; i++) for (j = 0; j < 2; j++) kv21.el[i].el[j] = rr2[++ctr]; seam(&kv41, &kv21, &kv22, loc); addkv4(&R_S.el[0], &kv41, &kv42, loc); /*mult_AU_BU_recv(&R_S.el[0], &M_S.A[0], &M_S.B[0], &U_S.el[0], &U_S.el[1], loc, tm1);*/ /************************* * compute R = k - M*u * *************************/ for (i = 0; i <= bppm1; i++) subkv4(&R_S.el[i], &K_S.el[i], &R_S.el[i], loc); /**************** * set R0 = R * ****************/ dmat_kvec4(&R0_S, blks_per_proc, loc); for (l = 0; l < blks_per_proc; l++) copy_kvec4(&R0_S.el[l], &R_S.el[l], loc); /******************** * get 2norm of R * ********************/ vsm[0] = dot_mat_kvec4(&R_S, &R_S, loc, blks_per_proc); for(i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i++) pSync[i]=_SHMEM_SYNC_VALUE; barrier(); shmem_double_sum_to_all(&vsm[0], &vsm[0], 1, 0, 0, nproc, pWrk, pSync); /***************************************** * define reltol and initial constants * *****************************************/ reltol = (*vsm) * eps*eps; rho = dvector(1); alpha = 1; omega = 1; oldrho = 1; *rho = *vsm; dmat_kvec4(&V_S, blks_per_proc, loc); dmat_kvec4(&T_S, blks_per_proc, loc); dmat_kvec4(&P_S, blks_per_proc, loc); dmat_kvec4(&S_S, blks_per_proc, loc); dmat_kvec4(&Y_S, bppp1, loc); dmat_kvec4(&Z_S, bppp1, loc); /*************** * MAIN LOOP * ***************/ conv = 0; iter = 0; gettimeofday(&t_s, (struct timezone*)0); while ((iter < maxiter) && (!conv)) { iter++; beta = (*rho/oldrho) * (alpha/omega); saxpy_mat_kvec4(&P_S, -omega, &V_S, &P_S, loc, blks_per_proc); saxpy_mat_kvec4(&P_S, beta, &P_S, &R_S, loc, blks_per_proc); psolve_S(&Y_S, &M_S, &P_S, blks_per_proc, loc, tm1, tp1); mult_collmat_collvec_trick_S(&V_S, &M_S, &Y_S, &P_S, blks_per_proc, loc, tm1, tp1); *vsm = dot_mat_kvec4(&R0_S, &V_S, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(&vsm[0], &vsm[0], 1, 0, 0, nproc, pWrk, pSync); alpha = (*rho) / (*vsm); saxpy_mat_kvec4(&S_S, -alpha, &V_S, &R_S, loc, blks_per_proc); psolve_S(&Z_S, &M_S, &S_S, blks_per_proc, loc, tm1, tp1); mult_collmat_collvec_trick_S(&T_S, &M_S, &Z_S, &S_S, blks_per_proc, loc, tm1, tp1); vsm[0] = dot_mat_kvec4(&T_S, &T_S, loc, blks_per_proc); vsm[1] = dot_mat_kvec4(&T_S, &S_S, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(vsm, vsm, 2, 0, 0, nproc, pWrk, pSync); omega = vsm[1] / vsm[0]; saxpy_mat_kvec4(&U_S, alpha, &Y_S, &U_S, loc, blks_per_proc); saxpy_mat_kvec4(&U_S, omega, &Z_S, &U_S, loc, blks_per_proc); saxpy_mat_kvec4(&R_S, -omega, &T_S, &S_S, loc, blks_per_proc); vsm[0] = dot_mat_kvec4(&R0_S, &R_S, loc, blks_per_proc); vsm[1] = dot_mat_kvec4( &R_S, &R_S, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(vsm, vsm, 2, 0, 0, nproc, pWrk, pSync); oldrho = *rho; *rho = vsm[0]; if (vsm[1] <= reltol) conv = 1; } gettimeofday(&t_f, (struct timezone*)0); stime = (t_s.tv_sec*1000000 + t_s.tv_usec) / 1000000.; ftime = (t_f.tv_sec*1000000 + t_f.tv_usec) / 1000000.; barrier(); shmem_double_max_to_all(&ftime, &ftime, 1, 0, 0, nproc, pWrk, pSync); barrier(); shmem_double_min_to_all(&stime, &stime, 1, 0, 0, nproc, pWrk, pSync); /* secs = ((t_f.tv_sec-t_s.tv_sec)*1000000 + t_f.tv_usec - t_s.tv_usec)/1000000.; printf("time for main loop = %lf seconds for pe = %d\n", secs, pe); */ /*************************************************************** * single-node gather to obtain all parts of solution vector * ***************************************************************/ solution = dvector(pass_vec_size); unload_collvec_S(solution, &U_S, blks_per_proc, xm, ym, num, loc, 0); pvm_gather(solution, solution, pass_vec_size, PVM_DOUBLE, 37, "", 0); /******************* * de-allocation * *******************/ free_mat_kvec4(&U_S, bppp1); free_mat_kvec4(&K_S, blks_per_proc); free_mat_kvec4(&R_S, blks_per_proc); free_mat_kvec4(&R0_S, blks_per_proc); free_mat_kvec4(&V_S, blks_per_proc); free_mat_kvec4(&T_S, blks_per_proc); free_mat_kvec4(&P_S, blks_per_proc); free_mat_kvec4(&S_S, blks_per_proc); free_mat_kvec4(&Y_S, bppp1); free_mat_kvec4(&Z_S, bppp1); free_proc_collmat(&M_S, blks_per_proc); free(solution); free(rho); free_kvec2(&kv21); free_kvec2(&kv22); free_kvec4(&kv41); free_kvec4(&kv42); } /* end of function run_s */ /********************** * FUNCTION: run_l * **********************/ void run_l (xm, ym, num, nproc, maxiter, eps, pe, Afile, xfile, bfile) int xm, ym, num, nproc, maxiter, pe; double eps; char *Afile, *xfile, *bfile; { int loc, glob, nslaves, blks_per_proc, pass_vec_size, pass_mat_size, pass_all_size, bppp1, bppm1, bppm2, bppm3, tp1, tm1, tz, i, im1, j, l, conv, iter, ctr; double reltol, *solution, secs, stime, ftime, *rho, oldrho, alpha, omega, beta; struct mat_kvec4 U_L, K_L, R_L, R0_L, V_L, T_L, P_L, S_L, Y_L, Z_L; struct kvec2 UL, YL, ZL, kv21, kv22; struct kvec4 kv41, kv42; struct last_proc_collmat LM; struct timeval t_s, t_f; /********************* * define loc, glob * *********************/ if (num == 0) { loc = xm; glob = ym; } else { loc = ym; glob = xm; } /********************** * define constants * **********************/ nslaves = nproc - 1; blks_per_proc = glob / nproc; pass_vec_size = 4 * loc*blks_per_proc; pass_mat_size = 32 * (2*loc-1) * blks_per_proc; pass_all_size = 2*pass_vec_size + pass_mat_size; bppp1 = blks_per_proc + 1; bppm1 = blks_per_proc - 1; bppm2 = blks_per_proc - 2; bppm3 = blks_per_proc - 3; /****************************************************** * get info from files, load into proper structures * ******************************************************/ dmat_kvec4(&U_L, blks_per_proc, loc); dmat_kvec4(&K_L, blks_per_proc, loc); d_last_proc_collmat(&LM, loc, blks_per_proc); dkvec2(&UL, loc); file2Scollvec(&U_L, xfile, nproc, blks_per_proc, xm, ym, num, loc, pe); file2Scollvec(&K_L, bfile, nproc, blks_per_proc, xm, ym, num, loc, pe); file2Lcollmat(&LM, Afile, blks_per_proc, xm, ym, num, loc, glob, pe); /*************** * neighbors * ***************/ tp1 = 0; tm1 = pe-1; tz = tp1; /************************* * circular shift for U * *************************/ cshift_S4_R2(&U_L.el[0], tm1, &UL, tp1, loc); /***************** * compute M*u * *****************/ dmat_kvec4(&R_L, blks_per_proc, loc); dkvec2(&kv21, loc); dkvec2(&kv22, loc); dkvec4(&kv41, loc); dkvec4(&kv42, loc); mult_M24(&kv21, &LM.CL[0], &U_L.el[bppm1], loc); ctr = -1; for (i = 0; i < loc; i++) for (j = 0; j < 2; j++) ss2[++ctr] = kv21.el[i].el[j]; /*mult_CU_send(&LM.CL[0], &U_L.el[bppm1], loc, tp1);*/ mult_CU_AU_BLUL(&R_L.el[bppm1], &LM.A[bppm1], &LM.BL[0], &LM.C[bppm2], &U_L.el[bppm1], &U_L.el[bppm2], &UL, loc); for (i = bppm2; i >= 1; i--) { im1 = i-1; mult_CU_AU_BU(&R_L.el[i], &LM.A[i], &LM.B[i], &LM.C[im1], &U_L.el[im1], &U_L.el[i], &U_L.el[i+1], loc); } mult_M44(&kv42, &LM.A[0], &U_L.el[0], loc); mult_M24(&kv22, &LM.B[0], &U_L.el[1], loc); barrier(); shmem_get(rr2, ss2, 2*loc, tm1); ctr = -1; for (i = 0; i < loc; i++) for (j = 0; j < 2; j++) kv21.el[i].el[j] = rr2[++ctr]; seam(&kv41, &kv21, &kv22, loc); addkv4(&R_L.el[0], &kv41, &kv42, loc); /*mult_AU_BU_recv(&R_L.el[0], &LM.A[0], &LM.B[0], &U_L.el[0], &U_L.el[1], loc, tm1);*/ /************************* * compute R = k - M*u * *************************/ for (i = 0; i <= bppm1; i++) subkv4(&R_L.el[i], &K_L.el[i], &R_L.el[i], loc); /**************** * set R0 = R * ****************/ dmat_kvec4(&R0_L, blks_per_proc, loc); for (l = 0; l < blks_per_proc; l++) copy_kvec4(&R0_L.el[l], &R_L.el[l], loc); /******************** * get 2norm of R * ********************/ vsm[0] = dot_mat_kvec4(&R_L, &R_L, loc, blks_per_proc); for(i = 0; i < _SHMEM_REDUCE_SYNC_SIZE; i++) pSync[i]=_SHMEM_SYNC_VALUE; barrier(); shmem_double_sum_to_all(&vsm[0], &vsm[0], 1, 0, 0, nproc, pWrk, pSync); /***************************************** * define reltol and initial constants * *****************************************/ reltol = (*vsm) * eps*eps; rho = dvector(1); alpha = 1; omega = 1; oldrho = 1; *rho = *vsm; dmat_kvec4(&V_L, blks_per_proc, loc); dmat_kvec4(&T_L, blks_per_proc, loc); dmat_kvec4(&P_L, blks_per_proc, loc); dmat_kvec4(&S_L, blks_per_proc, loc); dmat_kvec4(&Y_L, blks_per_proc, loc); dmat_kvec4(&Z_L, blks_per_proc, loc); dkvec2(&YL, loc); dkvec2(&ZL, loc); /*************** * MAIN LOOP * ***************/ conv = 0; iter = 0; gettimeofday(&t_s, (struct timezone*)0); while ((iter < maxiter) && (!conv)) { iter++; beta = (*rho/oldrho) * (alpha/omega); saxpy_mat_kvec4(&P_L, -omega, &V_L, &P_L, loc, blks_per_proc); saxpy_mat_kvec4(&P_L, beta, &P_L, &R_L, loc, blks_per_proc); psolve_LS(&Y_L, &YL, &LM, &P_L, blks_per_proc, loc, tm1, tp1); mult_collmat_collvec_trick_LS (&V_L, &LM, &Y_L, &P_L, blks_per_proc, loc, tm1, tp1); *vsm = dot_mat_kvec4(&R0_L, &V_L, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(&vsm[0], &vsm[0], 1, 0, 0, nproc, pWrk, pSync); alpha = (*rho) / (*vsm); saxpy_mat_kvec4(&S_L, -alpha, &V_L, &R_L, loc, blks_per_proc); psolve_LS(&Z_L, &ZL, &LM, &S_L, blks_per_proc, loc, tm1, tp1); mult_collmat_collvec_trick_LS (&T_L, &LM, &Z_L, &S_L, blks_per_proc, loc, tm1, tp1); vsm[0] = dot_mat_kvec4(&T_L, &T_L, loc, blks_per_proc); vsm[1] = dot_mat_kvec4(&T_L, &S_L, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(vsm, vsm, 2, 0, 0, nproc, pWrk, pSync); omega = vsm[1] / vsm[0]; saxpy_mat_kvec4(&U_L, alpha, &Y_L, &U_L, loc, blks_per_proc); saxpy_mat_kvec4(&U_L, omega, &Z_L, &U_L, loc, blks_per_proc); saxpy_mat_kvec4(&R_L, -omega, &T_L, &S_L, loc, blks_per_proc); vsm[0] = dot_mat_kvec4(&R0_L, &R_L, loc, blks_per_proc); vsm[1] = dot_mat_kvec4( &R_L, &R_L, loc, blks_per_proc); barrier(); shmem_double_sum_to_all(vsm, vsm, 2, 0, 0, nproc, pWrk, pSync); oldrho = *rho; *rho = vsm[0]; if (vsm[1] <= reltol) conv = 1; } gettimeofday(&t_f, (struct timezone*)0); stime = (t_s.tv_sec*1000000 + t_s.tv_usec) / 1000000.; ftime = (t_f.tv_sec*1000000 + t_f.tv_usec) / 1000000.; barrier(); shmem_double_max_to_all(&ftime, &ftime, 1, 0, 0, nproc, pWrk, pSync); barrier(); shmem_double_min_to_all(&stime, &stime, 1, 0, 0, nproc, pWrk, pSync); /* secs = ((t_f.tv_sec-t_s.tv_sec)*1000000 + t_f.tv_usec - t_s.tv_usec)/1000000.; printf("time for main loop = %lf seconds for pe = %d\n", secs, pe); */ /*************************************************************** * single-node gather to obtain all parts of solution vector * ***************************************************************/ solution = dvector(pass_vec_size); unload_collvec_S(solution, &U_L, blks_per_proc, xm, ym, num, loc, 0); pvm_gather(solution, solution, pass_vec_size, PVM_DOUBLE, 37, "", 0); /******************* * de-allocation * *******************/ free_mat_kvec4(&K_L, blks_per_proc); free_mat_kvec4(&U_L, blks_per_proc); free_kvec2(&UL); free_kvec2(&YL); free_kvec2(&ZL); free_mat_kvec4(&R_L, blks_per_proc); free_mat_kvec4(&R0_L, blks_per_proc); free_mat_kvec4(&V_L, blks_per_proc); free_mat_kvec4(&T_L, blks_per_proc); free_mat_kvec4(&P_L, blks_per_proc); free_mat_kvec4(&S_L, blks_per_proc); free_mat_kvec4(&Y_L, blks_per_proc); free_mat_kvec4(&Z_L, blks_per_proc); free_last_proc_collmat(&LM, blks_per_proc); free(solution); free(rho); free_kvec2(&kv21); free_kvec2(&kv22); free_kvec4(&kv41); free_kvec4(&kv42); } /* end of function run_l */