pnslac.c


/*
 Use PARPACK + SuperLU_DIST to solve generalized eigenvalue
 arising from Accelerator Design (SLAC problem)
 This version uses the nonsymmetric solver of ARPACK: pdnaupd.

 NOTE: Aztec is used for sparse matrix-vector multiplication.
*/

#include <math.h>
#include "superlu_ddefs.h"

#define workd(i)  workd[(i)-1]
#define z(i,j)    z[nloc*( (j) -1 )+ (i) - 1]
#define zglb(i,j) zglb[na*( (j) -1 )+ (i) - 1]
#define asub(i)   asub[(i) - 1]
#define amtx(i)   amtx[(i) - 1]

#define USECRS
#define USEAZTEC

#ifdef USEAZTEC
#include "az_aztec.h"
#endif
#ifdef USECRS


/* Convert csr format to msr format */

void csr2msr(int n,int nnz,int *row_start, int *col_index, double *val,
             int *bindex,double *newval,int *update)
{
  int row,j,count,offdiags;
  /* Copy val and col_index to second part of bindex and newval */
  bindex[0]=n+1;
  count = n+1;
  offdiags=0;
  for(row=0; row < n;row++) {
    if(row > 0) {
      bindex[row]=bindex[row-1]+offdiags;
      offdiags=0;
    }
    for(j=row_start[row];j < row_start[row+1];j++) {
      if(col_index[j] == update[row]) {
        newval[row]=val[j];  /* Diagonal Element */
      } else {
        newval[count]=val[j];
        bindex[count]=col_index[j];
        count++;
        offdiags++;
      }
    }
  }

  bindex[n]=count;
}

/* 
   Convert a global compressed column matrix to
   local compressed row matrix 
   Input matrix is in ncols, col_start, row_index, col_val
   Local row range is given by my_starting_row <= row < my_finishing_row 
   Matrix is output in p_row_start, p_col_index, and p_val
   Returns number of local entries
*/

int gccs2lcrs(int ncols,int *col_start, int *row_index, double *col_val,
	       int my_starting_row, int my_finishing_row,
	       int **p_row_start, int **p_col_index, double **p_val)
{
  int *row_start;
  int *col_index;
  double *val;
  int n_local_rows = my_finishing_row - my_starting_row;
  int *row_length;
  int *seen_in_row;
  int total_entries, gcol,i,current_row,lrow;

  total_entries = 0;
  row_length = (int *) calloc(n_local_rows,sizeof(int));
  for(gcol=0;gcol < ncols;gcol++) {
    for(i=col_start[gcol];i < col_start[gcol+1];i++) {
      current_row = row_index[i];
      if((current_row >= my_starting_row) && 
	 (current_row < my_finishing_row)) {
	row_length[current_row-my_starting_row]++;
	total_entries++;
      }
    }
  }

  printf("ncols: %d\n",ncols);
  printf("n_local_rows: %d\n",n_local_rows);
  printf("Total entries: %d\n",total_entries);
  *p_row_start = row_start = (int *) calloc(n_local_rows+1,sizeof(int));
  *p_col_index = col_index = (int *) calloc(total_entries,sizeof(int));
  *p_val = val = (double *) calloc(total_entries,sizeof(double));

  row_start[0]=0;
  for(lrow=1;lrow <= n_local_rows;lrow++) {
    row_start[lrow]=row_start[lrow-1]+row_length[lrow-1];
  }

  seen_in_row = (int *) calloc(n_local_rows,sizeof(int));

  for(lrow=0;lrow < n_local_rows;lrow++) {
    seen_in_row[lrow]=row_start[lrow];
  }

  for(gcol=0;gcol < ncols;gcol++) {
    for(i=col_start[gcol];i < col_start[gcol+1];i++) {
      current_row = row_index[i];
      if((current_row >= my_starting_row) && 
	 (current_row < my_finishing_row)) {
	lrow = current_row - my_starting_row;
	col_index[seen_in_row[lrow]]=gcol;
	val[seen_in_row[lrow]]=col_val[i];
	seen_in_row[lrow]++;
      }
    }
  }

  free(row_length);
  free(seen_in_row);
  return(total_entries);
}

void crsmvm(int n,int *row_start, int *col_index, double *val,
	    double *x,double *y)
{
  int i,j;
  for(i=0;i < n;i++) {
    y[i]=0.0;
    for(j=row_start[i];j < row_start[i+1];j++) {
      y[i]+=val[j]*x[col_index[j]];
    }
  }
}
#endif

main(int argc, char *argv[])
/*
 * Purpose
 * =======
 *
 * The driver program PEIGEN
 *
 * This example illustrates how to use SuperLU_DIST and PARPACK
 * to solve a large-scale sparse standard eigenvalue problem
 * in shift-invert mode.
 * In this case, we factorize A only once in the first call to
 * pdgssvx_ABglobal, and reuse the following data structures
 * in the subsequent call to pdgssvx_ABglobal:
 *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
 *        LUstruct         : Glu_persist, Llu
 * 
 * On the Cray T3E, the program may be run by typing:
 *    mpprun -n <procs> pddrive1 -r <proc rows> -c <proc columns> <input_matrix>
 *
 */
{
    superlu_options_t options;
    SuperLUStat_t stat;
    SuperMatrix A;
    ScalePermstruct_t ScalePermstruct;
    LUstruct_t LUstruct;
    gridinfo_t grid;
    double   *berr;
    double   *amtx, *bmtx, *b, *b1, *b2;
    int_t    *asub, *xa, *bsub, *xb;
    int_t    i, j, ma, na, nnza, mb, nb, nnzb;
    int_t    nprow, npcol;
    int      iam, info, ldb, ldx, nrhs;
    char     trans[1];
    char     **cpp, c;
    FILE *fp1, *fp2, *fopen();
    double   t0, t1;

    /* ----------------
       ARPACK variables 
       ----------------*/
    MPI_Comm arcomm;
    MPI_Status mpistat;
    int      nprocs, ibeg;
    int      ido, nloc, nrem, nev , ncv   , lworkl, mypid, 
             sid, arflg, pin, pout, maxitr, rvec  , nconv,
             iopt;
    double   tol, sigmar =2000.0, sigmai=0.0;
    char     bmat[1] = "G", which[2]="LR", howmny[1]="A";
    int      ipntr[14], iparam[11], *select, *psize, *nbase;
    double   *v , *z , *workd, *workl, *workev, *resid, *recvbuf,
             *dr, *di, *zglb , *res;
    int      ione = 1, stiff = 1, irow, ibegin, iend;
    double   done = 1.0, dzero = 0.0; 
    /* make a copy of the matrix for residual checking */
    int      *colptr, *rowind;
    double   *nzvals;

#ifdef USECRS
    int *row_start;
    int *col_index;
    double *val;
#ifdef USEAZTEC
    int proc_config[AZ_PROC_SIZE];
    AZ_MATRIX *Bmat;
    double *dsrc;
    double *ddest,*ddest2;
    int N_update;
    int *external, *update_index, *update, *extern_index, *data_org;
    int aztotal;
    int *bindx;
    double *msrval;
    int local_nnz;
#endif
#endif
    nprow = 1;  /* Default process rows.      */
    npcol = 1;  /* Default process columns.   */
    nrhs  = 1;   /* Number of right-hand side. */

    /* Parse command line argv[]. */
    if (argc != 7) ABORT("Number of arguements incorrect");
    for (cpp = argv+1; *cpp; ++cpp) {
	if ( **cpp == '-' ) {
	    c = *(*cpp+1);
	    ++cpp;
	    switch (c) {
	      case 'h':
		  printf("Options:\n");
		  printf("\t-r <int>: process rows    (default %d)\n", nprow);
		  printf("\t-c <int>: process columns (default %d)\n", npcol);
		  exit(0);
		  break;
	      case 'r': nprow = atoi(*cpp);
		        break;
	      case 'c': npcol = atoi(*cpp);
		        break;
	    }
	} else { /* Last two args are considered a filename */
            if (stiff) { 
               /* open the stiffness matrix */ 
	       if ( !(fp1 = fopen(*cpp, "r")) ) {
                   ABORT("File does not exist");
               }
               stiff = 0;  
            }
            else {
	       if ( !(fp2 = fopen(*cpp, "r")) ) {
                   ABORT("File does not exist");
               }
	       break;
            }
	}
    }

    /* ------------------------------------------------------------
       INITIALIZE MPI ENVIRONMENT. 
       ------------------------------------------------------------*/
    MPI_Init( &argc, &argv );
    t0 = MPI_Wtime();
#ifdef USEAZTEC
    AZ_set_proc_config(proc_config,MPI_COMM_WORLD);
#endif
    /* ------------------------------------------------------------
       INITIALIZE THE SUPERLU PROCESS GRID. 
       ------------------------------------------------------------*/
    superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid);

    /* Bail out if I do not belong in the grid. */
    iam = grid.iam;
    if ( iam >= nprow * npcol )
	goto out;
    
#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Enter main()");
#endif
    
    /* ------------------------------------------------------------
       PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL
       THE OTHER PROCESSES.
       ------------------------------------------------------------*/
    if ( !iam ) {
	/* Print the CPP definitions. */
	cpp_defs();
	
	/* Read the stiffness matrix stored on disk in binary format. */
        fread(&ma  , 4, 1, fp1);
        na = ma;
        fread(&nnza, 4, 1, fp1);

        xa   = (int*)calloc(ma+1, sizeof(int));
        asub = (int*)calloc(nnza, sizeof(int));
        amtx = (double*)calloc(nnza, sizeof(double));

        fread((int*)xa     , 4, na+1, fp1);
        fread((int*)asub   , 4, nnza, fp1);
        fread((double*)amtx, 8, nnza, fp1);
        fclose(fp1);
	printf("\tA: Dimension\t%dx%d\t # nonzeros %d\n", ma, na, nnza);
        /* Change to 0-based index */
        for (i=0;i<na+1;i++) xa[i]--;
        for (i=0;i<nnza;i++) asub[i]--;

        /* Read the mass matrix */
        fread(&mb  , 4, 1, fp2);
        nb = mb;
        fread(&nnzb, 4, 1, fp2);

        xb   = (int*)calloc(mb+1, sizeof(int));
        bsub = (int*)calloc(nnzb, sizeof(int));
        bmtx = (double*)calloc(nnzb, sizeof(double));

        fread((int*)xb     , 4, nb+1, fp2);
        fread((int*)bsub   , 4, nnzb, fp2);
        fread((double*)bmtx, 8, nnzb, fp2);
        fclose(fp2);
        /* Change to 0-based index */
        for (i=0;i<nb+1;i++) xb[i]--;
        for (i=0;i<nnzb;i++) bsub[i]--;

	printf("\tB: Dimension\t%dx%d\t # nonzeros %d\n", mb, nb, nnzb);
	printf("\tProcess grid\t%d X %d\n", grid.nprow, grid.npcol);

	/* Broadcast matrix A to the other PEs. */
	MPI_Bcast( &ma,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &na,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnza, 1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( amtx, nnza, MPI_DOUBLE, 0, grid.comm );
	MPI_Bcast( asub, nnza, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xa,   na+1, mpi_int_t,  0, grid.comm );

	/* Broadcast matrix B to the other PEs. */
	MPI_Bcast( &mb,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nb,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnzb, 1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( bmtx, nnzb, MPI_DOUBLE, 0, grid.comm );
	MPI_Bcast( bsub, nnzb, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xb,   nb+1, mpi_int_t,  0, grid.comm );

    } else {
	/* Receive matrix A from PE 0. */
	MPI_Bcast( &ma,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &na,   1,   mpi_int_t,  0, grid.comm );
	MPI_Bcast( &nnza, 1,   mpi_int_t,  0, grid.comm );

	/* Allocate storage for compressed column representation. */
	dallocateA_dist(na, nnza, &amtx, &asub, &xa);

	MPI_Bcast( amtx, nnza, MPI_DOUBLE, 0, grid.comm );
	MPI_Bcast( asub, nnza, mpi_int_t,  0, grid.comm );
	MPI_Bcast( xa,   na+1, mpi_int_t,  0, grid.comm );

        /* Receive matrix B from PE 0. */
        MPI_Bcast( &mb,   1,   mpi_int_t,  0, grid.comm );
        MPI_Bcast( &nb,   1,   mpi_int_t,  0, grid.comm );
        MPI_Bcast( &nnzb, 1,   mpi_int_t,  0, grid.comm );

        /* Allocate storage for compressed column representation. */
        dallocateA_dist(nb, nnzb, &bmtx, &bsub, &xb);

        MPI_Bcast( bmtx, nnzb, MPI_DOUBLE, 0, grid.comm );
        MPI_Bcast( bsub, nnzb, mpi_int_t,  0, grid.comm );
        MPI_Bcast( xb,   nb+1, mpi_int_t,  0, grid.comm );
    }

    /* Make a copy of the stiffness matrix A (amtx will be corrupted by fact)*/ 
    colptr = (int*)calloc((na+1),sizeof(int));
    rowind = (int*)calloc(nnza,sizeof(int));
    nzvals = (double*)calloc(nnza,sizeof(double));
    for (i=0;i<=na;i++)  colptr[i]=xa[i]+1;
    for (i=0;i<nnza;i++) {
       rowind[i]=asub[i]+1;
       nzvals[i]=amtx[i];
    }

    /* A <--- A - sigmar B */
    for (i=0;i<nnza;i++) {
       amtx[i] = amtx[i] - sigmar*bmtx[i];
    }

    /* Create compressed column matrix for A. */
    dCreate_CompCol_Matrix_dist(&A, ma, na, nnza, amtx, asub, xa, NC, D, GE);

    /* Generate the exact solution and compute the right-hand side. */
    if ( !(b = doubleMalloc_dist(ma * nrhs)) ) ABORT("Malloc fails for b[]");
    if ( !(b1 = doubleMalloc_dist(ma * nrhs)) ) ABORT("Malloc fails for b1[]");
    if ( !(b2 = doubleMalloc_dist(ma * nrhs)) ) ABORT("Malloc fails for b2[]");
    *trans = 'N';
    ldx = na;
    ldb = ma;

    if ( !(berr = doubleMalloc_dist(nrhs)) )
	ABORT("Malloc fails for berr[].");

    /* ------------------------------------------------------------
       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
       ------------------------------------------------------------*/

    /* Set the default input options. */
    set_default_options(&options);
    options.RowPerm = NO; /* XSL 4-11-02 */

    /* Initialize ScalePermstruct and LUstruct. */
    ScalePermstructInit(ma, na, &ScalePermstruct);
    LUstructInit(ma, na, &LUstruct);

    /* Initialize the statistics variables. */
    PStatInit(&stat);

    /* Call the linear equation solver: just to factorize when nrhs = 0. */
    nrhs = 0;
    pdgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
		     &LUstruct, berr, &stat, &info);

    PStatPrint(&stat, &grid);        /* Print the statistics. */
    PStatFree(&stat);

    /* ------------------------------------------------------------
       NOW WE CALL pdnaupd to solve the eigenvalue problem
       WE WILL USE THE EXISTING L AND U FACTORS IN
       LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION.
       ------------------------------------------------------------*/
    options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */
    options.IterRefine = NO; /* XSL 4-9-02 */
    PStatInit(&stat); /* Initialize the statistics variables. */

    if (mypid == 0) {
          printf(" nnza = %d\n", nnza);
          printf(" xa[na] = %d\n", xa[na]);
          printf(" asub[0] = %d\n", asub[0]);
          printf(" asub[nnza-1] = %d\n", asub[nnza-1]);
    }

    arcomm = MPI_COMM_WORLD;
    MPI_Comm_rank(arcomm, &mypid);
    nev    = 20;
    ncv    = 40;
    lworkl = 3*ncv*ncv+6*ncv;

    /* distribute ARPACK work space as uniformly as possible */
    nprocs = nprow*npcol;
    psize  = (int*)malloc(nprocs*sizeof(int));
    nbase  = (int*)malloc(nprocs*sizeof(int));
    if (mypid == 0) {
       nloc   = na/nprocs;
       nrem   = na - nloc*nprocs;
       for (i = 0; i<nprocs; i++) {
          psize[i] = nloc;
          if (i<nrem) psize[i]++; 
       }
       nbase[0] = 0;
       for (i = 1; i<nprocs; i++)
          nbase[i] = nbase[i-1]+psize[i-1];

       recvbuf = (double*)malloc(nloc*sizeof(double));
    }
    if (mypid  == 0) printf(" nprocs = %d\n", nprocs);

    MPI_Bcast( psize, nprocs, MPI_INT, 0, arcomm );
    MPI_Bcast( nbase, nprocs, MPI_INT, 0, arcomm );

    nloc = psize[mypid]; 

#ifdef USECRS
#ifdef USEAZTEC
    local_nnz = gccs2lcrs(nb,xb,bsub,bmtx,nbase[mypid],nbase[mypid]+nloc,
			  &row_start,&col_index,&val);
#else
    gccs2lcrs(nb,xb,bsub,bmtx,0,nb,&row_start,&col_index,&val);
#endif
#endif
    
    /* Change the indices of the mass matrix to fortran 1-based indices */
    for (i=0;i<=nb;i++)  xb[i]++;
    for (i=0;i<nnzb;i++) bsub[i]++;
    
    v      = (double*) malloc( nloc*ncv*sizeof(double)  );
    z      = (double*) malloc( nloc*ncv*sizeof(double)  );
    workl  = (double*) malloc( lworkl*sizeof(double) );
    workev = (double*) malloc( 3*nloc*sizeof(double) ); 
    resid  = (double*) malloc( nloc*sizeof(double) );
    dr     = (double*) malloc( ncv*sizeof(double) );
    di     = (double*) malloc( ncv*sizeof(double) );
    select = (int*)    malloc( ncv*sizeof(int));
    for (i=0;i<ncv;i++) {
      dr[i] = 0.0;
      di[i] = 0.0;
    }

    workd  = (double*) malloc( 3*nloc*sizeof(double) );
    tol    = 1.0e-6;
    maxitr = 100;

    iparam[0] = 1;
    iparam[2] = maxitr; 
    iparam[6] = 3;

    printf(" mypid = %d, nloc  = %d\n", mypid, nloc);
    printf(" mypid = %d, nbase = %d\n", mypid, nbase[mypid]);

#ifdef USEAZTEC
    update = calloc(nloc,sizeof(int));
    N_update = nloc;
		    
    for(i=0;i < nloc;i++) {
      update[i]=nbase[mypid]+i;
    }
    bindx = calloc(local_nnz+N_update+1,sizeof(int));
    msrval = calloc(local_nnz+N_update+1,sizeof(double));
    csr2msr(N_update,local_nnz,row_start,col_index,val,bindx,msrval,update);
    free(row_start);
    free(col_index);
    free(val);
    AZ_transform(proc_config, &external, bindx, msrval, update, &update_index,
		 &extern_index, &data_org, N_update, NULL, NULL, NULL, NULL,
		 AZ_MSR_MATRIX);
    Bmat = AZ_matrix_create(data_org[AZ_N_internal]+data_org[AZ_N_border]);
    AZ_set_MSR(Bmat,bindx,msrval,data_org,0,NULL,AZ_LOCAL);
    aztotal= data_org[AZ_N_internal]+data_org[AZ_N_border]+
      data_org[AZ_N_external] + 1;
    dsrc = calloc(aztotal,sizeof(double));
    ddest = calloc(aztotal,sizeof(double));
    ddest2 = calloc(aztotal,sizeof(double));
#endif

    ido   = 0;
    arflg = 0;
   
    fprintf(stderr, " calling pdnaupd ...\n");

    while (ido != 99) {
      pdnaupd(&arcomm, &ido   , bmat,  &nloc, which, &nev , &tol , 
	      resid , &ncv   , v    , &nloc, iparam, ipntr, workd,
	      workl , &lworkl, &arflg);
      if (ido == -1) {
	/* collect the local pieces from all processors 
	   and put it in b1 */
	pin  = ipntr[0];
	pout = ipntr[1];
	MPI_Allgatherv(&workd(pin),nloc,MPI_DOUBLE,b1,psize,nbase,
		       MPI_DOUBLE,arcomm);
	/* do b2 <--- B*b1 */
#ifdef USECRS
#ifdef USEAZTEC
	/* Copy source */
	bzero(dsrc,aztotal*sizeof(double)); 
	dcopy(&nloc, &workd(pin), &ione, dsrc, &ione);
	AZ_reorder_vec(dsrc,data_org,update_index,NULL);
	Bmat->matvec(dsrc,ddest,Bmat,proc_config);
	AZ_invorder_vec(ddest,data_org,update_index,NULL,ddest2);
	MPI_Allgatherv(ddest2,nloc,MPI_DOUBLE,b2,psize,nbase,
		       MPI_DOUBLE,arcomm);
#else
	   crsmvm(nb,row_start,col_index,val,b1,b2);
#endif
#else
           iopt = 1;  
           dmvm( &nb, bmtx, bsub, xb, b1, b2, &iopt );
#endif
           nrhs = 1;
           pdgssvx_ABglobal(&options, &A       , &ScalePermstruct, 
                            b2      , ldb      , nrhs            ,
                            &grid   , &LUstruct, berr            ,
                            &stat   , &info);
	   
           /* put the solution into workd */
           ibeg = nbase[mypid];
           dcopy(&nloc, &b2[ibeg], &ione, &workd(pout), &ione);
       }
       else if (ido == 1) {

           /* collect the local pieces from all processors
              and put it in b1 */
           pin  = ipntr[2];
           pout = ipntr[1];
           MPI_Allgatherv(&workd(pin),nloc,MPI_DOUBLE,b1,psize,nbase,
				MPI_DOUBLE,arcomm);
           nrhs = 1;
           pdgssvx_ABglobal(&options, &A       , &ScalePermstruct,
                            b1      , ldb      , nrhs            ,
                            &grid   , &LUstruct, berr            ,
                            &stat   , &info);

           /* put the solution into workd */
           ibeg = nbase[mypid];
           dcopy(&nloc, &b1[ibeg], &ione, &workd(pout), &ione);

       }
       else if (ido == 2) {

           /* collect the local pieces from all processors
              and put it in b1 */
           
           pin  = ipntr[0];
           pout = ipntr[1];

           MPI_Allgatherv(&workd(pin),nloc,MPI_DOUBLE,b1,psize,nbase,
				MPI_DOUBLE,arcomm);

#ifdef USECRS
#ifdef USEAZTEC
	bzero(dsrc,aztotal*sizeof(double)); 
	dcopy(&nloc, &workd(pin), &ione, dsrc, &ione);
	AZ_reorder_vec(dsrc,data_org,update_index,NULL);
	Bmat->matvec(dsrc,ddest,Bmat,proc_config);
	AZ_invorder_vec(ddest,data_org,update_index,NULL,ddest2);
	MPI_Allgatherv(ddest2,nloc,MPI_DOUBLE,b2,psize,nbase,
		       MPI_DOUBLE,arcomm);
#else
	   crsmvm(nb,row_start,col_index,val,b1,b2);
#endif
#else
           iopt = 1;  
           dmvm( &nb, bmtx, bsub, xb, b1, b2, &iopt);
#endif
           /* put the product into workd */
           ibeg = nbase[mypid];
           dcopy(&nloc, &b2[ibeg], &ione, &workd(pout), &ione);
       }
    }

    /* DEALLOCATE SOME STORAGE */
 
    Destroy_CompCol_Matrix_dist(&A);
    Destroy_LU(na, &grid, &LUstruct);
    ScalePermstructFree(&ScalePermstruct);
    LUstructFree(&LUstruct);

    if (arflg == 0) {
       rvec  = 1;
       nconv = iparam[4];
       pdneupd(&arcomm , &rvec  , howmny , select , dr    , di   ,  
               z      , &nloc  , &sigmar, &sigmai, workev, bmat ,
               &nloc  , which  , &nev   , &tol   , resid , &ncv ,
               v      , &nloc  , iparam , ipntr  , workd , workl,
               &lworkl, &arflg);
       if (arflg==0) {
          /* compute residual */ 
          if (!iam ) printf(" nconv = %d\n", nconv);
    
          /* Check the accuracy of the solution. */
          if (mypid == 0) {
             zglb = (double*) malloc(na*nconv*sizeof(double));
             res  = (double*) malloc(nconv*sizeof(double));
          }

          for (i=0;i<nconv;i++) {
             if (mypid == 0) {
                dcopy(&nloc, &z(1,i+1), &ione, &zglb(1,i+1), &ione);
                for (j=1;j<nprocs;j++) {
                   MPI_Recv(recvbuf, psize[j]   , MPI_DOUBLE,
                            j      , MPI_ANY_TAG, arcomm    ,
                            &mpistat);
                   sid  = mpistat.MPI_SOURCE;
                   ibeg = nbase[sid];
                   dcopy(&psize[j], recvbuf, &ione, &zglb(ibeg+1,i+1), &ione);
                }
             }
             else {
                MPI_Send(&z(1,i+1), nloc  , MPI_DOUBLE, 0,
                         mypid    , arcomm); 
             }
          } /* end for */

          if (mypid == 0) {     
             dngeres(&na , &nconv, colptr, rowind, nzvals, xb,  
                     bsub, bmtx  , dr    , di    , zglb  , &na,
                     res);
             for (i = 0; i<nconv; i++)  
                printf( "dr = %12.5e,  di = %12.5e,  res = %11.4e\n", 
                         dr[i], di[i], res[i]); 
             free(zglb);
             free(res);
          }

       } /* end if (arflg) */
    }

    /* Print the statistics. */
    PStatPrint(&stat, &grid);
    t1 = MPI_Wtime();
    t1 = t1 - t0;
    if (!iam) printf("Total run time = %11.3e\n", t1);

    /* ------------------------------------------------------------
       DEALLOCATE STORAGE.
       ------------------------------------------------------------*/
    PStatFree(&stat);
    SUPERLU_FREE(b);
    SUPERLU_FREE(b1);
    SUPERLU_FREE(b2);
    SUPERLU_FREE(berr);

    free(v);
    free(z);
    free(workl);
    free(workev);
    free(workd);
    free(resid);
    free(psize);
    free(nbase);
    free(dr);
    free(di);

    free(external);
    free(update);
    free(update_index);
    free(extern_index);
    free(data_org);
    free(dsrc);
    free(ddest);
    free(ddest2);
    free(bindx);
    free(msrval);
    AZ_free(Bmat);

    if (mypid == 0) free(recvbuf);

    /* ------------------------------------------------------------
       RELEASE THE SUPERLU PROCESS GRID.
       ------------------------------------------------------------*/
out:
    superlu_gridexit(&grid);

    /* ------------------------------------------------------------
       TERMINATES THE MPI EXECUTION ENVIRONMENT.
       ------------------------------------------------------------*/
    MPI_Finalize();

#if ( DEBUGlevel>=1 )
    CHECK_MALLOC(iam, "Exit main()");
#endif

}


int cpp_defs()
{
    printf(".. CPP definitions:\n");
#if ( PRNTlevel>=1 )
    printf("\tPRNTlevel = %d\n", PRNTlevel);
#endif
#if ( DEBUGlevel>=1 )
    printf("\tDEBUGlevel = %d\n", DEBUGlevel);
#endif
#if ( PROFlevel>=1 )
    printf("\tPROFlevel = %d\n", PROFlevel);
#endif
#if ( StaticPivot>=1 )
    printf("\tStaticPivot = %d\n", StaticPivot);
#endif
#ifdef USECRS
#ifdef USEAZTEC
    printf("\tAztec Matrix Multiplication\n");
#else
    printf("\tCRS Matrix Multiplication\n");
#endif
#endif
    printf("....\n");
}


syntax highlighted by Code2HTML, v. 0.9.1