/**************************************************************************
*                                                                         *
*  Author      : Dr. Thomas Brandes, GMD, I1.HR                           *
*  Copyright   : GMD St. Augustin, Germany                                *
*  Date        : Apr 92                                                   *
*  Last Update : May 92                                                   *
*                                                                         *
*  This Module is part of the DALIB                                       *
*                                                                         *
*  Module      : schedule.c                                               *
*                                                                         *
*  Function    : Operations for indirect addressing of distributed arrays *
*                                                                         *
*  Export : FORTRAN Interface                                             *
*                                                                         *
*  - A, P, Q, ... are conform and are distributed                         *
*  - B is indirect accessed, rank determines number of P, Q, ...          *
*  - A and B have same type                                               *
*                                                                         *
*  A = B [P]  : A[j] = B[P[j]]    for all j                               *
*                                                                         *
*  void dalib_global_getm1_ (a, a_size, a_elems,                          *
*                            b, b_N1, p, mask   )                         *
*                                                                         *
*  B [P] = A  : B[P[j]] = A[j]    for all j                               *
*                                                                         *
*  void dalib_global_setm1_ (op, b, b_N1, p,                              *
*                            a, a_size, a_elems, mask)                    *
*                                                                         *
*  A = B [P,Q]  : A[j] = B[P[j],Q[j]]    for all j                        *
*                                                                         *
*  void dalib_global_getm2_ (a, a_size, a_elems,                          *
*                            b, b_N1, b_N2, p, q, mask   )                *
*                                                                         *
*  B [P,Q] = A  : B[P[j],Q[j]] = A[j]    for all j                        *
*                                                                         *
*  void dalib_global_setm2_ (op, b, b_N1, b_N2, p, q,                     *
*                            a, a_size, a_elems, mask)                    *
*                                                                         *
**************************************************************************/

#undef DEBUG
#include "system.h"
#include <stdio.h>

/*********************************************************
*                                                        *
*  Global data for a schedule                            *
*                                                        *
*********************************************************/

  /******************************************************************
  *                                                                 *
  *  send_schedule                                                  *
  *  -------------------------------------------------------------  *
  *  | s_no[0]      | s_no[1]     |                  | s_no[k-1] |  *
  *  -------------------------------------------------------------  *
  * /\             /\            /\                 /\              *
  *  | s_ptr[0]     | s_ptr[1]       .......         | s_ptr[k-1]   *
  *                                                                 *
  *                                                                 *
  *  s_no[p-1] : number of indexes that point to process p          *
  *                                                                 *
  *  send_schedule [s_ptr[p-1]]                 index values        *
  *  send_schedule [s_ptr[p-1]+1]               that point to       *
  *  send_schedule [s_ptr[p-1]+2]               process p           *
  *  ............................                                   *
  *  send_schedule [s_ptr[p-1]+s_no[p-1]-1]                         *
  *                                                                 *
  *  send_values : size of array send_schedule                      *
  *                                                                 *
  *  index[i] is in send_schedule[local_index[i]]                    *
  *                                                                 *
  ******************************************************************/

  int index_values;    /* number of elements in index, local part */
  int send_values;     /* number of true index elements, mask!    */
  int *send_schedule;  /* size is send_values */
  int *s_ptr;          /* size is nproc()     */
  int *s_no;           /* size is nproc()     */

  int *local_index;    /* size is send_values */

  /******************************************************************
  *                                                                 *
  *  recv_schedule                                                  *
  *  -------------------------------------------------------------  *
  *  | r_no[0]      | r_no[1]     |                  | r_no[k-1] |  *
  *  -------------------------------------------------------------  *
  * /\             /\            /\                 /\              *
  *  | r_ptr[0]     | r_ptr[1]       .......         | r_ptr[k-1]   *
  *                                                                 *
  *                                                                 *
  *  r_no[p-1] : number of indexes of process p that point to me    *
  *                                                                 *
  *  recv_schedule [r_ptr[p-1]]                 index values        *
  *  recv_schedule [r_ptr[p-1]+1]               of process p that   *
  *  recv_schedule [r_ptr[p-1]+2]               point to values     *
  *  ............................               belonging to        *
  *  recv_schedule [r_ptr[p-1]+r_no[p-1]-1]     this process        *
  *                                                                 *
  *  recv_values : size of array recv_schedule                      *
  *                                                                 *
  ******************************************************************/

  int recv_values;
  int *recv_schedule;
  int *r_ptr;
  int *r_no;

  int operation;   /* operation for global set, 0 - 18 */

/*********************************************************
*                                                        *
*  ERROR - Handling                                      *
*                                                        *
*********************************************************/

void dalib_schedule_error (s)
char s[];
{ printf ("FATAL ERROR : dalib_schedule_...\n");
  printf ("Message : %s\n", s);
  exit (-1);
}

#ifdef DEBUG
/*********************************************************
*                                                        *
*  P R I N T I N G    T E S T    O U T P U T             *
*                                                        *
*********************************************************/

void dalib_print_send_schedule ()

{ int k;

  /* lock (&sm->global_lock); */

  /* print the send_schedule */

  printf ("Process %d has %d of %d pointers to other values\n",
           pcb.i, send_values, index_values);

  for (k=0; k<pcb.p; k++)
      printf ("Process %d needs %d values of process %d\n",
              pcb.i, s_no[k], k+1);

  /* unlock (&sm->global_lock); */
}

void dalib_print_recv_schedule ()

{ int k;

  /* lock (&sm->global_lock); */

  /* print the recv_schedule */

  printf ("Process %d has %d pointers from other ones to its values\n",
           pcb.i, recv_values);

  for (k=0; k<pcb.p; k++)
      printf ("Process %d has %d indexes of process %d\n",
              pcb.i, r_no[k], k+1);

  /* unlock (&sm->global_lock); */
}

void dalib_print_send_sched_values ()

{ /* print the send_schedule */
 
  int j;

  /* lock (&sm->global_lock); */

  printf ("send_schedule of process %d \n", pcb.i);
  printf ("=========================== \n");

  printf ("\n");
  printf ("send values = %d\n", send_values);
  printf ("\n");
  for (j=0;j<pcb.p;j++)
    printf ("proc %d: ptr = %d, no = %d \n", j+1, s_ptr[j], s_no[j]);
   
  printf ("\n");
  for (j=0;j<send_values;j++)
    printf ("send_schedule (%d) = %d\n", j, send_schedule[j]);
  printf ("\n");

  /* unlock (&sm->global_lock); */

} /* dalib_print_send_sched_values */

 
void dalib_print_recv_sched_values ()

{ /* print the recv schedule */

  int j;

  /* lock (&sm->global_lock); */

  printf ("recv_schedule of process %d \n", pcb.i);
  printf ("=========================== \n");

  printf ("\n");
  for (j=0;j<pcb.p;j++)
    printf ("proc %d: ptr = %d , no = %d \n", j+1, r_ptr[j], r_no[j]);
   
  printf ("\n");
  printf ("recv_values = %d\n", recv_values);
  printf ("\n");

  for (j=0;j<recv_values;j++)
    printf ("recv_schedule(%d,%d) = %d\n", pcb.i, j, recv_schedule[j]);
  printf ("\n");

  /* unlock (&sm->global_lock); */

} /* dalib_print_recv_sched_values */

#endif
 
/*********************************************************
*                                                        *
*  Help function to determine fictive zero of array      *
*                                                        *
*********************************************************/

unsigned char * dalib_array_zero1 (a, a_size, a_N1)
unsigned char *a;
int a_size, a_N1;

{ int lb, ub, offset;

  /* determination of offset */

  dalib_local_extensions (pcb.i, a_N1, &lb, &ub);

  offset = (lb - 1) * a_size;
  return (a - offset);
}


unsigned char *dalib_array_zero2 (a, a_size, a_N1, a_N2)
unsigned char *a;
int a_size, a_N1, a_N2;

{ int lb, ub, offset;

  /* determination of offset */

  dalib_local_extensions (pcb.i, a_N2, &lb, &ub);

  offset = a_N1 * (lb - 1);
  offset = offset * a_size;
  return (a - offset);
}

/*********************************************************
*                                                        *
*  PROCEDURES for global SET                             *
*                                                        *
*********************************************************/

          /*************************************
          *                                    *
          *  C O P Y                           *
          *                                    *
          *************************************/

void dalib_set_copy (a_zero, values, size)
unsigned char *a_zero, *values;
int size;

{ int i, j;
  unsigned char *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = values[j]  */
      ptr = a_zero + recv_schedule[j] * size;
      for (i=0;i<size;i++)
        ptr[i] = values[j*size + i];
    }
} /* dalib_set_copy */

          /*************************************
          *                                    *
          *  M I N V A L                       *
          *                                    *
          *************************************/

void dalib_set_min_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = min (a[recv-schedule[j], values[j])  */
      ptr = a_zero + recv_schedule[j];
      if (values[j] < *ptr) *ptr = values[j];
    }
} /* dalib_set_min_ints */

void dalib_set_min_reals (a_zero, values)
float *a_zero, *values;

{ int j;
  float *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = min (a[recv-schedule[j], values[j])  */
      ptr = a_zero + recv_schedule[j];
      if (values[j] < *ptr) *ptr = values[j];
    }
} /* dalib_set_min_ints */

void dalib_set_min_doubles (a_zero, values)
double *a_zero, *values;

{ int j;
  double *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = min (a[recv-schedule[j], values[j])  */
      ptr = a_zero + recv_schedule[j];
      if (values[j] < *ptr) *ptr = values[j];
    }
} /* dalib_set_min_doubles */

          /*************************************
          *                                    *
          *  M A X V A L                       *
          *                                    *
          *************************************/

void dalib_set_max_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = max (a[recv-schedule[j], values[j])  */
      ptr = a_zero + recv_schedule[j];
      if (values[j] > *ptr) *ptr = values[j];
    }
} /* dalib_set_max_ints */

void dalib_set_max_reals (a_zero, values)
float *a_zero, *values;

{ int j;
  float *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = max (a[recv-schedule[j], values[j])  */
      ptr = a_zero + recv_schedule[j];
      if (values[j] > *ptr) *ptr = values[j];
    }
} /* dalib_set_max_ints */

void dalib_set_max_doubles (a_zero, values)
double *a_zero, *values;

{ int j;
  double *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = max (a[recv-schedule[j], values[j])  */
      ptr = a_zero + recv_schedule[j];
      if (values[j] > *ptr) *ptr = values[j];
    }
} /* dalib_set_max_doubles */

          /*************************************
          *                                    *
          *  S U M                             *
          *                                    *
          *************************************/

void dalib_set_add_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] + values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr += values[j];
    }
} /* dalib_set_add_ints */

void dalib_set_add_reals (a_zero, values)
float *a_zero, *values;

{ int j;
  float *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] + values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr += values[j];
    }
} /* dalib_set_add_reals */

void dalib_set_add_doubles (a_zero, values)
double *a_zero, *values;

{ int j;
  double *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] + values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr += values[j];
    }
} /* dalib_set_add_doubles */

          /*************************************
          *                                    *
          *  P R O D U C T                     *
          *                                    *
          *************************************/

void dalib_set_mult_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] * values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr *= values[j];
    }
} /* dalib_set_mult_ints */

void dalib_set_mult_reals (a_zero, values)
float *a_zero, *values;

{ int j;
  float *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] * values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr *= values[j];
    }
} /* dalib_set_mult_reals */

void dalib_set_mult_doubles (a_zero, values)
double *a_zero, *values;

{ int j;
  double *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] * values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr *= values[j];
    }
} /* dalib_set_mult_doubles */

          /*************************************
          *                                    *
          *  AND, OR, EOR                      *
          *                                    *
          *************************************/

void dalib_set_and_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] & values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr = *ptr | values[j];
    }
} /* dalib_set_and_ints */

void dalib_set_or_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] | values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr = *ptr | values[j];
    }
} /* dalib_set_or_ints */

void dalib_set_eor_ints (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] ^ values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr = *ptr ^ values[j];
    }
} /* dalib_set_eor_ints */

          /*************************************
          *                                    *
          *  ALL, ANY, PARITY                  *
          *                                    *
          *************************************/

void dalib_set_or_bools (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] || values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr = *ptr || values[j];
    }
} /* dalib_set_or_bools */

void dalib_set_and_bools (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] || values[j]  */
      ptr = a_zero + recv_schedule[j];
      *ptr = *ptr && values[j];
    }
} /* dalib_set_and_bools */

void dalib_set_neq_bools (a_zero, values)
int *a_zero, *values;

{ int j;
  int *ptr;

  for (j=0; j<recv_values; j++)
    { /* a[recv_schedule[j]] = a[recv-schedule[j] || values[j]  */
      ptr = a_zero + recv_schedule[j];
      if (values[j]) *ptr = !(*ptr);
    }
} /* dalib_set_neq_bools */

/*******************************************************************
*                                                                  *
*  dalib_schedule_init_ (b_N, index, index_size, mask)             *
*                                                                  *
*  Initialization of a schedule                                    *
*                                                                  *
*  in:  b_N (number of elements in the distributed dimension of b) *
*       index (index values for array b)                           *
*       index_size (number of elements on this processor)          *
*       mask (mask for values to be indexed, mask == index pos)    *
*                                                                  *
*  out: index_values, send_values, recv_values                     *
*       s_no, r_no                                                 *
*                                                                  *
*******************************************************************/

void dalib_schedule_init_ (b_N, index, index_elems, mask)

/* index must be pointer to distributed integer*4 array
   index_elems is number of elements of index locally
   b_N is number of elements in the distributed dimension
   mask  must be pointer to distributed logical*4 array 
   index == mask means there is no mask                  */

int *b_N, *index, *index_elems, *mask;

{ int b_elems;
  int id, nproc, p;    /* processor numbers */
  int j, k;

  id = pcb.i;
  nproc = pcb.p;

  index_values = *index_elems;

#ifdef DEBUG
  printf ("Process %d starts schedule_init, b_N = %d, indexes = %d\n", 
           id, *b_N, index_values);
#endif

  /* intialize s_no */

  s_no = (int *) malloc (4 * nproc);
#ifdef MEIKO_CS2
  ew_touchBuf(s_no, sizeof(int)*nproc);
#endif
  for (k=0; k<nproc; k++)
      s_no [k] = 0;

  local_index = (int *) malloc (4 * index_values);

  /* saves the processor ids */

  /* count number of sending values */

  b_elems  = *b_N;

  if (index == mask)       /* no mask */
    { for (j=0; j < index_values;j++)
       { /* which processor has b[index[j]] */
         p = (index[j] * nproc - 1) / b_elems + 1;
         local_index[j] = p;
         /* make sure that p is between 1 and nproc */
         if ((p<=0) || (p>nproc))
          { printf ("Index value %d out of range\n", index[j]);
            printf ("Fatal ERROR, will exit\n");
            exit(-1);
          }
         s_no [p-1] += 1;
       }
      send_values = index_values;
    }
   else /* there is a mask */
    { send_values = 0;
      for (j=0; j< index_values;j++)
       { if (mask[j])
         { /* which processor has b[index[j]] */
           p = (index[j] * nproc - 1) / b_elems + 1;
           local_index[j] = p;
           /* make sure that p is between 1 and nproc */
           if ((p<=0) || (p>nproc))
            { printf ("Index value %d out of range\n", index[j]);
              printf ("Fatal ERROR, will exit\n");
              exit(-1);
            }
           s_no [p-1] += 1;
           send_values += 1;
         }
         else local_index[j] = -1;   /* not existing processor */
       } /* for */
    } /* with mask */

  /* now it is known how many elements of index have to be send */

  for (k=0; k<nproc; k++)
      asend (id, k+1, s_no + k, 4);
 
# ifdef DEBUG
  dalib_print_send_schedule ();
# endif

  /* now it will be known how many elements other processors need */

  r_no = (int*) malloc (4 * nproc);
#ifdef MEIKO_CS2
  ew_touchBuf(r_no, nproc*sizeof(int));
#endif
  recv_values = 0;

  for (k=0; k<nproc; k++)
    { areceive (id, k+1, r_no + k, 4);
      recv_values += r_no[k];
    }

# ifdef DEBUG
  dalib_print_recv_schedule ();
# endif

  /* now every process knows

      - how many of my indexes point to the other processors
      - how many indexes of other processors point to me 
  */

} /* dalib_schedule_init */

/*********************************************************
*                                                        *
*  dalib_schedule_send_ (b_N, index)                     *
*                                                        *
*  Definition of send_schedule, sending it               *
*                                                        *
*  in:  b_N  (number of elements of b in distrib. dim)   *
*       index (index values for array b)                 *
*                                                        *
*  out: s_ptr, send_schedule                             *
*       indexes will be send                             *
*                                                        *
*********************************************************/

void dalib_schedule_send_ (b_N, index)

int *b_N, *index;

{ int id, nproc, p;    /* processor numbers */
  int j, k;

  id = pcb.i;
  nproc = pcb.p;

  /* make the s_ptr array */

  s_ptr = (int *) malloc (4 * nproc);
  s_ptr[0] = 0;
  for (k=1; k<nproc; k++)
    { s_ptr [k] = s_ptr[k-1] + s_no [k-1]; 
    }

  /* make the send_schedule and local_index array */

  send_schedule = (int *) malloc (4 * send_values);
#ifdef MEIKO_CS2
  ew_touchBuf(send_schedule, sizeof(int)*send_values);
#endif

  for (j=0; j<index_values;j++)
    { /* collect indexes in send_schedule, local_index */
      p = local_index[j];
      if (p >= 0)                   /* mask is hidden in local_index */
        { local_index[j] = s_ptr[p-1];
          send_schedule [s_ptr[p-1]] = index[j] - 1;
          s_ptr[p-1] += 1;
        }
    }

  /* reset the s_ptr array */

  s_ptr[0] = 0;
  for (k=1; k<nproc; k++)
     s_ptr [k] = s_ptr[k-1] + s_no [k-1]; 

  /* send the index values that point to other processors  */

  for (k=0; k<nproc; k++)
    if (s_no[k] > 0)
      asend (id, k+1, send_schedule + s_ptr[k], s_no[k] * 4);

# ifdef DEBUG
  dalib_print_send_sched_values ();
# endif
 
  free (send_schedule);

} /* dalib_schedule_send */
 
/*********************************************************
*                                                        *
*  dalib_schedule_send2_ (b_N1, b_N2, index1, index2 )   *
*                                                        *
*  Definition of send_schedule, sending it               *
*                                                        *
*  in:  b_N (description of indexed array, rank = 2)     *
*       index1 (index values for array b)                *
*       index2 (index values for array b)                *
*                                                        *
*  out: s_ptr, send_schedule                             *
*       indexes will be send                             *
*                                                        *
*********************************************************/

void dalib_schedule_send2_ (b_N1, b_N2, index1, index2)

int *b_N1, *b_N2, *index1, *index2;

{ int id, nproc, p;    /* processor numbers */
  int j, k;

  id = pcb.i;
  nproc = pcb.p;

  /* make the s_ptr array */

  s_ptr = (int *) malloc (4 * nproc);
  s_ptr[0] = 0;
  for (k=1; k<nproc; k++)
    { s_ptr [k] = s_ptr[k-1] + s_no [k-1]; 
    }

  /* make the send_schedule and local_index array */

  send_schedule = (int *) malloc (4 * send_values);
#ifdef MEIKO_CS2
  ew_touchBuf(send_schedule, sizeof(int)*send_values);
#endif

  for (j=0; j<index_values;j++)
    { /* collect indexes in send_schedule, local_index */
      p = local_index[j];
      if (p >= 0)                  /* mask is hidden in local_index */
        { local_index[j] = s_ptr[p-1];
          send_schedule [s_ptr[p-1]] =   (index2[j] - 1) * *b_N1
                                       + (index1[j] - 1) ;
          s_ptr[p-1] += 1;
        }
    }

  /* reset the s_ptr array */

  s_ptr[0] = 0;
  for (k=1; k<nproc; k++)
     s_ptr [k] = s_ptr[k-1] + s_no [k-1]; 

  /* send the index values that point to other processors  */

  for (k=0; k<nproc; k++)
    if (s_no[k] > 0)
      asend (id, k+1, send_schedule + s_ptr[k], s_no[k] * 4);

# ifdef DEBUG
  dalib_print_send_sched_values ();
# endif
 
  free (send_schedule);

} /* dalib_schedule_send2 */
 
/*********************************************************
*                                                        *
*  dalib_schedule_recv_ ()                               *
*                                                        *
*  Definition of recv_schedule, receiving it             *
*                                                        *
*  out: s_ptr, send_schedule                             *
*       indexes will be send                             *
*                                                        *
*********************************************************/

void dalib_schedule_recv_ ()

{ int id, nproc;
  int j, k;

  id = pcb.i;
  nproc = pcb.p;

  /* make the r_ptr array */

  r_ptr = (int *) malloc (4 * nproc);

  /* set r_ptr by using numbers of r_no */

  r_ptr[0] = 0;
  for (k=1; k<nproc; k++)
    r_ptr [k] = r_ptr[k-1] + r_no [k-1]; 

  /* make the recv_schedule array */

  recv_schedule = (int *) malloc (recv_values * 4);
#ifdef MEIKO_CS2
  ew_touchBuf(recv_schedule, sizeof(int)*recv_values);
#endif

  /* receive the indexes */

  for (k=0; k<nproc; k++)
    if (r_no[k] > 0)
       areceive (id, k+1, recv_schedule + r_ptr[k], r_no[k] * 4);

# ifdef DEBUG
  dalib_print_recv_sched_values ();
# endif
 
} /* dalib_schedule_recv */

/*********************************************************
*                                                        *
*  dalib_switching_init_ (b_N, index, index_elems)       *
*                                                        *
*  Initialization of a schedule of switching             *
*                                                        *
*  in:  b_N (siye of b in distributed dimension)         *
*       index (index values for array b)                 *
*       index_elems (number of elems of A, index)        *
*                                                        *
*  out: send_values, recv_values                         *
*       s_no, r_no                                       *
*                                                        *
*********************************************************/

void dalib_switching_init_ (b_N, index, index_elems)
int *b_N;
int *index;
int *index_elems;

/* index must be pointer to distributed integer*4 array */

{ int id, nproc, p;    /* processor numbers */
  int j, k;
  int b_elems;         /* is equal *b_N     */

  id = pcb.i;
  nproc = pcb.p;

  index_values = *index_elems;
  send_values = index_values;          /* no mask in this case */
  
  /* intialize s_no */

  s_no = (int *) malloc (4 * nproc);
  for (k=0; k<nproc; k++)
      s_no [k] = 0;

  local_index   = (int *) malloc (4 * index_values);

  /* saves the processor ids */

  /* count number of sending values */

  b_elems  = *b_N;

  for (j=0; j<index_values;j++)
    { /* which processor has b[index[j]] */
      p = (index[j] * nproc - 1) / b_elems + 1;
      local_index[j] = p;
      /* make sure that p is between 1 and nproc */
      if ((p<=0) || (p>nproc))
       { printf ("Index value %d out of range\n", index[j]);
         printf ("Fatal ERROR, will exit\n");
         exit(-1);
       }
      s_no [p-1] += 1;
    }

  /* now it is known how many elements of index have to be send and recvd */

  /* now it will be known how many elements other processors need */

  r_no = (int*) malloc (4 * nproc);
  recv_values = send_values ;

  for (k=0; k<nproc; k++)
    r_no[k] = s_no[k];

} /* dalib_switching_init */

/*********************************************************
*                                                        *
*  dalib_schedule_get_send (b_zero, b_size)              *
*                                                        *
*  - send the requrested values of b to other processes  *
*                                                        *
*  in :   - b_zero : my fiktive zero address of b        *
*         - b_size : number of bytes of one elem in b    *
*                                                        *
*  - uses recv_schedule that tells which elements        *
*    are needed by the other processors                  *
*                                                        *
*  - done by building array of values and sending it     *
*                                                        *
*********************************************************/

void dalib_schedule_get_send_ (b_zero, b_size)

unsigned char *b_zero;  /* fiktive zero address of b            */
int *b_size;            /* number of bytes for one element of b */

{ int id, nproc;
  int j, k, i;
  int size;
 
  unsigned char *values;     /* arrays with the values */
  unsigned char *ptr;        /* pointer for copy values */

  id = pcb.i;
  nproc = pcb.p;

  /* Step 1: use recv_schedudule to get values needed by other processes */

  /* size of one element of array b */

  size = *b_size;

  values = (unsigned char *) malloc (recv_values * size);

  if (size == 4)
    dalib_memget4 (values, b_zero, recv_schedule, recv_values);
  else if (size == 8)
    dalib_memget8 (values, b_zero, recv_schedule, recv_values);
  else
    { for (j=0;j<recv_values;j++)
        { /* values [j] = b [recv_schedule[j]] */
          ptr = b_zero + recv_schedule[j] * size;
          for (i=0;i<size;i++)
             values [j*size+i] = ptr[i];
        }
    }

  /* Step 2: send these values to other processors */

  for (k=0; k<nproc; k++)
    if (r_no[k] > 0)
      asend (id, k+1, values + r_ptr[k]*size, r_no[k] * size);
        
  free (values);

} /* dalib_schedule_get_send_ */

/*********************************************************
*                                                        *
*  dalib_schedule_get_recv (a, a_size)                   *
*                                                        *
*  - get the requested values and put in in my a         *
*                                                        *
*  in :   - a      : my address of array a               *
*         - a_size : number of bytes of one elem in a    *
*                                                        *
*  - uses send_schedule that tells which elements        *
*    processor got by the other processors               *
*                                                        *
*  - use also local_index that tells where to put        *
*    the received values in my array a                   *
*                                                        *
*********************************************************/

void dalib_schedule_get_recv_ (a, a_size)

int *a_size;         /* number of bytes for one element */
unsigned char *a;    /* pointer to first element of a   */

{ int id, nproc;
  int i, j, k;
  int size;
 
  unsigned char *values;     /* arrays with the received values */
  unsigned char *ptr;        /* pointer for copy values */

  id = pcb.i;
  nproc = pcb.p;

  /* size of one element of array a */

  size = *a_size;

  /* Step 1: recv my values from other processors */

  values = (unsigned char *) malloc (send_values * size);
#ifdef MEIKO_CS2
  ew_touchBuf(values, send_values*size);
#endif

  for (k=0; k<nproc; k++)
    if (s_no[k] > 0)
      areceive (id, k+1, values + s_ptr[k]*size, s_no[k] * size);

  /* Step 2: put the values to the right place in array a */

  if (size == 4)
    dalib_memgetm4 (a, values, local_index, index_values);
  else if (size == 8)
    dalib_memgetm8 (a, values, local_index, index_values);
  else
    { for (j=0; j<index_values; j++)
        { /* a[j] = b [index[j]]  */
          if (local_index[j] >= 0)               /* only if mask */
            { ptr = values + local_index[j] * size;
              for (i=0;i<size;i++)
                a[j*size+i] = ptr[i];
            }
        }
    }

  free (values);
}

/*********************************************************
*                                                        *
*  dalib_schedule_set_send (a, a_size)                   *
*                                                        *
*  - send the values of a to the processors that get it  *
*                                                        *
*  in :   - a      : my address of array a               *
*         - a_size : number of bytes of one elem in a    *
*                                                        *
*  - uses local_index that tells how to pack my          *
*    elements to be sent to other processors             *
*                                                        *
*********************************************************/

void dalib_schedule_set_send_ (a, a_size)
int *a_size;
unsigned char *a;

{ int id, nproc;
  int j, k, i;
  int size;
 
  unsigned char *values;     /* arrays with the values */
  unsigned char *ptr;        /* pointer for copy values */

  id = pcb.i;
  nproc = pcb.p;

  /* Step 1: use recv_schedudule to get values needed by other processes */

  /* size of one element of array a */

  size = *a_size;

  values = (unsigned char *) malloc (send_values * size);
#ifdef MEIKO_CS2
  ew_touchBuf(values, send_values*size);
#endif

  for (j=0;j<index_values;j++)
    { /* values [local_index[j]] = a [j] */
      if (local_index[j] >= 0)
        { ptr = values + local_index[j]*size;
          for (i=0;i<size;i++)
             ptr[i] = a[j*size+i];
        }
    }

  /* Step 2: send these values to other processors */

  for (k=0; k<nproc; k++)
    if (s_no[k] > 0)
      asend (id, k+1, values + s_ptr[k]*size, s_no[k] * size);
        
  free (values);

} /* dalib_schedule_set_send_ */

/*********************************************************
*                                                        *
*  dalib_schedule_set_recv_ (b_zero, b_size)             *
*                                                        *
*     receive values from other processors and are       *
*     stored/scattered in local part of array b          *
*                                                        *
*  in : - b_zero is fiktive zero address of my b         *
*       - b_size is number of bytes for one elem of b    *
*                                                        *
*  -     uses recv_schedule that tells where I have      *
*        to put the received values                      *
*                                                        *
*********************************************************/

void dalib_schedule_set_recv_ (b_zero, b_size)

int *b_size;
unsigned char *b_zero;

{ int id, nproc;
  int i, j, k;
  int size;
 
  unsigned char *values;     /* arrays with the received values */
  unsigned char *ptr;        /* pointer for copy values */

  id = pcb.i;
  nproc = pcb.p;

  /* size of one element of array b */

  size = *b_size;

  /* Step 1: recv my values from other processors */

  values = (unsigned char *) malloc (recv_values * size);
#ifdef MEIKO_CS2
  ew_touchBuf(values, recv_values*size);
#endif

  for (k=0; k<nproc; k++)
    if (r_no[k] > 0)
      areceive (id, k+1, values + r_ptr[k]*size, r_no[k] * size);

  /* Step 2: put the values to the right place in local array b */

  switch (operation) {
  case  0 : dalib_set_copy (b_zero, values, size); break;
  case  1 : dalib_set_min_ints (b_zero, values); break;
  case  2 : dalib_set_min_reals (b_zero, values); break;
  case  3 : dalib_set_min_doubles (b_zero, values); break;
  case  4 : dalib_set_max_ints (b_zero, values); break;
  case  5 : dalib_set_max_reals (b_zero, values); break;
  case  6 : dalib_set_max_doubles (b_zero, values); break;
  case  7 : dalib_set_add_ints (b_zero, values); break;
  case  8 : dalib_set_add_reals (b_zero, values); break;
  case  9 : dalib_set_add_doubles (b_zero, values); break;
  case 10 : dalib_set_mult_ints (b_zero, values); break;
  case 11 : dalib_set_mult_reals (b_zero, values); break;
  case 12 : dalib_set_mult_doubles (b_zero, values); break;
  case 13 : dalib_set_and_ints (b_zero, values); break;
  case 14 : dalib_set_or_ints (b_zero, values); break;
  case 15 : dalib_set_eor_ints (b_zero, values); break;
  case 16 : dalib_set_and_bools (b_zero, values); break;
  case 17 : dalib_set_or_bools (b_zero, values); break;
  case 18 : dalib_set_neq_bools (b_zero, values); break;
  default : dalib_schedule_error ("unknown op for global set"); break;
  }

  free (values);
}

/*********************************************************
*                                                        *
*  dalib_schedule_exit_ ()                               *
*                                                        *
*  Reset of a schedule, frees the memory                 *
*                                                        *
*********************************************************/

void dalib_schedule_exit_ ()

{
  free (recv_schedule);
  free (r_ptr);
  free (r_no);
  free (local_index);
  /* free (send_schedule) has already been done */
  free (s_ptr);
  free (s_no);
} /* dalib_schedule_exit */

/***********************************************************************
*                                                                      *
*  Check Routines                                                      *
*                                                                      *
*  check1 : A , B [P], MASK                                            *
*  check2 : A , B [P,Q], MASK                                          *
*                                                                      *
*    - P, Q must be integer*4                                          *
*    - MASK must be logical*4                                          *
*    - A, B must be of same type                                       *
*    - A, P, Q, MASK must be of same shape                             *
*    - rank of B must be 1 (check1) or 2 (check2)                      *
*                                                                      *
***********************************************************************/

/*******************************************************************
*                                                                  *
*  MASK: A = B [P]  : A[j] = B[P[j]]    for all j with MASK(j)     *
*                                                                  *
*  void dalib_global_getm1_ (a, a_size, a_elems,                   *
*                            b, b_N1, p, mask   )                  *
*                                                                  *
*******************************************************************/

void dalib_global_getm1_ (a, a_size, a_elems, b, b_N1, p, mask)

int *a_size;    /* number of local elements of a and p */
int *a_elems;   /* number of elemens in a, p           */
int *b_N1;      /* number of elements of b             */
int *p, *mask;  /* p == mask if mask is not available  */
unsigned char *a, *b;

{ unsigned char *b_zero;

  /* Step 1: init schedule */

  /* a_elems is also number of local elements for index array p */
  dalib_schedule_init_ (b_N1, p, a_elems, mask);

  /* Step 2: define schedule */

  dalib_schedule_send_ (b_N1, p);
  dalib_schedule_recv_ ();

  /* Step 3: make get with schedule */

  b_zero = dalib_array_zero1 (b, *a_size, *b_N1);
  dalib_schedule_get_send_ (b_zero, a_size);   /* send requested values of b */

  dalib_schedule_get_recv_ (a, a_size);

  /* Step 4: release schedule */

  dalib_schedule_exit_ ();

} /* dalib_global_getm1 */

/*******************************************************************
*                                                                  *
*  MASK: B [P] = A  : B[P[j]] = A[j]    for all j with MASK(j)     *
*                                                                  *
*  void dalib_global_setm1_ (op, b, b_N1, p,                       *
*                            a, a_size, a_elems, mask)             *
*                                                                  *
*******************************************************************/

void dalib_global_setm1_ (op, b, b_N1, p, a, a_size, a_elems, mask)

unsigned char *a, *b;
int *op;
int *a_size, *a_elems;
int *p, *mask;
int *b_N1;

{ unsigned char *b_zero;

  /* make assertions  */

  operation = *op;      /* global set for operation */

  /* Step 1: init schedule */

  /* a_elems is also number of local elements for index array p */

  dalib_schedule_init_ (b_N1, p, a_elems, mask);

  /* Step 2: define schedule */

  dalib_schedule_send_ (b_N1, p);
  dalib_schedule_recv_ ();

  /* Step 3: make set with schedule */

  dalib_schedule_set_send_ (a, a_size);
  b_zero = dalib_array_zero1 (b, *a_size, *b_N1);
  dalib_schedule_set_recv_ (b_zero, a_size);

  /* Step 4: release schedule */

  dalib_schedule_exit_ ();

} /* dalib_global_setm1 */

/***********************************************************************
*                                                                      *
*  MASK: A = B [P,Q]  : A[j] = B[P[j],Q[j]]    for all j with MASK(j)  *
*                                                                      *
*  void dalib_global_getm2_ (a, a_size, a_elems,                       *
*                            b, b_N1, b_N2, p, q, mask   )             *
*                                                                      *
***********************************************************************/

void dalib_global_getm2_ (a, a_size, a_elems, b, b_N1, b_N2, p, q, mask)

int *a_size;        /* number of bytes for elem size of a and b */
int *a_elems;       /* number of local elements in a, p, q */
int *b_N1, *b_N2;    /* global dimensions of b */
int *p, *q, *mask;  /* q == mask if mask is not available */
unsigned char *a, *b;

{ unsigned char *b_zero;

  /* make assertions */

#ifdef DEBUG
  printf ("Process %d starts getm2\n", pcb.i);
#endif

  /* Step 1: init schedule */

  /* a_elems is also number of local elements for index array q */
  dalib_schedule_init_ (b_N2, q, a_elems, mask);

  /* Step 2: define schedule */

  dalib_schedule_send2_ (b_N1, b_N2, p, q);
  dalib_schedule_recv_ ();

  /* Step 3: make get with schedule */

  b_zero = dalib_array_zero2 (b, *a_size, *b_N1, *b_N2);
  dalib_schedule_get_send_ (b_zero, a_size);

  dalib_schedule_get_recv_ (a, a_size);

  /* Step 4: release schedule */

  dalib_schedule_exit_ ();

} /* dalib_global_getm2 */

/***********************************************************************
*                                                                      *
*  MASK: B [P,Q] = A  : B[P[j],Q[j]] = A[j]    for all j with MASK(j)  *
*                                                                      *
*  void dalib_global_setm2_ (op, b, b_N1, b-N2, p, q                   *
*                            a, a_size, a_elems, mask)                 *
*                                                                      *
***********************************************************************/

void dalib_global_setm2_ (op, b, b_N1, b_N2, p, q,
                              a, a_size, a_elems,   mask)
int *op;
int *a_size;        /* number of bytes for elem size of a and b */
int *a_elems;       /* number of local elements in a, p, q */
int *b_N1, *b_N2;   /* global dimensions of b */
int *p, *q, *mask;  /* q == mask if mask is not available */
unsigned char *a, *b;

{ unsigned char *b_zero;

  operation = *op;    /* global set of scatter operation */

  /* Step 1: init schedule */

  /* a_elems is also number of local elements for index array q */
  dalib_schedule_init_ (b_N2, q, a_elems, mask);

  /* Step 2: define schedule */

  dalib_schedule_send2_ (b_N1, b_N2, p, q);
  dalib_schedule_recv_ ();

  /* Step 3: make set with schedule */

  dalib_schedule_set_send_ (a, a_size);   /* values of b will be send */

  b_zero = dalib_array_zero2 (b, *a_size, *b_N1, *b_N2);
  dalib_schedule_set_recv_ (b_zero, a_size);      /* recv values in b */

  /* Step 4: release schedule */

  dalib_schedule_exit_ ();

} /* dalib_global_setm2 */

/*******************************************************************
*                                                                  *
*  FORTRAN INTERFACE                                               *
*                                                                  *
*  void dalib_global_switch_ (a, a_size, a_elems, b, b_N1, p)      *
*                                                                  *
*******************************************************************/

void dalib_global_switch_ (a, a_size, a_elems, b, b_N1, p)
unsigned char *a, *b;
int *a_size, *a_elems, *b_N1;
int *p;

{ unsigned char *b_zero;

  /* Step 1: init schedule */

  dalib_switching_init_ (b_N1, p, a_elems);

  /* Step 2: define schedule */

  dalib_schedule_send_ (b_N1, p);
  dalib_schedule_recv_ ();

  /* Step 3: make get with schedule */

  b_zero = dalib_array_zero1 (b, *a_size, *b_N1);
  dalib_schedule_get_send_ (b_zero, a_size);
  dalib_schedule_get_recv_ (a, a_size);

  /* Step 4: release schedule */

  dalib_schedule_exit_ ();

} /* dalib_global_switch */

/*******************************************************************
*                                                                  *
*  FORTRAN INTERFACE                                               *
*                                                                  *
*    - subroutines without a mask parameter                        *
*                                                                  *
*******************************************************************/

void dalib_global_get1_ (a, a_size, a_elems, b, b_N1, p)

int *a_size;    /* size of elements of a and b in Bytes */
int *a_elems;   /* number of elements in a and p        */
int *b_N1;      /* number of elements of b */
int *p;
unsigned char *a, *b;

{ /* mask = p means no mask */

   dalib_global_getm1_ (a, a_size, a_elems, b, b_N1, p, p);
}

void dalib_global_get2_ (a, a_size, a_elems, b, b_N1, b_N2, p, q)
int *a_size;        /* number of bytes for elem size of a and b */
int *a_elems;       /* number of local elements in a, p, q */
int *b_N1, b_N2;    /* global dimensions of b */
int *p, *q;
unsigned char *a, *b;

{ /* mask = p means no mask */
   dalib_global_getm2_ (a, a_size, a_elems, b, b_N1, b_N2, p, q, q);
}

/*******************************************************************
*                                                                  *
*  FORTRAN INTERFACE                                               *
*                                                                  *
*    - splitted subroutines to reuse a fixed schedule              *
*                                                                  *
*******************************************************************/

void dalib_schedule_fix1__ (b_N1, p, p_elems, mask)
int *p_elems;   /* number of elemens in p           */
int *b_N1;      /* number of elements of b           */
int *p, *mask;  /* p == mask if mask is not available  */

{ dalib_schedule_init_ (b_N1, p, p_elems, mask);
  dalib_schedule_send_ (b_N1, p);
  dalib_schedule_recv_ ();
}

void dalib_schedule_fix2__ (b_N1, b_N2, p, q, q_elems, mask)
int *q_elems;   /* number of elemens in q           */
int *b_N1;      /* number of elements of b           */
int *b_N2;      /* number of elements of b           */
int *p, *q, *mask;  /* q == mask if mask is not available  */

{ dalib_schedule_init_ (b_N1, q, q_elems, mask);
  dalib_schedule_send2_ (b_N1, b_N2, p, q);
  dalib_schedule_recv_ ();
}

void dalib_schedule_get1__ (b, b_N1, a, size)

unsigned char *b, *a;
int *b_N1, *size;

{ unsigned char *b_zero;
  b_zero = dalib_array_zero1 (b, *size, *b_N1);
  dalib_schedule_get_send_ (b_zero, size);
  dalib_schedule_get_recv_ (a, size);
}

void dalib_schedule_get2__ (b, b_N1, b_N2, a, size)

unsigned char *b, *a;
int *b_N1, *b_N2, *size;

{ unsigned char *b_zero;
  b_zero = dalib_array_zero2 (b, *size, *b_N1, *b_N2);
  dalib_schedule_get_send_ (b_zero, size);
  dalib_schedule_get_recv_ (a, size);
}

void dalib_schedule_set1__ (op, b, b_N1, a, a_size)

unsigned char *a, *b;
int *op;
int *a_size, *b_N1;

{ unsigned char *b_zero;
  operation = *op;      /* global set for operation */
  dalib_schedule_set_send_ (a, a_size);
  b_zero = dalib_array_zero1 (b, *a_size, *b_N1);
  dalib_schedule_set_recv_ (b_zero, a_size);

} /* dalib_schedule_set1 */

void dalib_schedule_set2__ (op, b, b_N1, b_N2, a, a_size)

unsigned char *a, *b;
int *op;
int *a_size, *b_N1, *b_N2;

{ unsigned char *b_zero;
  operation = *op;      /* global set for operation */
  dalib_schedule_set_send_ (a, a_size);
  b_zero = dalib_array_zero1 (b, *a_size, *b_N1, *b_N2);
  dalib_schedule_set_recv_ (b_zero, a_size);

} /* dalib_schedule_set2 */


