Actual source code: gs.c
2: /***********************************gs.c***************************************
4: Author: Henry M. Tufo III
6: e-mail: hmt@cs.brown.edu
8: snail-mail:
9: Division of Applied Mathematics
10: Brown University
11: Providence, RI 02912
13: Last Modification:
14: 6.21.97
15: ************************************gs.c**************************************/
17: /***********************************gs.c***************************************
18: File Description:
19: -----------------
21: ************************************gs.c**************************************/
23: #include <../src/ksp/pc/impls/tfs/tfs.h>
25: /* default length of number of items via tree - doubles if exceeded */
26: #define TREE_BUF_SZ 2048;
27: #define GS_VEC_SZ 1
29: /***********************************gs.c***************************************
30: Type: struct gather_scatter_id
31: ------------------------------
33: ************************************gs.c**************************************/
34: typedef struct gather_scatter_id {
35: PetscInt id;
36: PetscInt nel_min;
37: PetscInt nel_max;
38: PetscInt nel_sum;
39: PetscInt negl;
40: PetscInt gl_max;
41: PetscInt gl_min;
42: PetscInt repeats;
43: PetscInt ordered;
44: PetscInt positive;
45: PetscScalar *vals;
47: /* bit mask info */
48: PetscInt *my_proc_mask;
49: PetscInt mask_sz;
50: PetscInt *ngh_buf;
51: PetscInt ngh_buf_sz;
52: PetscInt *nghs;
53: PetscInt num_nghs;
54: PetscInt max_nghs;
55: PetscInt *pw_nghs;
56: PetscInt num_pw_nghs;
57: PetscInt *tree_nghs;
58: PetscInt num_tree_nghs;
60: PetscInt num_loads;
62: /* repeats == true -> local info */
63: PetscInt nel; /* number of unique elememts */
64: PetscInt *elms; /* of size nel */
65: PetscInt nel_total;
66: PetscInt *local_elms; /* of size nel_total */
67: PetscInt *companion; /* of size nel_total */
69: /* local info */
70: PetscInt num_local_total;
71: PetscInt local_strength;
72: PetscInt num_local;
73: PetscInt *num_local_reduce;
74: PetscInt **local_reduce;
75: PetscInt num_local_gop;
76: PetscInt *num_gop_local_reduce;
77: PetscInt **gop_local_reduce;
79: /* pairwise info */
80: PetscInt level;
81: PetscInt num_pairs;
82: PetscInt max_pairs;
83: PetscInt loc_node_pairs;
84: PetscInt max_node_pairs;
85: PetscInt min_node_pairs;
86: PetscInt avg_node_pairs;
87: PetscInt *pair_list;
88: PetscInt *msg_sizes;
89: PetscInt **node_list;
90: PetscInt len_pw_list;
91: PetscInt *pw_elm_list;
92: PetscScalar *pw_vals;
94: MPI_Request *msg_ids_in;
95: MPI_Request *msg_ids_out;
97: PetscScalar *out;
98: PetscScalar *in;
99: PetscInt msg_total;
101: /* tree - crystal accumulator info */
102: PetscInt max_left_over;
103: PetscInt *pre;
104: PetscInt *in_num;
105: PetscInt *out_num;
106: PetscInt **in_list;
107: PetscInt **out_list;
109: /* new tree work*/
110: PetscInt tree_nel;
111: PetscInt *tree_elms;
112: PetscScalar *tree_buf;
113: PetscScalar *tree_work;
115: PetscInt tree_map_sz;
116: PetscInt *tree_map_in;
117: PetscInt *tree_map_out;
119: /* current memory status */
120: PetscInt gl_bss_min;
121: PetscInt gl_perm_min;
123: /* max segment size for PCTFS_gs_gop_vec() */
124: PetscInt vec_sz;
126: /* hack to make paul happy */
127: MPI_Comm PCTFS_gs_comm;
129: } PCTFS_gs_id;
131: static PCTFS_gs_id *gsi_check_args(PetscInt *elms, PetscInt nel, PetscInt level);
132: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs);
133: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs);
134: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs);
135: static PCTFS_gs_id *gsi_new(void);
136: static PetscErrorCode set_tree(PCTFS_gs_id *gs);
138: /* same for all but vector flavor */
139: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals);
140: /* vector flavor */
141: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
143: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
144: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step);
145: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
146: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
147: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step);
149: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals);
150: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals);
152: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
153: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim);
154: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim);
156: /* global vars */
157: /* from comm.c module */
159: static PetscInt num_gs_ids = 0;
161: /* should make this dynamic ... later */
162: static PetscInt msg_buf =MAX_MSG_BUF;
163: static PetscInt vec_sz =GS_VEC_SZ;
164: static PetscInt *tree_buf =NULL;
165: static PetscInt tree_buf_sz=0;
166: static PetscInt ntree =0;
168: /***************************************************************************/
169: PetscErrorCode PCTFS_gs_init_vec_sz(PetscInt size)
170: {
171: vec_sz = size;
172: return 0;
173: }
175: /******************************************************************************/
176: PetscErrorCode PCTFS_gs_init_msg_buf_sz(PetscInt buf_size)
177: {
178: msg_buf = buf_size;
179: return 0;
180: }
182: /******************************************************************************/
183: PCTFS_gs_id *PCTFS_gs_init(PetscInt *elms, PetscInt nel, PetscInt level)
184: {
185: PCTFS_gs_id *gs;
186: MPI_Group PCTFS_gs_group;
187: MPI_Comm PCTFS_gs_comm;
189: /* ensure that communication package has been initialized */
190: PCTFS_comm_init();
192: /* determines if we have enough dynamic/semi-static memory */
193: /* checks input, allocs and sets gd_id template */
194: gs = gsi_check_args(elms,nel,level);
196: /* only bit mask version up and working for the moment */
197: /* LATER :: get int list version working for sparse pblms */
198: PETSC_COMM_WORLD,gsi_via_bit_mask(gs);
200: PETSC_COMM_WORLD,MPI_Comm_group(MPI_COMM_WORLD,&PCTFS_gs_group);
201: PETSC_COMM_WORLD,MPI_Comm_create(MPI_COMM_WORLD,PCTFS_gs_group,&PCTFS_gs_comm);
202: PETSC_COMM_WORLD,MPI_Group_free(&PCTFS_gs_group);
204: gs->PCTFS_gs_comm=PCTFS_gs_comm;
206: return(gs);
207: }
209: /******************************************************************************/
210: static PCTFS_gs_id *gsi_new(void)
211: {
212: PCTFS_gs_id *gs;
213: gs = (PCTFS_gs_id*) malloc(sizeof(PCTFS_gs_id));
214: PETSC_COMM_WORLD,PetscMemzero(gs,sizeof(PCTFS_gs_id));
215: return(gs);
216: }
218: /******************************************************************************/
219: static PCTFS_gs_id *gsi_check_args(PetscInt *in_elms, PetscInt nel, PetscInt level)
220: {
221: PetscInt i, j, k, t2;
222: PetscInt *companion, *elms, *unique, *iptr;
223: PetscInt num_local=0, *num_to_reduce, **local_reduce;
224: PetscInt oprs[] = {NON_UNIFORM,GL_MIN,GL_MAX,GL_ADD,GL_MIN,GL_MAX,GL_MIN,GL_B_AND};
225: PetscInt vals[sizeof(oprs)/sizeof(oprs[0])-1];
226: PetscInt work[sizeof(oprs)/sizeof(oprs[0])-1];
227: PCTFS_gs_id *gs;
229: if (!in_elms) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"elms point to nothing!!!\n");
230: if (nel<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"can't have fewer than 0 elms!!!\n");
232: if (nel==0) PETSC_COMM_WORLD,PetscInfo(0,"I don't have any elements!!!\n");
234: /* get space for gs template */
235: gs = gsi_new();
236: gs->id = ++num_gs_ids;
238: /* hmt 6.4.99 */
239: /* caller can set global ids that don't participate to 0 */
240: /* PCTFS_gs_init ignores all zeros in elm list */
241: /* negative global ids are still invalid */
242: for (i=j=0; i<nel; i++) {
243: if (in_elms[i]!=0) j++;
244: }
246: k=nel; nel=j;
248: /* copy over in_elms list and create inverse map */
249: elms = (PetscInt*) malloc((nel+1)*sizeof(PetscInt));
250: companion = (PetscInt*) malloc(nel*sizeof(PetscInt));
252: for (i=j=0; i<k; i++) {
253: if (in_elms[i]!=0) { elms[j] = in_elms[i]; companion[j++] = i; }
254: }
256: if (j!=nel) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"nel j mismatch!\n");
258: /* pre-pass ... check to see if sorted */
259: elms[nel] = INT_MAX;
260: iptr = elms;
261: unique = elms+1;
262: j =0;
263: while (*iptr!=INT_MAX) {
264: if (*iptr++>*unique++) { j=1; break; }
265: }
267: /* set up inverse map */
268: if (j) {
269: PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list *not* sorted!\n");
270: PETSC_COMM_WORLD,PCTFS_SMI_sort((void*)elms, (void*)companion, nel, SORT_INTEGER);
271: } else PETSC_COMM_WORLD,PetscInfo(0,"gsi_check_args() :: elm list sorted!\n");
272: elms[nel] = INT_MIN;
274: /* first pass */
275: /* determine number of unique elements, check pd */
276: for (i=k=0; i<nel; i+=j) {
277: t2 = elms[i];
278: j = ++i;
280: /* clump 'em for now */
281: while (elms[j]==t2) j++;
283: /* how many together and num local */
284: if (j-=i) { num_local++; k+=j; }
285: }
287: /* how many unique elements? */
288: gs->repeats = k;
289: gs->nel = nel-k;
291: /* number of repeats? */
292: gs->num_local = num_local;
293: num_local += 2;
294: gs->local_reduce = local_reduce=(PetscInt**)malloc(num_local*sizeof(PetscInt*));
295: gs->num_local_reduce = num_to_reduce=(PetscInt*) malloc(num_local*sizeof(PetscInt));
297: unique = (PetscInt*) malloc((gs->nel+1)*sizeof(PetscInt));
298: gs->elms = unique;
299: gs->nel_total = nel;
300: gs->local_elms = elms;
301: gs->companion = companion;
303: /* compess map as well as keep track of local ops */
304: for (num_local=i=j=0; i<gs->nel; i++) {
305: k = j;
306: t2 = unique[i] = elms[j];
307: companion[i] = companion[j];
309: while (elms[j]==t2) j++;
311: if ((t2=(j-k))>1) {
312: /* number together */
313: num_to_reduce[num_local] = t2++;
315: iptr = local_reduce[num_local++] = (PetscInt*)malloc(t2*sizeof(PetscInt));
317: /* to use binary searching don't remap until we check intersection */
318: *iptr++ = i;
320: /* note that we're skipping the first one */
321: while (++k<j) *(iptr++) = companion[k];
322: *iptr = -1;
323: }
324: }
326: /* sentinel for ngh_buf */
327: unique[gs->nel]=INT_MAX;
329: /* for two partition sort hack */
330: num_to_reduce[num_local] = 0;
331: local_reduce[num_local] = NULL;
332: num_to_reduce[++num_local] = 0;
333: local_reduce[num_local] = NULL;
335: /* load 'em up */
336: /* note one extra to hold NON_UNIFORM flag!!! */
337: vals[2] = vals[1] = vals[0] = nel;
338: if (gs->nel>0) {
339: vals[3] = unique[0];
340: vals[4] = unique[gs->nel-1];
341: } else {
342: vals[3] = INT_MAX;
343: vals[4] = INT_MIN;
344: }
345: vals[5] = level;
346: vals[6] = num_gs_ids;
348: /* GLOBAL: send 'em out */
349: PETSC_COMM_WORLD,PCTFS_giop(vals,work,sizeof(oprs)/sizeof(oprs[0])-1,oprs);
351: /* must be semi-pos def - only pairwise depends on this */
352: /* LATER - remove this restriction */
353: if (vals[3]<0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system not semi-pos def \n");
354: if (vals[4]==INT_MAX) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system ub too large !\n");
356: gs->nel_min = vals[0];
357: gs->nel_max = vals[1];
358: gs->nel_sum = vals[2];
359: gs->gl_min = vals[3];
360: gs->gl_max = vals[4];
361: gs->negl = vals[4]-vals[3]+1;
363: if (gs->negl<=0) SETERRABORT(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"gsi_check_args() :: system empty or neg :: %d\n");
365: /* LATER :: add level == -1 -> program selects level */
366: if (vals[5]<0) vals[5]=0;
367: else if (vals[5]>PCTFS_num_nodes) vals[5]=PCTFS_num_nodes;
368: gs->level = vals[5];
370: return(gs);
371: }
373: /******************************************************************************/
374: static PetscErrorCode gsi_via_bit_mask(PCTFS_gs_id *gs)
375: {
376: PetscInt i, nel, *elms;
377: PetscInt t1;
378: PetscInt **reduce;
379: PetscInt *map;
381: /* totally local removes ... PCTFS_ct_bits == 0 */
382: get_ngh_buf(gs);
384: if (gs->level) set_pairwise(gs);
385: if (gs->max_left_over) set_tree(gs);
387: /* intersection local and pairwise/tree? */
388: gs->num_local_total = gs->num_local;
389: gs->gop_local_reduce = gs->local_reduce;
390: gs->num_gop_local_reduce = gs->num_local_reduce;
392: map = gs->companion;
394: /* is there any local compression */
395: if (!gs->num_local) {
396: gs->local_strength = NONE;
397: gs->num_local_gop = 0;
398: } else {
399: /* ok find intersection */
400: map = gs->companion;
401: reduce = gs->local_reduce;
402: for (i=0, t1=0; i<gs->num_local; i++, reduce++) {
403: if ((PCTFS_ivec_binary_search(**reduce,gs->pw_elm_list,gs->len_pw_list)>=0) || PCTFS_ivec_binary_search(**reduce,gs->tree_map_in,gs->tree_map_sz)>=0) {
404: t1++;
406: gs->num_local_reduce[i] *= -1;
407: }
408: **reduce=map[**reduce];
409: }
411: /* intersection is empty */
412: if (!t1) {
413: gs->local_strength = FULL;
414: gs->num_local_gop = 0;
415: } else { /* intersection not empty */
416: gs->local_strength = PARTIAL;
418: PCTFS_SMI_sort((void*)gs->num_local_reduce, (void*)gs->local_reduce, gs->num_local + 1, SORT_INT_PTR);
420: gs->num_local_gop = t1;
421: gs->num_local_total = gs->num_local;
422: gs->num_local -= t1;
423: gs->gop_local_reduce = gs->local_reduce;
424: gs->num_gop_local_reduce = gs->num_local_reduce;
426: for (i=0; i<t1; i++) {
428: gs->num_gop_local_reduce[i] *= -1;
429: gs->local_reduce++;
430: gs->num_local_reduce++;
431: }
432: gs->local_reduce++;
433: gs->num_local_reduce++;
434: }
435: }
437: elms = gs->pw_elm_list;
438: nel = gs->len_pw_list;
439: for (i=0; i<nel; i++) elms[i] = map[elms[i]];
441: elms = gs->tree_map_in;
442: nel = gs->tree_map_sz;
443: for (i=0; i<nel; i++) elms[i] = map[elms[i]];
445: /* clean up */
446: free((void*) gs->local_elms);
447: free((void*) gs->companion);
448: free((void*) gs->elms);
449: free((void*) gs->ngh_buf);
450: gs->local_elms = gs->companion = gs->elms = gs->ngh_buf = NULL;
451: return 0;
452: }
454: /******************************************************************************/
455: static PetscErrorCode place_in_tree(PetscInt elm)
456: {
457: PetscInt *tp, n;
459: if (ntree==tree_buf_sz) {
460: if (tree_buf_sz) {
461: tp = tree_buf;
462: n = tree_buf_sz;
463: tree_buf_sz<<=1;
464: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
465: PCTFS_ivec_copy(tree_buf,tp,n);
466: free(tp);
467: } else {
468: tree_buf_sz = TREE_BUF_SZ;
469: tree_buf = (PetscInt*)malloc(tree_buf_sz*sizeof(PetscInt));
470: }
471: }
473: tree_buf[ntree++] = elm;
474: return 0;
475: }
477: /******************************************************************************/
478: static PetscErrorCode get_ngh_buf(PCTFS_gs_id *gs)
479: {
480: PetscInt i, j, npw=0, ntree_map=0;
481: PetscInt p_mask_size, ngh_buf_size, buf_size;
482: PetscInt *p_mask, *sh_proc_mask, *pw_sh_proc_mask;
483: PetscInt *ngh_buf, *buf1, *buf2;
484: PetscInt offset, per_load, num_loads, or_ct, start, end;
485: PetscInt *ptr1, *ptr2, i_start, negl, nel, *elms;
486: PetscInt oper=GL_B_OR;
487: PetscInt *ptr3, *t_mask, level, ct1, ct2;
489: /* to make life easier */
490: nel = gs->nel;
491: elms = gs->elms;
492: level = gs->level;
494: /* det #bytes needed for processor bit masks and init w/mask cor. to PCTFS_my_id */
495: p_mask = (PetscInt*) malloc(p_mask_size=PCTFS_len_bit_mask(PCTFS_num_nodes));
496: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
498: /* allocate space for masks and info bufs */
499: gs->nghs = sh_proc_mask = (PetscInt*) malloc(p_mask_size);
500: gs->pw_nghs = pw_sh_proc_mask = (PetscInt*) malloc(p_mask_size);
501: gs->ngh_buf_sz = ngh_buf_size = p_mask_size*nel;
502: t_mask = (PetscInt*) malloc(p_mask_size);
503: gs->ngh_buf = ngh_buf = (PetscInt*) malloc(ngh_buf_size);
505: /* comm buffer size ... memory usage bounded by ~2*msg_buf */
506: /* had thought I could exploit rendezvous threshold */
508: /* default is one pass */
509: per_load = negl = gs->negl;
510: gs->num_loads = num_loads = 1;
511: i = p_mask_size*negl;
513: /* possible overflow on buffer size */
514: /* overflow hack */
515: if (i<0) i=INT_MAX;
517: buf_size = PetscMin(msg_buf,i);
519: /* can we do it? */
522: /* get PCTFS_giop buf space ... make *only* one malloc */
523: buf1 = (PetscInt*) malloc(buf_size<<1);
525: /* more than one gior exchange needed? */
526: if (buf_size!=i) {
527: per_load = buf_size/p_mask_size;
528: buf_size = per_load*p_mask_size;
529: gs->num_loads = num_loads = negl/per_load + (negl%per_load>0);
530: }
532: /* convert buf sizes from #bytes to #ints - 32 bit only! */
533: p_mask_size/=sizeof(PetscInt); ngh_buf_size/=sizeof(PetscInt); buf_size/=sizeof(PetscInt);
535: /* find PCTFS_giop work space */
536: buf2 = buf1+buf_size;
538: /* hold #ints needed for processor masks */
539: gs->mask_sz=p_mask_size;
541: /* init buffers */
542: PCTFS_ivec_zero(sh_proc_mask,p_mask_size);
543: PCTFS_ivec_zero(pw_sh_proc_mask,p_mask_size);
544: PCTFS_ivec_zero(ngh_buf,ngh_buf_size);
546: /* HACK reset tree info */
547: tree_buf = NULL;
548: tree_buf_sz = ntree = 0;
550: /* ok do it */
551: for (ptr1=ngh_buf,ptr2=elms,end=gs->gl_min,or_ct=i=0; or_ct<num_loads; or_ct++) {
552: /* identity for bitwise or is 000...000 */
553: PCTFS_ivec_zero(buf1,buf_size);
555: /* load msg buffer */
556: for (start=end,end+=per_load,i_start=i; (offset=*ptr2)<end; i++, ptr2++) {
557: offset = (offset-start)*p_mask_size;
558: PCTFS_ivec_copy(buf1+offset,p_mask,p_mask_size);
559: }
561: /* GLOBAL: pass buffer */
562: PCTFS_giop(buf1,buf2,buf_size,&oper);
564: /* unload buffer into ngh_buf */
565: ptr2=(elms+i_start);
566: for (ptr3=buf1,j=start; j<end; ptr3+=p_mask_size,j++) {
567: /* I own it ... may have to pairwise it */
568: if (j==*ptr2) {
569: /* do i share it w/anyone? */
570: ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
571: /* guess not */
572: if (ct1<2) { ptr2++; ptr1+=p_mask_size; continue; }
574: /* i do ... so keep info and turn off my bit */
575: PCTFS_ivec_copy(ptr1,ptr3,p_mask_size);
576: PCTFS_ivec_xor(ptr1,p_mask,p_mask_size);
577: PCTFS_ivec_or(sh_proc_mask,ptr1,p_mask_size);
579: /* is it to be done pairwise? */
580: if (--ct1<=level) {
581: npw++;
583: /* turn on high bit to indicate pw need to process */
584: *ptr2++ |= TOP_BIT;
585: PCTFS_ivec_or(pw_sh_proc_mask,ptr1,p_mask_size);
586: ptr1 += p_mask_size;
587: continue;
588: }
590: /* get set for next and note that I have a tree contribution */
591: /* could save exact elm index for tree here -> save a search */
592: ptr2++; ptr1+=p_mask_size; ntree_map++;
593: } else { /* i don't but still might be involved in tree */
595: /* shared by how many? */
596: ct1 = PCTFS_ct_bits((char*)ptr3,p_mask_size*sizeof(PetscInt));
598: /* none! */
599: if (ct1<2) continue;
601: /* is it going to be done pairwise? but not by me of course!*/
602: if (--ct1<=level) continue;
603: }
604: /* LATER we're going to have to process it NOW */
605: /* nope ... tree it */
606: place_in_tree(j);
607: }
608: }
610: free((void*)t_mask);
611: free((void*)buf1);
613: gs->len_pw_list = npw;
614: gs->num_nghs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
616: /* expand from bit mask list to int list and save ngh list */
617: gs->nghs = (PetscInt*) malloc(gs->num_nghs * sizeof(PetscInt));
618: PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),gs->nghs);
620: gs->num_pw_nghs = PCTFS_ct_bits((char*)pw_sh_proc_mask,p_mask_size*sizeof(PetscInt));
622: oper = GL_MAX;
623: ct1 = gs->num_nghs;
624: PCTFS_giop(&ct1,&ct2,1,&oper);
625: gs->max_nghs = ct1;
627: gs->tree_map_sz = ntree_map;
628: gs->max_left_over=ntree;
630: free((void*)p_mask);
631: free((void*)sh_proc_mask);
632: return 0;
633: }
635: /******************************************************************************/
636: static PetscErrorCode set_pairwise(PCTFS_gs_id *gs)
637: {
638: PetscInt i, j;
639: PetscInt p_mask_size;
640: PetscInt *p_mask, *sh_proc_mask, *tmp_proc_mask;
641: PetscInt *ngh_buf, *buf2;
642: PetscInt offset;
643: PetscInt *msg_list, *msg_size, **msg_nodes, nprs;
644: PetscInt *pairwise_elm_list, len_pair_list=0;
645: PetscInt *iptr, t1, i_start, nel, *elms;
646: PetscInt ct;
648: /* to make life easier */
649: nel = gs->nel;
650: elms = gs->elms;
651: ngh_buf = gs->ngh_buf;
652: sh_proc_mask = gs->pw_nghs;
654: /* need a few temp masks */
655: p_mask_size = PCTFS_len_bit_mask(PCTFS_num_nodes);
656: p_mask = (PetscInt*) malloc(p_mask_size);
657: tmp_proc_mask = (PetscInt*) malloc(p_mask_size);
659: /* set mask to my PCTFS_my_id's bit mask */
660: PCTFS_set_bit_mask(p_mask,p_mask_size,PCTFS_my_id);
662: p_mask_size /= sizeof(PetscInt);
664: len_pair_list = gs->len_pw_list;
665: gs->pw_elm_list = pairwise_elm_list=(PetscInt*)malloc((len_pair_list+1)*sizeof(PetscInt));
667: /* how many processors (nghs) do we have to exchange with? */
668: nprs = gs->num_pairs = PCTFS_ct_bits((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt));
670: /* allocate space for PCTFS_gs_gop() info */
671: gs->pair_list = msg_list = (PetscInt*) malloc(sizeof(PetscInt)*nprs);
672: gs->msg_sizes = msg_size = (PetscInt*) malloc(sizeof(PetscInt)*nprs);
673: gs->node_list = msg_nodes = (PetscInt**) malloc(sizeof(PetscInt*)*(nprs+1));
675: /* init msg_size list */
676: PCTFS_ivec_zero(msg_size,nprs);
678: /* expand from bit mask list to int list */
679: PCTFS_bm_to_proc((char*)sh_proc_mask,p_mask_size*sizeof(PetscInt),msg_list);
681: /* keep list of elements being handled pairwise */
682: for (i=j=0; i<nel; i++) {
683: if (elms[i] & TOP_BIT) { elms[i] ^= TOP_BIT; pairwise_elm_list[j++] = i; }
684: }
685: pairwise_elm_list[j] = -1;
687: gs->msg_ids_out = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1));
688: gs->msg_ids_out[nprs] = MPI_REQUEST_NULL;
689: gs->msg_ids_in = (MPI_Request*) malloc(sizeof(MPI_Request)*(nprs+1));
690: gs->msg_ids_in[nprs] = MPI_REQUEST_NULL;
691: gs->pw_vals = (PetscScalar*) malloc(sizeof(PetscScalar)*len_pair_list*vec_sz);
693: /* find who goes to each processor */
694: for (i_start=i=0; i<nprs; i++) {
695: /* processor i's mask */
696: PCTFS_set_bit_mask(p_mask,p_mask_size*sizeof(PetscInt),msg_list[i]);
698: /* det # going to processor i */
699: for (ct=j=0; j<len_pair_list; j++) {
700: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
701: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
702: if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) ct++;
703: }
704: msg_size[i] = ct;
705: i_start = PetscMax(i_start,ct);
707: /*space to hold nodes in message to first neighbor */
708: msg_nodes[i] = iptr = (PetscInt*) malloc(sizeof(PetscInt)*(ct+1));
710: for (j=0;j<len_pair_list;j++) {
711: buf2 = ngh_buf+(pairwise_elm_list[j]*p_mask_size);
712: PCTFS_ivec_and3(tmp_proc_mask,p_mask,buf2,p_mask_size);
713: if (PCTFS_ct_bits((char*)tmp_proc_mask,p_mask_size*sizeof(PetscInt))) *iptr++ = j;
714: }
715: *iptr = -1;
716: }
717: msg_nodes[nprs] = NULL;
719: j = gs->loc_node_pairs=i_start;
720: t1 = GL_MAX;
721: PCTFS_giop(&i_start,&offset,1,&t1);
722: gs->max_node_pairs = i_start;
724: i_start = j;
725: t1 = GL_MIN;
726: PCTFS_giop(&i_start,&offset,1,&t1);
727: gs->min_node_pairs = i_start;
729: i_start = j;
730: t1 = GL_ADD;
731: PCTFS_giop(&i_start,&offset,1,&t1);
732: gs->avg_node_pairs = i_start/PCTFS_num_nodes + 1;
734: i_start = nprs;
735: t1 = GL_MAX;
736: PCTFS_giop(&i_start,&offset,1,&t1);
737: gs->max_pairs = i_start;
739: /* remap pairwise in tail of gsi_via_bit_mask() */
740: gs->msg_total = PCTFS_ivec_sum(gs->msg_sizes,nprs);
741: gs->out = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
742: gs->in = (PetscScalar*) malloc(sizeof(PetscScalar)*gs->msg_total*vec_sz);
744: /* reset malloc pool */
745: free((void*)p_mask);
746: free((void*)tmp_proc_mask);
747: return 0;
748: }
750: /* to do pruned tree just save ngh buf copy for each one and decode here!
751: ******************************************************************************/
752: static PetscErrorCode set_tree(PCTFS_gs_id *gs)
753: {
754: PetscInt i, j, n, nel;
755: PetscInt *iptr_in, *iptr_out, *tree_elms, *elms;
757: /* local work ptrs */
758: elms = gs->elms;
759: nel = gs->nel;
761: /* how many via tree */
762: gs->tree_nel = n = ntree;
763: gs->tree_elms = tree_elms = iptr_in = tree_buf;
764: gs->tree_buf = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
765: gs->tree_work = (PetscScalar*) malloc(sizeof(PetscScalar)*n*vec_sz);
766: j = gs->tree_map_sz;
767: gs->tree_map_in = iptr_in = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
768: gs->tree_map_out = iptr_out = (PetscInt*) malloc(sizeof(PetscInt)*(j+1));
770: /* search the longer of the two lists */
771: /* note ... could save this info in get_ngh_buf and save searches */
772: if (n<=nel) {
773: /* bijective fct w/remap - search elm list */
774: for (i=0; i<n; i++) {
775: if ((j=PCTFS_ivec_binary_search(*tree_elms++,elms,nel))>=0) {*iptr_in++ = j; *iptr_out++ = i;}
776: }
777: } else {
778: for (i=0; i<nel; i++) {
779: if ((j=PCTFS_ivec_binary_search(*elms++,tree_elms,n))>=0) {*iptr_in++ = i; *iptr_out++ = j;}
780: }
781: }
783: /* sentinel */
784: *iptr_in = *iptr_out = -1;
785: return 0;
786: }
788: /******************************************************************************/
789: static PetscErrorCode PCTFS_gs_gop_local_out(PCTFS_gs_id *gs, PetscScalar *vals)
790: {
791: PetscInt *num, *map, **reduce;
792: PetscScalar tmp;
794: num = gs->num_gop_local_reduce;
795: reduce = gs->gop_local_reduce;
796: while ((map = *reduce++)) {
797: /* wall */
798: if (*num == 2) {
799: num++;
800: vals[map[1]] = vals[map[0]];
801: } else if (*num == 3) { /* corner shared by three elements */
802: num++;
803: vals[map[2]] = vals[map[1]] = vals[map[0]];
804: } else if (*num == 4) { /* corner shared by four elements */
805: num++;
806: vals[map[3]] = vals[map[2]] = vals[map[1]] = vals[map[0]];
807: } else { /* general case ... odd geoms ... 3D*/
808: num++;
809: tmp = *(vals + *map++);
810: while (*map >= 0) *(vals + *map++) = tmp;
811: }
812: }
813: return 0;
814: }
816: /******************************************************************************/
817: static PetscErrorCode PCTFS_gs_gop_local_plus(PCTFS_gs_id *gs, PetscScalar *vals)
818: {
819: PetscInt *num, *map, **reduce;
820: PetscScalar tmp;
822: num = gs->num_local_reduce;
823: reduce = gs->local_reduce;
824: while ((map = *reduce)) {
825: /* wall */
826: if (*num == 2) {
827: num++; reduce++;
828: vals[map[1]] = vals[map[0]] += vals[map[1]];
829: } else if (*num == 3) { /* corner shared by three elements */
830: num++; reduce++;
831: vals[map[2]]=vals[map[1]]=vals[map[0]]+=(vals[map[1]]+vals[map[2]]);
832: } else if (*num == 4) { /* corner shared by four elements */
833: num++; reduce++;
834: vals[map[1]]=vals[map[2]]=vals[map[3]]=vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
835: } else { /* general case ... odd geoms ... 3D*/
836: num++;
837: tmp = 0.0;
838: while (*map >= 0) tmp += *(vals + *map++);
840: map = *reduce++;
841: while (*map >= 0) *(vals + *map++) = tmp;
842: }
843: }
844: return 0;
845: }
847: /******************************************************************************/
848: static PetscErrorCode PCTFS_gs_gop_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals)
849: {
850: PetscInt *num, *map, **reduce;
851: PetscScalar *base;
853: num = gs->num_gop_local_reduce;
854: reduce = gs->gop_local_reduce;
855: while ((map = *reduce++)) {
856: /* wall */
857: if (*num == 2) {
858: num++;
859: vals[map[0]] += vals[map[1]];
860: } else if (*num == 3) { /* corner shared by three elements */
861: num++;
862: vals[map[0]] += (vals[map[1]] + vals[map[2]]);
863: } else if (*num == 4) { /* corner shared by four elements */
864: num++;
865: vals[map[0]] += (vals[map[1]] + vals[map[2]] + vals[map[3]]);
866: } else { /* general case ... odd geoms ... 3D*/
867: num++;
868: base = vals + *map++;
869: while (*map >= 0) *base += *(vals + *map++);
870: }
871: }
872: return 0;
873: }
875: /******************************************************************************/
876: PetscErrorCode PCTFS_gs_free(PCTFS_gs_id *gs)
877: {
878: PetscInt i;
880: MPI_Comm_free(&gs->PCTFS_gs_comm);
881: if (gs->nghs) free((void*) gs->nghs);
882: if (gs->pw_nghs) free((void*) gs->pw_nghs);
884: /* tree */
885: if (gs->max_left_over) {
886: if (gs->tree_elms) free((void*) gs->tree_elms);
887: if (gs->tree_buf) free((void*) gs->tree_buf);
888: if (gs->tree_work) free((void*) gs->tree_work);
889: if (gs->tree_map_in) free((void*) gs->tree_map_in);
890: if (gs->tree_map_out) free((void*) gs->tree_map_out);
891: }
893: /* pairwise info */
894: if (gs->num_pairs) {
895: /* should be NULL already */
896: if (gs->ngh_buf) free((void*) gs->ngh_buf);
897: if (gs->elms) free((void*) gs->elms);
898: if (gs->local_elms) free((void*) gs->local_elms);
899: if (gs->companion) free((void*) gs->companion);
901: /* only set if pairwise */
902: if (gs->vals) free((void*) gs->vals);
903: if (gs->in) free((void*) gs->in);
904: if (gs->out) free((void*) gs->out);
905: if (gs->msg_ids_in) free((void*) gs->msg_ids_in);
906: if (gs->msg_ids_out) free((void*) gs->msg_ids_out);
907: if (gs->pw_vals) free((void*) gs->pw_vals);
908: if (gs->pw_elm_list) free((void*) gs->pw_elm_list);
909: if (gs->node_list) {
910: for (i=0;i<gs->num_pairs;i++) {
911: if (gs->node_list[i]) {
912: free((void*) gs->node_list[i]);
913: }
914: }
915: free((void*) gs->node_list);
916: }
917: if (gs->msg_sizes) free((void*) gs->msg_sizes);
918: if (gs->pair_list) free((void*) gs->pair_list);
919: }
921: /* local info */
922: if (gs->num_local_total>=0) {
923: for (i=0;i<gs->num_local_total+1;i++) {
924: if (gs->num_gop_local_reduce[i]) free((void*) gs->gop_local_reduce[i]);
925: }
926: }
928: /* if intersection tree/pairwise and local isn't empty */
929: if (gs->gop_local_reduce) free((void*) gs->gop_local_reduce);
930: if (gs->num_gop_local_reduce) free((void*) gs->num_gop_local_reduce);
932: free((void*) gs);
933: return 0;
934: }
936: /******************************************************************************/
937: PetscErrorCode PCTFS_gs_gop_vec(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt step)
938: {
939: switch (*op) {
940: case '+':
941: PCTFS_gs_gop_vec_plus(gs,vals,step);
942: break;
943: default:
944: PetscInfo(0,"PCTFS_gs_gop_vec() :: %c is not a valid op\n",op[0]);
945: PetscInfo(0,"PCTFS_gs_gop_vec() :: default :: plus\n");
946: PCTFS_gs_gop_vec_plus(gs,vals,step);
947: break;
948: }
949: return 0;
950: }
952: /******************************************************************************/
953: static PetscErrorCode PCTFS_gs_gop_vec_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
954: {
957: /* local only operations!!! */
958: if (gs->num_local) PCTFS_gs_gop_vec_local_plus(gs,vals,step);
960: /* if intersection tree/pairwise and local isn't empty */
961: if (gs->num_local_gop) {
962: PCTFS_gs_gop_vec_local_in_plus(gs,vals,step);
964: /* pairwise */
965: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
967: /* tree */
968: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
970: PCTFS_gs_gop_vec_local_out(gs,vals,step);
971: } else { /* if intersection tree/pairwise and local is empty */
972: /* pairwise */
973: if (gs->num_pairs) PCTFS_gs_gop_vec_pairwise_plus(gs,vals,step);
975: /* tree */
976: else if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,vals,step);
977: }
978: return 0;
979: }
981: /******************************************************************************/
982: static PetscErrorCode PCTFS_gs_gop_vec_local_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
983: {
984: PetscInt *num, *map, **reduce;
985: PetscScalar *base;
987: num = gs->num_local_reduce;
988: reduce = gs->local_reduce;
989: while ((map = *reduce)) {
990: base = vals + map[0] * step;
992: /* wall */
993: if (*num == 2) {
994: num++; reduce++;
995: PCTFS_rvec_add (base,vals+map[1]*step,step);
996: PCTFS_rvec_copy(vals+map[1]*step,base,step);
997: } else if (*num == 3) { /* corner shared by three elements */
998: num++; reduce++;
999: PCTFS_rvec_add (base,vals+map[1]*step,step);
1000: PCTFS_rvec_add (base,vals+map[2]*step,step);
1001: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1002: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1003: } else if (*num == 4) { /* corner shared by four elements */
1004: num++; reduce++;
1005: PCTFS_rvec_add (base,vals+map[1]*step,step);
1006: PCTFS_rvec_add (base,vals+map[2]*step,step);
1007: PCTFS_rvec_add (base,vals+map[3]*step,step);
1008: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1009: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1010: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1011: } else { /* general case ... odd geoms ... 3D */
1012: num++;
1013: while (*++map >= 0) PCTFS_rvec_add (base,vals+*map*step,step);
1015: map = *reduce;
1016: while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1018: reduce++;
1019: }
1020: }
1021: return 0;
1022: }
1024: /******************************************************************************/
1025: static PetscErrorCode PCTFS_gs_gop_vec_local_in_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1026: {
1027: PetscInt *num, *map, **reduce;
1028: PetscScalar *base;
1030: num = gs->num_gop_local_reduce;
1031: reduce = gs->gop_local_reduce;
1032: while ((map = *reduce++)) {
1033: base = vals + map[0] * step;
1035: /* wall */
1036: if (*num == 2) {
1037: num++;
1038: PCTFS_rvec_add(base,vals+map[1]*step,step);
1039: } else if (*num == 3) { /* corner shared by three elements */
1040: num++;
1041: PCTFS_rvec_add(base,vals+map[1]*step,step);
1042: PCTFS_rvec_add(base,vals+map[2]*step,step);
1043: } else if (*num == 4) { /* corner shared by four elements */
1044: num++;
1045: PCTFS_rvec_add(base,vals+map[1]*step,step);
1046: PCTFS_rvec_add(base,vals+map[2]*step,step);
1047: PCTFS_rvec_add(base,vals+map[3]*step,step);
1048: } else { /* general case ... odd geoms ... 3D*/
1049: num++;
1050: while (*++map >= 0) PCTFS_rvec_add(base,vals+*map*step,step);
1051: }
1052: }
1053: return 0;
1054: }
1056: /******************************************************************************/
1057: static PetscErrorCode PCTFS_gs_gop_vec_local_out(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1058: {
1059: PetscInt *num, *map, **reduce;
1060: PetscScalar *base;
1062: num = gs->num_gop_local_reduce;
1063: reduce = gs->gop_local_reduce;
1064: while ((map = *reduce++)) {
1065: base = vals + map[0] * step;
1067: /* wall */
1068: if (*num == 2) {
1069: num++;
1070: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1071: } else if (*num == 3) { /* corner shared by three elements */
1072: num++;
1073: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1074: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1075: } else if (*num == 4) { /* corner shared by four elements */
1076: num++;
1077: PCTFS_rvec_copy(vals+map[1]*step,base,step);
1078: PCTFS_rvec_copy(vals+map[2]*step,base,step);
1079: PCTFS_rvec_copy(vals+map[3]*step,base,step);
1080: } else { /* general case ... odd geoms ... 3D*/
1081: num++;
1082: while (*++map >= 0) PCTFS_rvec_copy(vals+*map*step,base,step);
1083: }
1084: }
1085: return 0;
1086: }
1088: /******************************************************************************/
1089: static PetscErrorCode PCTFS_gs_gop_vec_pairwise_plus(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt step)
1090: {
1091: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1092: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1093: PetscInt *pw, *list, *size, **nodes;
1094: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1095: MPI_Status status;
1096: PetscBLASInt i1 = 1,dstep;
1098: /* strip and load s */
1099: msg_list = list = gs->pair_list;
1100: msg_size = size = gs->msg_sizes;
1101: msg_nodes = nodes = gs->node_list;
1102: iptr = pw = gs->pw_elm_list;
1103: dptr1 = dptr3 = gs->pw_vals;
1104: msg_ids_in = ids_in = gs->msg_ids_in;
1105: msg_ids_out = ids_out = gs->msg_ids_out;
1106: dptr2 = gs->out;
1107: in1=in2 = gs->in;
1109: /* post the receives */
1110: /* msg_nodes=nodes; */
1111: do {
1112: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1113: second one *list and do list++ afterwards */
1114: MPI_Irecv(in1, *size *step, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1115: list++;msg_ids_in++;
1116: in1 += *size++ *step;
1117: } while (*++msg_nodes);
1118: msg_nodes=nodes;
1120: /* load gs values into in out gs buffers */
1121: while (*iptr >= 0) {
1122: PCTFS_rvec_copy(dptr3,in_vals + *iptr*step,step);
1123: dptr3+=step;
1124: iptr++;
1125: }
1127: /* load out buffers and post the sends */
1128: while ((iptr = *msg_nodes++)) {
1129: dptr3 = dptr2;
1130: while (*iptr >= 0) {
1131: PCTFS_rvec_copy(dptr2,dptr1 + *iptr*step,step);
1132: dptr2+=step;
1133: iptr++;
1134: }
1135: MPI_Isend(dptr3, *msg_size *step, MPIU_SCALAR, *msg_list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1136: msg_size++; msg_list++;msg_ids_out++;
1137: }
1139: /* tree */
1140: if (gs->max_left_over) PCTFS_gs_gop_vec_tree_plus(gs,in_vals,step);
1142: /* process the received data */
1143: msg_nodes=nodes;
1144: while ((iptr = *nodes++)) {
1145: PetscScalar d1 = 1.0;
1147: /* Should I check the return value of MPI_Wait() or status? */
1148: /* Can this loop be replaced by a call to MPI_Waitall()? */
1149: MPI_Wait(ids_in, &status);
1150: ids_in++;
1151: while (*iptr >= 0) {
1152: PetscBLASIntCast(step,&dstep);
1153: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&dstep,&d1,in2,&i1,dptr1 + *iptr*step,&i1));
1154: in2+=step;
1155: iptr++;
1156: }
1157: }
1159: /* replace vals */
1160: while (*pw >= 0) {
1161: PCTFS_rvec_copy(in_vals + *pw*step,dptr1,step);
1162: dptr1+=step;
1163: pw++;
1164: }
1166: /* clear isend message handles */
1167: /* This changed for clarity though it could be the same */
1169: /* Should I check the return value of MPI_Wait() or status? */
1170: /* Can this loop be replaced by a call to MPI_Waitall()? */
1171: while (*msg_nodes++) {
1172: MPI_Wait(ids_out, &status);
1173: ids_out++;
1174: }
1175: return 0;
1176: }
1178: /******************************************************************************/
1179: static PetscErrorCode PCTFS_gs_gop_vec_tree_plus(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt step)
1180: {
1181: PetscInt size, *in, *out;
1182: PetscScalar *buf, *work;
1183: PetscInt op[] = {GL_ADD,0};
1184: PetscBLASInt i1 = 1;
1185: PetscBLASInt dstep;
1187: /* copy over to local variables */
1188: in = gs->tree_map_in;
1189: out = gs->tree_map_out;
1190: buf = gs->tree_buf;
1191: work = gs->tree_work;
1192: size = gs->tree_nel*step;
1194: /* zero out collection buffer */
1195: PCTFS_rvec_zero(buf,size);
1197: /* copy over my contributions */
1198: while (*in >= 0) {
1199: PetscBLASIntCast(step,&dstep);
1200: PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,vals + *in++ * step,&i1,buf + *out++ * step,&i1));
1201: }
1203: /* perform fan in/out on full buffer */
1204: /* must change PCTFS_grop to handle the blas */
1205: PCTFS_grop(buf,work,size,op);
1207: /* reset */
1208: in = gs->tree_map_in;
1209: out = gs->tree_map_out;
1211: /* get the portion of the results I need */
1212: while (*in >= 0) {
1213: PetscBLASIntCast(step,&dstep);
1214: PetscStackCallBLAS("BLAScopy",BLAScopy_(&dstep,buf + *out++ * step,&i1,vals + *in++ * step,&i1));
1215: }
1216: return 0;
1217: }
1219: /******************************************************************************/
1220: PetscErrorCode PCTFS_gs_gop_hc(PCTFS_gs_id *gs, PetscScalar *vals, const char *op, PetscInt dim)
1221: {
1222: switch (*op) {
1223: case '+':
1224: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1225: break;
1226: default:
1227: PetscInfo(0,"PCTFS_gs_gop_hc() :: %c is not a valid op\n",op[0]);
1228: PetscInfo(0,"PCTFS_gs_gop_hc() :: default :: plus\n");
1229: PCTFS_gs_gop_plus_hc(gs,vals,dim);
1230: break;
1231: }
1232: return 0;
1233: }
1235: /******************************************************************************/
1236: static PetscErrorCode PCTFS_gs_gop_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1237: {
1238: /* if there's nothing to do return */
1239: if (dim<=0) return 0;
1241: /* can't do more dimensions then exist */
1242: dim = PetscMin(dim,PCTFS_i_log2_num_nodes);
1244: /* local only operations!!! */
1245: if (gs->num_local) PCTFS_gs_gop_local_plus(gs,vals);
1247: /* if intersection tree/pairwise and local isn't empty */
1248: if (gs->num_local_gop) {
1249: PCTFS_gs_gop_local_in_plus(gs,vals);
1251: /* pairwise will do tree inside ... */
1252: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree only */
1253: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1255: PCTFS_gs_gop_local_out(gs,vals);
1256: } else { /* if intersection tree/pairwise and local is empty */
1257: /* pairwise will do tree inside */
1258: if (gs->num_pairs) PCTFS_gs_gop_pairwise_plus_hc(gs,vals,dim); /* tree */
1259: else if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,vals,dim);
1260: }
1261: return 0;
1262: }
1264: /******************************************************************************/
1265: static PetscErrorCode PCTFS_gs_gop_pairwise_plus_hc(PCTFS_gs_id *gs, PetscScalar *in_vals, PetscInt dim)
1266: {
1267: PetscScalar *dptr1, *dptr2, *dptr3, *in1, *in2;
1268: PetscInt *iptr, *msg_list, *msg_size, **msg_nodes;
1269: PetscInt *pw, *list, *size, **nodes;
1270: MPI_Request *msg_ids_in, *msg_ids_out, *ids_in, *ids_out;
1271: MPI_Status status;
1272: PetscInt i, mask=1;
1274: for (i=1; i<dim; i++) { mask<<=1; mask++; }
1276: /* strip and load s */
1277: msg_list = list = gs->pair_list;
1278: msg_size = size = gs->msg_sizes;
1279: msg_nodes = nodes = gs->node_list;
1280: iptr = pw = gs->pw_elm_list;
1281: dptr1 = dptr3 = gs->pw_vals;
1282: msg_ids_in = ids_in = gs->msg_ids_in;
1283: msg_ids_out = ids_out = gs->msg_ids_out;
1284: dptr2 = gs->out;
1285: in1 = in2 = gs->in;
1287: /* post the receives */
1288: /* msg_nodes=nodes; */
1289: do {
1290: /* Should MPI_ANY_SOURCE be replaced by *list ? In that case do the
1291: second one *list and do list++ afterwards */
1292: if ((PCTFS_my_id|mask)==(*list|mask)) {
1293: MPI_Irecv(in1, *size, MPIU_SCALAR, MPI_ANY_SOURCE, MSGTAG1 + *list, gs->PCTFS_gs_comm, msg_ids_in);
1294: list++; msg_ids_in++;in1 += *size++;
1295: } else { list++; size++; }
1296: } while (*++msg_nodes);
1298: /* load gs values into in out gs buffers */
1299: while (*iptr >= 0) *dptr3++ = *(in_vals + *iptr++);
1301: /* load out buffers and post the sends */
1302: msg_nodes=nodes;
1303: list = msg_list;
1304: while ((iptr = *msg_nodes++)) {
1305: if ((PCTFS_my_id|mask)==(*list|mask)) {
1306: dptr3 = dptr2;
1307: while (*iptr >= 0) *dptr2++ = *(dptr1 + *iptr++);
1308: /* CHECK PERSISTENT COMMS MODE FOR ALL THIS STUFF */
1309: /* is msg_ids_out++ correct? */
1310: MPI_Isend(dptr3, *msg_size, MPIU_SCALAR, *list, MSGTAG1+PCTFS_my_id, gs->PCTFS_gs_comm, msg_ids_out);
1311: msg_size++;list++;msg_ids_out++;
1312: } else {list++; msg_size++;}
1313: }
1315: /* do the tree while we're waiting */
1316: if (gs->max_left_over) PCTFS_gs_gop_tree_plus_hc(gs,in_vals,dim);
1318: /* process the received data */
1319: msg_nodes=nodes;
1320: list = msg_list;
1321: while ((iptr = *nodes++)) {
1322: if ((PCTFS_my_id|mask)==(*list|mask)) {
1323: /* Should I check the return value of MPI_Wait() or status? */
1324: /* Can this loop be replaced by a call to MPI_Waitall()? */
1325: MPI_Wait(ids_in, &status);
1326: ids_in++;
1327: while (*iptr >= 0) *(dptr1 + *iptr++) += *in2++;
1328: }
1329: list++;
1330: }
1332: /* replace vals */
1333: while (*pw >= 0) *(in_vals + *pw++) = *dptr1++;
1335: /* clear isend message handles */
1336: /* This changed for clarity though it could be the same */
1337: while (*msg_nodes++) {
1338: if ((PCTFS_my_id|mask)==(*msg_list|mask)) {
1339: /* Should I check the return value of MPI_Wait() or status? */
1340: /* Can this loop be replaced by a call to MPI_Waitall()? */
1341: MPI_Wait(ids_out, &status);
1342: ids_out++;
1343: }
1344: msg_list++;
1345: }
1346: return 0;
1347: }
1349: /******************************************************************************/
1350: static PetscErrorCode PCTFS_gs_gop_tree_plus_hc(PCTFS_gs_id *gs, PetscScalar *vals, PetscInt dim)
1351: {
1352: PetscInt size;
1353: PetscInt *in, *out;
1354: PetscScalar *buf, *work;
1355: PetscInt op[] = {GL_ADD,0};
1357: in = gs->tree_map_in;
1358: out = gs->tree_map_out;
1359: buf = gs->tree_buf;
1360: work = gs->tree_work;
1361: size = gs->tree_nel;
1363: PCTFS_rvec_zero(buf,size);
1365: while (*in >= 0) *(buf + *out++) = *(vals + *in++);
1367: in = gs->tree_map_in;
1368: out = gs->tree_map_out;
1370: PCTFS_grop_hc(buf,work,size,op,dim);
1372: while (*in >= 0) *(vals + *in++) = *(buf + *out++);
1373: return 0;
1374: }