Actual source code: sfpack.c
1: #include "petsc/private/sfimpl.h"
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: /* This is a C file that contains packing facilities, with dispatches to device if enabled. */
7: /*
8: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
9: * therefore we pack data types manually. This file defines packing routines for the standard data types.
10: */
12: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
14: /* Operations working like s += t */
15: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while (0) /* binary ops in the middle such as +, *, && etc. */
16: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while (0) /* ops like a function, such as PetscMax, PetscMin */
17: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while (0) /* logical exclusive OR */
18: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while (0)
19: /* Ref MPI MAXLOC */
20: #define OP_XLOC(op,s,t) \
21: do { \
22: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
23: else if (!((s).u op (t).u)) s = t; \
24: } while (0)
26: /* DEF_PackFunc - macro defining a Pack routine
28: Arguments of the macro:
29: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
30: .BS Block size for vectorization. It is a factor of bsz.
31: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
33: Arguments of the Pack routine:
34: +count Number of indices in idx[].
35: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
36: .opt Per-pack optimization plan. NULL means no such plan.
37: .idx Indices of entries to packed.
38: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
39: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
40: -packed Address of the packed data.
41: */
42: #define DEF_PackFunc(Type,BS,EQ) \
43: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
44: { \
45: const Type *u = (const Type*)unpacked,*u2; \
46: Type *p = (Type*)packed,*p2; \
47: PetscInt i,j,k,X,Y,r,bs=link->bs; \
48: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
49: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
50: if (!idx) PetscArraycpy(p,u+start*MBS,MBS*count);/* idx[] are contiguous */ \
51: else if (opt) { /* has optimizations available */ \
52: p2 = p; \
53: for (r=0; r<opt->n; r++) { \
54: u2 = u + opt->start[r]*MBS; \
55: X = opt->X[r]; \
56: Y = opt->Y[r]; \
57: for (k=0; k<opt->dz[r]; k++) \
58: for (j=0; j<opt->dy[r]; j++) { \
59: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
60: p2 += opt->dx[r]*MBS; \
61: } \
62: } \
63: } else { \
64: for (i=0; i<count; i++) \
65: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
66: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
67: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
68: } \
69: return 0; \
70: }
72: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
73: and inserts into a sparse array.
75: Arguments:
76: .Type Type of the data
77: .BS Block size for vectorization
78: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
80: Notes:
81: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
82: */
83: #define DEF_UnpackFunc(Type,BS,EQ) \
84: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
85: { \
86: Type *u = (Type*)unpacked,*u2; \
87: const Type *p = (const Type*)packed; \
88: PetscInt i,j,k,X,Y,r,bs=link->bs; \
89: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
90: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
91: if (!idx) { \
92: u += start*MBS; \
93: if (u != p) PetscArraycpy(u,p,count*MBS); \
94: } else if (opt) { /* has optimizations available */ \
95: for (r=0; r<opt->n; r++) { \
96: u2 = u + opt->start[r]*MBS; \
97: X = opt->X[r]; \
98: Y = opt->Y[r]; \
99: for (k=0; k<opt->dz[r]; k++) \
100: for (j=0; j<opt->dy[r]; j++) { \
101: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
102: p += opt->dx[r]*MBS; \
103: } \
104: } \
105: } else { \
106: for (i=0; i<count; i++) \
107: for (j=0; j<M; j++) \
108: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
109: } \
110: return 0; \
111: }
113: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
115: Arguments:
116: +Opname Name of the Op, such as Add, Mult, LAND, etc.
117: .Type Type of the data
118: .BS Block size for vectorization
119: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
120: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
121: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
122: */
123: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
124: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
125: { \
126: Type *u = (Type*)unpacked,*u2; \
127: const Type *p = (const Type*)packed; \
128: PetscInt i,j,k,X,Y,r,bs=link->bs; \
129: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
130: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
131: if (!idx) { \
132: u += start*MBS; \
133: for (i=0; i<count; i++) \
134: for (j=0; j<M; j++) \
135: for (k=0; k<BS; k++) \
136: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
137: } else if (opt) { /* idx[] has patterns */ \
138: for (r=0; r<opt->n; r++) { \
139: u2 = u + opt->start[r]*MBS; \
140: X = opt->X[r]; \
141: Y = opt->Y[r]; \
142: for (k=0; k<opt->dz[r]; k++) \
143: for (j=0; j<opt->dy[r]; j++) { \
144: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
145: p += opt->dx[r]*MBS; \
146: } \
147: } \
148: } else { \
149: for (i=0; i<count; i++) \
150: for (j=0; j<M; j++) \
151: for (k=0; k<BS; k++) \
152: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
153: } \
154: return 0; \
155: }
157: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
158: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
159: { \
160: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
161: PetscInt i,j,k,r,l,bs=link->bs; \
162: const PetscInt M = (EQ) ? 1 : bs/BS; \
163: const PetscInt MBS = M*BS; \
164: for (i=0; i<count; i++) { \
165: r = (!idx ? start+i : idx[i])*MBS; \
166: l = i*MBS; \
167: for (j=0; j<M; j++) \
168: for (k=0; k<BS; k++) { \
169: tmp = u[r+j*BS+k]; \
170: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
171: p[l+j*BS+k] = tmp; \
172: } \
173: } \
174: return 0; \
175: }
177: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
178: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
179: { \
180: const Type *u = (const Type*)src; \
181: Type *v = (Type*)dst; \
182: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
183: const PetscInt M = (EQ) ? 1 : bs/BS; \
184: const PetscInt MBS = M*BS; \
185: if (!srcIdx) { /* src is contiguous */ \
186: u += srcStart*MBS; \
187: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
188: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
189: u += srcOpt->start[0]*MBS; \
190: v += dstStart*MBS; \
191: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
192: for (k=0; k<srcOpt->dz[0]; k++) \
193: for (j=0; j<srcOpt->dy[0]; j++) { \
194: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
195: v += srcOpt->dx[0]*MBS; \
196: } \
197: } else { /* all other cases */ \
198: for (i=0; i<count; i++) { \
199: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
200: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
201: for (j=0; j<M; j++) \
202: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
203: } \
204: } \
205: return 0; \
206: }
208: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
209: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
210: { \
211: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
212: const Type *ldata = (const Type*)leafdata; \
213: PetscInt i,j,k,r,l,bs = link->bs; \
214: const PetscInt M = (EQ) ? 1 : bs/BS; \
215: const PetscInt MBS = M*BS; \
216: for (i=0; i<count; i++) { \
217: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
218: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
219: for (j=0; j<M; j++) \
220: for (k=0; k<BS; k++) { \
221: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
222: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
223: } \
224: } \
225: return 0; \
226: }
228: /* Pack, Unpack/Fetch ops */
229: #define DEF_Pack(Type,BS,EQ) \
230: DEF_PackFunc(Type,BS,EQ) \
231: DEF_UnpackFunc(Type,BS,EQ) \
232: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
233: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
234: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
235: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
236: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
237: }
239: /* Add, Mult ops */
240: #define DEF_Add(Type,BS,EQ) \
241: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
242: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
243: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
244: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
245: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
246: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
247: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
248: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
249: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
250: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
251: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
252: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
253: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
254: }
256: /* Max, Min ops */
257: #define DEF_Cmp(Type,BS,EQ) \
258: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
259: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
260: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
261: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
262: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
263: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
264: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
265: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
266: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
267: }
269: /* Logical ops.
270: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
271: the compilation warning "empty macro arguments are undefined in ISO C90"
272: */
273: #define DEF_Log(Type,BS,EQ) \
274: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
275: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
276: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
277: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
278: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
279: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
280: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
281: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
282: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
283: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
284: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
285: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
286: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
287: }
289: /* Bitwise ops */
290: #define DEF_Bit(Type,BS,EQ) \
291: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
292: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
293: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
294: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
295: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
296: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
297: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
298: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
299: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
300: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
301: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
302: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
303: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
304: }
306: /* Maxloc, Minloc ops */
307: #define DEF_Xloc(Type,BS,EQ) \
308: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
309: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
310: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
311: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
312: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
313: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
314: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
315: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
316: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
317: }
319: #define DEF_IntegerType(Type,BS,EQ) \
320: DEF_Pack(Type,BS,EQ) \
321: DEF_Add(Type,BS,EQ) \
322: DEF_Cmp(Type,BS,EQ) \
323: DEF_Log(Type,BS,EQ) \
324: DEF_Bit(Type,BS,EQ) \
325: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
326: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
327: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
328: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
329: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
330: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
331: }
333: #define DEF_RealType(Type,BS,EQ) \
334: DEF_Pack(Type,BS,EQ) \
335: DEF_Add(Type,BS,EQ) \
336: DEF_Cmp(Type,BS,EQ) \
337: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
338: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
339: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
340: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
341: }
343: #if defined(PETSC_HAVE_COMPLEX)
344: #define DEF_ComplexType(Type,BS,EQ) \
345: DEF_Pack(Type,BS,EQ) \
346: DEF_Add(Type,BS,EQ) \
347: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
348: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
349: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
350: }
351: #endif
353: #define DEF_DumbType(Type,BS,EQ) \
354: DEF_Pack(Type,BS,EQ) \
355: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
356: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
357: }
359: /* Maxloc, Minloc */
360: #define DEF_PairType(Type,BS,EQ) \
361: DEF_Pack(Type,BS,EQ) \
362: DEF_Xloc(Type,BS,EQ) \
363: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
364: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
365: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
366: }
368: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
369: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
370: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
371: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
372: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
373: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
374: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
375: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
377: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
378: DEF_IntegerType(int,1,1)
379: DEF_IntegerType(int,2,1)
380: DEF_IntegerType(int,4,1)
381: DEF_IntegerType(int,8,1)
382: DEF_IntegerType(int,1,0)
383: DEF_IntegerType(int,2,0)
384: DEF_IntegerType(int,4,0)
385: DEF_IntegerType(int,8,0)
386: #endif
388: /* The typedefs are used to get a typename without space that CPPJoin can handle */
389: typedef signed char SignedChar;
390: DEF_IntegerType(SignedChar,1,1)
391: DEF_IntegerType(SignedChar,2,1)
392: DEF_IntegerType(SignedChar,4,1)
393: DEF_IntegerType(SignedChar,8,1)
394: DEF_IntegerType(SignedChar,1,0)
395: DEF_IntegerType(SignedChar,2,0)
396: DEF_IntegerType(SignedChar,4,0)
397: DEF_IntegerType(SignedChar,8,0)
399: typedef unsigned char UnsignedChar;
400: DEF_IntegerType(UnsignedChar,1,1)
401: DEF_IntegerType(UnsignedChar,2,1)
402: DEF_IntegerType(UnsignedChar,4,1)
403: DEF_IntegerType(UnsignedChar,8,1)
404: DEF_IntegerType(UnsignedChar,1,0)
405: DEF_IntegerType(UnsignedChar,2,0)
406: DEF_IntegerType(UnsignedChar,4,0)
407: DEF_IntegerType(UnsignedChar,8,0)
409: DEF_RealType(PetscReal,1,1)
410: DEF_RealType(PetscReal,2,1)
411: DEF_RealType(PetscReal,4,1)
412: DEF_RealType(PetscReal,8,1)
413: DEF_RealType(PetscReal,1,0)
414: DEF_RealType(PetscReal,2,0)
415: DEF_RealType(PetscReal,4,0)
416: DEF_RealType(PetscReal,8,0)
418: #if defined(PETSC_HAVE_COMPLEX)
419: DEF_ComplexType(PetscComplex,1,1)
420: DEF_ComplexType(PetscComplex,2,1)
421: DEF_ComplexType(PetscComplex,4,1)
422: DEF_ComplexType(PetscComplex,8,1)
423: DEF_ComplexType(PetscComplex,1,0)
424: DEF_ComplexType(PetscComplex,2,0)
425: DEF_ComplexType(PetscComplex,4,0)
426: DEF_ComplexType(PetscComplex,8,0)
427: #endif
429: #define PairType(Type1,Type2) Type1##_##Type2
430: typedef struct {int u; int i;} PairType(int,int);
431: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
432: DEF_PairType(PairType(int,int),1,1)
433: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
435: /* If we don't know the basic type, we treat it as a stream of chars or ints */
436: DEF_DumbType(char,1,1)
437: DEF_DumbType(char,2,1)
438: DEF_DumbType(char,4,1)
439: DEF_DumbType(char,1,0)
440: DEF_DumbType(char,2,0)
441: DEF_DumbType(char,4,0)
443: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
444: DEF_DumbType(DumbInt,1,1)
445: DEF_DumbType(DumbInt,2,1)
446: DEF_DumbType(DumbInt,4,1)
447: DEF_DumbType(DumbInt,8,1)
448: DEF_DumbType(DumbInt,1,0)
449: DEF_DumbType(DumbInt,2,0)
450: DEF_DumbType(DumbInt,4,0)
451: DEF_DumbType(DumbInt,8,0)
453: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink link)
454: {
455: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
456: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
458: /* Destroy device-specific fields */
459: if (link->deviceinited) (*link->Destroy)(sf,link);
461: /* Destroy host related fields */
462: if (!link->isbuiltin) MPI_Type_free(&link->unit);
463: if (!link->use_nvshmem) {
464: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
465: if (link->reqs[i] != MPI_REQUEST_NULL) MPI_Request_free(&link->reqs[i]);
466: }
467: PetscFree(link->reqs);
468: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
469: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
470: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
471: }
472: }
473: PetscFree(link);
474: return 0;
475: }
477: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
478: {
479: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
480: #if defined(PETSC_HAVE_NVSHMEM)
481: {
482: PetscBool use_nvshmem;
483: PetscSFLinkNvshmemCheck(sf,rootmtype,rootdata,leafmtype,leafdata,&use_nvshmem);
484: if (use_nvshmem) {
485: PetscSFLinkCreate_NVSHMEM(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
486: return 0;
487: }
488: }
489: #endif
490: PetscSFLinkCreate_MPI(sf,unit,rootmtype,rootdata,leafmtype,leafdata,op,sfop,mylink);
491: return 0;
492: }
494: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
495: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
496: */
497: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
498: {
499: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
500: PetscInt i,j,cnt,nrootranks,ndrootranks,nleafranks,ndleafranks;
501: const PetscInt *rootoffset,*leafoffset;
502: MPI_Aint disp;
503: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
504: MPI_Datatype unit = link->unit;
505: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
506: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
508: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
509: if (sf->persistent) {
510: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
511: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
512: if (direction == PETSCSF_LEAF2../../../../../..) {
513: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
514: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
515: cnt = rootoffset[i+1]-rootoffset[i];
516: MPIU_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
517: }
518: } else { /* PETSCSF_../../../../../..2LEAF */
519: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
520: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
521: cnt = rootoffset[i+1]-rootoffset[i];
522: MPIU_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,cnt,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
523: }
524: }
525: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
526: }
528: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
529: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
530: if (direction == PETSCSF_LEAF2../../../../../..) {
531: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
532: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
533: cnt = leafoffset[i+1]-leafoffset[i];
534: MPIU_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
535: }
536: } else { /* PETSCSF_../../../../../..2LEAF */
537: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
538: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
539: cnt = leafoffset[i+1]-leafoffset[i];
540: MPIU_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,cnt,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
541: }
542: }
543: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
544: }
545: }
546: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
547: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
548: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
549: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
550: return 0;
551: }
553: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
554: {
555: PetscSFLink link,*p;
556: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
558: /* Look for types in cache */
559: for (p=&bas->inuse; (link=*p); p=&link->next) {
560: PetscBool match;
561: MPIPetsc_Type_compare(unit,link->unit,&match);
562: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
563: switch (cmode) {
564: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
565: case PETSC_USE_POINTER: break;
566: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
567: }
568: *mylink = link;
569: return 0;
570: }
571: }
572: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
573: }
575: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *mylink)
576: {
577: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
578: PetscSFLink link = *mylink;
580: link->rootdata = NULL;
581: link->leafdata = NULL;
582: link->next = bas->avail;
583: bas->avail = link;
584: *mylink = NULL;
585: return 0;
586: }
588: /* Error out on unsupported overlapped communications */
589: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
590: {
591: PetscSFLink link,*p;
592: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
593: PetscBool match;
595: if (PetscDefined(USE_DEBUG)) {
596: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
597: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
598: */
599: if (rootdata || leafdata) {
600: for (p=&bas->inuse; (link=*p); p=&link->next) {
601: MPIPetsc_Type_compare(unit,link->unit,&match);
603: }
604: }
605: }
606: return 0;
607: }
609: static PetscErrorCode PetscSFLinkMemcpy_Host(PetscSFLink link,PetscMemType dstmtype,void* dst,PetscMemType srcmtype,const void*src,size_t n)
610: {
611: if (n) PetscMemcpy(dst,src,n);
612: return 0;
613: }
615: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
616: {
617: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
618: PetscBool is2Int,is2PetscInt;
619: PetscMPIInt ni,na,nd,combiner;
620: #if defined(PETSC_HAVE_COMPLEX)
621: PetscInt nPetscComplex=0;
622: #endif
624: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
625: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
626: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
627: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
628: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
629: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
630: #if defined(PETSC_HAVE_COMPLEX)
631: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
632: #endif
633: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
634: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
635: /* TODO: shaell we also handle Fortran MPI_2REAL? */
636: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
637: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
638: link->bs = 1; /* default */
640: if (is2Int) {
641: PackInit_PairType_int_int_1_1(link);
642: link->bs = 1;
643: link->unitbytes = 2*sizeof(int);
644: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
645: link->basicunit = MPI_2INT;
646: link->unit = MPI_2INT;
647: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
648: PackInit_PairType_PetscInt_PetscInt_1_1(link);
649: link->bs = 1;
650: link->unitbytes = 2*sizeof(PetscInt);
651: link->basicunit = MPIU_2INT;
652: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
653: link->unit = MPIU_2INT;
654: } else if (nPetscReal) {
655: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
656: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
657: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
658: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
659: link->bs = nPetscReal;
660: link->unitbytes = nPetscReal*sizeof(PetscReal);
661: link->basicunit = MPIU_REAL;
662: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
663: } else if (nPetscInt) {
664: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
665: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
666: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
667: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
668: link->bs = nPetscInt;
669: link->unitbytes = nPetscInt*sizeof(PetscInt);
670: link->basicunit = MPIU_INT;
671: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
672: #if defined(PETSC_USE_64BIT_INDICES)
673: } else if (nInt) {
674: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
675: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
676: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
677: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
678: link->bs = nInt;
679: link->unitbytes = nInt*sizeof(int);
680: link->basicunit = MPI_INT;
681: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
682: #endif
683: } else if (nSignedChar) {
684: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
685: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
686: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
687: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
688: link->bs = nSignedChar;
689: link->unitbytes = nSignedChar*sizeof(SignedChar);
690: link->basicunit = MPI_SIGNED_CHAR;
691: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
692: } else if (nUnsignedChar) {
693: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
694: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
695: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
696: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
697: link->bs = nUnsignedChar;
698: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
699: link->basicunit = MPI_UNSIGNED_CHAR;
700: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
701: #if defined(PETSC_HAVE_COMPLEX)
702: } else if (nPetscComplex) {
703: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
704: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
705: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
706: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
707: link->bs = nPetscComplex;
708: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
709: link->basicunit = MPIU_COMPLEX;
710: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
711: #endif
712: } else {
713: MPI_Aint lb,nbyte;
714: MPI_Type_get_extent(unit,&lb,&nbyte);
716: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
717: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
718: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
719: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
720: link->bs = nbyte;
721: link->unitbytes = nbyte;
722: link->basicunit = MPI_BYTE;
723: } else {
724: nInt = nbyte / sizeof(int);
725: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
726: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
727: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
728: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
729: link->bs = nInt;
730: link->unitbytes = nbyte;
731: link->basicunit = MPI_INT;
732: }
733: if (link->isbuiltin) link->unit = unit;
734: }
736: if (!link->isbuiltin) MPI_Type_dup(unit,&link->unit);
738: link->Memcpy = PetscSFLinkMemcpy_Host;
739: return 0;
740: }
742: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
743: {
744: *UnpackAndOp = NULL;
745: if (PetscMemTypeHost(mtype)) {
746: if (op == MPI_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
747: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
748: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
749: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
750: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
751: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
752: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
753: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
754: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
755: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
756: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
757: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
758: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
759: }
760: #if defined(PETSC_HAVE_DEVICE)
761: else if (PetscMemTypeDevice(mtype) && !atomic) {
762: if (op == MPI_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
763: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
764: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
765: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
766: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
767: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
768: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
769: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
770: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
771: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
772: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
773: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
774: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
775: } else if (PetscMemTypeDevice(mtype) && atomic) {
776: if (op == MPI_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
777: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
778: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
779: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
780: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
781: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
782: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
783: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
784: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
785: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
786: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
787: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
788: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
789: }
790: #endif
791: return 0;
792: }
794: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
795: {
796: *ScatterAndOp = NULL;
797: if (PetscMemTypeHost(mtype)) {
798: if (op == MPI_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
799: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
800: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
801: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
802: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
803: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
804: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
805: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
806: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
807: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
808: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
809: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
810: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
811: }
812: #if defined(PETSC_HAVE_DEVICE)
813: else if (PetscMemTypeDevice(mtype) && !atomic) {
814: if (op == MPI_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
815: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
816: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
817: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
818: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
819: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
820: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
821: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
822: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
823: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
824: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
825: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
826: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
827: } else if (PetscMemTypeDevice(mtype) && atomic) {
828: if (op == MPI_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
829: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
830: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
831: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
832: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
833: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
834: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
835: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
836: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
837: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
838: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
839: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
840: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
841: }
842: #endif
843: return 0;
844: }
846: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
847: {
848: *FetchAndOp = NULL;
850: if (PetscMemTypeHost(mtype)) *FetchAndOp = link->h_FetchAndAdd;
851: #if defined(PETSC_HAVE_DEVICE)
852: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOp = link->d_FetchAndAdd;
853: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOp = link->da_FetchAndAdd;
854: #endif
855: return 0;
856: }
858: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
859: {
860: *FetchAndOpLocal = NULL;
862: if (PetscMemTypeHost(mtype)) *FetchAndOpLocal = link->h_FetchAndAddLocal;
863: #if defined(PETSC_HAVE_DEVICE)
864: else if (PetscMemTypeDevice(mtype) && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
865: else if (PetscMemTypeDevice(mtype) && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
866: #endif
867: return 0;
868: }
870: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
871: {
872: PetscLogDouble flops;
873: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
875: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
876: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
877: #if defined(PETSC_HAVE_DEVICE)
878: if (PetscMemTypeDevice(link->rootmtype)) PetscLogGpuFlops(flops); else
879: #endif
880: PetscLogFlops(flops);
881: }
882: return 0;
883: }
885: static inline PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
886: {
887: PetscLogDouble flops;
889: if (op != MPI_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
890: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
891: #if defined(PETSC_HAVE_DEVICE)
892: if (PetscMemTypeDevice(link->leafmtype)) PetscLogGpuFlops(flops); else
893: #endif
894: PetscLogFlops(flops);
895: }
896: return 0;
897: }
899: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
900: Input Parameters:
901: +sf - The StarForest
902: .link - The link
903: .count - Number of entries to unpack
904: .start - The first index, significent when indices=NULL
905: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
906: .buf - A contiguous buffer to unpack from
907: -op - Operation after unpack
909: Output Parameters:
910: .data - The data to unpack to
911: */
912: static inline PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
913: {
914: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
915: {
916: PetscInt i;
917: if (indices) {
918: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
919: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
920: */
921: for (i=0; i<count; i++) MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);
922: } else {
923: MPIU_Reduce_local(buf,(char*)data+start*link->unitbytes,count,link->unit,op);
924: }
925: }
926: return 0;
927: #else
928: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
929: #endif
930: }
932: static inline PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
933: {
934: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
935: {
936: PetscInt i,disp;
937: if (!srcIdx) {
938: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
939: } else {
940: for (i=0; i<count; i++) {
941: disp = dstIdx? dstIdx[i] : dstStart + i;
942: MPIU_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
943: }
944: }
945: }
946: return 0;
947: #else
948: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
949: #endif
950: }
952: /*=============================================================================
953: Pack/Unpack/Fetch/Scatter routines
954: ============================================================================*/
956: /* Pack rootdata to rootbuf
957: Input Parameters:
958: + sf - The SF this packing works on.
959: . link - It gives the memtype of the roots and also provides root buffer.
960: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
961: - rootdata - Where to read the roots.
963: Notes:
964: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
965: in a place where the underlying MPI is ready to access (use_gpu_aware_mpi or not)
966: */
967: PetscErrorCode PetscSFLinkPackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
968: {
969: const PetscInt *rootindices = NULL;
970: PetscInt count,start;
971: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
972: PetscMemType rootmtype = link->rootmtype;
973: PetscSFPackOpt opt = NULL;
975: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
976: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip packing */
977: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
978: PetscSFLinkGetPack(link,rootmtype,&Pack);
979: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
980: }
981: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
982: return 0;
983: }
985: /* Pack leafdata to leafbuf */
986: PetscErrorCode PetscSFLinkPackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
987: {
988: const PetscInt *leafindices = NULL;
989: PetscInt count,start;
990: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
991: PetscMemType leafmtype = link->leafmtype;
992: PetscSFPackOpt opt = NULL;
994: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
995: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip packing */
996: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
997: PetscSFLinkGetPack(link,leafmtype,&Pack);
998: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
999: }
1000: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1001: return 0;
1002: }
1004: /* Pack rootdata to rootbuf, which are in the same memory space */
1005: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1006: {
1007: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1009: if (scope == PETSCSF_REMOTE) { /* Sync the device if rootdata is not on petsc default stream */
1010: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1011: if (link->PrePack) (*link->PrePack)(sf,link,PETSCSF_../../../../../..2LEAF); /* Used by SF nvshmem */
1012: }
1013: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1014: if (bas->rootbuflen[scope]) PetscSFLinkPackRootData_Private(sf,link,scope,rootdata);
1015: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1016: return 0;
1017: }
1018: /* Pack leafdata to leafbuf, which are in the same memory space */
1019: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1020: {
1021: if (scope == PETSCSF_REMOTE) {
1022: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1023: if (link->PrePack) (*link->PrePack)(sf,link,PETSCSF_LEAF2../../../../../..); /* Used by SF nvshmem */
1024: }
1025: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1026: if (sf->leafbuflen[scope]) PetscSFLinkPackLeafData_Private(sf,link,scope,leafdata);
1027: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1028: return 0;
1029: }
1031: PetscErrorCode PetscSFLinkUnpackRootData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1032: {
1033: const PetscInt *rootindices = NULL;
1034: PetscInt count,start;
1035: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1036: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1037: PetscMemType rootmtype = link->rootmtype;
1038: PetscSFPackOpt opt = NULL;
1040: if (!link->rootdirect[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1041: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1042: if (UnpackAndOp) {
1043: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1044: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1045: } else {
1046: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1047: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1048: }
1049: }
1050: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1051: return 0;
1052: }
1054: PetscErrorCode PetscSFLinkUnpackLeafData_Private(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1055: {
1056: const PetscInt *leafindices = NULL;
1057: PetscInt count,start;
1058: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1059: PetscMemType leafmtype = link->leafmtype;
1060: PetscSFPackOpt opt = NULL;
1062: if (!link->leafdirect[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1063: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1064: if (UnpackAndOp) {
1065: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1066: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1067: } else {
1068: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1069: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1070: }
1071: }
1072: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1073: return 0;
1074: }
1075: /* Unpack rootbuf to rootdata, which are in the same memory space */
1076: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1077: {
1078: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1080: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1081: if (bas->rootbuflen[scope]) PetscSFLinkUnpackRootData_Private(sf,link,scope,rootdata,op);
1082: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1083: if (scope == PETSCSF_REMOTE) {
1084: if (link->PostUnpack) (*link->PostUnpack)(sf,link,PETSCSF_LEAF2../../../../../..); /* Used by SF nvshmem */
1085: if (PetscMemTypeDevice(link->rootmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1086: }
1087: return 0;
1088: }
1090: /* Unpack leafbuf to leafdata for remote (common case) or local (rare case when rootmtype != leafmtype) */
1091: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1092: {
1093: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1094: if (sf->leafbuflen[scope]) PetscSFLinkUnpackLeafData_Private(sf,link,scope,leafdata,op);
1095: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1096: if (scope == PETSCSF_REMOTE) {
1097: if (link->PostUnpack) (*link->PostUnpack)(sf,link,PETSCSF_../../../../../..2LEAF); /* Used by SF nvshmem */
1098: if (PetscMemTypeDevice(link->leafmtype) && link->SyncDevice && sf->unknown_input_stream) (*link->SyncDevice)(link);
1099: }
1100: return 0;
1101: }
1103: /* FetchAndOp rootdata with rootbuf, it is a kind of Unpack on rootdata, except it also updates rootbuf */
1104: PetscErrorCode PetscSFLinkFetchAndOpRemote(PetscSF sf,PetscSFLink link,void *rootdata,MPI_Op op)
1105: {
1106: const PetscInt *rootindices = NULL;
1107: PetscInt count,start;
1108: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1109: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1110: PetscMemType rootmtype = link->rootmtype;
1111: PetscSFPackOpt opt = NULL;
1113: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1114: if (bas->rootbuflen[PETSCSF_REMOTE]) {
1115: /* Do FetchAndOp on rootdata with rootbuf */
1116: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_REMOTE],&FetchAndOp);
1117: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_REMOTE,&count,&start,&opt,&rootindices);
1118: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[PETSCSF_REMOTE][rootmtype]);
1119: }
1120: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,PETSCSF_REMOTE,op);
1121: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1122: return 0;
1123: }
1125: PetscErrorCode PetscSFLinkScatterLocal(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void *rootdata,void *leafdata,MPI_Op op)
1126: {
1127: const PetscInt *rootindices = NULL,*leafindices = NULL;
1128: PetscInt count,rootstart,leafstart;
1129: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1130: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1131: PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype,srcmtype,dstmtype;
1132: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1133: PetscInt buflen = sf->leafbuflen[PETSCSF_LOCAL];
1134: char *srcbuf = NULL,*dstbuf = NULL;
1135: PetscBool dstdups;
1137: if (!buflen) return 0;
1138: if (rootmtype != leafmtype) { /* The cross memory space local scatter is done by pack, copy and unpack */
1139: if (direction == PETSCSF_../../../../../..2LEAF) {
1140: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1141: srcmtype = rootmtype;
1142: srcbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1143: dstmtype = leafmtype;
1144: dstbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1145: } else {
1146: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1147: srcmtype = leafmtype;
1148: srcbuf = link->leafbuf[PETSCSF_LOCAL][leafmtype];
1149: dstmtype = rootmtype;
1150: dstbuf = link->rootbuf[PETSCSF_LOCAL][rootmtype];
1151: }
1152: (*link->Memcpy)(link,dstmtype,dstbuf,srcmtype,srcbuf,buflen*link->unitbytes);
1153: /* If above is a device to host copy, we have to sync the stream before accessing the buffer on host */
1154: if (PetscMemTypeHost(dstmtype)) (*link->SyncStream)(link);
1155: if (direction == PETSCSF_../../../../../..2LEAF) {
1156: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1157: } else {
1158: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1159: }
1160: } else {
1161: dstdups = (direction == PETSCSF_../../../../../..2LEAF) ? sf->leafdups[PETSCSF_LOCAL] : bas->rootdups[PETSCSF_LOCAL];
1162: dstmtype = (direction == PETSCSF_../../../../../..2LEAF) ? link->leafmtype : link->rootmtype;
1163: PetscSFLinkGetScatterAndOp(link,dstmtype,op,dstdups,&ScatterAndOp);
1164: if (ScatterAndOp) {
1165: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1166: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1167: if (direction == PETSCSF_../../../../../..2LEAF) {
1168: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1169: } else {
1170: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1171: }
1172: } else {
1173: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1174: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1175: if (direction == PETSCSF_../../../../../..2LEAF) {
1176: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1177: } else {
1178: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1179: }
1180: }
1181: }
1182: return 0;
1183: }
1185: /* Fetch rootdata to leafdata and leafupdate locally */
1186: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1187: {
1188: const PetscInt *rootindices = NULL,*leafindices = NULL;
1189: PetscInt count,rootstart,leafstart;
1190: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1191: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1192: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1193: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1195: if (!bas->rootbuflen[PETSCSF_LOCAL]) return 0;
1196: if (rootmtype != leafmtype) {
1197: /* The local communication has to go through pack and unpack */
1198: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1199: } else {
1200: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1201: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1202: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1203: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1204: }
1205: return 0;
1206: }
1208: /*
1209: Create per-rank pack/unpack optimizations based on indice patterns
1211: Input Parameters:
1212: + n - Number of destination ranks
1213: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1214: - idx - [*] Array storing indices
1216: Output Parameters:
1217: + opt - Pack optimizations. NULL if no optimizations.
1218: */
1219: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1220: {
1221: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1222: PetscBool optimizable = PETSC_TRUE;
1223: PetscSFPackOpt opt;
1225: PetscMalloc1(1,&opt);
1226: PetscMalloc1(7*n+2,&opt->array);
1227: opt->n = opt->array[0] = n;
1228: opt->offset = opt->array + 1;
1229: opt->start = opt->array + n + 2;
1230: opt->dx = opt->array + 2*n + 2;
1231: opt->dy = opt->array + 3*n + 2;
1232: opt->dz = opt->array + 4*n + 2;
1233: opt->X = opt->array + 5*n + 2;
1234: opt->Y = opt->array + 6*n + 2;
1236: for (r=0; r<n; r++) { /* For each destination rank */
1237: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1238: p = offset[r];
1239: start = idx[p]; /* First index for this rank */
1240: p++;
1242: /* Search in X dimension */
1243: for (dx=1; dx<m; dx++,p++) {
1244: if (start+dx != idx[p]) break;
1245: }
1247: dydz = m/dx;
1248: X = dydz > 1 ? (idx[p]-start) : dx;
1249: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1250: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1251: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1252: for (i=0; i<dx; i++,p++) {
1253: if (start+X*dy+i != idx[p]) {
1254: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1255: else goto Z_dimension;
1256: }
1257: }
1258: }
1260: Z_dimension:
1261: dz = m/(dx*dy);
1262: Y = dz > 1 ? (idx[p]-start)/X : dy;
1263: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1264: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1265: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1266: for (j=0; j<dy; j++) {
1267: for (i=0; i<dx; i++,p++) {
1268: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1269: }
1270: }
1271: }
1272: opt->start[r] = start;
1273: opt->dx[r] = dx;
1274: opt->dy[r] = dy;
1275: opt->dz[r] = dz;
1276: opt->X[r] = X;
1277: opt->Y[r] = Y;
1278: }
1280: finish:
1281: /* If not optimizable, free arrays to save memory */
1282: if (!n || !optimizable) {
1283: PetscFree(opt->array);
1284: PetscFree(opt);
1285: *out = NULL;
1286: } else {
1287: opt->offset[0] = 0;
1288: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1289: *out = opt;
1290: }
1291: return 0;
1292: }
1294: static inline PetscErrorCode PetscSFDestroyPackOpt(PetscSF sf,PetscMemType mtype,PetscSFPackOpt *out)
1295: {
1296: PetscSFPackOpt opt = *out;
1298: if (opt) {
1299: PetscSFFree(sf,mtype,opt->array);
1300: PetscFree(opt);
1301: *out = NULL;
1302: }
1303: return 0;
1304: }
1306: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1307: {
1308: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1309: PetscInt i,j;
1311: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1312: for (i=0; i<2; i++) { /* Set defaults */
1313: sf->leafstart[i] = 0;
1314: sf->leafcontig[i] = PETSC_TRUE;
1315: sf->leafdups[i] = PETSC_FALSE;
1316: bas->rootstart[i] = 0;
1317: bas->rootcontig[i] = PETSC_TRUE;
1318: bas->rootdups[i] = PETSC_FALSE;
1319: }
1321: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1322: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1324: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1325: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1327: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1328: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1329: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1330: }
1331: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1332: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1333: }
1335: /* If not, see if we can have per-rank optimizations by doing index analysis */
1336: if (!sf->leafcontig[0]) PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);
1337: if (!sf->leafcontig[1]) PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);
1339: /* Are root indices for self and remote contiguous? */
1340: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1341: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1343: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1344: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1346: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1347: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1348: }
1349: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1350: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1351: }
1353: if (!bas->rootcontig[0]) PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);
1354: if (!bas->rootcontig[1]) PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);
1356: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1357: if (PetscDefined(HAVE_DEVICE)) {
1358: PetscBool ismulti = (sf->multi == sf) ? PETSC_TRUE : PETSC_FALSE;
1363: }
1364: return 0;
1365: }
1367: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1368: {
1369: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1370: PetscInt i;
1372: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1373: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1374: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1375: #if defined(PETSC_HAVE_DEVICE)
1376: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1377: PetscSFDestroyPackOpt(sf,PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1378: #endif
1379: }
1380: return 0;
1381: }