Actual source code: sfpack.c
petsc-3.13.0 2020-03-29
2: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: #include <../src/vec/is/sf/impls/basic/sfbasic.h>
5: #if defined(PETSC_HAVE_CUDA)
6: #include <cuda_runtime.h>
7: #endif
8: /*
9: * MPI_Reduce_local is not really useful because it can't handle sparse data and it vectorizes "in the wrong direction",
10: * therefore we pack data types manually. This file defines packing routines for the standard data types.
11: */
13: #define CPPJoin4(a,b,c,d) a##_##b##_##c##_##d
15: /* Operations working like s += t */
16: #define OP_BINARY(op,s,t) do {(s) = (s) op (t); } while(0) /* binary ops in the middle such as +, *, && etc. */
17: #define OP_FUNCTION(op,s,t) do {(s) = op((s),(t)); } while(0) /* ops like a function, such as PetscMax, PetscMin */
18: #define OP_LXOR(op,s,t) do {(s) = (!(s)) != (!(t));} while(0) /* logical exclusive OR */
19: #define OP_ASSIGN(op,s,t) do {(s) = (t);} while(0)
20: /* Ref MPI MAXLOC */
21: #define OP_XLOC(op,s,t) \
22: do { \
23: if ((s).u == (t).u) (s).i = PetscMin((s).i,(t).i); \
24: else if (!((s).u op (t).u)) s = t; \
25: } while(0)
27: /* DEF_PackFunc - macro defining a Pack routine
29: Arguments of the macro:
30: +Type Type of the basic data in an entry, i.e., int, PetscInt, PetscReal etc. It is not the type of an entry.
31: .BS Block size for vectorization. It is a factor of bsz.
32: -EQ (bs == BS) ? 1 : 0. EQ is a compile-time const to help compiler optimizations. See below.
34: Arguments of the Pack routine:
35: +count Number of indices in idx[].
36: .start When opt and idx are NULL, it means indices are contiguous & start is the first index; otherwise, not used.
37: .opt Per-pack optimization plan. NULL means no such plan.
38: .idx Indices of entries to packed.
39: .link Provide a context for the current call, such as link->bs, number of basic types in an entry. Ex. if unit is MPI_2INT, then bs=2 and the basic type is int.
40: .unpacked Address of the unpacked data. The entries will be packed are unpacked[idx[i]],for i in [0,count).
41: -packed Address of the packed data.
42: */
43: #define DEF_PackFunc(Type,BS,EQ) \
44: static PetscErrorCode CPPJoin4(Pack,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,const void *unpacked,void *packed) \
45: { \
47: const Type *u = (const Type*)unpacked,*u2; \
48: Type *p = (Type*)packed,*p2; \
49: PetscInt i,j,k,X,Y,r,bs=link->bs; \
50: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
51: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
53: if (!idx) {PetscArraycpy(p,u+start*MBS,MBS*count);}/* idx[] are contiguous */ \
54: else if (opt) { /* has optimizations available */ \
55: p2 = p; \
56: for (r=0; r<opt->n; r++) { \
57: u2 = u + opt->start[r]*MBS; \
58: X = opt->X[r]; \
59: Y = opt->Y[r]; \
60: for (k=0; k<opt->dz[r]; k++) \
61: for (j=0; j<opt->dy[r]; j++) { \
62: PetscArraycpy(p2,u2+(X*Y*k+X*j)*MBS,opt->dx[r]*MBS); \
63: p2 += opt->dx[r]*MBS; \
64: } \
65: } \
66: } else { \
67: for (i=0; i<count; i++) \
68: for (j=0; j<M; j++) /* Decent compilers should eliminate this loop when M = const 1 */ \
69: for (k=0; k<BS; k++) /* Compiler either unrolls (BS=1) or vectorizes (BS=2,4,8,etc) this loop */ \
70: p[i*MBS+j*BS+k] = u[idx[i]*MBS+j*BS+k]; \
71: } \
72: return(0); \
73: }
75: /* DEF_Action - macro defining a UnpackAndInsert routine that unpacks data from a contiguous buffer
76: and inserts into a sparse array.
78: Arguments:
79: .Type Type of the data
80: .BS Block size for vectorization
81: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
83: Notes:
84: This macro is not combined with DEF_ActionAndOp because we want to use memcpy in this macro.
85: */
86: #define DEF_UnpackFunc(Type,BS,EQ) \
87: static PetscErrorCode CPPJoin4(UnpackAndInsert,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
88: { \
90: Type *u = (Type*)unpacked,*u2; \
91: const Type *p = (const Type*)packed; \
92: PetscInt i,j,k,X,Y,r,bs=link->bs; \
93: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
94: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
96: if (!idx) { \
97: u += start*MBS; \
98: if (u != p) {PetscArraycpy(u,p,count*MBS);} \
99: } else if (opt) { /* has optimizations available */ \
100: for (r=0; r<opt->n; r++) { \
101: u2 = u + opt->start[r]*MBS; \
102: X = opt->X[r]; \
103: Y = opt->Y[r]; \
104: for (k=0; k<opt->dz[r]; k++) \
105: for (j=0; j<opt->dy[r]; j++) { \
106: PetscArraycpy(u2+(X*Y*k+X*j)*MBS,p,opt->dx[r]*MBS); \
107: p += opt->dx[r]*MBS; \
108: } \
109: } \
110: } else { \
111: for (i=0; i<count; i++) \
112: for (j=0; j<M; j++) \
113: for (k=0; k<BS; k++) u[idx[i]*MBS+j*BS+k] = p[i*MBS+j*BS+k]; \
114: } \
115: return(0); \
116: }
118: /* DEF_UnpackAndOp - macro defining a UnpackAndOp routine where Op should not be Insert
120: Arguments:
121: +Opname Name of the Op, such as Add, Mult, LAND, etc.
122: .Type Type of the data
123: .BS Block size for vectorization
124: .EQ (bs == BS) ? 1 : 0. EQ is a compile-time const.
125: .Op Operator for the op, such as +, *, &&, ||, PetscMax, PetscMin, etc.
126: .OpApply Macro defining application of the op. Could be OP_BINARY, OP_FUNCTION, OP_LXOR
127: */
128: #define DEF_UnpackAndOp(Type,BS,EQ,Opname,Op,OpApply) \
129: static PetscErrorCode CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,const void *packed) \
130: { \
131: Type *u = (Type*)unpacked,*u2; \
132: const Type *p = (const Type*)packed; \
133: PetscInt i,j,k,X,Y,r,bs=link->bs; \
134: const PetscInt M = (EQ) ? 1 : bs/BS; /* If EQ, then M=1 enables compiler's const-propagation */ \
135: const PetscInt MBS = M*BS; /* MBS=bs. We turn MBS into a compile time const when EQ=1. */ \
137: if (!idx) { \
138: u += start*MBS; \
139: for (i=0; i<count; i++) \
140: for (j=0; j<M; j++) \
141: for (k=0; k<BS; k++) \
142: OpApply(Op,u[i*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
143: } else if (opt) { /* idx[] has patterns */ \
144: for (r=0; r<opt->n; r++) { \
145: u2 = u + opt->start[r]*MBS; \
146: X = opt->X[r]; \
147: Y = opt->Y[r]; \
148: for (k=0; k<opt->dz[r]; k++) \
149: for (j=0; j<opt->dy[r]; j++) { \
150: for (i=0; i<opt->dx[r]*MBS; i++) OpApply(Op,u2[(X*Y*k+X*j)*MBS+i],p[i]); \
151: p += opt->dx[r]*MBS; \
152: } \
153: } \
154: } else { \
155: for (i=0; i<count; i++) \
156: for (j=0; j<M; j++) \
157: for (k=0; k<BS; k++) \
158: OpApply(Op,u[idx[i]*MBS+j*BS+k],p[i*MBS+j*BS+k]); \
159: } \
160: return(0); \
161: }
163: #define DEF_FetchAndOp(Type,BS,EQ,Opname,Op,OpApply) \
164: static PetscErrorCode CPPJoin4(FetchAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt start,PetscSFPackOpt opt,const PetscInt *idx,void *unpacked,void *packed) \
165: { \
166: Type *u = (Type*)unpacked,*p = (Type*)packed,tmp; \
167: PetscInt i,j,k,r,l,bs=link->bs; \
168: const PetscInt M = (EQ) ? 1 : bs/BS; \
169: const PetscInt MBS = M*BS; \
171: for (i=0; i<count; i++) { \
172: r = (!idx ? start+i : idx[i])*MBS; \
173: l = i*MBS; \
174: for (j=0; j<M; j++) \
175: for (k=0; k<BS; k++) { \
176: tmp = u[r+j*BS+k]; \
177: OpApply(Op,u[r+j*BS+k],p[l+j*BS+k]); \
178: p[l+j*BS+k] = tmp; \
179: } \
180: } \
181: return(0); \
182: }
184: #define DEF_ScatterAndOp(Type,BS,EQ,Opname,Op,OpApply) \
185: static PetscErrorCode CPPJoin4(ScatterAnd##Opname,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt srcStart,PetscSFPackOpt srcOpt,const PetscInt *srcIdx,const void *src,PetscInt dstStart,PetscSFPackOpt dstOpt,const PetscInt *dstIdx,void *dst) \
186: { \
188: const Type *u = (const Type*)src; \
189: Type *v = (Type*)dst; \
190: PetscInt i,j,k,s,t,X,Y,bs = link->bs; \
191: const PetscInt M = (EQ) ? 1 : bs/BS; \
192: const PetscInt MBS = M*BS; \
194: if (!srcIdx) { /* src is contiguous */ \
195: u += srcStart*MBS; \
196: CPPJoin4(UnpackAnd##Opname,Type,BS,EQ)(link,count,dstStart,dstOpt,dstIdx,dst,u); \
197: } else if (srcOpt && !dstIdx) { /* src is 3D, dst is contiguous */ \
198: u += srcOpt->start[0]*MBS; \
199: v += dstStart*MBS; \
200: X = srcOpt->X[0]; Y = srcOpt->Y[0]; \
201: for (k=0; k<srcOpt->dz[0]; k++) \
202: for (j=0; j<srcOpt->dy[0]; j++) { \
203: for (i=0; i<srcOpt->dx[0]*MBS; i++) OpApply(Op,v[i],u[(X*Y*k+X*j)*MBS+i]); \
204: v += srcOpt->dx[0]*MBS; \
205: } \
206: } else { /* all other cases */ \
207: for (i=0; i<count; i++) { \
208: s = (!srcIdx ? srcStart+i : srcIdx[i])*MBS; \
209: t = (!dstIdx ? dstStart+i : dstIdx[i])*MBS; \
210: for (j=0; j<M; j++) \
211: for (k=0; k<BS; k++) OpApply(Op,v[t+j*BS+k],u[s+j*BS+k]); \
212: } \
213: } \
214: return(0); \
215: }
217: #define DEF_FetchAndOpLocal(Type,BS,EQ,Opname,Op,OpApply) \
218: static PetscErrorCode CPPJoin4(FetchAnd##Opname##Local,Type,BS,EQ)(PetscSFLink link,PetscInt count,PetscInt rootstart,PetscSFPackOpt rootopt,const PetscInt *rootidx,void *rootdata,PetscInt leafstart,PetscSFPackOpt leafopt,const PetscInt *leafidx,const void *leafdata,void *leafupdate) \
219: { \
220: Type *rdata = (Type*)rootdata,*lupdate = (Type*)leafupdate; \
221: const Type *ldata = (const Type*)leafdata; \
222: PetscInt i,j,k,r,l,bs = link->bs; \
223: const PetscInt M = (EQ) ? 1 : bs/BS; \
224: const PetscInt MBS = M*BS; \
226: for (i=0; i<count; i++) { \
227: r = (rootidx ? rootidx[i] : rootstart+i)*MBS; \
228: l = (leafidx ? leafidx[i] : leafstart+i)*MBS; \
229: for (j=0; j<M; j++) \
230: for (k=0; k<BS; k++) { \
231: lupdate[l+j*BS+k] = rdata[r+j*BS+k]; \
232: OpApply(Op,rdata[r+j*BS+k],ldata[l+j*BS+k]); \
233: } \
234: } \
235: return(0); \
236: }
238: /* Pack, Unpack/Fetch ops */
239: #define DEF_Pack(Type,BS,EQ) \
240: DEF_PackFunc(Type,BS,EQ) \
241: DEF_UnpackFunc(Type,BS,EQ) \
242: DEF_ScatterAndOp(Type,BS,EQ,Insert,=,OP_ASSIGN) \
243: static void CPPJoin4(PackInit_Pack,Type,BS,EQ)(PetscSFLink link) { \
244: link->h_Pack = CPPJoin4(Pack, Type,BS,EQ); \
245: link->h_UnpackAndInsert = CPPJoin4(UnpackAndInsert,Type,BS,EQ); \
246: link->h_ScatterAndInsert= CPPJoin4(ScatterAndInsert,Type,BS,EQ); \
247: }
249: /* Add, Mult ops */
250: #define DEF_Add(Type,BS,EQ) \
251: DEF_UnpackAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
252: DEF_UnpackAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
253: DEF_FetchAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
254: DEF_ScatterAndOp (Type,BS,EQ,Add, +,OP_BINARY) \
255: DEF_ScatterAndOp (Type,BS,EQ,Mult,*,OP_BINARY) \
256: DEF_FetchAndOpLocal(Type,BS,EQ,Add, +,OP_BINARY) \
257: static void CPPJoin4(PackInit_Add,Type,BS,EQ)(PetscSFLink link) { \
258: link->h_UnpackAndAdd = CPPJoin4(UnpackAndAdd, Type,BS,EQ); \
259: link->h_UnpackAndMult = CPPJoin4(UnpackAndMult, Type,BS,EQ); \
260: link->h_FetchAndAdd = CPPJoin4(FetchAndAdd, Type,BS,EQ); \
261: link->h_ScatterAndAdd = CPPJoin4(ScatterAndAdd, Type,BS,EQ); \
262: link->h_ScatterAndMult = CPPJoin4(ScatterAndMult, Type,BS,EQ); \
263: link->h_FetchAndAddLocal = CPPJoin4(FetchAndAddLocal,Type,BS,EQ); \
264: }
266: /* Max, Min ops */
267: #define DEF_Cmp(Type,BS,EQ) \
268: DEF_UnpackAndOp (Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
269: DEF_UnpackAndOp (Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
270: DEF_ScatterAndOp(Type,BS,EQ,Max,PetscMax,OP_FUNCTION) \
271: DEF_ScatterAndOp(Type,BS,EQ,Min,PetscMin,OP_FUNCTION) \
272: static void CPPJoin4(PackInit_Compare,Type,BS,EQ)(PetscSFLink link) { \
273: link->h_UnpackAndMax = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
274: link->h_UnpackAndMin = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
275: link->h_ScatterAndMax = CPPJoin4(ScatterAndMax, Type,BS,EQ); \
276: link->h_ScatterAndMin = CPPJoin4(ScatterAndMin, Type,BS,EQ); \
277: }
279: /* Logical ops.
280: The operator in OP_LXOR should be empty but is ||. It is not used. Put here to avoid
281: the compilation warning "empty macro arguments are undefined in ISO C90"
282: */
283: #define DEF_Log(Type,BS,EQ) \
284: DEF_UnpackAndOp (Type,BS,EQ,LAND,&&,OP_BINARY) \
285: DEF_UnpackAndOp (Type,BS,EQ,LOR, ||,OP_BINARY) \
286: DEF_UnpackAndOp (Type,BS,EQ,LXOR,||, OP_LXOR) \
287: DEF_ScatterAndOp(Type,BS,EQ,LAND,&&,OP_BINARY) \
288: DEF_ScatterAndOp(Type,BS,EQ,LOR, ||,OP_BINARY) \
289: DEF_ScatterAndOp(Type,BS,EQ,LXOR,||, OP_LXOR) \
290: static void CPPJoin4(PackInit_Logical,Type,BS,EQ)(PetscSFLink link) { \
291: link->h_UnpackAndLAND = CPPJoin4(UnpackAndLAND, Type,BS,EQ); \
292: link->h_UnpackAndLOR = CPPJoin4(UnpackAndLOR, Type,BS,EQ); \
293: link->h_UnpackAndLXOR = CPPJoin4(UnpackAndLXOR, Type,BS,EQ); \
294: link->h_ScatterAndLAND = CPPJoin4(ScatterAndLAND,Type,BS,EQ); \
295: link->h_ScatterAndLOR = CPPJoin4(ScatterAndLOR, Type,BS,EQ); \
296: link->h_ScatterAndLXOR = CPPJoin4(ScatterAndLXOR,Type,BS,EQ); \
297: }
299: /* Bitwise ops */
300: #define DEF_Bit(Type,BS,EQ) \
301: DEF_UnpackAndOp (Type,BS,EQ,BAND,&,OP_BINARY) \
302: DEF_UnpackAndOp (Type,BS,EQ,BOR, |,OP_BINARY) \
303: DEF_UnpackAndOp (Type,BS,EQ,BXOR,^,OP_BINARY) \
304: DEF_ScatterAndOp(Type,BS,EQ,BAND,&,OP_BINARY) \
305: DEF_ScatterAndOp(Type,BS,EQ,BOR, |,OP_BINARY) \
306: DEF_ScatterAndOp(Type,BS,EQ,BXOR,^,OP_BINARY) \
307: static void CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(PetscSFLink link) { \
308: link->h_UnpackAndBAND = CPPJoin4(UnpackAndBAND, Type,BS,EQ); \
309: link->h_UnpackAndBOR = CPPJoin4(UnpackAndBOR, Type,BS,EQ); \
310: link->h_UnpackAndBXOR = CPPJoin4(UnpackAndBXOR, Type,BS,EQ); \
311: link->h_ScatterAndBAND = CPPJoin4(ScatterAndBAND,Type,BS,EQ); \
312: link->h_ScatterAndBOR = CPPJoin4(ScatterAndBOR, Type,BS,EQ); \
313: link->h_ScatterAndBXOR = CPPJoin4(ScatterAndBXOR,Type,BS,EQ); \
314: }
316: /* Maxloc, Minloc ops */
317: #define DEF_Xloc(Type,BS,EQ) \
318: DEF_UnpackAndOp (Type,BS,EQ,Max,>,OP_XLOC) \
319: DEF_UnpackAndOp (Type,BS,EQ,Min,<,OP_XLOC) \
320: DEF_ScatterAndOp(Type,BS,EQ,Max,>,OP_XLOC) \
321: DEF_ScatterAndOp(Type,BS,EQ,Min,<,OP_XLOC) \
322: static void CPPJoin4(PackInit_Xloc,Type,BS,EQ)(PetscSFLink link) { \
323: link->h_UnpackAndMaxloc = CPPJoin4(UnpackAndMax, Type,BS,EQ); \
324: link->h_UnpackAndMinloc = CPPJoin4(UnpackAndMin, Type,BS,EQ); \
325: link->h_ScatterAndMaxloc = CPPJoin4(ScatterAndMax,Type,BS,EQ); \
326: link->h_ScatterAndMinloc = CPPJoin4(ScatterAndMin,Type,BS,EQ); \
327: }
329: #define DEF_IntegerType(Type,BS,EQ) \
330: DEF_Pack(Type,BS,EQ) \
331: DEF_Add(Type,BS,EQ) \
332: DEF_Cmp(Type,BS,EQ) \
333: DEF_Log(Type,BS,EQ) \
334: DEF_Bit(Type,BS,EQ) \
335: static void CPPJoin4(PackInit_IntegerType,Type,BS,EQ)(PetscSFLink link) { \
336: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
337: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
338: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
339: CPPJoin4(PackInit_Logical,Type,BS,EQ)(link); \
340: CPPJoin4(PackInit_Bitwise,Type,BS,EQ)(link); \
341: }
343: #define DEF_RealType(Type,BS,EQ) \
344: DEF_Pack(Type,BS,EQ) \
345: DEF_Add(Type,BS,EQ) \
346: DEF_Cmp(Type,BS,EQ) \
347: static void CPPJoin4(PackInit_RealType,Type,BS,EQ)(PetscSFLink link) { \
348: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
349: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
350: CPPJoin4(PackInit_Compare,Type,BS,EQ)(link); \
351: }
353: #if defined(PETSC_HAVE_COMPLEX)
354: #define DEF_ComplexType(Type,BS,EQ) \
355: DEF_Pack(Type,BS,EQ) \
356: DEF_Add(Type,BS,EQ) \
357: static void CPPJoin4(PackInit_ComplexType,Type,BS,EQ)(PetscSFLink link) { \
358: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
359: CPPJoin4(PackInit_Add,Type,BS,EQ)(link); \
360: }
361: #endif
363: #define DEF_DumbType(Type,BS,EQ) \
364: DEF_Pack(Type,BS,EQ) \
365: static void CPPJoin4(PackInit_DumbType,Type,BS,EQ)(PetscSFLink link) { \
366: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
367: }
369: /* Maxloc, Minloc */
370: #define DEF_PairType(Type,BS,EQ) \
371: DEF_Pack(Type,BS,EQ) \
372: DEF_Xloc(Type,BS,EQ) \
373: static void CPPJoin4(PackInit_PairType,Type,BS,EQ)(PetscSFLink link) { \
374: CPPJoin4(PackInit_Pack,Type,BS,EQ)(link); \
375: CPPJoin4(PackInit_Xloc,Type,BS,EQ)(link); \
376: }
378: DEF_IntegerType(PetscInt,1,1) /* unit = 1 MPIU_INT */
379: DEF_IntegerType(PetscInt,2,1) /* unit = 2 MPIU_INTs */
380: DEF_IntegerType(PetscInt,4,1) /* unit = 4 MPIU_INTs */
381: DEF_IntegerType(PetscInt,8,1) /* unit = 8 MPIU_INTs */
382: DEF_IntegerType(PetscInt,1,0) /* unit = 1*n MPIU_INTs, n>1 */
383: DEF_IntegerType(PetscInt,2,0) /* unit = 2*n MPIU_INTs, n>1 */
384: DEF_IntegerType(PetscInt,4,0) /* unit = 4*n MPIU_INTs, n>1 */
385: DEF_IntegerType(PetscInt,8,0) /* unit = 8*n MPIU_INTs, n>1. Routines with bigger BS are tried first. */
387: #if defined(PETSC_USE_64BIT_INDICES) /* Do not need (though it is OK) to generate redundant functions if PetscInt is int */
388: DEF_IntegerType(int,1,1)
389: DEF_IntegerType(int,2,1)
390: DEF_IntegerType(int,4,1)
391: DEF_IntegerType(int,8,1)
392: DEF_IntegerType(int,1,0)
393: DEF_IntegerType(int,2,0)
394: DEF_IntegerType(int,4,0)
395: DEF_IntegerType(int,8,0)
396: #endif
398: /* The typedefs are used to get a typename without space that CPPJoin can handle */
399: typedef signed char SignedChar;
400: DEF_IntegerType(SignedChar,1,1)
401: DEF_IntegerType(SignedChar,2,1)
402: DEF_IntegerType(SignedChar,4,1)
403: DEF_IntegerType(SignedChar,8,1)
404: DEF_IntegerType(SignedChar,1,0)
405: DEF_IntegerType(SignedChar,2,0)
406: DEF_IntegerType(SignedChar,4,0)
407: DEF_IntegerType(SignedChar,8,0)
409: typedef unsigned char UnsignedChar;
410: DEF_IntegerType(UnsignedChar,1,1)
411: DEF_IntegerType(UnsignedChar,2,1)
412: DEF_IntegerType(UnsignedChar,4,1)
413: DEF_IntegerType(UnsignedChar,8,1)
414: DEF_IntegerType(UnsignedChar,1,0)
415: DEF_IntegerType(UnsignedChar,2,0)
416: DEF_IntegerType(UnsignedChar,4,0)
417: DEF_IntegerType(UnsignedChar,8,0)
419: DEF_RealType(PetscReal,1,1)
420: DEF_RealType(PetscReal,2,1)
421: DEF_RealType(PetscReal,4,1)
422: DEF_RealType(PetscReal,8,1)
423: DEF_RealType(PetscReal,1,0)
424: DEF_RealType(PetscReal,2,0)
425: DEF_RealType(PetscReal,4,0)
426: DEF_RealType(PetscReal,8,0)
428: #if defined(PETSC_HAVE_COMPLEX)
429: DEF_ComplexType(PetscComplex,1,1)
430: DEF_ComplexType(PetscComplex,2,1)
431: DEF_ComplexType(PetscComplex,4,1)
432: DEF_ComplexType(PetscComplex,8,1)
433: DEF_ComplexType(PetscComplex,1,0)
434: DEF_ComplexType(PetscComplex,2,0)
435: DEF_ComplexType(PetscComplex,4,0)
436: DEF_ComplexType(PetscComplex,8,0)
437: #endif
439: #define PairType(Type1,Type2) Type1##_##Type2
440: typedef struct {int u; int i;} PairType(int,int);
441: typedef struct {PetscInt u; PetscInt i;} PairType(PetscInt,PetscInt);
442: DEF_PairType(PairType(int,int),1,1)
443: DEF_PairType(PairType(PetscInt,PetscInt),1,1)
445: /* If we don't know the basic type, we treat it as a stream of chars or ints */
446: DEF_DumbType(char,1,1)
447: DEF_DumbType(char,2,1)
448: DEF_DumbType(char,4,1)
449: DEF_DumbType(char,1,0)
450: DEF_DumbType(char,2,0)
451: DEF_DumbType(char,4,0)
453: typedef int DumbInt; /* To have a different name than 'int' used above. The name is used to make routine names. */
454: DEF_DumbType(DumbInt,1,1)
455: DEF_DumbType(DumbInt,2,1)
456: DEF_DumbType(DumbInt,4,1)
457: DEF_DumbType(DumbInt,8,1)
458: DEF_DumbType(DumbInt,1,0)
459: DEF_DumbType(DumbInt,2,0)
460: DEF_DumbType(DumbInt,4,0)
461: DEF_DumbType(DumbInt,8,0)
463: #if !defined(PETSC_HAVE_MPI_TYPE_DUP)
464: PETSC_STATIC_INLINE int MPI_Type_dup(MPI_Datatype datatype,MPI_Datatype *newtype)
465: {
466: int ierr;
467: MPI_Type_contiguous(1,datatype,newtype); if (ierr) return ierr;
468: MPI_Type_commit(newtype); if (ierr) return ierr;
469: return MPI_SUCCESS;
470: }
471: #endif
473: /*
474: The routine Creates a communication link for the given operation. It first looks up its link cache. If
475: there is a free & suitable one, it uses it. Otherwise it creates a new one.
477: A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
478: root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
479: can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
480: those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
482: The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
484: In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
486: The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
487: need pack/unpack data.
488: */
489: PetscErrorCode PetscSFLinkCreate(PetscSF sf,MPI_Datatype unit,PetscMemType rootmtype,const void *rootdata,PetscMemType leafmtype,const void *leafdata,MPI_Op op,PetscSFOperation sfop,PetscSFLink *mylink)
490: {
491: PetscErrorCode ierr;
492: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
493: PetscInt i,j,k,nrootreqs,nleafreqs,nreqs;
494: PetscSFLink *p,link;
495: PetscSFDirection direction;
496: MPI_Request *reqs = NULL;
497: PetscBool match,rootdirect[2],leafdirect[2];
498: PetscMemType rootmtype_mpi,leafmtype_mpi; /* mtypes seen by MPI */
499: PetscInt rootdirect_mpi,leafdirect_mpi; /* root/leafdirect seen by MPI*/
502: PetscSFSetErrorOnUnsupportedOverlap(sf,unit,rootdata,leafdata);
504: /* Can we directly use root/leafdirect with the given sf, sfop and op? */
505: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
506: if (sfop == PETSCSF_BCAST) {
507: rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
508: leafdirect[i] = (sf->leafcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
509: } else if (sfop == PETSCSF_REDUCE) {
510: leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */
511: rootdirect[i] = (bas->rootcontig[i] && op == MPIU_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
512: } else { /* PETSCSF_FETCH */
513: rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
514: leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
515: }
516: }
518: if (sf->use_gpu_aware_mpi) {
519: rootmtype_mpi = rootmtype;
520: leafmtype_mpi = leafmtype;
521: } else {
522: rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
523: }
524: /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is bufferred on host and then passed to MPI */
525: rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype)? 1 : 0;
526: leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype)? 1 : 0;
528: direction = (sfop == PETSCSF_BCAST)? PETSCSF_../../../../../..2LEAF : PETSCSF_LEAF2../../../../../..;
529: nrootreqs = bas->nrootreqs;
530: nleafreqs = sf->nleafreqs;
532: /* Look for free links in cache */
533: for (p=&bas->avail; (link=*p); p=&link->next) {
534: MPIPetsc_Type_compare(unit,link->unit,&match);
535: if (match) {
536: /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with current.
537: If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests().
538: */
539: if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
540: reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
541: for (i=0; i<nrootreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
542: link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
543: }
544: if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
545: reqs = link->leafreqs[direction][leafmtype][1];
546: for (i=0; i<nleafreqs; i++) {if (reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&reqs[i]);}}
547: link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
548: }
549: *p = link->next; /* Remove from available list */
550: goto found;
551: }
552: }
554: PetscNew(&link);
555: PetscSFLinkSetUp_Host(sf,link,unit);
556: PetscCommGetNewTag(PetscObjectComm((PetscObject)sf),&link->tag); /* One tag per link */
558: nreqs = (nrootreqs+nleafreqs)*8;
559: PetscMalloc1(nreqs,&link->reqs);
560: for (i=0; i<nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
562: for (i=0; i<2; i++) { /* Two communication directions */
563: for (j=0; j<2; j++) { /* Two memory types */
564: for (k=0; k<2; k++) { /* root/leafdirect 0 or 1 */
565: link->rootreqs[i][j][k] = link->reqs + nrootreqs*(4*i+2*j+k);
566: link->leafreqs[i][j][k] = link->reqs + nrootreqs*8 + nleafreqs*(4*i+2*j+k);
567: }
568: }
569: }
571: found:
572: if ((rootmtype == PETSC_MEMTYPE_DEVICE || leafmtype == PETSC_MEMTYPE_DEVICE) && !link->deviceinited) {PetscSFLinkSetUp_Device(sf,link,unit);}
574: /* Allocate buffers along root/leafdata */
575: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
576: /* For local communication, buffers are only needed when roots and leaves have different mtypes */
577: if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
578: if (bas->rootbuflen[i]) {
579: if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
580: link->rootbuf[i][rootmtype] = (char*)rootdata + bas->rootstart[i]*link->unitbytes;
581: } else { /* Have to have a separate rootbuf */
582: if (!link->rootbuf_alloc[i][rootmtype]) {
583: PetscMallocWithMemType(rootmtype,bas->rootbuflen[i]*link->unitbytes,(void**)&link->rootbuf_alloc[i][rootmtype]);
584: }
585: link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
586: }
587: }
589: if (sf->leafbuflen[i]) {
590: if (leafdirect[i]) {
591: link->leafbuf[i][leafmtype] = (char*)leafdata + sf->leafstart[i]*link->unitbytes;
592: } else {
593: if (!link->leafbuf_alloc[i][leafmtype]) {
594: PetscMallocWithMemType(leafmtype,sf->leafbuflen[i]*link->unitbytes,(void**)&link->leafbuf_alloc[i][leafmtype]);
595: }
596: link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
597: }
598: }
599: }
601: /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
602: if (rootmtype == PETSC_MEMTYPE_DEVICE && rootmtype_mpi == PETSC_MEMTYPE_HOST) {
603: if(!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
604: PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
605: }
606: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
607: }
608: if (leafmtype == PETSC_MEMTYPE_DEVICE && leafmtype_mpi == PETSC_MEMTYPE_HOST) {
609: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) {
610: PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes,&link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]);
611: }
612: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
613: }
615: /* Set `current` state of the link. They may change between different SF invocations with the same link */
616: if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
617: if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
618: if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
619: }
621: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
622: link->leafdata = leafdata;
623: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
624: link->rootdirect[i] = rootdirect[i];
625: link->leafdirect[i] = leafdirect[i];
626: }
627: link->rootdirect_mpi = rootdirect_mpi;
628: link->leafdirect_mpi = leafdirect_mpi;
629: link->rootmtype = rootmtype;
630: link->leafmtype = leafmtype;
631: link->rootmtype_mpi = rootmtype_mpi;
632: link->leafmtype_mpi = leafmtype_mpi;
634: link->next = bas->inuse;
635: bas->inuse = link;
636: *mylink = link;
637: return(0);
638: }
640: /* Return root/leaf buffers and MPI requests attached to the link for MPI communication in the given direction.
641: If the sf uses persistent requests and the requests have not been initialized, then initialize them.
642: */
643: PetscErrorCode PetscSFLinkGetMPIBuffersAndRequests(PetscSF sf,PetscSFLink link,PetscSFDirection direction,void **rootbuf, void **leafbuf,MPI_Request **rootreqs,MPI_Request **leafreqs)
644: {
645: PetscErrorCode ierr;
646: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
647: PetscInt i,j,nrootranks,ndrootranks,nleafranks,ndleafranks;
648: const PetscInt *rootoffset,*leafoffset;
649: PetscMPIInt n;
650: MPI_Aint disp;
651: MPI_Comm comm = PetscObjectComm((PetscObject)sf);
652: MPI_Datatype unit = link->unit;
653: const PetscMemType rootmtype_mpi = link->rootmtype_mpi,leafmtype_mpi = link->leafmtype_mpi; /* Used to select buffers passed to MPI */
654: const PetscInt rootdirect_mpi = link->rootdirect_mpi,leafdirect_mpi = link->leafdirect_mpi;
657: /* Init persistent MPI requests if not yet. Currently only SFBasic uses persistent MPI */
658: if (sf->persistent) {
659: if (rootreqs && bas->rootbuflen[PETSCSF_REMOTE] && !link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi]) {
660: PetscSFGetRootInfo_Basic(sf,&nrootranks,&ndrootranks,NULL,&rootoffset,NULL);
661: if (direction == PETSCSF_LEAF2../../../../../..) {
662: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
663: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
664: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
665: MPI_Recv_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
666: }
667: } else { /* PETSCSF_../../../../../..2LEAF */
668: for (i=ndrootranks,j=0; i<nrootranks; i++,j++) {
669: disp = (rootoffset[i] - rootoffset[ndrootranks])*link->unitbytes;
670: PetscMPIIntCast(rootoffset[i+1]-rootoffset[i],&n);
671: MPI_Send_init(link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi]+disp,n,unit,bas->iranks[i],link->tag,comm,link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi]+j);
672: }
673: }
674: link->rootreqsinited[direction][rootmtype_mpi][rootdirect_mpi] = PETSC_TRUE;
675: }
677: if (leafreqs && sf->leafbuflen[PETSCSF_REMOTE] && !link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi]) {
678: PetscSFGetLeafInfo_Basic(sf,&nleafranks,&ndleafranks,NULL,&leafoffset,NULL,NULL);
679: if (direction == PETSCSF_LEAF2../../../../../..) {
680: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
681: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
682: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
683: MPI_Send_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
684: }
685: } else { /* PETSCSF_../../../../../..2LEAF */
686: for (i=ndleafranks,j=0; i<nleafranks; i++,j++) {
687: disp = (leafoffset[i] - leafoffset[ndleafranks])*link->unitbytes;
688: PetscMPIIntCast(leafoffset[i+1]-leafoffset[i],&n);
689: MPI_Recv_init(link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi]+disp,n,unit,sf->ranks[i],link->tag,comm,link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi]+j);
690: }
691: }
692: link->leafreqsinited[direction][leafmtype_mpi][leafdirect_mpi] = PETSC_TRUE;
693: }
694: }
695: if (rootbuf) *rootbuf = link->rootbuf[PETSCSF_REMOTE][rootmtype_mpi];
696: if (leafbuf) *leafbuf = link->leafbuf[PETSCSF_REMOTE][leafmtype_mpi];
697: if (rootreqs) *rootreqs = link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi];
698: if (leafreqs) *leafreqs = link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi];
699: return(0);
700: }
703: PetscErrorCode PetscSFLinkGetInUse(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata,PetscCopyMode cmode,PetscSFLink *mylink)
704: {
705: PetscErrorCode ierr;
706: PetscSFLink link,*p;
707: PetscSF_Basic *bas=(PetscSF_Basic*)sf->data;
710: /* Look for types in cache */
711: for (p=&bas->inuse; (link=*p); p=&link->next) {
712: PetscBool match;
713: MPIPetsc_Type_compare(unit,link->unit,&match);
714: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) {
715: switch (cmode) {
716: case PETSC_OWN_POINTER: *p = link->next; break; /* Remove from inuse list */
717: case PETSC_USE_POINTER: break;
718: default: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"invalid cmode");
719: }
720: *mylink = link;
721: return(0);
722: }
723: }
724: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Could not find pack");
725: return(0);
726: }
728: PetscErrorCode PetscSFLinkReclaim(PetscSF sf,PetscSFLink *link)
729: {
730: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
733: (*link)->rootdata = NULL;
734: (*link)->leafdata = NULL;
735: (*link)->next = bas->avail;
736: bas->avail = *link;
737: *link = NULL;
738: return(0);
739: }
741: /* Destroy all links chained in 'avail' */
742: PetscErrorCode PetscSFLinkDestroy(PetscSF sf,PetscSFLink *avail)
743: {
744: PetscErrorCode ierr;
745: PetscSFLink link = *avail,next;
746: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
747: PetscInt i,nreqs = (bas->nrootreqs+sf->nleafreqs)*8;
750: for (; link; link=next) {
751: next = link->next;
752: if (!link->isbuiltin) {MPI_Type_free(&link->unit);}
753: for (i=0; i<nreqs; i++) { /* Persistent reqs must be freed. */
754: if (link->reqs[i] != MPI_REQUEST_NULL) {MPI_Request_free(&link->reqs[i]);}
755: }
756: PetscFree(link->reqs);
757: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
758: #if defined(PETSC_HAVE_CUDA)
759: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->rootbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
760: PetscFreeWithMemType(PETSC_MEMTYPE_DEVICE,link->leafbuf_alloc[i][PETSC_MEMTYPE_DEVICE]);
761: if (link->stream) {cudaError_t err = cudaStreamDestroy(link->stream);CHKERRCUDA(err); link->stream = NULL;}
762: #endif
763: PetscFree(link->rootbuf_alloc[i][PETSC_MEMTYPE_HOST]);
764: PetscFree(link->leafbuf_alloc[i][PETSC_MEMTYPE_HOST]);
765: }
766: PetscFree(link);
767: }
768: *avail = NULL;
769: return(0);
770: }
772: #if defined(PETSC_USE_DEBUG)
773: /* Error out on unsupported overlapped communications */
774: PetscErrorCode PetscSFSetErrorOnUnsupportedOverlap(PetscSF sf,MPI_Datatype unit,const void *rootdata,const void *leafdata)
775: {
776: PetscErrorCode ierr;
777: PetscSFLink link,*p;
778: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
779: PetscBool match;
782: /* Look up links in use and error out if there is a match. When both rootdata and leafdata are NULL, ignore
783: the potential overlapping since this process does not participate in communication. Overlapping is harmless.
784: */
785: if (rootdata || leafdata) {
786: for (p=&bas->inuse; (link=*p); p=&link->next) {
787: MPIPetsc_Type_compare(unit,link->unit,&match);
788: if (match && (rootdata == link->rootdata) && (leafdata == link->leafdata)) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Overlapped PetscSF with the same rootdata(%p), leafdata(%p) and data type. Undo the overlapping to avoid the error.",rootdata,leafdata);
789: }
790: }
791: return(0);
792: }
793: #endif
795: PetscErrorCode PetscSFLinkSetUp_Host(PetscSF sf,PetscSFLink link,MPI_Datatype unit)
796: {
798: PetscInt nSignedChar=0,nUnsignedChar=0,nInt=0,nPetscInt=0,nPetscReal=0;
799: PetscBool is2Int,is2PetscInt;
800: PetscMPIInt ni,na,nd,combiner;
801: #if defined(PETSC_HAVE_COMPLEX)
802: PetscInt nPetscComplex=0;
803: #endif
806: MPIPetsc_Type_compare_contig(unit,MPI_SIGNED_CHAR, &nSignedChar);
807: MPIPetsc_Type_compare_contig(unit,MPI_UNSIGNED_CHAR,&nUnsignedChar);
808: /* MPI_CHAR is treated below as a dumb type that does not support reduction according to MPI standard */
809: MPIPetsc_Type_compare_contig(unit,MPI_INT, &nInt);
810: MPIPetsc_Type_compare_contig(unit,MPIU_INT, &nPetscInt);
811: MPIPetsc_Type_compare_contig(unit,MPIU_REAL,&nPetscReal);
812: #if defined(PETSC_HAVE_COMPLEX)
813: MPIPetsc_Type_compare_contig(unit,MPIU_COMPLEX,&nPetscComplex);
814: #endif
815: MPIPetsc_Type_compare(unit,MPI_2INT,&is2Int);
816: MPIPetsc_Type_compare(unit,MPIU_2INT,&is2PetscInt);
817: /* TODO: shaell we also handle Fortran MPI_2REAL? */
818: MPI_Type_get_envelope(unit,&ni,&na,&nd,&combiner);
819: link->isbuiltin = (combiner == MPI_COMBINER_NAMED) ? PETSC_TRUE : PETSC_FALSE; /* unit is MPI builtin */
820: link->bs = 1; /* default */
822: if (is2Int) {
823: PackInit_PairType_int_int_1_1(link);
824: link->bs = 1;
825: link->unitbytes = 2*sizeof(int);
826: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
827: link->basicunit = MPI_2INT;
828: link->unit = MPI_2INT;
829: } else if (is2PetscInt) { /* TODO: when is2PetscInt and nPetscInt=2, we don't know which path to take. The two paths support different ops. */
830: PackInit_PairType_PetscInt_PetscInt_1_1(link);
831: link->bs = 1;
832: link->unitbytes = 2*sizeof(PetscInt);
833: link->basicunit = MPIU_2INT;
834: link->isbuiltin = PETSC_TRUE; /* unit is PETSc builtin */
835: link->unit = MPIU_2INT;
836: } else if (nPetscReal) {
837: if (nPetscReal == 8) PackInit_RealType_PetscReal_8_1(link); else if (nPetscReal%8 == 0) PackInit_RealType_PetscReal_8_0(link);
838: else if (nPetscReal == 4) PackInit_RealType_PetscReal_4_1(link); else if (nPetscReal%4 == 0) PackInit_RealType_PetscReal_4_0(link);
839: else if (nPetscReal == 2) PackInit_RealType_PetscReal_2_1(link); else if (nPetscReal%2 == 0) PackInit_RealType_PetscReal_2_0(link);
840: else if (nPetscReal == 1) PackInit_RealType_PetscReal_1_1(link); else if (nPetscReal%1 == 0) PackInit_RealType_PetscReal_1_0(link);
841: link->bs = nPetscReal;
842: link->unitbytes = nPetscReal*sizeof(PetscReal);
843: link->basicunit = MPIU_REAL;
844: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_REAL;}
845: } else if (nPetscInt) {
846: if (nPetscInt == 8) PackInit_IntegerType_PetscInt_8_1(link); else if (nPetscInt%8 == 0) PackInit_IntegerType_PetscInt_8_0(link);
847: else if (nPetscInt == 4) PackInit_IntegerType_PetscInt_4_1(link); else if (nPetscInt%4 == 0) PackInit_IntegerType_PetscInt_4_0(link);
848: else if (nPetscInt == 2) PackInit_IntegerType_PetscInt_2_1(link); else if (nPetscInt%2 == 0) PackInit_IntegerType_PetscInt_2_0(link);
849: else if (nPetscInt == 1) PackInit_IntegerType_PetscInt_1_1(link); else if (nPetscInt%1 == 0) PackInit_IntegerType_PetscInt_1_0(link);
850: link->bs = nPetscInt;
851: link->unitbytes = nPetscInt*sizeof(PetscInt);
852: link->basicunit = MPIU_INT;
853: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_INT;}
854: #if defined(PETSC_USE_64BIT_INDICES)
855: } else if (nInt) {
856: if (nInt == 8) PackInit_IntegerType_int_8_1(link); else if (nInt%8 == 0) PackInit_IntegerType_int_8_0(link);
857: else if (nInt == 4) PackInit_IntegerType_int_4_1(link); else if (nInt%4 == 0) PackInit_IntegerType_int_4_0(link);
858: else if (nInt == 2) PackInit_IntegerType_int_2_1(link); else if (nInt%2 == 0) PackInit_IntegerType_int_2_0(link);
859: else if (nInt == 1) PackInit_IntegerType_int_1_1(link); else if (nInt%1 == 0) PackInit_IntegerType_int_1_0(link);
860: link->bs = nInt;
861: link->unitbytes = nInt*sizeof(int);
862: link->basicunit = MPI_INT;
863: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_INT;}
864: #endif
865: } else if (nSignedChar) {
866: if (nSignedChar == 8) PackInit_IntegerType_SignedChar_8_1(link); else if (nSignedChar%8 == 0) PackInit_IntegerType_SignedChar_8_0(link);
867: else if (nSignedChar == 4) PackInit_IntegerType_SignedChar_4_1(link); else if (nSignedChar%4 == 0) PackInit_IntegerType_SignedChar_4_0(link);
868: else if (nSignedChar == 2) PackInit_IntegerType_SignedChar_2_1(link); else if (nSignedChar%2 == 0) PackInit_IntegerType_SignedChar_2_0(link);
869: else if (nSignedChar == 1) PackInit_IntegerType_SignedChar_1_1(link); else if (nSignedChar%1 == 0) PackInit_IntegerType_SignedChar_1_0(link);
870: link->bs = nSignedChar;
871: link->unitbytes = nSignedChar*sizeof(SignedChar);
872: link->basicunit = MPI_SIGNED_CHAR;
873: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_SIGNED_CHAR;}
874: } else if (nUnsignedChar) {
875: if (nUnsignedChar == 8) PackInit_IntegerType_UnsignedChar_8_1(link); else if (nUnsignedChar%8 == 0) PackInit_IntegerType_UnsignedChar_8_0(link);
876: else if (nUnsignedChar == 4) PackInit_IntegerType_UnsignedChar_4_1(link); else if (nUnsignedChar%4 == 0) PackInit_IntegerType_UnsignedChar_4_0(link);
877: else if (nUnsignedChar == 2) PackInit_IntegerType_UnsignedChar_2_1(link); else if (nUnsignedChar%2 == 0) PackInit_IntegerType_UnsignedChar_2_0(link);
878: else if (nUnsignedChar == 1) PackInit_IntegerType_UnsignedChar_1_1(link); else if (nUnsignedChar%1 == 0) PackInit_IntegerType_UnsignedChar_1_0(link);
879: link->bs = nUnsignedChar;
880: link->unitbytes = nUnsignedChar*sizeof(UnsignedChar);
881: link->basicunit = MPI_UNSIGNED_CHAR;
882: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPI_UNSIGNED_CHAR;}
883: #if defined(PETSC_HAVE_COMPLEX)
884: } else if (nPetscComplex) {
885: if (nPetscComplex == 8) PackInit_ComplexType_PetscComplex_8_1(link); else if (nPetscComplex%8 == 0) PackInit_ComplexType_PetscComplex_8_0(link);
886: else if (nPetscComplex == 4) PackInit_ComplexType_PetscComplex_4_1(link); else if (nPetscComplex%4 == 0) PackInit_ComplexType_PetscComplex_4_0(link);
887: else if (nPetscComplex == 2) PackInit_ComplexType_PetscComplex_2_1(link); else if (nPetscComplex%2 == 0) PackInit_ComplexType_PetscComplex_2_0(link);
888: else if (nPetscComplex == 1) PackInit_ComplexType_PetscComplex_1_1(link); else if (nPetscComplex%1 == 0) PackInit_ComplexType_PetscComplex_1_0(link);
889: link->bs = nPetscComplex;
890: link->unitbytes = nPetscComplex*sizeof(PetscComplex);
891: link->basicunit = MPIU_COMPLEX;
892: if (link->bs == 1) {link->isbuiltin = PETSC_TRUE; link->unit = MPIU_COMPLEX;}
893: #endif
894: } else {
895: MPI_Aint lb,nbyte;
896: MPI_Type_get_extent(unit,&lb,&nbyte);
897: if (lb != 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Datatype with nonzero lower bound %ld\n",(long)lb);
898: if (nbyte % sizeof(int)) { /* If the type size is not multiple of int */
899: if (nbyte == 4) PackInit_DumbType_char_4_1(link); else if (nbyte%4 == 0) PackInit_DumbType_char_4_0(link);
900: else if (nbyte == 2) PackInit_DumbType_char_2_1(link); else if (nbyte%2 == 0) PackInit_DumbType_char_2_0(link);
901: else if (nbyte == 1) PackInit_DumbType_char_1_1(link); else if (nbyte%1 == 0) PackInit_DumbType_char_1_0(link);
902: link->bs = nbyte;
903: link->unitbytes = nbyte;
904: link->basicunit = MPI_BYTE;
905: } else {
906: nInt = nbyte / sizeof(int);
907: if (nInt == 8) PackInit_DumbType_DumbInt_8_1(link); else if (nInt%8 == 0) PackInit_DumbType_DumbInt_8_0(link);
908: else if (nInt == 4) PackInit_DumbType_DumbInt_4_1(link); else if (nInt%4 == 0) PackInit_DumbType_DumbInt_4_0(link);
909: else if (nInt == 2) PackInit_DumbType_DumbInt_2_1(link); else if (nInt%2 == 0) PackInit_DumbType_DumbInt_2_0(link);
910: else if (nInt == 1) PackInit_DumbType_DumbInt_1_1(link); else if (nInt%1 == 0) PackInit_DumbType_DumbInt_1_0(link);
911: link->bs = nInt;
912: link->unitbytes = nbyte;
913: link->basicunit = MPI_INT;
914: }
915: if (link->isbuiltin) link->unit = unit;
916: }
918: if (!link->isbuiltin) {MPI_Type_dup(unit,&link->unit);}
919: return(0);
920: }
922: PetscErrorCode PetscSFLinkGetUnpackAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*))
923: {
925: *UnpackAndOp = NULL;
926: if (mtype == PETSC_MEMTYPE_HOST) {
927: if (op == MPIU_REPLACE) *UnpackAndOp = link->h_UnpackAndInsert;
928: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->h_UnpackAndAdd;
929: else if (op == MPI_PROD) *UnpackAndOp = link->h_UnpackAndMult;
930: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->h_UnpackAndMax;
931: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->h_UnpackAndMin;
932: else if (op == MPI_LAND) *UnpackAndOp = link->h_UnpackAndLAND;
933: else if (op == MPI_BAND) *UnpackAndOp = link->h_UnpackAndBAND;
934: else if (op == MPI_LOR) *UnpackAndOp = link->h_UnpackAndLOR;
935: else if (op == MPI_BOR) *UnpackAndOp = link->h_UnpackAndBOR;
936: else if (op == MPI_LXOR) *UnpackAndOp = link->h_UnpackAndLXOR;
937: else if (op == MPI_BXOR) *UnpackAndOp = link->h_UnpackAndBXOR;
938: else if (op == MPI_MAXLOC) *UnpackAndOp = link->h_UnpackAndMaxloc;
939: else if (op == MPI_MINLOC) *UnpackAndOp = link->h_UnpackAndMinloc;
940: }
941: #if defined(PETSC_HAVE_CUDA)
942: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
943: if (op == MPIU_REPLACE) *UnpackAndOp = link->d_UnpackAndInsert;
944: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->d_UnpackAndAdd;
945: else if (op == MPI_PROD) *UnpackAndOp = link->d_UnpackAndMult;
946: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->d_UnpackAndMax;
947: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->d_UnpackAndMin;
948: else if (op == MPI_LAND) *UnpackAndOp = link->d_UnpackAndLAND;
949: else if (op == MPI_BAND) *UnpackAndOp = link->d_UnpackAndBAND;
950: else if (op == MPI_LOR) *UnpackAndOp = link->d_UnpackAndLOR;
951: else if (op == MPI_BOR) *UnpackAndOp = link->d_UnpackAndBOR;
952: else if (op == MPI_LXOR) *UnpackAndOp = link->d_UnpackAndLXOR;
953: else if (op == MPI_BXOR) *UnpackAndOp = link->d_UnpackAndBXOR;
954: else if (op == MPI_MAXLOC) *UnpackAndOp = link->d_UnpackAndMaxloc;
955: else if (op == MPI_MINLOC) *UnpackAndOp = link->d_UnpackAndMinloc;
956: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
957: if (op == MPIU_REPLACE) *UnpackAndOp = link->da_UnpackAndInsert;
958: else if (op == MPI_SUM || op == MPIU_SUM) *UnpackAndOp = link->da_UnpackAndAdd;
959: else if (op == MPI_PROD) *UnpackAndOp = link->da_UnpackAndMult;
960: else if (op == MPI_MAX || op == MPIU_MAX) *UnpackAndOp = link->da_UnpackAndMax;
961: else if (op == MPI_MIN || op == MPIU_MIN) *UnpackAndOp = link->da_UnpackAndMin;
962: else if (op == MPI_LAND) *UnpackAndOp = link->da_UnpackAndLAND;
963: else if (op == MPI_BAND) *UnpackAndOp = link->da_UnpackAndBAND;
964: else if (op == MPI_LOR) *UnpackAndOp = link->da_UnpackAndLOR;
965: else if (op == MPI_BOR) *UnpackAndOp = link->da_UnpackAndBOR;
966: else if (op == MPI_LXOR) *UnpackAndOp = link->da_UnpackAndLXOR;
967: else if (op == MPI_BXOR) *UnpackAndOp = link->da_UnpackAndBXOR;
968: else if (op == MPI_MAXLOC) *UnpackAndOp = link->da_UnpackAndMaxloc;
969: else if (op == MPI_MINLOC) *UnpackAndOp = link->da_UnpackAndMinloc;
970: }
971: #endif
972: return(0);
973: }
975: PetscErrorCode PetscSFLinkGetScatterAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*))
976: {
978: *ScatterAndOp = NULL;
979: if (mtype == PETSC_MEMTYPE_HOST) {
980: if (op == MPIU_REPLACE) *ScatterAndOp = link->h_ScatterAndInsert;
981: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->h_ScatterAndAdd;
982: else if (op == MPI_PROD) *ScatterAndOp = link->h_ScatterAndMult;
983: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->h_ScatterAndMax;
984: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->h_ScatterAndMin;
985: else if (op == MPI_LAND) *ScatterAndOp = link->h_ScatterAndLAND;
986: else if (op == MPI_BAND) *ScatterAndOp = link->h_ScatterAndBAND;
987: else if (op == MPI_LOR) *ScatterAndOp = link->h_ScatterAndLOR;
988: else if (op == MPI_BOR) *ScatterAndOp = link->h_ScatterAndBOR;
989: else if (op == MPI_LXOR) *ScatterAndOp = link->h_ScatterAndLXOR;
990: else if (op == MPI_BXOR) *ScatterAndOp = link->h_ScatterAndBXOR;
991: else if (op == MPI_MAXLOC) *ScatterAndOp = link->h_ScatterAndMaxloc;
992: else if (op == MPI_MINLOC) *ScatterAndOp = link->h_ScatterAndMinloc;
993: }
994: #if defined(PETSC_HAVE_CUDA)
995: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) {
996: if (op == MPIU_REPLACE) *ScatterAndOp = link->d_ScatterAndInsert;
997: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->d_ScatterAndAdd;
998: else if (op == MPI_PROD) *ScatterAndOp = link->d_ScatterAndMult;
999: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->d_ScatterAndMax;
1000: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->d_ScatterAndMin;
1001: else if (op == MPI_LAND) *ScatterAndOp = link->d_ScatterAndLAND;
1002: else if (op == MPI_BAND) *ScatterAndOp = link->d_ScatterAndBAND;
1003: else if (op == MPI_LOR) *ScatterAndOp = link->d_ScatterAndLOR;
1004: else if (op == MPI_BOR) *ScatterAndOp = link->d_ScatterAndBOR;
1005: else if (op == MPI_LXOR) *ScatterAndOp = link->d_ScatterAndLXOR;
1006: else if (op == MPI_BXOR) *ScatterAndOp = link->d_ScatterAndBXOR;
1007: else if (op == MPI_MAXLOC) *ScatterAndOp = link->d_ScatterAndMaxloc;
1008: else if (op == MPI_MINLOC) *ScatterAndOp = link->d_ScatterAndMinloc;
1009: } else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) {
1010: if (op == MPIU_REPLACE) *ScatterAndOp = link->da_ScatterAndInsert;
1011: else if (op == MPI_SUM || op == MPIU_SUM) *ScatterAndOp = link->da_ScatterAndAdd;
1012: else if (op == MPI_PROD) *ScatterAndOp = link->da_ScatterAndMult;
1013: else if (op == MPI_MAX || op == MPIU_MAX) *ScatterAndOp = link->da_ScatterAndMax;
1014: else if (op == MPI_MIN || op == MPIU_MIN) *ScatterAndOp = link->da_ScatterAndMin;
1015: else if (op == MPI_LAND) *ScatterAndOp = link->da_ScatterAndLAND;
1016: else if (op == MPI_BAND) *ScatterAndOp = link->da_ScatterAndBAND;
1017: else if (op == MPI_LOR) *ScatterAndOp = link->da_ScatterAndLOR;
1018: else if (op == MPI_BOR) *ScatterAndOp = link->da_ScatterAndBOR;
1019: else if (op == MPI_LXOR) *ScatterAndOp = link->da_ScatterAndLXOR;
1020: else if (op == MPI_BXOR) *ScatterAndOp = link->da_ScatterAndBXOR;
1021: else if (op == MPI_MAXLOC) *ScatterAndOp = link->da_ScatterAndMaxloc;
1022: else if (op == MPI_MINLOC) *ScatterAndOp = link->da_ScatterAndMinloc;
1023: }
1024: #endif
1025: return(0);
1026: }
1028: PetscErrorCode PetscSFLinkGetFetchAndOp(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*))
1029: {
1031: *FetchAndOp = NULL;
1032: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1033: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOp = link->h_FetchAndAdd;
1034: #if defined(PETSC_HAVE_CUDA)
1035: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOp = link->d_FetchAndAdd;
1036: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOp = link->da_FetchAndAdd;
1037: #endif
1038: return(0);
1039: }
1041: PetscErrorCode PetscSFLinkGetFetchAndOpLocal(PetscSFLink link,PetscMemType mtype,MPI_Op op,PetscBool atomic,PetscErrorCode (**FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*))
1042: {
1044: *FetchAndOpLocal = NULL;
1045: if (op != MPI_SUM && op != MPIU_SUM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for MPI_Op in FetchAndOp");
1046: if (mtype == PETSC_MEMTYPE_HOST) *FetchAndOpLocal = link->h_FetchAndAddLocal;
1047: #if defined(PETSC_HAVE_CUDA)
1048: else if (mtype == PETSC_MEMTYPE_DEVICE && !atomic) *FetchAndOpLocal = link->d_FetchAndAddLocal;
1049: else if (mtype == PETSC_MEMTYPE_DEVICE && atomic) *FetchAndOpLocal = link->da_FetchAndAddLocal;
1050: #endif
1051: return(0);
1052: }
1054: /*=============================================================================
1055: A set of helper routines for Pack/Unpack/Scatter on GPUs
1056: ============================================================================*/
1057: #if defined(PETSC_HAVE_CUDA)
1058: /* If SF does not know which stream root/leafdata is being computed on, it has to sync the device to
1059: make sure the data is ready for packing.
1060: */
1061: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncDeviceBeforePackData(PetscSF sf,PetscSFLink link)
1062: {
1064: if (sf->use_default_stream) return(0);
1065: if (link->rootmtype == PETSC_MEMTYPE_DEVICE || link->leafmtype == PETSC_MEMTYPE_DEVICE) {
1066: cudaError_t cerr = cudaDeviceSynchronize();CHKERRCUDA(cerr);
1067: }
1068: return(0);
1069: }
1071: /* PetscSFLinkSyncStreamAfterPackXxxData routines make sure root/leafbuf for the remote is ready for MPI */
1072: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackRootData(PetscSF sf,PetscSFLink link)
1073: {
1074: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1077: /* Do nothing if we use stream aware mpi || has nothing for remote */
1078: if (sf->use_stream_aware_mpi || link->rootmtype != PETSC_MEMTYPE_DEVICE || !bas->rootbuflen[PETSCSF_REMOTE]) return(0);
1079: /* If we called a packing kernel || we async-copied rootdata from device to host || No cudaDeviceSynchronize was called (since default stream is assumed) */
1080: if (!link->rootdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1081: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1082: }
1083: return(0);
1084: }
1085: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterPackLeafData(PetscSF sf,PetscSFLink link)
1086: {
1088: /* See comments above */
1089: if (sf->use_stream_aware_mpi || link->leafmtype != PETSC_MEMTYPE_DEVICE || !sf->leafbuflen[PETSCSF_REMOTE]) return(0);
1090: if (!link->leafdirect[PETSCSF_REMOTE] || !sf->use_gpu_aware_mpi || sf->use_default_stream) {
1091: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1092: }
1093: return(0);
1094: }
1096: /* PetscSFLinkSyncStreamAfterUnpackXxx routines make sure root/leafdata (local & remote) is ready to use for SF callers, when SF
1097: does not know which stream the callers will use.
1098: */
1099: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackRootData(PetscSF sf,PetscSFLink link)
1100: {
1101: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1104: /* Do nothing if we are expected to put rootdata on default stream */
1105: if (sf->use_default_stream || link->rootmtype != PETSC_MEMTYPE_DEVICE) return(0);
1106: /* If we have something from local, then we called a scatter kernel (on link->stream), then we must sync it;
1107: If we have something from remote and we called unpack kernel, then we must also sycn it.
1108: */
1109: if (bas->rootbuflen[PETSCSF_LOCAL] || (bas->rootbuflen[PETSCSF_REMOTE] && !link->rootdirect[PETSCSF_REMOTE])) {
1110: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1111: }
1112: return(0);
1113: }
1114: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkSyncStreamAfterUnpackLeafData(PetscSF sf,PetscSFLink link)
1115: {
1117: /* See comments above */
1118: if (sf->use_default_stream || link->leafmtype != PETSC_MEMTYPE_DEVICE) return(0);
1119: if (sf->leafbuflen[PETSCSF_LOCAL] || (sf->leafbuflen[PETSCSF_REMOTE] && !link->leafdirect[PETSCSF_REMOTE])) {
1120: cudaError_t cerr = cudaStreamSynchronize(link->stream);CHKERRCUDA(cerr);
1121: }
1122: return(0);
1123: }
1125: /* PetscSFLinkCopyXxxxBufferInCaseNotUseGpuAwareMPI routines are simple: if not use_gpu_aware_mpi, we need
1126: to copy the buffer from GPU to CPU before MPI calls, and from CPU to GPU after MPI calls.
1127: */
1128: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1129: {
1131: cudaError_t cerr;
1132: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1135: if (link->rootmtype == PETSC_MEMTYPE_DEVICE && (link->rootmtype_mpi != link->rootmtype) && bas->rootbuflen[PETSCSF_REMOTE]) {
1136: void *h_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1137: void *d_buf = link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1138: size_t count = bas->rootbuflen[PETSCSF_REMOTE]*link->unitbytes;
1139: if (device2host) {
1140: cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1141: PetscLogGpuToCpu(count);
1142: } else {
1143: cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1144: PetscLogCpuToGpu(count);
1145: }
1146: }
1147: return(0);
1148: }
1150: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(PetscSF sf,PetscSFLink link,PetscBool device2host)
1151: {
1153: cudaError_t cerr;
1156: if (link->leafmtype == PETSC_MEMTYPE_DEVICE && (link->leafmtype_mpi != link->leafmtype) && sf->leafbuflen[PETSCSF_REMOTE]) {
1157: void *h_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
1158: void *d_buf = link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_DEVICE];
1159: size_t count = sf->leafbuflen[PETSCSF_REMOTE]*link->unitbytes;
1160: if (device2host) {
1161: cerr = cudaMemcpyAsync(h_buf,d_buf,count,cudaMemcpyDeviceToHost,link->stream);CHKERRCUDA(cerr);
1162: PetscLogGpuToCpu(count);
1163: } else {
1164: cerr = cudaMemcpyAsync(d_buf,h_buf,count,cudaMemcpyHostToDevice,link->stream);CHKERRCUDA(cerr);
1165: PetscLogCpuToGpu(count);
1166: }
1167: }
1168: return(0);
1169: }
1170: #else
1172: #define PetscSFLinkSyncDeviceBeforePackData(a,b) 0
1173: #define PetscSFLinkSyncStreamAfterPackRootData(a,b) 0
1174: #define PetscSFLinkSyncStreamAfterPackLeafData(a,b) 0
1175: #define PetscSFLinkSyncStreamAfterUnpackRootData(a,b) 0
1176: #define PetscSFLinkSyncStreamAfterUnpackLeafData(a,b) 0
1177: #define PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1178: #define PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(a,b,c) 0
1180: #endif
1182: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1183: {
1185: PetscLogDouble flops;
1186: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1190: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1191: flops = bas->rootbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1192: #if defined(PETSC_HAVE_CUDA)
1193: if (link->rootmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1194: #endif
1195: {PetscLogFlops(flops);}
1196: }
1197: return(0);
1198: }
1200: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkLogFlopsAfterUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,MPI_Op op)
1201: {
1202: PetscLogDouble flops;
1206: if (op != MPIU_REPLACE && link->basicunit == MPIU_SCALAR) { /* op is a reduction on PetscScalars */
1207: flops = sf->leafbuflen[scope]*link->bs; /* # of roots in buffer x # of scalars in unit */
1208: #if defined(PETSC_HAVE_CUDA)
1209: if (link->leafmtype == PETSC_MEMTYPE_DEVICE) {PetscLogGpuFlops(flops);} else
1210: #endif
1211: {PetscLogFlops(flops);}
1212: }
1213: return(0);
1214: }
1216: /* When SF could not find a proper UnpackAndOp() from link, it falls back to MPI_Reduce_local.
1217: Input Arguments:
1218: +sf - The StarForest
1219: .link - The link
1220: .count - Number of entries to unpack
1221: .start - The first index, significent when indices=NULL
1222: .indices - Indices of entries in <data>. If NULL, it means indices are contiguous and the first is given in <start>
1223: .buf - A contiguous buffer to unpack from
1224: -op - Operation after unpack
1226: Output Arguments:
1227: .data - The data to unpack to
1228: */
1229: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkUnpackDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt start,const PetscInt *indices,void *data,const void *buf,MPI_Op op)
1230: {
1232: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1233: {
1235: PetscInt i;
1236: PetscMPIInt n;
1237: if (indices) {
1238: /* Note we use link->unit instead of link->basicunit. When op can be mapped to MPI_SUM etc, it operates on
1239: basic units of a root/leaf element-wisely. Otherwise, it is meant to operate on a whole root/leaf.
1240: */
1241: for (i=0; i<count; i++) {MPI_Reduce_local((const char*)buf+i*link->unitbytes,(char*)data+indices[i]*link->unitbytes,1,link->unit,op);}
1242: } else {
1243: PetscMPIIntCast(count,&n);
1244: MPI_Reduce_local(buf,(char*)data+start*link->unitbytes,n,link->unit,op);
1245: }
1246: }
1247: #else
1248: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1249: #endif
1250: return(0);
1251: }
1253: PETSC_STATIC_INLINE PetscErrorCode PetscSFLinkScatterDataWithMPIReduceLocal(PetscSF sf,PetscSFLink link,PetscInt count,PetscInt srcStart,const PetscInt *srcIdx,const void *src,PetscInt dstStart,const PetscInt *dstIdx,void *dst,MPI_Op op)
1254: {
1256: #if defined(PETSC_HAVE_MPI_REDUCE_LOCAL)
1257: {
1259: PetscInt i,disp;
1260: if (!srcIdx) {
1261: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,dstStart,dstIdx,dst,(const char*)src+srcStart*link->unitbytes,op);
1262: } else {
1263: for (i=0; i<count; i++) {
1264: disp = dstIdx? dstIdx[i] : dstStart + i;
1265: MPI_Reduce_local((const char*)src+srcIdx[i]*link->unitbytes,(char*)dst+disp*link->unitbytes,1,link->unit,op);
1266: }
1267: }
1268: }
1269: #else
1270: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No unpacking reduction operation for this MPI_Op");
1271: #endif
1272: return(0);
1273: }
1275: /*=============================================================================
1276: Pack/Unpack/Fetch/Scatter routines
1277: ============================================================================*/
1279: /* Pack rootdata to rootbuf
1280: Input Arguments:
1281: + sf - The SF this packing works on.
1282: . link - It gives the memtype of the roots and also provides root buffer.
1283: . scope - PETSCSF_LOCAL or PETSCSF_REMOTE. Note SF has the ability to do local and remote communications separately.
1284: - rootdata - Where to read the roots.
1286: Notes:
1287: When rootdata can be directly used as root buffer, the routine is almost a no-op. After the call, root data is
1288: in a place where the underlying MPI is ready can access (use_gpu_aware_mpi or not)
1289: */
1290: PetscErrorCode PetscSFLinkPackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *rootdata)
1291: {
1292: PetscErrorCode ierr;
1293: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1294: const PetscInt *rootindices = NULL;
1295: PetscInt count,start;
1296: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1297: PetscMemType rootmtype = link->rootmtype;
1298: PetscSFPackOpt opt = NULL;
1302: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1303: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1304: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip packing */
1305: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1306: PetscSFLinkGetPack(link,rootmtype,&Pack);
1307: (*Pack)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1308: }
1309: if (scope == PETSCSF_REMOTE) {
1310: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1311: PetscSFLinkSyncStreamAfterPackRootData(sf,link);
1312: }
1313: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1314: return(0);
1315: }
1317: /* Pack leafdata to leafbuf */
1318: PetscErrorCode PetscSFLinkPackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,const void *leafdata)
1319: {
1320: PetscErrorCode ierr;
1321: const PetscInt *leafindices = NULL;
1322: PetscInt count,start;
1323: PetscErrorCode (*Pack)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1324: PetscMemType leafmtype = link->leafmtype;
1325: PetscSFPackOpt opt = NULL;
1328: PetscLogEventBegin(PETSCSF_Pack,sf,0,0,0);
1329: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncDeviceBeforePackData(sf,link);}
1330: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip packing */
1331: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1332: PetscSFLinkGetPack(link,leafmtype,&Pack);
1333: (*Pack)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1334: }
1335: if (scope == PETSCSF_REMOTE) {
1336: PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE/*device2host*/);
1337: PetscSFLinkSyncStreamAfterPackLeafData(sf,link);
1338: }
1339: PetscLogEventEnd(PETSCSF_Pack,sf,0,0,0);
1340: return(0);
1341: }
1343: /* Unpack rootbuf to rootdata */
1344: PetscErrorCode PetscSFLinkUnpackRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1345: {
1346: PetscErrorCode ierr;
1347: const PetscInt *rootindices = NULL;
1348: PetscInt count,start;
1349: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1350: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1351: PetscMemType rootmtype = link->rootmtype;
1352: PetscSFPackOpt opt = NULL;
1355: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1356: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1357: if (!link->rootdirect[scope] && bas->rootbuflen[scope]) { /* If rootdata works directly as rootbuf, skip unpacking */
1358: PetscSFLinkGetUnpackAndOp(link,rootmtype,op,bas->rootdups[scope],&UnpackAndOp);
1359: if (UnpackAndOp) {
1360: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1361: (*UnpackAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1362: } else {
1363: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&rootindices);
1364: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,rootindices,rootdata,link->rootbuf[scope][rootmtype],op);
1365: }
1366: }
1367: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);}
1368: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1369: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1370: return(0);
1371: }
1373: /* Unpack leafbuf to leafdata */
1374: PetscErrorCode PetscSFLinkUnpackLeafData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *leafdata,MPI_Op op)
1375: {
1376: PetscErrorCode ierr;
1377: const PetscInt *leafindices = NULL;
1378: PetscInt count,start;
1379: PetscErrorCode (*UnpackAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,const void*) = NULL;
1380: PetscMemType leafmtype = link->leafmtype;
1381: PetscSFPackOpt opt = NULL;
1384: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1385: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1386: if (!link->leafdirect[scope] && sf->leafbuflen[scope]) { /* If leafdata works directly as rootbuf, skip unpacking */
1387: PetscSFLinkGetUnpackAndOp(link,leafmtype,op,sf->leafdups[scope],&UnpackAndOp);
1388: if (UnpackAndOp) {
1389: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,scope,&count,&start,&opt,&leafindices);
1390: (*UnpackAndOp)(link,count,start,opt,leafindices,leafdata,link->leafbuf[scope][leafmtype]);
1391: } else {
1392: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,scope,&count,&start,&opt,&leafindices);
1393: PetscSFLinkUnpackDataWithMPIReduceLocal(sf,link,count,start,leafindices,leafdata,link->leafbuf[scope][leafmtype],op);
1394: }
1395: }
1396: if (scope == PETSCSF_REMOTE) {PetscSFLinkSyncStreamAfterUnpackLeafData(sf,link);}
1397: PetscSFLinkLogFlopsAfterUnpackLeafData(sf,link,scope,op);
1398: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1399: return(0);
1400: }
1402: /* FetchAndOp rootdata with rootbuf */
1403: PetscErrorCode PetscSFLinkFetchRootData(PetscSF sf,PetscSFLink link,PetscSFScope scope,void *rootdata,MPI_Op op)
1404: {
1405: PetscErrorCode ierr;
1406: const PetscInt *rootindices = NULL;
1407: PetscInt count,start;
1408: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1409: PetscErrorCode (*FetchAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,void*) = NULL;
1410: PetscMemType rootmtype = link->rootmtype;
1411: PetscSFPackOpt opt = NULL;
1414: PetscLogEventBegin(PETSCSF_Unpack,sf,0,0,0);
1415: if (scope == PETSCSF_REMOTE) {PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_FALSE);}
1416: if (bas->rootbuflen[scope]) {
1417: /* Do FetchAndOp on rootdata with rootbuf */
1418: PetscSFLinkGetFetchAndOp(link,rootmtype,op,bas->rootdups[scope],&FetchAndOp);
1419: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,scope,&count,&start,&opt,&rootindices);
1420: (*FetchAndOp)(link,count,start,opt,rootindices,rootdata,link->rootbuf[scope][rootmtype]);
1421: }
1422: if (scope == PETSCSF_REMOTE) {
1423: PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf,link,PETSC_TRUE);
1424: PetscSFLinkSyncStreamAfterUnpackRootData(sf,link);
1425: }
1426: PetscSFLinkLogFlopsAfterUnpackRootData(sf,link,scope,op);
1427: PetscLogEventEnd(PETSCSF_Unpack,sf,0,0,0);
1428: return(0);
1429: }
1431: /* Bcast rootdata to leafdata locally (i.e., only for local communication - PETSCSF_LOCAL) */
1432: PetscErrorCode PetscSFLinkBcastAndOpLocal(PetscSF sf,PetscSFLink link,const void *rootdata,void *leafdata,MPI_Op op)
1433: {
1434: PetscErrorCode ierr;
1435: const PetscInt *rootindices = NULL,*leafindices = NULL;
1436: PetscInt count,rootstart,leafstart;
1437: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1438: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1439: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1440: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1443: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1444: if (rootmtype != leafmtype) { /* Uncommon case */
1445: /* The local communication has to go through pack and unpack */
1446: PetscSFLinkPackRootData(sf,link,PETSCSF_LOCAL,rootdata);
1447: PetscSFLinkMemcpy(sf,link,leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],sf->leafbuflen[PETSCSF_LOCAL]*link->unitbytes);
1448: PetscSFLinkUnpackLeafData(sf,link,PETSCSF_LOCAL,leafdata,op);
1449: } else {
1450: PetscSFLinkGetScatterAndOp(link,leafmtype,op,sf->leafdups[PETSCSF_LOCAL],&ScatterAndOp);
1451: if (ScatterAndOp) {
1452: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1453: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1454: (*ScatterAndOp)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata);
1455: } else {
1456: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1457: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1458: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,rootstart,rootindices,rootdata,leafstart,leafindices,leafdata,op);
1459: }
1460: }
1461: return(0);
1462: }
1464: /* Reduce leafdata to rootdata locally */
1465: PetscErrorCode PetscSFLinkReduceLocal(PetscSF sf,PetscSFLink link,const void *leafdata,void *rootdata,MPI_Op op)
1466: {
1467: PetscErrorCode ierr;
1468: const PetscInt *rootindices = NULL,*leafindices = NULL;
1469: PetscInt count,rootstart,leafstart;
1470: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1471: PetscErrorCode (*ScatterAndOp)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,PetscInt,PetscSFPackOpt,const PetscInt*,void*) = NULL;
1472: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1473: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1476: if (!sf->leafbuflen[PETSCSF_LOCAL]) return(0);
1477: if (rootmtype != leafmtype) {
1478: /* The local communication has to go through pack and unpack */
1479: PetscSFLinkPackLeafData(sf,link,PETSCSF_LOCAL,leafdata);
1480: PetscSFLinkMemcpy(sf,link,rootmtype,link->rootbuf[PETSCSF_LOCAL][rootmtype],leafmtype,link->leafbuf[PETSCSF_LOCAL][leafmtype],bas->rootbuflen[PETSCSF_LOCAL]*link->unitbytes);
1481: PetscSFLinkUnpackRootData(sf,link,PETSCSF_LOCAL,rootdata,op);
1482: } else {
1483: PetscSFLinkGetScatterAndOp(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&ScatterAndOp);
1484: if (ScatterAndOp) {
1485: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1486: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1487: (*ScatterAndOp)(link,count,leafstart,leafopt,leafindices,leafdata,rootstart,rootopt,rootindices,rootdata);
1488: } else {
1489: PetscSFLinkGetRootPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1490: PetscSFLinkGetLeafPackOptAndIndices(sf,link,PETSC_MEMTYPE_HOST,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1491: PetscSFLinkScatterDataWithMPIReduceLocal(sf,link,count,leafstart,leafindices,leafdata,rootstart,rootindices,rootdata,op);
1492: }
1493: }
1494: return(0);
1495: }
1497: /* Fetch rootdata to leafdata and leafupdate locally */
1498: PetscErrorCode PetscSFLinkFetchAndOpLocal(PetscSF sf,PetscSFLink link,void *rootdata,const void *leafdata,void *leafupdate,MPI_Op op)
1499: {
1500: PetscErrorCode ierr;
1501: const PetscInt *rootindices = NULL,*leafindices = NULL;
1502: PetscInt count,rootstart,leafstart;
1503: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1504: PetscErrorCode (*FetchAndOpLocal)(PetscSFLink,PetscInt,PetscInt,PetscSFPackOpt,const PetscInt*,void*,PetscInt,PetscSFPackOpt,const PetscInt*,const void*,void*) = NULL;
1505: const PetscMemType rootmtype = link->rootmtype,leafmtype = link->leafmtype;
1506: PetscSFPackOpt leafopt = NULL,rootopt = NULL;
1509: if (!bas->rootbuflen[PETSCSF_LOCAL]) return(0);
1510: if (rootmtype != leafmtype) {
1511: /* The local communication has to go through pack and unpack */
1512: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Doing PetscSFFetchAndOp with rootdata and leafdata on opposite side of CPU and GPU");
1513: } else {
1514: PetscSFLinkGetRootPackOptAndIndices(sf,link,rootmtype,PETSCSF_LOCAL,&count,&rootstart,&rootopt,&rootindices);
1515: PetscSFLinkGetLeafPackOptAndIndices(sf,link,leafmtype,PETSCSF_LOCAL,&count,&leafstart,&leafopt,&leafindices);
1516: PetscSFLinkGetFetchAndOpLocal(link,rootmtype,op,bas->rootdups[PETSCSF_LOCAL],&FetchAndOpLocal);
1517: (*FetchAndOpLocal)(link,count,rootstart,rootopt,rootindices,rootdata,leafstart,leafopt,leafindices,leafdata,leafupdate);
1518: }
1519: return(0);
1520: }
1522: /*
1523: Create per-rank pack/unpack optimizations based on indice patterns
1525: Input Parameters:
1526: + n - Number of destination ranks
1527: . offset - [n+1] For the i-th rank, its associated indices are idx[offset[i], offset[i+1]). offset[0] needs not to be 0.
1528: - idx - [*] Array storing indices
1530: Output Parameters:
1531: + opt - Pack optimizations. NULL if no optimizations.
1532: */
1533: PetscErrorCode PetscSFCreatePackOpt(PetscInt n,const PetscInt *offset,const PetscInt *idx,PetscSFPackOpt *out)
1534: {
1536: PetscInt r,p,start,i,j,k,dx,dy,dz,dydz,m,X,Y;
1537: PetscBool optimizable = PETSC_TRUE;
1538: PetscSFPackOpt opt;
1541: PetscMalloc1(1,&opt);
1542: PetscMalloc1(7*n+2,&opt->array);
1543: opt->n = opt->array[0] = n;
1544: opt->offset = opt->array + 1;
1545: opt->start = opt->array + n + 2;
1546: opt->dx = opt->array + 2*n + 2;
1547: opt->dy = opt->array + 3*n + 2;
1548: opt->dz = opt->array + 4*n + 2;
1549: opt->X = opt->array + 5*n + 2;
1550: opt->Y = opt->array + 6*n + 2;
1552: for (r=0; r<n; r++) { /* For each destination rank */
1553: m = offset[r+1] - offset[r]; /* Total number of indices for this rank. We want to see if m can be factored into dx*dy*dz */
1554: p = offset[r];
1555: start = idx[p]; /* First index for this rank */
1556: p++;
1558: /* Search in X dimension */
1559: for (dx=1; dx<m; dx++,p++) {
1560: if (start+dx != idx[p]) break;
1561: }
1563: dydz = m/dx;
1564: X = dydz > 1 ? (idx[p]-start) : dx;
1565: /* Not optimizable if m is not a multiple of dx, or some unrecognized pattern is found */
1566: if (m%dx || X <= 0) {optimizable = PETSC_FALSE; goto finish;}
1567: for (dy=1; dy<dydz; dy++) { /* Search in Y dimension */
1568: for (i=0; i<dx; i++,p++) {
1569: if (start+X*dy+i != idx[p]) {
1570: if (i) {optimizable = PETSC_FALSE; goto finish;} /* The pattern is violated in the middle of an x-walk */
1571: else goto Z_dimension;
1572: }
1573: }
1574: }
1576: Z_dimension:
1577: dz = m/(dx*dy);
1578: Y = dz > 1 ? (idx[p]-start)/X : dy;
1579: /* Not optimizable if m is not a multiple of dx*dy, or some unrecognized pattern is found */
1580: if (m%(dx*dy) || Y <= 0) {optimizable = PETSC_FALSE; goto finish;}
1581: for (k=1; k<dz; k++) { /* Go through Z dimension to see if remaining indices follow the pattern */
1582: for (j=0; j<dy; j++) {
1583: for (i=0; i<dx; i++,p++) {
1584: if (start+X*Y*k+X*j+i != idx[p]) {optimizable = PETSC_FALSE; goto finish;}
1585: }
1586: }
1587: }
1588: opt->start[r] = start;
1589: opt->dx[r] = dx;
1590: opt->dy[r] = dy;
1591: opt->dz[r] = dz;
1592: opt->X[r] = X;
1593: opt->Y[r] = Y;
1594: }
1596: finish:
1597: /* If not optimizable, free arrays to save memory */
1598: if (!n || !optimizable) {
1599: PetscFree(opt->array);
1600: PetscFree(opt);
1601: *out = NULL;
1602: } else {
1603: opt->offset[0] = 0;
1604: for (r=0; r<n; r++) opt->offset[r+1] = opt->offset[r] + opt->dx[r]*opt->dy[r]*opt->dz[r];
1605: *out = opt;
1606: }
1607: return(0);
1608: }
1610: PETSC_STATIC_INLINE PetscErrorCode PetscSFDestroyPackOpt(PetscMemType mtype,PetscSFPackOpt *out)
1611: {
1613: PetscSFPackOpt opt = *out;
1616: if (opt) {
1617: if (mtype == PETSC_MEMTYPE_HOST) {PetscFree(opt->array);}
1618: #if defined(PETSC_HAVE_CUDA)
1619: else {cudaError_t cerr = cudaFree(opt->array);CHKERRCUDA(cerr);opt->array=NULL;}
1620: #endif
1621: PetscFree(opt);
1622: *out = NULL;
1623: }
1624: return(0);
1625: }
1627: PetscErrorCode PetscSFSetUpPackFields(PetscSF sf)
1628: {
1630: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1631: PetscInt i,j;
1634: /* [0] for PETSCSF_LOCAL and [1] for PETSCSF_REMOTE in the following */
1635: for (i=0; i<2; i++) { /* Set defaults */
1636: sf->leafstart[i] = 0;
1637: sf->leafcontig[i] = PETSC_TRUE;
1638: sf->leafdups[i] = PETSC_FALSE;
1639: bas->rootstart[i] = 0;
1640: bas->rootcontig[i] = PETSC_TRUE;
1641: bas->rootdups[i] = PETSC_FALSE;
1642: }
1644: sf->leafbuflen[0] = sf->roffset[sf->ndranks];
1645: sf->leafbuflen[1] = sf->roffset[sf->nranks] - sf->roffset[sf->ndranks];
1647: if (sf->leafbuflen[0]) sf->leafstart[0] = sf->rmine[0];
1648: if (sf->leafbuflen[1]) sf->leafstart[1] = sf->rmine[sf->roffset[sf->ndranks]];
1650: /* Are leaf indices for self and remote contiguous? If yes, it is best for pack/unpack */
1651: for (i=0; i<sf->roffset[sf->ndranks]; i++) { /* self */
1652: if (sf->rmine[i] != sf->leafstart[0]+i) {sf->leafcontig[0] = PETSC_FALSE; break;}
1653: }
1654: for (i=sf->roffset[sf->ndranks],j=0; i<sf->roffset[sf->nranks]; i++,j++) { /* remote */
1655: if (sf->rmine[i] != sf->leafstart[1]+j) {sf->leafcontig[1] = PETSC_FALSE; break;}
1656: }
1658: /* If not, see if we can have per-rank optimizations by doing index analysis */
1659: if (!sf->leafcontig[0]) {PetscSFCreatePackOpt(sf->ndranks, sf->roffset, sf->rmine, &sf->leafpackopt[0]);}
1660: if (!sf->leafcontig[1]) {PetscSFCreatePackOpt(sf->nranks-sf->ndranks, sf->roffset+sf->ndranks, sf->rmine, &sf->leafpackopt[1]);}
1662: /* Are root indices for self and remote contiguous? */
1663: bas->rootbuflen[0] = bas->ioffset[bas->ndiranks];
1664: bas->rootbuflen[1] = bas->ioffset[bas->niranks] - bas->ioffset[bas->ndiranks];
1666: if (bas->rootbuflen[0]) bas->rootstart[0] = bas->irootloc[0];
1667: if (bas->rootbuflen[1]) bas->rootstart[1] = bas->irootloc[bas->ioffset[bas->ndiranks]];
1669: for (i=0; i<bas->ioffset[bas->ndiranks]; i++) {
1670: if (bas->irootloc[i] != bas->rootstart[0]+i) {bas->rootcontig[0] = PETSC_FALSE; break;}
1671: }
1672: for (i=bas->ioffset[bas->ndiranks],j=0; i<bas->ioffset[bas->niranks]; i++,j++) {
1673: if (bas->irootloc[i] != bas->rootstart[1]+j) {bas->rootcontig[1] = PETSC_FALSE; break;}
1674: }
1676: if (!bas->rootcontig[0]) {PetscSFCreatePackOpt(bas->ndiranks, bas->ioffset, bas->irootloc, &bas->rootpackopt[0]);}
1677: if (!bas->rootcontig[1]) {PetscSFCreatePackOpt(bas->niranks-bas->ndiranks, bas->ioffset+bas->ndiranks, bas->irootloc, &bas->rootpackopt[1]);}
1679: #if defined(PETSC_HAVE_CUDA)
1680: /* Check dups in indices so that CUDA unpacking kernels can use cheaper regular instructions instead of atomics when they know there are no data race chances */
1685: #endif
1687: return(0);
1688: }
1690: PetscErrorCode PetscSFResetPackFields(PetscSF sf)
1691: {
1693: PetscSF_Basic *bas = (PetscSF_Basic*)sf->data;
1694: PetscInt i;
1697: for (i=PETSCSF_LOCAL; i<=PETSCSF_REMOTE; i++) {
1698: PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&sf->leafpackopt[i]);
1699: PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&sf->leafpackopt_d[i]);
1700: PetscSFDestroyPackOpt(PETSC_MEMTYPE_HOST,&bas->rootpackopt[i]);
1701: PetscSFDestroyPackOpt(PETSC_MEMTYPE_DEVICE,&bas->rootpackopt_d[i]);
1702: }
1703: return(0);
1704: }