VTK-m  2.0
FunctorsOpenMP.h
Go to the documentation of this file.
1 //============================================================================
2 // Copyright (c) Kitware, Inc.
3 // All rights reserved.
4 // See LICENSE.txt for details.
5 //
6 // This software is distributed WITHOUT ANY WARRANTY; without even
7 // the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
8 // PURPOSE. See the above copyright notice for more information.
9 //============================================================================
10 #ifndef vtk_m_cont_openmp_internal_FunctorsOpenMP_h
11 #define vtk_m_cont_openmp_internal_FunctorsOpenMP_h
12 
15 
17 
18 #include <vtkm/BinaryOperators.h>
19 #include <vtkm/BinaryPredicates.h>
20 #include <vtkm/Pair.h>
21 #include <vtkm/Types.h>
22 #include <vtkm/cont/ArrayHandle.h>
24 
25 #include <omp.h>
26 
27 #include <algorithm>
28 #include <type_traits>
29 #include <vector>
30 
31 // Wrap all '#pragma omp ...' calls in this macro so we can disable them in
32 // non-omp builds and avoid a multitude of 'ignoring pragma..." warnings.
33 #ifdef _OPENMP
34 #define VTKM_OPENMP_DIRECTIVE_IMPL(fullDir) _Pragma(#fullDir)
35 #define VTKM_OPENMP_DIRECTIVE(dir) VTKM_OPENMP_DIRECTIVE_IMPL(omp dir)
36 #else // _OPENMP
37 #define VTKM_OPENMP_DIRECTIVE(directive)
38 #endif // _OPENMP
39 
40 // See "OpenMP data sharing" section of
41 // https://www.gnu.org/software/gcc/gcc-9/porting_to.html. OpenMP broke
42 // backwards compatibility regarding const variable handling.
43 // tl;dr, put all const variables accessed from openmp blocks in a
44 // VTKM_OPENMP_SHARED_CONST(var1, var2, ...) macro. This will do The Right Thing
45 // on all gcc.
46 #if defined(VTKM_GCC) && (__GNUC__ < 9)
47 #define VTKM_OPENMP_SHARED_CONST(...)
48 #else
49 #define VTKM_OPENMP_SHARED_CONST(...) shared(__VA_ARGS__)
50 #endif
51 
52 // When defined, supported type / operator combinations will use the OpenMP
53 // reduction(...) clause. Otherwise, all reductions use the general
54 // implementation with a manual reduction once the threads complete.
55 // I don't know how, but the benchmarks currently perform better without the
56 // specializations.
57 //#define VTKM_OPENMP_USE_NATIVE_REDUCTION
58 
59 namespace vtkm
60 {
61 namespace cont
62 {
63 namespace openmp
64 {
65 
66 constexpr static vtkm::Id VTKM_CACHE_LINE_SIZE = 64;
67 constexpr static vtkm::Id VTKM_PAGE_SIZE = 4096;
68 
69 // Returns ceil(num/den) for integral types
70 template <typename T>
71 static constexpr T CeilDivide(const T& numerator, const T& denominator)
72 {
73  return (numerator + denominator - 1) / denominator;
74 }
75 
76 // Computes the number of values per chunk. Note that numChunks + chunkSize may
77 // exceed numVals, so be sure to check upper limits.
78 static void ComputeChunkSize(const vtkm::Id numVals,
79  const vtkm::Id numThreads,
80  const vtkm::Id chunksPerThread,
81  const vtkm::Id bytesPerValue,
82  vtkm::Id& numChunks,
83  vtkm::Id& valuesPerChunk)
84 {
85  // try to evenly distribute pages across chunks:
86  const vtkm::Id bytesIn = numVals * bytesPerValue;
87  const vtkm::Id pagesIn = CeilDivide(bytesIn, VTKM_PAGE_SIZE);
88  // If we don't have enough pages to honor chunksPerThread, ignore it:
89  numChunks = (pagesIn > numThreads * chunksPerThread) ? numThreads * chunksPerThread : numThreads;
90  const vtkm::Id pagesPerChunk = CeilDivide(pagesIn, numChunks);
91  valuesPerChunk = CeilDivide(pagesPerChunk * VTKM_PAGE_SIZE, bytesPerValue);
92 }
93 
94 template <typename T>
96 {
97  using type = T;
98 };
99 
100 template <typename PortalType>
101 struct CleanArrayRefImpl<vtkm::internal::ArrayPortalValueReference<PortalType>>
102 {
103  using type = typename PortalType::ValueType;
104 };
105 
106 template <typename T>
108 
109 template <typename T, typename U>
110 static void DoCopy(T src, U dst, vtkm::Id numVals, std::true_type)
111 {
112  if (numVals)
113  {
114  std::copy(src, src + numVals, dst);
115  }
116 }
117 
118 // Don't use std::copy when type conversion is required because MSVC.
119 template <typename InIterT, typename OutIterT>
120 static void DoCopy(InIterT inIter, OutIterT outIter, vtkm::Id numVals, std::false_type)
121 {
122  using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
123  using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
124 
125  for (vtkm::Id i = 0; i < numVals; ++i)
126  {
127  // The conversion to InputType and then OutputType looks weird, but it is necessary.
128  // *inItr actually returns an ArrayPortalValueReference, which can automatically convert
129  // itself to InputType but not necessarily OutputType. Thus, we first convert to
130  // InputType, and then allow the conversion to OutputType.
131  *(outIter++) = static_cast<OutValueType>(static_cast<InValueType>(*(inIter++)));
132  }
133 }
134 
135 template <typename InIterT, typename OutIterT>
136 static void DoCopy(InIterT inIter, OutIterT outIter, vtkm::Id numVals)
137 {
138  using InValueType = CleanArrayRef<typename std::iterator_traits<InIterT>::value_type>;
139  using OutValueType = CleanArrayRef<typename std::iterator_traits<OutIterT>::value_type>;
140 
141  DoCopy(inIter, outIter, numVals, std::is_same<InValueType, OutValueType>());
142 }
143 
144 
145 template <typename InPortalT, typename OutPortalT>
146 static void CopyHelper(InPortalT inPortal,
147  OutPortalT outPortal,
148  vtkm::Id inStart,
149  vtkm::Id outStart,
150  vtkm::Id numVals)
151 {
152  using InValueT = typename InPortalT::ValueType;
153  using OutValueT = typename OutPortalT::ValueType;
154  constexpr auto isSame = std::is_same<InValueT, OutValueT>();
155 
156  auto inIter = vtkm::cont::ArrayPortalToIteratorBegin(inPortal) + inStart;
157  auto outIter = vtkm::cont::ArrayPortalToIteratorBegin(outPortal) + outStart;
158  vtkm::Id valuesPerChunk;
159 
160  VTKM_OPENMP_DIRECTIVE(parallel default(none) shared(inIter, outIter, valuesPerChunk, numVals)
161  VTKM_OPENMP_SHARED_CONST(isSame))
162  {
163 
164  VTKM_OPENMP_DIRECTIVE(single)
165  {
166  // Evenly distribute full pages to all threads. We manually chunk the
167  // data here so that we can exploit std::copy's memmove optimizations.
168  vtkm::Id numChunks;
169  vtkm::Id numThreads;
171  .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagOpenMP())
172  .GetThreads(numThreads);
173  ComputeChunkSize(numVals, numThreads, 8, sizeof(InValueT), numChunks, valuesPerChunk);
174  }
175 
176  VTKM_OPENMP_DIRECTIVE(for schedule(static))
177  for (vtkm::Id i = 0; i < numVals; i += valuesPerChunk)
178  {
179  vtkm::Id chunkSize = std::min(numVals - i, valuesPerChunk);
180  DoCopy(inIter + i, outIter + i, chunkSize, isSame);
181  }
182  }
183 }
184 
186 {
190 
193  std::vector<vtkm::Id> EndIds;
194 
195  CopyIfHelper() = default;
196 
197  void Initialize(vtkm::Id numValues, vtkm::Id valueSize)
198  {
199  this->NumValues = numValues;
201  .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagOpenMP())
202  .GetThreads(this->NumThreads);
203  this->ValueSize = valueSize;
204 
205  // Evenly distribute pages across the threads. We manually chunk the
206  // data here so that we can exploit std::copy's memmove optimizations.
207  ComputeChunkSize(
208  this->NumValues, this->NumThreads, 8, valueSize, this->NumChunks, this->ChunkSize);
209 
210  this->EndIds.resize(static_cast<std::size_t>(this->NumChunks));
211  }
212 
213  template <typename InIterT, typename StencilIterT, typename OutIterT, typename PredicateT>
214  void CopyIf(InIterT inIter,
215  StencilIterT stencilIter,
216  OutIterT outIter,
217  PredicateT pred,
218  vtkm::Id chunk)
219  {
220  vtkm::Id startPos = std::min(chunk * this->ChunkSize, this->NumValues);
221  vtkm::Id endPos = std::min((chunk + 1) * this->ChunkSize, this->NumValues);
222 
223  vtkm::Id outPos = startPos;
224  for (vtkm::Id inPos = startPos; inPos < endPos; ++inPos)
225  {
226  if (pred(stencilIter[inPos]))
227  {
228  outIter[outPos++] = inIter[inPos];
229  }
230  }
231 
232  this->EndIds[static_cast<std::size_t>(chunk)] = outPos;
233  }
234 
235  template <typename OutIterT>
236  vtkm::Id Reduce(OutIterT data)
237  {
238  vtkm::Id endPos = this->EndIds.front();
239  for (vtkm::Id i = 1; i < this->NumChunks; ++i)
240  {
241  vtkm::Id chunkStart = std::min(i * this->ChunkSize, this->NumValues);
242  vtkm::Id chunkEnd = this->EndIds[static_cast<std::size_t>(i)];
243  vtkm::Id numValuesToCopy = chunkEnd - chunkStart;
244  if (numValuesToCopy > 0 && chunkStart != endPos)
245  {
246  std::copy(data + chunkStart, data + chunkEnd, data + endPos);
247  }
248  endPos += numValuesToCopy;
249  }
250  return endPos;
251  }
252 };
253 
254 #ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
255 // OpenMP only declares reduction operations for primitive types. This utility
256 // detects if a type T is supported.
257 template <typename T>
258 struct OpenMPReductionSupported : std::false_type
259 {
260 };
261 template <>
262 struct OpenMPReductionSupported<Int8> : std::true_type
263 {
264 };
265 template <>
266 struct OpenMPReductionSupported<UInt8> : std::true_type
267 {
268 };
269 template <>
270 struct OpenMPReductionSupported<Int16> : std::true_type
271 {
272 };
273 template <>
274 struct OpenMPReductionSupported<UInt16> : std::true_type
275 {
276 };
277 template <>
278 struct OpenMPReductionSupported<Int32> : std::true_type
279 {
280 };
281 template <>
282 struct OpenMPReductionSupported<UInt32> : std::true_type
283 {
284 };
285 template <>
286 struct OpenMPReductionSupported<Int64> : std::true_type
287 {
288 };
289 template <>
290 struct OpenMPReductionSupported<UInt64> : std::true_type
291 {
292 };
293 template <>
294 struct OpenMPReductionSupported<Float32> : std::true_type
295 {
296 };
297 template <>
298 struct OpenMPReductionSupported<Float64> : std::true_type
299 {
300 };
301 #else
302 template <typename T>
303 using OpenMPReductionSupported = std::false_type;
304 #endif // VTKM_OPENMP_USE_NATIVE_REDUCTION
305 
307 {
308  // std::is_integral, but adapted to see through vecs and pairs.
309  template <typename T>
310  struct IsIntegral : public std::is_integral<T>
311  {
312  };
313 
314  template <typename T, vtkm::IdComponent Size>
315  struct IsIntegral<vtkm::Vec<T, Size>> : public std::is_integral<T>
316  {
317  };
318 
319  template <typename T, typename U>
320  struct IsIntegral<vtkm::Pair<T, U>>
321  : public std::integral_constant<bool, std::is_integral<T>::value && std::is_integral<U>::value>
322  {
323  };
324 
325  // Generic implementation:
326  template <typename PortalT, typename ReturnType, typename Functor>
327  static ReturnType Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
328  {
329  internal::WrappedBinaryOperator<ReturnType, Functor> f(functorIn);
330 
331  const vtkm::Id numVals = portal.GetNumberOfValues();
332  auto data = vtkm::cont::ArrayPortalToIteratorBegin(portal);
333 
334  bool doParallel = false;
335  vtkm::Id numThreads = 0;
336 
338  .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagOpenMP())
339  .GetThreads(numThreads);
340 
341  std::unique_ptr<ReturnType[]> threadData;
342 
343  VTKM_OPENMP_DIRECTIVE(parallel default(none) firstprivate(f) shared(
344  data, doParallel, numThreads, threadData) VTKM_OPENMP_SHARED_CONST(numVals))
345  {
346  int tid = omp_get_thread_num();
347 
348  VTKM_OPENMP_DIRECTIVE(single)
349  {
350  if (numVals >= numThreads * 2)
351  {
352  doParallel = true;
353  threadData.reset(new ReturnType[static_cast<std::size_t>(numThreads)]);
354  }
355  }
356 
357  if (doParallel)
358  {
359  // Static dispatch to unroll non-integral types:
360  const ReturnType localResult = ReduceHelper::DoParallelReduction<ReturnType>(
361  data, numVals, tid, numThreads, f, IsIntegral<ReturnType>{});
362 
363  threadData[static_cast<std::size_t>(tid)] = localResult;
364  }
365  } // end parallel
366 
367  if (doParallel)
368  {
369  // do the final reduction serially:
370  for (size_t i = 0; i < static_cast<size_t>(numThreads); ++i)
371  {
372  init = f(init, threadData[i]);
373  }
374  }
375  else
376  {
377  // Not enough threads. Do the entire reduction in serial:
378  for (vtkm::Id i = 0; i < numVals; ++i)
379  {
380  init = f(init, data[i]);
381  }
382  }
383 
384  return init;
385  }
386 
387  // non-integer reduction: unroll loop manually.
388  // This gives faster code for floats and non-trivial types.
389  template <typename ReturnType, typename IterType, typename FunctorType>
390  static ReturnType DoParallelReduction(IterType data,
391  const vtkm::Id& numVals,
392  const int& tid,
393  const int& numThreads,
394  FunctorType f,
395  std::false_type /* isIntegral */)
396  {
397  // Use the first (numThreads*2) values for initializing:
398  ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
399 
400  const vtkm::Id offset = numThreads * 2;
401  const vtkm::Id end = std::max(((numVals / 4) * 4) - 4, offset);
402  const vtkm::Id unrollEnd = end - ((end - offset) % 4);
403  vtkm::Id i = offset;
404 
405 // When initializing the looping iterator to a non integral type, intel compilers will
406 // convert the iterator type to an unsigned value
407 #pragma GCC diagnostic push
408 #pragma GCC diagnostic ignored "-Wsign-conversion"
409  VTKM_OPENMP_DIRECTIVE(for schedule(static))
410  for (i = offset; i < unrollEnd; i += 4)
411 #pragma GCC diagnostic pop
412  {
413  const auto t1 = f(data[i], data[i + 1]);
414  const auto t2 = f(data[i + 2], data[i + 3]);
415  accum = f(accum, t1);
416  accum = f(accum, t2);
417  }
418 
419  // Let the last thread mop up any remaining values as it would
420  // have just accessed the adjacent data
421  if (tid == numThreads - 1)
422  {
423  for (i = unrollEnd; i < numVals; ++i)
424  {
425  accum = f(accum, data[i]);
426  }
427  }
428 
429  return accum;
430  }
431 
432  // Integer reduction: no unrolling. Ints vectorize easily and unrolling can
433  // hurt performance.
434  template <typename ReturnType, typename IterType, typename FunctorType>
435  static ReturnType DoParallelReduction(IterType data,
436  const vtkm::Id& numVals,
437  const int& tid,
438  const int& numThreads,
439  FunctorType f,
440  std::true_type /* isIntegral */)
441  {
442  // Use the first (numThreads*2) values for initializing:
443  ReturnType accum = f(data[2 * tid], data[2 * tid + 1]);
444 
445 // Assign each thread chunks of the remaining values for local reduction
446 #pragma GCC diagnostic push
447 #pragma GCC diagnostic ignored "-Wsign-conversion"
448  VTKM_OPENMP_DIRECTIVE(for schedule(static))
449  for (vtkm::Id i = numThreads * 2; i < numVals; i++)
450 #pragma GCC diagnostic pop
451  {
452  accum = f(accum, data[i]);
453  }
454 
455  return accum;
456  }
457 
458 #ifdef VTKM_OPENMP_USE_NATIVE_REDUCTION
459 
460 // Specialize for vtkm functors with OpenMP special cases:
461 #define VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, PragmaString) \
462  template <typename PortalT, typename ReturnType> \
463  static ReturnType Execute( \
464  PortalT portal, ReturnType value, FunctorType functorIn, std::true_type) \
465  { \
466  const vtkm::Id numValues = portal.GetNumberOfValues(); \
467  internal::WrappedBinaryOperator<ReturnType, FunctorType> f(functorIn); \
468  _Pragma(#PragmaString) for (vtkm::Id i = 0; i < numValues; ++i) \
469  { \
470  value = f(value, portal.Get(i)); \
471  } \
472  return value; \
473  }
474 
475 // Constructing the pragma string inside the _Pragma call doesn't work so
476 // we jump through a hoop:
477 #define VTKM_OPENMP_SPECIALIZE_REDUCE(FunctorType, Operator) \
478  VTKM_OPENMP_SPECIALIZE_REDUCE1(FunctorType, "omp parallel for reduction(" #Operator ":value)")
479 
480  // + (Add, Sum)
481  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Add, +)
482  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Sum, +)
483  // * (Multiply, Product)
484  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Multiply, *)
485  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Product, *)
486  // - (Subtract)
487  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Subtract, -)
488  // & (BitwiseAnd)
489  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::BitwiseAnd, &)
490  // | (BitwiseOr)
491  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::BitwiseOr, |)
492  // ^ (BitwiseXor)
493  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::BitwiseXor, ^)
494  // && (LogicalAnd)
495  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::LogicalAnd, &&)
496  // || (LogicalOr)
497  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::LogicalOr, ||)
498  // min (Minimum)
499  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Minimum, min)
500  // max (Maximum)
501  VTKM_OPENMP_SPECIALIZE_REDUCE(vtkm::Maximum, max)
502 
503 #undef VTKM_OPENMP_SPECIALIZE_REDUCE
504 #undef VTKM_OPENMP_SPECIALIZE_REDUCE1
505 
506 #endif // VTKM_OPENMP_USE_NATIVE_REDUCTION
507 };
508 
509 template <typename KeysInArray,
510  typename ValuesInArray,
511  typename KeysOutArray,
512  typename ValuesOutArray,
513  typename BinaryFunctor>
514 void ReduceByKeyHelper(KeysInArray keysInArray,
515  ValuesInArray valuesInArray,
516  KeysOutArray keysOutArray,
517  ValuesOutArray valuesOutArray,
518  BinaryFunctor functor)
519 {
520  using KeyType = typename KeysInArray::ValueType;
521  using ValueType = typename ValuesInArray::ValueType;
522 
523  vtkm::cont::Token token;
524 
525  const vtkm::Id numValues = keysInArray.GetNumberOfValues();
526  auto keysInPortal = keysInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
527  auto valuesInPortal = valuesInArray.PrepareForInput(DeviceAdapterTagOpenMP(), token);
528  auto keysIn = vtkm::cont::ArrayPortalToIteratorBegin(keysInPortal);
529  auto valuesIn = vtkm::cont::ArrayPortalToIteratorBegin(valuesInPortal);
530 
531  auto keysOutPortal = keysOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
532  auto valuesOutPortal =
533  valuesOutArray.PrepareForOutput(numValues, DeviceAdapterTagOpenMP(), token);
534  auto keysOut = vtkm::cont::ArrayPortalToIteratorBegin(keysOutPortal);
535  auto valuesOut = vtkm::cont::ArrayPortalToIteratorBegin(valuesOutPortal);
536 
537  internal::WrappedBinaryOperator<ValueType, BinaryFunctor> f(functor);
538  vtkm::Id outIdx = 0;
539  vtkm::Id numThreads = 0;
540 
542  .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagOpenMP())
543  .GetThreads(numThreads);
544 
545  VTKM_OPENMP_DIRECTIVE(parallel default(none) firstprivate(keysIn, valuesIn, keysOut, valuesOut, f)
546  shared(numThreads, outIdx) VTKM_OPENMP_SHARED_CONST(numValues))
547  {
548  int tid = omp_get_thread_num();
549 
550  // Determine bounds for this thread's scan operation:
551  vtkm::Id chunkSize = (numValues + numThreads - 1) / numThreads;
552  vtkm::Id scanIdx = std::min(tid * chunkSize, numValues);
553  vtkm::Id scanEnd = std::min(scanIdx + chunkSize, numValues);
554 
555  auto threadKeysBegin = keysOut + scanIdx;
556  auto threadValuesBegin = valuesOut + scanIdx;
557  auto threadKey = threadKeysBegin;
558  auto threadValue = threadValuesBegin;
559 
560  // Reduce each thread's partition:
561  KeyType rangeKey;
562  ValueType rangeValue;
563  for (;;)
564  {
565  if (scanIdx < scanEnd)
566  {
567  rangeKey = keysIn[scanIdx];
568  rangeValue = valuesIn[scanIdx];
569  ++scanIdx;
570 
571  // Locate end of current range:
572  while (scanIdx < scanEnd && static_cast<KeyType>(keysIn[scanIdx]) == rangeKey)
573  {
574  rangeValue = f(rangeValue, valuesIn[scanIdx]);
575  ++scanIdx;
576  }
577 
578  *threadKey = rangeKey;
579  *threadValue = rangeValue;
580  ++threadKey;
581  ++threadValue;
582  }
583  else
584  {
585  break;
586  }
587  }
588 
589  if (tid == 0)
590  {
591  outIdx = static_cast<vtkm::Id>(threadKey - threadKeysBegin);
592  }
593 
594  // Combine the reduction results. Skip tid == 0, since it's already in
595  // the correct location:
596  for (int i = 1; i < numThreads; ++i)
597  {
598 
599  // This barrier ensures that:
600  // 1) Threads remain synchronized through this final reduction loop.
601  // 2) The outIdx variable is initialized by thread 0.
602  // 3) All threads have reduced their partitions.
603  VTKM_OPENMP_DIRECTIVE(barrier)
604 
605  if (tid == i)
606  {
607  // Check if the previous thread's last key matches our first:
608  if (outIdx > 0 && threadKeysBegin < threadKey && keysOut[outIdx - 1] == *threadKeysBegin)
609  {
610  valuesOut[outIdx - 1] = f(valuesOut[outIdx - 1], *threadValuesBegin);
611  ++threadKeysBegin;
612  ++threadValuesBegin;
613  }
614 
615  // Copy reduced partition to final location (if needed)
616  if (threadKeysBegin < threadKey && threadKeysBegin != keysOut + outIdx)
617  {
618  std::copy(threadKeysBegin, threadKey, keysOut + outIdx);
619  std::copy(threadValuesBegin, threadValue, valuesOut + outIdx);
620  }
621 
622  outIdx += static_cast<vtkm::Id>(threadKey - threadKeysBegin);
623 
624  } // end tid == i
625  } // end combine reduction
626  } // end parallel
627 
628  token.DetachFromAll();
629 
630  keysOutArray.Allocate(outIdx, vtkm::CopyFlag::On);
631  valuesOutArray.Allocate(outIdx, vtkm::CopyFlag::On);
632 }
633 
634 template <typename IterT, typename RawPredicateT>
636 {
637  using ValueType = typename std::iterator_traits<IterT>::value_type;
638  using PredicateT = internal::WrappedBinaryOperator<bool, RawPredicateT>;
639 
640  struct Node
641  {
644 
645  // Pad the node out to the size of a cache line to prevent false sharing:
646  static constexpr size_t DataSize = 2 * sizeof(vtkm::Id2);
647  static constexpr size_t NumCacheLines = CeilDivide<size_t>(DataSize, VTKM_CACHE_LINE_SIZE);
648  static constexpr size_t PaddingSize = NumCacheLines * VTKM_CACHE_LINE_SIZE - DataSize;
649  unsigned char Padding[PaddingSize];
650  };
651 
652  IterT Data;
656  std::vector<Node> Nodes;
657  size_t NextNode;
658 
659  UniqueHelper(IterT iter, vtkm::Id numValues, RawPredicateT pred)
660  : Data(iter)
661  , NumValues(numValues)
662  , Predicate(pred)
663  , LeafSize(0)
664  , NextNode(0)
665  {
666  }
667 
669  {
670  vtkm::Id outSize = 0;
671 
672  VTKM_OPENMP_DIRECTIVE(parallel default(shared))
673  {
674  VTKM_OPENMP_DIRECTIVE(single)
675  {
676  this->Prepare();
677 
678  // Kick off task-based divide-and-conquer uniquification:
679  Node* rootNode = this->AllocNode();
680  rootNode->InputRange = vtkm::Id2(0, this->NumValues);
681  this->Uniquify(rootNode);
682  outSize = rootNode->OutputRange[1] - rootNode->OutputRange[0];
683  }
684  }
685 
686  return outSize;
687  }
688 
689 private:
690  void Prepare()
691  {
692  // Figure out how many values each thread should handle:
693  vtkm::Id numThreads = 0;
695  .GetRuntimeConfiguration(vtkm::cont::DeviceAdapterTagOpenMP())
696  .GetThreads(numThreads);
697 
698  vtkm::Id chunksPerThread = 8;
699  vtkm::Id numChunks;
700  ComputeChunkSize(
701  this->NumValues, numThreads, chunksPerThread, sizeof(ValueType), numChunks, this->LeafSize);
702 
703  // Compute an upper-bound of the number of nodes in the tree:
704  std::size_t numNodes = static_cast<std::size_t>(numChunks);
705  while (numChunks > 1)
706  {
707  numChunks = (numChunks + 1) / 2;
708  numNodes += static_cast<std::size_t>(numChunks);
709  }
710  this->Nodes.resize(numNodes);
711  this->NextNode = 0;
712  }
713 
715  {
716  size_t nodeIdx;
717 
718 // GCC emits a false positive "value computed but not used" for this block:
719 #pragma GCC diagnostic push
720 #pragma GCC diagnostic ignored "-Wunused-value"
721 
722  VTKM_OPENMP_DIRECTIVE(atomic capture)
723  {
724  nodeIdx = this->NextNode;
725  ++this->NextNode;
726  }
727 
728 #pragma GCC diagnostic pop
729 
730  VTKM_ASSERT(nodeIdx < this->Nodes.size());
731 
732  return &this->Nodes[nodeIdx];
733  }
734 
735  bool IsLeaf(const vtkm::Id2& range) { return (range[1] - range[0]) <= this->LeafSize; }
736 
737  // Not an strict midpoint, but ensures that the first range will always be
738  // a multiple of the leaf size.
740  {
741  const vtkm::Id n = range[1] - range[0];
742  const vtkm::Id np = this->LeafSize;
743 
744  return CeilDivide(n / 2, np) * np + range[0];
745  }
746 
747  void Uniquify(Node* node)
748  {
749  if (!this->IsLeaf(node->InputRange))
750  {
751  vtkm::Id midpoint = this->ComputeMidpoint(node->InputRange);
752 
753  Node* right = this->AllocNode();
754  Node* left = this->AllocNode();
755 
756  right->InputRange = vtkm::Id2(midpoint, node->InputRange[1]);
757 
758  // Intel compilers seem to have trouble following the 'this' pointer
759  // when launching tasks, resulting in a corrupt task environment.
760  // Explicitly copying the pointer into a local variable seems to fix this.
761  auto explicitThis = this;
762 
763  VTKM_OPENMP_DIRECTIVE(taskgroup)
764  {
765  VTKM_OPENMP_DIRECTIVE(task) { explicitThis->Uniquify(right); }
766 
767  left->InputRange = vtkm::Id2(node->InputRange[0], midpoint);
768  this->Uniquify(left);
769 
770  } // end taskgroup. Both sides of the tree will be completed here.
771 
772  // Combine the ranges in the left side:
773  if (this->Predicate(this->Data[left->OutputRange[1] - 1], this->Data[right->OutputRange[0]]))
774  {
775  ++right->OutputRange[0];
776  }
777 
778  vtkm::Id numVals = right->OutputRange[1] - right->OutputRange[0];
779  DoCopy(this->Data + right->OutputRange[0], this->Data + left->OutputRange[1], numVals);
780 
781  node->OutputRange[0] = left->OutputRange[0];
782  node->OutputRange[1] = left->OutputRange[1] + numVals;
783  }
784  else
785  {
786  auto start = this->Data + node->InputRange[0];
787  auto end = this->Data + node->InputRange[1];
788  end = std::unique(start, end, this->Predicate);
789  node->OutputRange[0] = node->InputRange[0];
790  node->OutputRange[1] = node->InputRange[0] + static_cast<vtkm::Id>(end - start);
791  }
792  }
793 };
794 }
795 }
796 } // end namespace vtkm::cont::openmp
797 
798 #endif // vtk_m_cont_openmp_internal_FunctorsOpenMP_h
vtkm::cont::ArrayPortalToIteratorBegin
VTKM_SUPPRESS_EXEC_WARNINGS VTKM_EXEC_CONT vtkm::cont::ArrayPortalToIterators< PortalType >::IteratorType ArrayPortalToIteratorBegin(const PortalType &portal)
Convenience function for converting an ArrayPortal to a begin iterator.
Definition: ArrayPortalToIterators.h:178
vtkm::cont::openmp::CopyIfHelper::ChunkSize
vtkm::Id ChunkSize
Definition: FunctorsOpenMP.h:192
vtkm::cont::openmp::CopyIfHelper::ValueSize
vtkm::Id ValueSize
Definition: FunctorsOpenMP.h:189
vtkm::cont::openmp::CopyIfHelper::CopyIf
void CopyIf(InIterT inIter, StencilIterT stencilIter, OutIterT outIter, PredicateT pred, vtkm::Id chunk)
Definition: FunctorsOpenMP.h:214
vtkm::cont::openmp::CopyIfHelper::NumChunks
vtkm::Id NumChunks
Definition: FunctorsOpenMP.h:191
vtkm::cont::openmp::CleanArrayRefImpl
Definition: FunctorsOpenMP.h:95
ArrayHandle.h
FunctorsGeneral.h
vtkm::cont::openmp::UniqueHelper::Node
Definition: FunctorsOpenMP.h:640
vtkm
Groups connected points that have the same field value.
Definition: Atomic.h:19
vtkm::Product
Binary Predicate that takes two arguments argument x, and y and returns product (multiplication) of t...
Definition: BinaryOperators.h:56
vtkm::Subtract
Definition: Types.h:242
vtkm::BitwiseOr
Binary Predicate that takes two arguments argument x, and y and returns the bitwise operation x|y
Definition: BinaryOperators.h:168
Types.h
VTKM_ASSERT
#define VTKM_ASSERT(condition)
Definition: Assert.h:43
vtkm::cont::openmp::UniqueHelper::IsLeaf
bool IsLeaf(const vtkm::Id2 &range)
Definition: FunctorsOpenMP.h:735
Pair.h
vtkm::cont::Token::DetachFromAll
VTKM_CONT void DetachFromAll()
Detaches this Token from all resources to allow them to be used elsewhere or deleted.
vtkm::cont::openmp::CopyIfHelper::CopyIfHelper
CopyIfHelper()=default
vtkm::cont::openmp::UniqueHelper::Node::DataSize
static constexpr size_t DataSize
Definition: FunctorsOpenMP.h:646
vtkm::Int16
int16_t Int16
Definition: Types.h:158
vtkm::Maximum
Binary Predicate that takes two arguments argument x, and y and returns the x if x > y otherwise retu...
Definition: BinaryOperators.h:85
vtkm::cont::openmp::UniqueHelper::PredicateT
internal::WrappedBinaryOperator< bool, RawPredicateT > PredicateT
Definition: FunctorsOpenMP.h:638
vtkm::cont::openmp::CopyIfHelper::NumValues
vtkm::Id NumValues
Definition: FunctorsOpenMP.h:187
vtkm::Id
vtkm::Int32 Id
Represents an ID (index into arrays).
Definition: Types.h:191
vtkm::LogicalAnd
Binary Predicate that takes two arguments argument x, and y and returns True if and only if x and y a...
Definition: BinaryPredicates.h:71
vtkm::cont::openmp::UniqueHelper::Node::OutputRange
vtkm::Id2 OutputRange
Definition: FunctorsOpenMP.h:643
vtkm::cont::openmp::UniqueHelper::AllocNode
Node * AllocNode()
Definition: FunctorsOpenMP.h:714
DeviceAdapterTagOpenMP.h
vtkm::Add
Definition: Types.h:222
vtkm::cont::openmp::UniqueHelper::Prepare
void Prepare()
Definition: FunctorsOpenMP.h:690
vtkm::cont::openmp::UniqueHelper::LeafSize
vtkm::Id LeafSize
Definition: FunctorsOpenMP.h:655
vtkm::cont::Token
A token to hold the scope of an ArrayHandle or other object.
Definition: Token.h:35
vtkm::cont::openmp::ReduceHelper::IsIntegral
Definition: FunctorsOpenMP.h:310
vtkm::Sum
Binary Predicate that takes two arguments argument x, and y and returns sum (addition) of the two val...
Definition: BinaryOperators.h:33
vtkm::cont::openmp::CopyIfHelper::EndIds
std::vector< vtkm::Id > EndIds
Definition: FunctorsOpenMP.h:193
vtkm::cont::openmp::ReduceByKeyHelper
void ReduceByKeyHelper(KeysInArray keysInArray, ValuesInArray valuesInArray, KeysOutArray keysOutArray, ValuesOutArray valuesOutArray, BinaryFunctor functor)
Definition: FunctorsOpenMP.h:514
vtkm::cont::openmp::ReduceHelper
Definition: FunctorsOpenMP.h:306
vtkm::Multiply
Definition: Types.h:262
vtkm::Int8
int8_t Int8
Definition: Types.h:156
vtkm::cont::openmp::UniqueHelper::UniqueHelper
UniqueHelper(IterT iter, vtkm::Id numValues, RawPredicateT pred)
Definition: FunctorsOpenMP.h:659
vtkm::cont::openmp::UniqueHelper::Uniquify
void Uniquify(Node *node)
Definition: FunctorsOpenMP.h:747
vtkm::BitwiseXor
Binary Predicate that takes two arguments argument x, and y and returns the bitwise operation x^y
Definition: BinaryOperators.h:191
vtkm::LogicalOr
Binary Predicate that takes two arguments argument x, and y and returns True if and only if x or y is...
Definition: BinaryPredicates.h:83
vtkm::cont::openmp::CopyIfHelper::Initialize
void Initialize(vtkm::Id numValues, vtkm::Id valueSize)
Definition: FunctorsOpenMP.h:197
vtkm::cont::openmp::UniqueHelper::NextNode
size_t NextNode
Definition: FunctorsOpenMP.h:657
vtkm::cont::RuntimeDeviceInformation::GetRuntimeConfiguration
VTKM_CONT vtkm::cont::internal::RuntimeDeviceConfigurationBase & GetRuntimeConfiguration(DeviceAdapterId id, const vtkm::cont::internal::RuntimeDeviceConfigurationOptions &configOptions, int &argc, char *argv[]=nullptr) const
Returns a reference to a RuntimeDeviceConfiguration that will work with the given device.
vtkm::UInt8
uint8_t UInt8
Definition: Types.h:157
vtkm::CopyFlag::On
@ On
vtkm::cont::openmp::UniqueHelper::Nodes
std::vector< Node > Nodes
Definition: FunctorsOpenMP.h:656
ErrorExecution.h
vtkm::cont::openmp::ReduceHelper::Execute
static ReturnType Execute(PortalT portal, ReturnType init, Functor functorIn, std::false_type)
Definition: FunctorsOpenMP.h:327
VTKM_OPENMP_DIRECTIVE
#define VTKM_OPENMP_DIRECTIVE(directive)
Definition: FunctorsOpenMP.h:37
BinaryOperators.h
vtkm::cont::openmp::UniqueHelper::ComputeMidpoint
vtkm::Id ComputeMidpoint(const vtkm::Id2 &range)
Definition: FunctorsOpenMP.h:739
VTKM_OPENMP_SHARED_CONST
#define VTKM_OPENMP_SHARED_CONST(...)
Definition: FunctorsOpenMP.h:49
vtkm::Vec
A short fixed-length array.
Definition: Types.h:767
vtkm::cont::openmp::UniqueHelper
Definition: FunctorsOpenMP.h:635
vtkm::cont::openmp::OpenMPReductionSupported
std::false_type OpenMPReductionSupported
Definition: FunctorsOpenMP.h:303
vtkm::BitwiseAnd
Binary Predicate that takes two arguments argument x, and y and returns the bitwise operation x&y
Definition: BinaryOperators.h:145
vtkm::cont::openmp::CopyIfHelper::Reduce
vtkm::Id Reduce(OutIterT data)
Definition: FunctorsOpenMP.h:236
vtkm::UInt32
uint32_t UInt32
Definition: Types.h:161
vtkm::cont::openmp::UniqueHelper::ValueType
typename std::iterator_traits< IterT >::value_type ValueType
Definition: FunctorsOpenMP.h:637
vtkm::cont::openmp::UniqueHelper::Execute
vtkm::Id Execute()
Definition: FunctorsOpenMP.h:668
vtkm::cont::openmp::UniqueHelper::Node::Padding
unsigned char Padding[PaddingSize]
Definition: FunctorsOpenMP.h:649
vtkm::Float32
float Float32
Definition: Types.h:154
BinaryPredicates.h
vtkm::Int32
int32_t Int32
Definition: Types.h:160
vtkm::Float64
double Float64
Definition: Types.h:155
vtkm::cont::openmp::ReduceHelper::DoParallelReduction
static ReturnType DoParallelReduction(IterType data, const vtkm::Id &numVals, const int &tid, const int &numThreads, FunctorType f, std::false_type)
Definition: FunctorsOpenMP.h:390
vtkm::cont::openmp::ReduceHelper::DoParallelReduction
static ReturnType DoParallelReduction(IterType data, const vtkm::Id &numVals, const int &tid, const int &numThreads, FunctorType f, std::true_type)
Definition: FunctorsOpenMP.h:435
vtkm::cont::openmp::UniqueHelper::Node::NumCacheLines
static constexpr size_t NumCacheLines
Definition: FunctorsOpenMP.h:647
RuntimeDeviceInformation.h
vtkm::cont::openmp::UniqueHelper::Data
IterT Data
Definition: FunctorsOpenMP.h:652
vtkm::cont::openmp::CopyIfHelper
Definition: FunctorsOpenMP.h:185
vtkm::UInt16
uint16_t UInt16
Definition: Types.h:159
vtkm::cont::RuntimeDeviceInformation
A class that can be used to determine if a given device adapter is supported on the current machine a...
Definition: RuntimeDeviceInformation.h:29
vtkm::cont::openmp::CopyIfHelper::NumThreads
vtkm::Id NumThreads
Definition: FunctorsOpenMP.h:188
vtkm::cont::openmp::CleanArrayRef
typename CleanArrayRefImpl< T >::type CleanArrayRef
Definition: FunctorsOpenMP.h:107
vtkm::Pair
A vtkm::Pair is essentially the same as an STL pair object except that the methods (constructors and ...
Definition: Pair.h:29
vtkm::cont::openmp::UniqueHelper::Node::PaddingSize
static constexpr size_t PaddingSize
Definition: FunctorsOpenMP.h:648
vtkm::cont::openmp::CleanArrayRefImpl::type
T type
Definition: FunctorsOpenMP.h:97
vtkm::cont::openmp::UniqueHelper::Predicate
PredicateT Predicate
Definition: FunctorsOpenMP.h:654
vtkm::Minimum
Binary Predicate that takes two arguments argument x, and y and returns the x if x < y otherwise retu...
Definition: BinaryOperators.h:99
vtkm::Id2
vtkm::Vec< vtkm::Id, 2 > Id2
Id2 corresponds to a 2-dimensional index.
Definition: Types.h:885
vtkm::cont::openmp::UniqueHelper::Node::InputRange
vtkm::Id2 InputRange
Definition: FunctorsOpenMP.h:642
vtkm::cont::openmp::UniqueHelper::NumValues
vtkm::Id NumValues
Definition: FunctorsOpenMP.h:653