CUDA Kernel Primitive API - 文章 - 开发者社区

Kernel Primitive API

Kernel Primitive API 对算子 Kernel 实现中的底层代码进行了抽象与功能封装，提供高性能的 Block 级 IO 运算和 Compute 运算。使用 Kernel Primitive API 进行 Kernel 开发可以更加专注计算逻辑的实现，在保证性能的同时大幅减少代码量，同时实现了算子计算与硬件解耦。

本部分为高级开发人员提供了用于 Kernel 开发的 CUDA Kernel Primitive API，该类 API 能够帮助开发者在提升开发效率的同时收获较好的性能。Kernel Primitive API 主要包括 IO API、Compute API 以及 OpFunc。

IO API 能够高效的完成全局内存与寄存器间的数据读写操作。

CUDA Pro Tip: Increase Performance with Vectorized Memory Access

Using CUDA Warp-Level Primitives

Compute API 为通用计算函数，如 ElementwiseBinary、ElementwiseUnary 等；
OpFunc 用于定义 Compute API 中的计算规则，例如实现 Add 操作则需要定义 AddFunctor 用于 ElementwiseBinary 调用，开发者可以直接使用默认的 OpFunc 也可以根据需要进行自定义，具体的实现规则将在 OpFunc 小节中进行详细介绍。

当前 API 均是 Block 级别的多线程 API，开发者可以直接传入当前 Block 的数据指针以及操作类型完成相应的计算，目前仅支持全局数据指针和寄存器指针。

Example

ElementwiseAdd

完成相同 Shape 的两数相加，输入为 InT 类型，输出为 OutT 类型，根据 OpFunc 完成对应的计算。

OpFunc 定义

OpFunc: 用于定义当前数据的计算规则，AddFunctor 定义如下：

  
template <typename InT>  
struct AddFunctor {  
  HOSTDEVICE InT operator()(const InT &a, const InT &b) const { return (a + b); }  
};

Kernel 实现说明

每个线程连续读取 VecSize 个元素 ，根据剩余元素 num 与 VecSize * blockDim.x 的关系，将数据处理分为 2 部分:

第一部分，当 VecSize * blockDim.x > num 表示当前数据处理需要进行边界处理，将 IsBoundary 设置为 true，避免访存越界;
第二部分，不需要进行边界处理，设置 IsBoundary = false。根据当前 block 的数据指针，将数据从全局内存中读取到寄存器中，完成加法操作后，将数据写入全局内存中。

注意此处使用 Init 函数对寄存器 arg0，arg1 进行初始化，避免当 arg0 或者 arg1 作为分母时出现为 0 的情况。

根据 OpFunc 完成两数求和操作，当需要进行两数相乘，可以直接修改对应的 Functor 即可，可以直接复用 Kernel 代码，提升开发效率。

数据处理过程如下：

picture.image

Kernel 代码

  
// Paddle/paddle/phi/kernels/primitive/kernel\_primitives.h  
  
#include "kernel\_primitives/kernel\_primitives.h"  
template<int VecSize, typename InT, typename OutT, typename OpFunc, bool IsBoundary>  
\_\_device\_\_ void ElementwiseAddImpl(InT *in0, InT * in1, OutT * out, OpFunc func, int num) {  
  
  InT arg0[VecSize];  
  InT arg1[VecSize];  
  OutT result[VecSize];  
  
  // init arg0 and arg1  
  Init<InT, VecSize>(arg0, static\_cast<OutT>(1.0f));  
  Init<InT, VecSize>(arg1, static\_cast<OutT>(1.0f));  
  
  // read data from global memory  
  ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(arg0, in0, num);  
  ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(arg1, in1, num);  
  
  // compute resut[i] = args[i] + arg1[i]  
  ElementwiseBinary<InT, OutT, VecSize, 1, 1, OpFunc>(result, arg0, arg1, func);  
  
  // write data  
  WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, result, num);  
}  
  
template<int VecSize, typename InT, typename OutT>  
\_\_global\_\_ void ElementwiseAdd(InT *in0, InT *in1, OutT *out, int size) {  
  
  // get the data offset of this Block  
  int data_offset = VecSize * blockIdx.x * blockDim.x;  
  
  // get the stride offset the block  
  int stride = gridDim.x * blockDim.x * VecSize;  
  
  for (int offset = data_offset; offset < size; offset += stride) {  
    if (offset + blockDim.x * VecSize < size) {  // set IsBoundary = false  
  
      ElementwiseAddImpl<VecSize, InT, OutT, AddFunctor<InT, OutT>, false>(in0 + offset, in1 + offset, out + offset, AddFunctor<InT, OutT>(), size - offset);  
  
    } else {  // left num is smaller than blockDim.x * VecSize, IsBoundary must be true  
  
      ElementwiseAddImpl<VecSize, InT, OutT, AddFunctor<InT, OutT>, true>(in0 + offset, in1 + offset, out + offset, AddFunctor<InT, OutT>(), size - offset);  
  
    }  
  }  
}

Reduce

根据 ReduceOp 中定义的计算规则对最高维度进行规约操作，例如输入为 x[N, H, W, C], axis 取值为 0, 规约后为 out[1, H, W, C]，此处以 ReduceSum 为例进行介绍。

ReduceOp 定义

  
// transform data type  
template <typename Tx, typename Ty = Tx>  
struct IdentityFunctor {  
  HOSTDEVICE explicit inline IdentityFunctor(int n) {}  
  
  HOSTDEVICE inline Ty operator()(const Tx& x) const {  
    return static\_cast<Ty>(x);  
  }  
};  
  
template <typename Tx, typename Ty = Tx>  
struct AddFunctor {  
  inline Ty initial() { return static\_cast<Ty>(0.0f); }  
  
  \_\_device\_\_ \_\_forceinline\_\_ Ty operator()(const Ty &a, const Ty &b) const {  
     return b + a;  
  }  
};

kernel 实现说明

对最高维进行规约操作，将不需要进行规约的维度进行合并，根据 NX 和 blockDim.x 对 H * W * C 进行 block 划分。

对于 blockIdx_1，数据个数小于 blockDim.x * NX，则设置 IsBoundary = true，避免访存越界。

将数据从全局内存中读取到寄存器中，每个线程读取 4 个元素，线程间数据没有依赖，进行线程内规约操作得到最终结果。

将数据从寄存器写入全局内存中。 ReduceSum 数据处理过程如下：

picture.image

kernel 代码

  
template <typename Tx, typename Ty, typename MPType, typename ReduceOp, typename TransformOp, bool IsBoundary = false>  
__device__ void HigherDimImpl(const Tx* x, Ty* y, ReduceOp reducer,  
                             TransformOp transform, MPType init,  
                             int reduce_num, int left_num,  
                             int block_num) {  
  
  const int NY = 2;  
  int idx = blockIdx.x * blockDim.x;  
  int idy = blockIdx.y * block_num; // block\_offset of rows  
  Tx reduce_input[NY];  
  MPType reduce_compute[NY];  
  MPType result = init;  
  
  int block_offset = idy * left_num + idx + blockIdx.z * reduce_num * left_num; // the offset of this block  
  int store_offset = blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num + idx;  
  
  const Tx* input = x + block_offset;  
  
  // how many columns left  
  int num = left_num - idx;  
  
  // how many rows have to be reduced  
  int loop = reduce_num - idy;  
  loop = loop > block_num ? block_size : loop;  
  
  for (int loop_index = 0; loop_index < loop; loop_index += NY) {  
    kps::ReadData<Tx, Tx, 1, NY, 1, IsBoundary>(&reduce_input[0], input + loop_index * left_num, num, NY, 1, left_num);  
    kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_num, 1, 1, TransformOp>(&reduce_compute[0], &reduce_input[0], transform);  
    kps::Reduce<MPType, NY, 1, 1, ReduceOp, kps::details::ReduceMode::kLocalMode>( &result, &reduce_compute[0], reducer, false);  
  }  
  
  Ty temp_data = static\_cast<Ty>(result);  
  kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, num);  
}  
  
template <typename Tx, typename Ty, typename MPType, typename ReduceOp, typename TransformOp>  
\_\_global\_\_ void ReduceHigherDimKernel(const Tx* x, Ty* y, ReduceOp reducer,  
                                      TransformOp transform, MPType init,  
                                      int reduce\_num, int left\_num,  
                                      int blocking\_num) {  
  
  // get the remaining data of this kernel  
  int num = left_num - blockIdx.x * blockDim.x;  
  
  if (num >= blockDim.x) {  
  
    // The remaining data is larger than blockdim.x  
    HigherDimImpl<Tx, Ty, MPType, AddFunctor<Tx, Ty>, IdentityFunctor<Tx, Ty>, false>(  
        x, y, AddFunctor<Tx, Ty>(), IdentityFunctor<Tx, Ty>(), init, reduce_num, left_num, blocking_num);  
  
  } else {  
  
    // The remaining data is smaller than blockdim.x, IsBounary must be true  
    HigherDimImpl<Tx, Ty, MPType, AddFunctor<Tx, Ty>, IdentityFunctor<Tx, Ty>, true>(  
        x, y, AddFunctor<Tx, Ty>(), IdentityFunctor<Tx, Ty>(), init, reduce_num, left_num, blocking_num);  
  
  }  
}

Macro

  
#define KPStream gpuStream\_t  
#define KPDevice phi::GPUContext  
#define \_ptr\_  
#define \_\_simd\_\_  
  
#define THREAD\_ID\_X threadIdx.x  
#define THREAD\_ID\_Y threadIdx.y  
#define THREAD\_ID\_Z threadIdx.z  
  
#define BLOCK\_NUM\_X blockDim.x  
#define BLOCK\_NUM\_Y blockDim.y  
#define BLOCK\_NUM\_Z blockDim.z  
  
#define BLOCK\_ID\_X blockIdx.x  
#define BLOCK\_ID\_Y blockIdx.y  
#define BLOCK\_ID\_Z blockIdx.z  
  
#define GRID\_NUM\_X gridDim.x  
#define GRID\_NUM\_Y gridDim.y  
#define GRID\_NUM\_Z gridDim.z  
  
#define VecSizeL 4  
#define VecSizeM 2  
#define VecSizeS 1  
  
namespace phi {  
namespace kps {  
  
using IndexType = int64\_t;  
  
struct DimConfig {  
  int split_num_x;  
  int split_num_y;  
  int split_num_z;  
  int deal_size_x;  
  int deal_size_y;  
  int deal_size_z;  
  int rem_x;  
  int rem_y;  
  int rem_z;  
  
  HOSTDEVICE explicit inline DimConfig(int split\_x,  
                                       int split\_y,  
                                       int split\_z,  
                                       int size\_x,  
                                       int size\_y,  
                                       int size\_z) {  
    split_num_x = split_x;  
    split_num_y = split_y;  
    split_num_z = split_z;  
    deal_size_x = size_x;  
    deal_size_y = size_y;  
    deal_size_z = size_z;  
  }  
  
  HOSTDEVICE void SetRem(int rem\_nx, int rem\_ny, int rem\_nz) {  
    rem_x = rem_nx;  
    rem_y = rem_ny;  
    rem_z = rem_nz;  
  }  
};  
}  // namespace kps  
}  // namespace phi

IO API

details

VectorType

  
template <typename T, int VecSize>  
struct alignas(sizeof(T) * VecSize) VectorType {  
  T val[VecSize];  
};

The easiest way to use vectorized loads is to use the vector data types defined in the CUDA C/C++ standard headers, such as int2, int4, or float2. You can easily use these types via type casting in C/C++. For example in C++ you can recast the int pointer d_in to an int2 pointer using reinterpret_cast<int2*>(d_in). In C99 you can do the same thing using the casting operator: (int2*(d_in)).

Dereferencing those pointers will cause the compiler to generate the vectorized instructions . However, there is one important caveat: these instructions require aligned data . Device-allocated memory is automatically aligned to a multiple of the size of the data type, but if you offset the pointer the offset must also be aligned . For example reinterpret_cast<int2*>(d_in+1) is invalid because d_in+1 is not aligned to a multiple of sizeof(int2).

You can safely offset arrays if you use an “aligned” offset , as in reinterpret_cast<int2*>(d_in+2). You can also generate vectorized loads using structures as long as the structure is a power of two bytes in size .

FastDivMod

  
/**  
 * Fast division : Replace division in CUDA with multiplication to improve  
 * kernel performance.  
 * 1. Complete the division calculation on the CPU, and record the calculation  
 * results by using the divider and shift\_val.  
 * 2. Set the divisor on the GPU through Div() to complete the calculation.  
 */  
struct FastDivMod {  
  // 1st value represents the result of input number divides by recorded divisor  
  // 2nd value represents the result of input number modulo by recorded divisor  
  using DivModT = VectorType<uint32\_t, 2>;  
  
  FastDivMod() {}  
  HOSTDEVICE FastDivMod(uint32\_t d) : divisor(d) {  
    static\_assert(sizeof(unsigned int) == 4,  
                  "Only Support 32-bit unsigned int.");  
  
    for (shift_val = 0; shift_val < INT_BITS; ++shift_val) {  
      auto shift_limit = 1 << shift_val;  
      if (shift_limit >= divisor) break;  
    }  
    uint64\_t long_one = 1;  
    uint64\_t temp_div =  
        ((long_one << INT_BITS) * ((long_one << shift_val) - divisor)) /  
            divisor +  
        1;  
    multiplier = temp_div;  
  }  
  
  \_\_device\_\_ \_\_forceinline\_\_ uint32\_t Div(uint32\_t n) const {  
    uint32\_t t = __umulhi(n, multiplier);  
    return (t + n) >> shift_val;  
  }  
  
  \_\_device\_\_ \_\_forceinline\_\_ DivModT Divmod(uint32\_t n) const {  
    uint32\_t q = Div(n);  
    DivModT result = {q, n - q * divisor};  
    return result;  
  }  
  
  int32\_t divisor;  
  int32\_t shift_val;  
  uint32\_t multiplier;  
};

BroadcastConfig

  
/**  
 * Configuration of broadcast. Calculate the input data index according to the  
 * index of the output data. if input or output shape is [dim0, dim1] then dims  
 * must be [dim1, dim0].  
 */  
struct BroadcastConfig {  
  FastDivMod divmoders[phi::DDim::kMaxRank];  
  uint32\_t strides[phi::DDim::kMaxRank];  
  int rank{0};  
  
  // BroadcastConfig should be defined on host used on device.  
  BroadcastConfig() {}  
  
  BroadcastConfig(const std::vector<int64\_t>& out_dims,  
                  const std::vector<int64\_t>& in_dims,  
                  int dim_size) {  
    for (int i = 0; i < dim_size; ++i) {  
      divmoders[i] = FastDivMod(out_dims[i]);  
    }  
  
    for (int i = 0; i < dim_size; ++i) {  
      strides[i] = in_dims[i] == 1 ? 0 : 1;  
      strides[i] = (i != 0 && strides[i] != 0)  
                       ? std::accumulate(in_dims.begin(),  
                                         in_dims.begin() + i,  
                                         1,  
                                         std::multiplies<int64\_t>())  
                       : strides[i];  
    }  
    rank = dim_size;  
  }  
};

WriteData

  
template <typename T>  
\_\_device\_\_ \_\_forceinline\_\_ void WriteData(T* dst,  
                                          T* \_\_restrict\_\_ src,  
                                          int num) {  
  for (int i = 0; i < num; i++) {  
    dst[i] = src[i];  
  }  
}

ReadData

  
template <typename T>  
\_\_device\_\_ \_\_forceinline\_\_ void ReadData(T* dst,  
                                         const T* \_\_restrict\_\_ src,  
                                         int num) {  
  for (int i = 0; i < num; i++) {  
    dst[i] = src[i];  
  }  
}

Init

将寄存器 dst 中的所有元素初始化为 init_data。

  
// Paddle/paddle/phi/kernels/primitive/datamover\_primitives.h  
  
/**  
 * @brief Initialize register with init\_data.  
 *  
 * @template paraments  
 * T: Data type of register.  
 * NX: Number of data to initialize.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX.  
 * init\_data: Initial value.  
 */  
template <typename T, int NX>  
\_\_device\_\_ \_\_forceinline\_\_ void Init(T* dst, T init\_data) {  
#pragma unroll  
  for (int i = 0; i < NX; i++) {  
    dst[i] = init_data;  
  }  
}  
  
template <typename T, int NX>  
\_\_device\_\_ \_\_forceinline\_\_ void Init(T* dst, T init\_data, int read\_lens) {  
#pragma unroll  
  for (int i = 0; i < NX; i++) {  
    dst[i] = init_data;  
  }  
}  
  
/**  
 * The difference from the above function is that  
 * it supports different data types of inputs.  
 */  
template <typename T, typename ArgsT, int Index, int NX>  
\_\_device\_\_ \_\_forceinline\_\_ void Init(ArgsT* dst, T init\_data, int read\_lens) {  
#pragma unroll  
  for (int i = 0; i < NX; i++) {  
    std::get<Index>(dst[i]) = init_data;  
  }  
}

InitWithDataIndex

  
// Paddle/paddle/phi/kernels/primitive/datamover\_primitives.h  
  
/**  
 * @brief Initialize register with data index.  
 *  
 * @template paraments  
 * T: Data type of register.  
 * NX: Number of data to initialize.  
 * NY: Number of data to initialize, NY only can be 1.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX.  
 * init\_data: The register pointer of init data, the size is NX.  
 */  
template <typename T, int NX, int NY>  
\_\_device\_\_ \_\_forceinline\_\_ void InitWithDataIndex(T* dst, int block\_offset) {  
  int thread_offset = block_offset + threadIdx.x * NX;  
#pragma unroll  
  for (int nx = 0; nx < NX; ++nx) {  
    dst[nx] = static\_cast<T>(thread_offset + nx);  
  }  
}

ReadData 2D

  
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, bool IsBoundary = false>  
__device__ void ReadData(Ty* dst, const Tx* __restrict__ src, int size_nx, int size_ny, int stride_nx, int stride_ny);

将 Tx 类型的 2D 数据从全局内存中读取到寄存器，并按照 Ty 类型存储到寄存器 dst 中。

最低维每读取 1 个元素需要偏移 stride_nx 个元素，最高维每读取 1 个元素需要偏移 stride_ny 个元素，直到加载 NX * NY 个数据到寄存器 dst 中。

当 IsBoundary = true 需要保证当前最高维偏移个数不超过 size_ny，列偏移个数不超过 size_nx。

数据处理过程如下：

picture.image

  
/**  
 * @brief Read 2D data from global memory to register according to Tx type, and  
 * store it as Ty type into register.  
 *  
 * @template paraments  
 * Tx: The type of data stored in the global memory.  
 * Ty: The type of data that needs to be stored in registers.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds  
 * judgment. When the number of data processed by the block is less than  
 * NX x NY x blockDim, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX * NY.  
 * src: The data pointer of the current block.  
 * size\_nx: The maximum offset of the current block is size\_nx elements in the  
 * lowest dimension. The parameters are only calculated when isboundary = true.  
 * size\_ny: The maximum offset of the current block is size\_ny elements in the  
 * first dimension. The parameters are only calculated when isboundary = true.  
 * stride\_nx: Each read one element stride stride\_nx elements in the last dim.  
 * stride\_ny: Each read one element stride stride\_ny elements in the first dim.  
 */  
template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void ReadData(Ty* dst,  
                                         const Tx* __restrict__ src,  
                                         int size_nx,  
                                         int size_ny,  
                                         int stride_nx,  
                                         int stride_ny) {  
  int thread_offset = threadIdx.x;  
  int left_size_nx = size_nx - thread_offset;  
  
  // Each branch is added for better performance  
  if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1  
    if (IsBoundary) {  
      if (left_size_nx > 0) {  
        dst[0] = static\_cast<Ty>(src[thread_offset]);  
      }  
    } else {  
      dst[0] = static\_cast<Ty>(src[thread_offset]);  
    }  
  } else if (NX == 1) {  // for NX == 1 and NY != 1  
#pragma unroll  
    for (int idy = 0; idy < NY; ++idy) {  
      if (IsBoundary) {  
        if (idy * stride_ny >= size_ny) {  
          break;  
        }  
      }  
      dst[idy] = static\_cast<Ty>(src[thread_offset + idy * stride_ny]);  
    }  
  } else if (NY == 1) {  // for NY == 1 and NX != 1  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if (IsBoundary) {  
        if (idx * stride_nx >= left_size_nx) {  
          break;  
        }  
      }  
      dst[idx] = static\_cast<Ty>(src[thread_offset + idx * stride_nx]);  
    }  
  } else {  // for NX != 1 and NY != 1  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if (IsBoundary) {  
        if (idx * stride_nx >= left_size_nx) {  
          break;  
        }  
      }  
#pragma unroll  
      for (int idy = 0; idy < NY; ++idy) {  
        if (IsBoundary) {  
          if (idy * stride_ny >= size_ny) {  
            break;  
          }  
        }  
        dst[idy * NX + idx] = static\_cast<Ty>(  
            src[thread_offset + idx * stride_nx + idy * stride_ny]);  
      }  
    }  
  }  
}

ReadData 1D

  
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>  
__device__ void ReadData(T* dst, const T* src, int num);

将 T 类型的 1D 数据从全局内存 src 中读取到寄存器 dst 中。

每次连续读取 NX 个数据，当前仅支持 NY = 1，直到加载 NX 个数据到寄存器 dst 中。

当 IsBoundary = true 需要保证 Block 读取的总数据个数不超过 num，以避免访存越界。

当 (NX % 4 = 0 或 NX % 2 = 0) 且 IsBoundary = false 时，向量化加载会有更高的访存效率。

数据处理过程如下：

picture.image

  
/**  
 * @brief Read 1D data from global memory to register. When IsBoundary = false  
 * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to  
 * improve memory access efficiency.  
 *  
 * @template paraments  
 * T: The type of data.  
 * NX: Each thread load NX data from global memory continuously.  
 * NY: Each thread need to load NY rows, only NY = 1 was supported.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.  
 * When the number of data processed by this block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX * NY.  
 * src: The data pointer of the current block.  
 * size: The current block needs to load size data continuously.  
 */  
template <typename T, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void ReadData(T* dst,  
                                         const T* __restrict__ src,  
                                         int num) {  
  if (IsBoundary) {  // blockDim.x * NX < num  
    int thread_offset = threadIdx.x * NX;  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if (idx + thread_offset < num) {  
        dst[idx] = src[thread_offset + idx];  
      }  
    }  
  } else {  // blockDim,x * NX > num  
    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;  
    constexpr int kVectorsPerThread = NX / kVectorSize;  
    int thread_offset = threadIdx.x * kVectorsPerThread;  
  
    using VecType = details::VectorType<T, kVectorSize>;  
    // vec\_input + 1 步长是 sizeof(VecType) = sizeof(T) * kVectorSize  
    const VecType* vec_input = reinterpret\_cast<const VecType*>(src);  
    VecType vec_temp[kVectorsPerThread];  
  
#pragma unroll  
    for (int i = 0; i < kVectorsPerThread; ++i) {  
      vec_temp[i] = vec_input[thread_offset + i];  
#pragma unroll  
      // 这里的疑问是i=0的时候，只有前 kVectorSize 个数有效，为什么会操作 NX 呢？  
      // 当 i = kVectorsPerThread -1 的时候，NX个数都有效了。所以，随着i增加，dst中的所有数都有效了。但是dst被重复赋值了 kVectorsPerThread，这是有什么目的才这么写的吗？  
      for (int idx = 0; idx < NX; ++idx) {  
        dst[idx] = *(reinterpret\_cast<T*>(vec_temp) + idx);  
      }  
    }  
  }  
}

  
/**  
 * @brief Read 1D data from global memory to register. The difference  
 * from the above function is that it supports different data types of inputs.  
 *  
 * @template paraments  
 * T: The type of data.  
 * NX: Each thread load NX data from global memory continuously.  
 * NY: Each thread need to load NY rows, only NY = 1 was supported.  
 * ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>  
 * Index: The index of data stored in dst.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.  
 * When the number of data processed by this block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX * NY.  
 * src: The data pointer of the current block.  
 * size: The current block needs to load size data continuously.  
 */  
template <typename T,  
          int NX,  
          int NY,  
          typename ArgsT,  
          int Index,  
          bool IsBoundary = false>  
__device__ __forceinline__ void ReadData(ArgsT* dst,  
                                         const T* __restrict__ src,  
                                         int num,  
                                         int read_lens) {  
  if (IsBoundary) {  // blockDim.x * NX < num  
    int thread_offset = threadIdx.x * NX;  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if (idx + thread_offset < num) {  
        std::get<Index>(dst[idx]) = src[thread_offset + idx];  
      }  
    }  
  } else {  // blockDim,x * NX = num  
    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;  
    constexpr int kVectorsPerThread = NX / kVectorSize;  
    int thread_offset = threadIdx.x * kVectorsPerThread;  
  
    using VecType = details::VectorType<T, kVectorSize>;  
    const VecType* vec_input = reinterpret\_cast<const VecType*>(src);  
    VecType vec_temp[kVectorsPerThread];  
  
#pragma unroll  
    for (int i = 0; i < kVectorsPerThread; ++i) {  
      vec_temp[i] = vec_input[thread_offset + i];  
#pragma unroll  
      for (int idx = 0; idx < NX; ++idx) {  
        std::get<Index>(dst[idx]) = *(reinterpret\_cast<T*>(vec_temp) + idx);  
      }  
    }  
  }  
}

ReadDataBc 2D

  
template <typename T, int NX, int NY, bool IsBoundary = false>  
__device__ void ReadDataBc(T* dst, const T* src,  
                           uint32\_t block_offset,  
                           details::BroadcastConfig<Rank> config,  
                           int total_num_output,  
                           int stride_nx,  
                           int stride_ny);

将需要进行 brodcast 的 2D 数据按照 T 类型从全局内存 src 中读取到寄存器 dst 中，其中 src 为原始输入数据指针，根据 config 计算当前输出数据对应的输入数据坐标，并将坐标对应的数据读取到寄存器中。

数据处理过程如下：

picture.image

  
/**  
 * @brief Read 2D data from global memory to registers with broadcast form.  
 *  
 * @template paraments  
 * T: The type of data stored in the global memory.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.  
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds  
 * judgment. When the number of data processed by the block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX * NY.  
 * src: The original input data pointer of this kernel.  
 * block\_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.  
 * config: Calculation configuration of broadcast. It is used to calculate the  
 * coordinate mapping relationship between output data and input data.  
 * total\_num\_output: Total number of original output.  
 * stride\_nx: Each read one element stride stride\_nx elements in the last dim.  
 * stride\_ny: Each read one element stride stride\_ny elements in the first dim.  
 */  
template <typename T, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void ReadDataBc(  
    T* dst,  
    const T* __restrict__ src,  
    uint32\_t block_offset,  
    const details::BroadcastConfig& config,  
    int total_num_output,  
    int stride_nx,  
    int stride_ny) {  
  uint32\_t thread_offset = block_offset + threadIdx.x;  
  uint32\_t index_src = 0;  
  
#pragma unroll  
  for (int ny = 0; ny < NY; ++ny) {  
#pragma unroll  
    for (uint32\_t nx = 0; nx < NX; ++nx) {  
      uint32\_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;  
      index_src = 0;  
      if (IsBoundary) {  
        if (index_output >= total_num_output) {  
          break;  
        }  
      }  
#pragma unroll  
      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {  
        if (i >= config.rank) break;  
        auto fast_divmoder = config.divmoders[i].Divmod(index_output);  
        index_output = fast_divmoder.val[0];  
        index_src += fast_divmoder.val[1] * config.strides[i];  
      }  
      dst[nx + ny * NX] = src[index_src];  
    }  
  }  
}

ReadDataBc 1D

  
template <typename T, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void ReadDataBc(  
    T* dst,  
    const T* __restrict__ src,  
    uint32\_t block_offset,  
    const details::BroadcastConfig& config,  
    int total_num_output,  
    int read_lens = NX)

将需要进行 brodcast 的 1D 数据按照 T 类型从全局内存 src 中读取到寄存器 dst 中，其中 src 为原始输入数据指针，根据 config 计算当前输出数据对应的输入数据坐标，并将坐标对应的数据读取到寄存器中。

数据处理过程如下：

picture.image

  
/**  
 * @brief Read 1D data from global memory to register with broadcast form.  
 *  
 * @template paraments  
 * T: The type of data stored in the global memory.  
 * NX: The number of data continuously loaded by each thread.  
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.  
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds  
 * judgment. When the number of data processed by the block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX * NY.  
 * src: The original input data pointer of kernel.  
 * block\_offset: The data offset of this block, blockDim.x * blockIdx.x * NX;  
 * config: Calculation configuration of broadcast. It is used to calculate the  
 * coordinate mapping relationship between output data and input data.  
 * total\_num\_output: Total number of original output.  
 */  
template <typename T, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void ReadDataBc(  
    T* dst,  
    const T* __restrict__ src,  
    uint32\_t block_offset,  
    const details::BroadcastConfig& config,  
    int total_num_output,  
    int read_lens = NX) {  
  uint32\_t thread_offset = block_offset + threadIdx.x * NX;  
  uint32\_t index_src = 0;  
  
#pragma unroll  
  for (uint32\_t nx = 0; nx < NX; ++nx) {  
    uint32\_t index_output = thread_offset + nx;  
    index_src = 0;  
    if (IsBoundary) {  
      if (index_output >= total_num_output) {  
        break;  
      }  
    }  
#pragma unroll  
    for (int i = 0; i < phi::DDim::kMaxRank; ++i) {  
      if (i >= config.rank) break;  
      auto fast_divmoder = config.divmoders[i].Divmod(index_output);  
      index_output = fast_divmoder.val[0];  
      index_src += fast_divmoder.val[1] * config.strides[i];  
    }  
    dst[nx] = src[index_src];  
  }  
}

ReadDataReduce

  
template <typename Tx,  
          typename Ty,  
          int NX,  
          int NY,  
          int Rank,  
          typename IndexCal,  
          typename Functor,  
          bool IsBoundary = false>  
__device__ __forceinline__ void ReadDataReduce(Ty* dst,  
                                               const Tx* __restrict__ src,  
                                               int block_offset,  
                                               const IndexCal& index_cal,  
                                               int size_nx,  
                                               int size_ny,  
                                               int stride_nx,  
                                               int stride_ny,  
                                               Functor func,  
                                               bool reduce_last_dim)

根据 index_cal 计算当前输出数据对应的输入数据坐标，将坐标对应的数据从全局内存 src 中读取到寄存器 dst 中。根据是否需要进行规约操作将数据映射成 2D 数据，总是将最后一维所在的维度映射到线程变化最快的维度，保证数据访存效率最高。

数据处理过程如下：

picture.image

  
/**  
 * @brief Read 2D data from global memory to register with reduce form.  
 *  
 * @template paraments  
 * T: The type of data.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.  
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds  
 * judgment. When the number of data processed by the block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The register pointer of the thread, the size is NX * NY.  
 * src: The input data pointer of this block.  
 * block\_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.  
 * index\_cal: Calculation configuration of Reduce. It is used to calculate the  
 * coordinate mapping relationship between output data and input data.  
 * size\_nx: The current block needs to load size\_nx columns of data, this  
 * parameter will participate in the calculation when isboundary = true.  
 * size\_ny: The current block needs to load size\_ny rows of data, this parameter  
 * will participate in the calculation when isboundary = true.  
 * will be used when IsBoundary = true.  
 * stride\_nx: Each read one element stride stride\_nx columns.  
 * stride\_ny: Each read one element stride stride\_ny raws.  
 * reduce\_last\_dim: Used to indicate whether the dimension of reduce contains  
 * the lowest dimension.  
 */  
template <typename Tx,  
          typename Ty,  
          int NX,  
          int NY,  
          int Rank,  
          typename IndexCal,  
          typename Functor,  
          bool IsBoundary = false>  
__device__ __forceinline__ void ReadDataReduce(Ty* dst,  
                                               const Tx* __restrict__ src,  
                                               int block_offset,  
                                               const IndexCal& index_cal,  
                                               int size_nx,  
                                               int size_ny,  
                                               int stride_nx,  
                                               int stride_ny,  
                                               Functor func,  
                                               bool reduce_last_dim) {  
  int thread_offset = 0;  
  int left_idx = 0;  
  if (reduce_last_dim) {  
    thread_offset = threadIdx.x;  
    left_idx = threadIdx.y;  
  } else {  
    thread_offset = threadIdx.y;  
    left_idx = threadIdx.x;  
  }  
  
  if (NX == 1) {  
#pragma unroll  
    for (int ny = 0; ny < NY; ++ny) {  
      if (IsBoundary) {  
        if (thread_offset >= size_ny) {  
          break;  
        }  
      }  
      uint32\_t index_src = index_cal(thread_offset + block_offset);  
      dst[ny] = static\_cast<Ty>(func(src[index_src]));  
      thread_offset += stride_ny;  
    }  
  } else {  
#pragma unroll  
    for (int nx = 0; nx < NX; ++nx) {  
#pragma unroll  
      for (int ny = 0; ny < NY; ++ny) {  
        if (IsBoundary) {  
          if ((thread_offset >= size_ny) ||  
              (left_idx + nx * stride_nx >= size_nx)) {  
            break;  
          }  
        }  
        uint32\_t index_src = index_cal(thread_offset + block_offset);  
        dst[nx + ny * NX] = static\_cast<Ty>(func(src[index_src]));  
        thread_offset += stride_ny;  
      }  
    }  
  }  
}

WriteData 1D

  
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>  
__device__ void WriteData(T* dst, T* src, int num);

将 T 类型的 1D 数据从寄存器 src 写到全局内存 dst 中。每次连续读取 NX 个数据，当前仅支持 NY = 1，直到写 NX 个数据到全局内存 dst 中。

当 IsBoundary = true 需要保证当前 Block 向全局内从中写的总数据个数不超过 num，以避免访存越界。

当 (NX % 4 = 0 或 NX % 2 = 0) 且 IsBoundary = false 时，会有更高的访存效率。

数据处理过程如下：

picture.image

  
/**  
 * @brief Write 1D data from registers to global memory. When IsBoundary = true  
 * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the  
 * data loading efficiency  
 *  
 * @template paraments  
 * T: The type of data.  
 * NX: The number of data continuously writed by each thread.  
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds  
 * judgment. When the number of data processed by the block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The data pointer of the current block.  
 * src: The register pointer, the size is NX * NY.  
 * size: The current block needs to load size elements continuously.  
 */  
template <typename T, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void WriteData(T* dst,  
                                          T* __restrict__ src,  
                                          int num) {  
  if (IsBoundary) {  
    int thread_offset = threadIdx.x * NX;  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if ((thread_offset + idx) < num) {  
        dst[thread_offset + idx] = src[idx];  
      }  
    }  
  } else {  
    // Vector type  
    constexpr int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;  
    constexpr int kVectorsPerThread = NX / kVectorSize;  
  
    int thread_offset = threadIdx.x * kVectorsPerThread;  
    using VecType = details::VectorType<T, kVectorSize>;  
    VecType* vec_dst = reinterpret\_cast<VecType*>(dst);  
    VecType vec_temp[kVectorsPerThread];  
#pragma unroll  
    for (int idx = 0; idx < kVectorsPerThread; ++idx) {  
      vec_temp[idx] = *(reinterpret\_cast<VecType*>(src) + idx);  
      vec_dst[thread_offset + idx] = vec_temp[idx];  
    }  
  }  
}

WriteData 2D

  
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, bool IsBoundary = false>  
__device__ void WriteData(Ty* dst, const Tx* src, int size_nx, int size_ny, int stride_nx, int stride_ny);

将 Tx 类型的 2D 数据从寄存器中写入到全局内存，并按照 Ty 类型存储到全局内存 dst 中。

最低维每写入 1 个元素需要偏移 stride_nx 个元素，最高维每写入 1 个元素需要偏移 stride_ny 个元素，直到将寄存器 NX * NY 个数据全部写入到全局内存 dst 中。

当 IsBoundary = true 需要保证当前最高维偏移个数不超过 size_ny，列偏移个数不超过 size_nx。

数据处理过程如下：

picture.image

  
/**  
 * @brief Write 2D data from register to global memory according to Tx type, and  
 * store it as Ty type.  
 *  
 * @template paraments  
 * Tx: The type of data that needs to be stored in registers.  
 * Ty: The type of data that stored in the global memory.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * IsBoundary: Indicates whether to perform block access storage out-of-bounds  
 * judgment. When the number of data processed by the block is less than  
 * NX x NY x blockDim.x, boundary judgment is required to avoid memory access  
 * crossing the boundary.  
 *  
 * @param：  
 * dst: The data pointer of the current block.  
 * src: The register pointer of the thread, the size is NX * NY.  
 * size\_nx: The maximum offset of the current block is size\_nx elements in the  
 * lowest dimension. The parameters are only calculated when isboundary = true.  
 * size\_ny: The maximum offset of the current block is size\_ny elements in the  
 * first dimension. The parameters are only calculated when isboundary = true.  
 * stride\_nx: Each read one element stride stride\_nx elements in the last dim.  
 * stride\_ny: Each read one element stride stride\_ny elements in the first dim.  
 */  
template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>  
__device__ __forceinline__ void WriteData(Ty* dst,  
                                          const Tx* __restrict__ src,  
                                          int size_nx,  
                                          int size_ny,  
                                          int stride_nx,  
                                          int stride_ny) {  
  int thread_offset = threadIdx.x;  
  int left_size_nx = size_nx - thread_offset;  
  
  // Each branch is added for better performance  
  if (NX == 1 && NY == 1) {  // for NX == 1 and NY == 1  
    if (IsBoundary) {  
      if (left_size_nx > 0) {  
        dst[thread_offset] = static\_cast<Ty>(src[0]);  
      }  
    } else {  
      dst[thread_offset] = static\_cast<Ty>(src[0]);  
    }  
  } else if (NX == 1) {  // for NX == 1 and NY != 1  
#pragma unroll  
    for (int idy = 0; idy < NY; ++idy) {  
      if (IsBoundary) {  
        if (idy * stride_ny >= size_ny) {  
          break;  
        }  
      }  
      dst[thread_offset + idy * stride_ny] = static\_cast<Ty>(src[idy]);  
    }  
  } else if (NY == 1) {  // for NY == 1 and NX != 1  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if (IsBoundary) {  
        if (idx * stride_nx >= left_size_nx) {  
          break;  
        }  
      }  
      dst[thread_offset + idx * stride_nx] = static\_cast<Ty>(src[idx]);  
    }  
  } else {  // for NX != 1 and NY != 1  
#pragma unroll  
    for (int idx = 0; idx < NX; ++idx) {  
      if (IsBoundary) {  
        if (idx * stride_nx >= left_size_nx) {  
          break;  
        }  
      }  
#pragma unroll  
      for (int idy = 0; idy < NY; ++idy) {  
        if (IsBoundary) {  
          if (idy * stride_ny >= size_ny) {  
            break;  
          }  
        }  
        dst[thread_offset + idx * stride_nx + idy * stride_ny] =  
            static\_cast<Ty>(src[idy * NX + idx]);  
      }  
    }  
  }  
}

OpFunc

details

  
// paddle/phi/kernels/primitive/functor\_primitives.h  
  
namespace Eigen{  
namespace numext{  
template <>  
HOSTDEVICE inline float16 exp(const float16& a) {  
  return float16(::expf(static\_cast<float>(a)));  
}  
  
template <>  
HOSTDEVICE inline phi::dtype::bfloat16 log(const phi::dtype::bfloat16& a) {  
  return phi::dtype::bfloat16(::logf(static\_cast<float>(a)));  
}  
}  
}  
  
static __device__ __forceinline__ phi::dtype::float16 Exp(  
    phi::dtype::float16 x) {  
  return ::Eigen::numext::exp(x);  
}  
  
static \_\_device\_\_ \_\_forceinline\_\_ float Exp(float x) { return expf(x); }  
  
static \_\_device\_\_ \_\_forceinline\_\_ double Exp(double x) { return exp(x); }  
  
static __device__ __forceinline__ phi::dtype::float16 Log(  
    phi::dtype::float16 x) {  
  return ::Eigen::numext::log(x);  
}  
  
static \_\_device\_\_ \_\_forceinline\_\_ float Log(float x) { return logf(x); }  
  
static \_\_device\_\_ \_\_forceinline\_\_ double Log(double x) { return log(x); }

Unary Functor

ExpFunctor

  
/**  
 * @brief Default unary exp functor  
 */  
template <typename Tx, typename Ty = Tx>  
struct ExpFunctor {  
  HOSTDEVICE inline ExpFunctor() {}  
  
  HOSTDEVICE explicit inline ExpFunctor(int n) {}  
  
  HOSTDEVICE inline Ty operator()(const Tx x) const {  
    return static\_cast<Ty>(details::Exp(x));  
  }  
};

IdentityFunctor

  
/**  
 * @brief Default unary identity functor  
 */  
template <typename Tx, typename Ty = Tx>  
struct IdentityFunctor {  
  HOSTDEVICE inline IdentityFunctor() {}  
  
  HOSTDEVICE explicit inline IdentityFunctor(int n) {}  
  
  HOSTDEVICE inline Ty operator()(const Tx x) const {  
    return static\_cast<Ty>(x);  
  }  
};

DivideFunctor

  
// paddle/mt/Paddle/paddle/phi/common/amp\_type\_traits.h  
namespace phi {  
namespace dtype {  
  
template <typename T>  
class MPTypeTrait {  
 public:  
  using Type = T;  
};  
  
template <>  
class MPTypeTrait<phi::dtype::float16> {  
 public:  
  using Type = float;  
};  
  
template <>  
class MPTypeTrait<phi::dtype::bfloat16> {  
 public:  
  using Type = float;  
};  
  
}  // namespace dtype  
}  // namespace phi

  
/**  
 * @brief Default unary div functor. Divide by a constant  
 */  
template <typename Tx, typename Ty = Tx>  
struct DivideFunctor {  
 private:  
  using MPType = typename ::phi::dtype::MPTypeTrait<Tx>::Type;  
  
 public:  
  HOSTDEVICE inline DivideFunctor() { n_inv = static\_cast<MPType>(1.0f); }  
  
  HOSTDEVICE explicit inline DivideFunctor(int n) : n\_inv((MPType)(1.0 / n)) {}  
  
  HOSTDEVICE inline Ty operator()(const Tx x) const {  
    return static\_cast<Ty>(static\_cast<MPType>(x) * n_inv);  
  }  
  
 private:  
  MPType n_inv;  
};

InverseFunctor

  
/**  
 * @brief Default inverse functor  
 */  
template <typename Tx, typename Ty = Tx>  
struct InverseFunctor {  
  HOSTDEVICE inline InverseFunctor() {}  
  
  HOSTDEVICE explicit inline InverseFunctor(int n) {}  
  
  HOSTDEVICE inline Ty operator()(const Tx x) const {  
    return static\_cast<Ty>(-x);  
  }  
};

SquareFunctor

  
/**  
 * @brief Default unary square functor  
 */  
template <typename Tx, typename Ty = Tx>  
struct SquareFunctor {  
  HOSTDEVICE inline SquareFunctor() {}  
  
  HOSTDEVICE explicit inline SquareFunctor(int n) {}  
  
  HOSTDEVICE inline Ty operator()(const Tx x) const {  
    return static\_cast<Ty>(x) * static\_cast<Ty>(x);  
  }  
};

Binary Functor

MinFunctor

  
/**  
 * @brief Default binary min functor  
 */  
template <typename T>  
struct MinFunctor {  
  inline T initial() { return static\_cast<T>(std::numeric_limits<T>::max()); }  
  
  \_\_device\_\_ \_\_forceinline\_\_ T operator()(const T a, const T b) const {  
    return (b < a) ? b : a;  
  }  
};

MaxFunctor

  
/**  
 * @brief Default binary max functor  
 */  
template <typename T>  
struct MaxFunctor {  
  inline T initial() {  
    return static\_cast<T>(std::numeric_limits<T>::lowest());  
  }  
  
  \_\_device\_\_ \_\_forceinline\_\_ T operator()(const T a, const T b) const {  
    return (b > a) ? b : a;  
  }  
};

AddFunctor

  
/**  
 * @brief Default binary add functor  
 */  
template <typename T>  
struct AddFunctor {  
  inline T initial() { return static\_cast<T>(0.0f); }  
  
  \_\_device\_\_ \_\_forceinline\_\_ T operator()(const T a, const T b) const {  
    return b + a;  
  }  
};

MulFunctor

  
/**  
 * @brief Default binary add functor  
 */  
template <typename T>  
struct MulFunctor {  
  inline T initial() { return static\_cast<T>(1.0f); }  
  
  \_\_device\_\_ \_\_forceinline\_\_ T operator()(const T a, const T b) const {  
    return b * a;  
  }  
};

LogicalOrFunctor

  
/**  
 * @brief Default binary logic or functor  
 */  
template <typename T>  
struct LogicalOrFunctor {  
  inline T initial() { return static\_cast<T>(false); }  
  
  \_\_device\_\_ \_\_forceinline\_\_ T operator()(const T a, const T b) const {  
    return b || a;  
  }  
};

LogicalAndFunctor

  
/**  
 * @brief Default binary logic and functor  
 */  
template <typename T>  
struct LogicalAndFunctor {  
  inline T initial() { return static\_cast<T>(true); }  
  
  \_\_device\_\_ \_\_forceinline\_\_ T operator()(const T a, const T b) const {  
    return b && a;  
  }  
};

SubFunctor

  
/**  
 * @brief Default binary sub functor  
 */  
template <typename T>  
struct SubFunctor {  
  inline T initial() { return static\_cast<T>(0.0f); }  
  
  inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }  
};

DivFunctor

  
/**  
 * @brief Default binary div functor  
 */  
template <typename T, typename Enable = void>  
struct DivFunctor {  
  inline T initial() { return static\_cast<T>(1.0f); }  
  
  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }  
};  
  
template <typename T>  
struct DivFunctor<T,  
                  typename std::enable\_if<std::is\_integral<T>::value>::type> {  
  inline T initial() { return static\_cast<T>(1.0f); }  
  
  inline HOSTDEVICE T operator()(const T a, const T b) const {  
    // For int32/int64, need to check whether the divison is zero.  
    PADDLE_ENFORCE_NE(b,  
                      0,  
                      phi::errors::InvalidArgument(  
                          "Integer division by zero encountered "  
                          "in (floor) divide. Please check the input value."));  
    return a / b;  
  }  
};

FloorDivFunctor

  
/**  
 * @brief Default binary floor divide functor  
 */  
template <typename T>  
struct FloorDivFunctor {  
  inline T initial() { return static\_cast<T>(1.0f); }  
  
  inline HOSTDEVICE T operator()(const T a, const T b) const {  
    PADDLE_ENFORCE_NE(b,  
                      0,  
                      phi::errors::InvalidArgument(  
                          "Integer division by zero encountered "  
                          "in (floor) divide. Please check the input value."));  
    return static\_cast<T>(std::trunc(a / b));  
  }  
};

Compute API

details

  
constexpr int kReduceMaxThread = 128;  
constexpr int kWarpSize = 32;  
  
// kGlobalMode: block reduce, each block gets an output;  
// kLocalMode: thread reduce, each thread gets an output;  
enum ReduceMode { kGlobalMode, kLocalMode };

SharedMemoryIndex

  
/**  
 * @brief Will be used in BlockYReduce, get the index of reduce\_num in shared  
 * memory.  
 */  
\_\_device\_\_ \_\_forceinline\_\_ int SharedMemoryIndex(int index) {  
  return (threadIdx.y + index) * blockDim.x + threadIdx.x;  
}

WarpReduce

Using CUDA Warp-Level Primitives

picture.image

  
#define FULL\_MASK 0xffffffff  
for (int offset = 16; offset > 0; offset /= 2)  
    val += __shfl_down_sync(FULL_MASK, val, offset);

一个 warp 由32条线组成，每个线程占据一条线。对于warp中位于X通道的线程，__shfl_down_sync(FULL_MASK, val, offset)从X+offset获取val变量的值。数据交换是在寄存器之间进行的，比通过共享内存更有效，后者需要一个加载、一个存储和一个额外的寄存器来保存地址。

picture.image

  
// paddle/phi/backends/gpu/cuda/cuda\_device\_function.h  
  
#define FULL\_WARP\_MASK 0xFFFFFFFF  
#define CREATE\_SHFL\_MASK(mask, predicate) \  
  mask = \_\_ballot\_sync(FULL\_WARP\_MASK, (predicate))  
  
  
template <typename T>  
\_\_forceinline\_\_ \_\_device\_\_ T  
CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {  
  return __shfl_down_sync(mask, val, static\_cast<unsigned>(delta), width);  
}  
  
template <>  
__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(  
    unsigned mask, phi::dtype::float16 val, int delta, int width) {  
  return phi::dtype::float16(__shfl_down_sync(  
      mask, val.to_half(), static\_cast<unsigned>(delta), width));  
}  
  
template <>  
__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(  
    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {  
#if defined(PADDLE\_CUDA\_BF16)  
  return phi::dtype::bfloat16(__shfl_down_sync(  
      mask, val.to_nv_bfloat16(), static\_cast<unsigned>(delta), width));  
#else  
  PADDLE_ENFORCE(  
      false, "\_\_shfl\_down\_sync with bfloat16 is not supported on cuda <= 11.");  
#endif  
}  
  
template <>  
__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(  
    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {  
  float real = static\_cast<float>(__shfl_down_sync(  
      mask, static\_cast<float>(val.real), static\_cast<unsigned>(delta), width));  
  float imag = static\_cast<float>(__shfl_down_sync(  
      mask, static\_cast<float>(val.imag), static\_cast<unsigned>(delta), width));  
  return phi::dtype::complex<float>(real, imag);  
}

  
template <typename T, typename ReduceOp>  
\_\_device\_\_ \_\_forceinline\_\_ T WarpReduce(T val, ReduceOp reducer) {  
  unsigned mask = 0u;  
  CREATE_SHFL_MASK(mask, true);  
  for (int stride = details::kWarpSize / 2; stride > 0; stride >>= 1) {  
    T temp = phi::backends::gpu::CudaShuffleDownSync(mask, val, stride);  
    val = reducer(val, temp);  
  }  
  return val;  
}

BlockXReduce

picture.image

  
/* e.g.  
 * |---------block---------|  
 * |warp0|warp1|warp2|warp3|  
 * |0~31|32~63|64~95|96~127|  ---->blockDim.x = 128  
 *  \|/  \|/   \|/    \|/     ---->1. First WarpReduce in each warp  
 * res0  res1  res2  res3     ---->2. Store result of each warp to shared memory  
 *   \    \    /     /        ---->3. Load the result above from shared memory  
 *        res                         to warp0 and process the second WarpReduce  
 */  
  
/**  
 * @brief BlockXReduce reduce along blockDim.x.  
 */  
template <typename T, typename ReduceOp>  
\_\_device\_\_ \_\_forceinline\_\_ T BlockXReduce(T val, ReduceOp reducer) {  
  __syncthreads();  
  using details::kWarpSize;  
  __shared__ T shared[2 * kWarpSize];  
  int block_dim_x = blockDim.x;  
  if (blockDim.x > kWarpSize) {  
    // Bit operation can be used when kWarpSize is 32 or 64 now  
    // 32 or 64  
    constexpr int rshift_val =  
        (kWarpSize != 32) ? ((kWarpSize == 64) ? 6 : 5) : 5;  
    block_dim_x = blockDim.x >> rshift_val;  
    int lane = threadIdx.x & (kWarpSize - 1);  
    // thread id  
    int tid = threadIdx.y * blockDim.x + threadIdx.x;  
    // warp id  
    int wid = tid >> rshift_val;  
    // block id  
    int bid = threadIdx.y;  
    val = WarpReduce(val, reducer);  
    if (lane == 0) {  
      shared[wid] = val;  
    }  
    __syncthreads();  
    val = shared[bid * block_dim_x + lane];  
  }  
  
  unsigned mask = 0u;  
  CREATE_SHFL_MASK(mask, true);  
  for (int stride = 1; stride < block_dim_x; stride <<= 1) {  
    T temp = phi::backends::gpu::CudaShuffleDownSync(mask, val, stride);  
    val = reducer(val, temp);  
  }  
  __syncthreads();  
  if (threadIdx.x == 0) {  
    shared[threadIdx.y] = val;  
  }  
  __syncthreads();  
  return shared[threadIdx.y];  
}

BlockYReduce

  
/**  
 * @brief BlockYReduce reduce along blockDim.y.  
 */  
template <typename T, typename ReduceOp>  
\_\_device\_\_ \_\_forceinline\_\_ T BlockYReduce(T val, ReduceOp reducer) {  
  __shared__ T shared_memory[1024];  
  shared_memory[SharedMemoryIndex(0)] = val;  
  for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) {  
    __syncthreads();  
    if (threadIdx.y < stride && threadIdx.y + stride < blockDim.y) {  
      T temp = shared_memory[SharedMemoryIndex(stride)];  
      val = reducer(val, temp);  
    }  
    shared_memory[SharedMemoryIndex(0)] = val;  
  }  
  __syncthreads();  
  return shared_memory[threadIdx.x];  
}

Swap

  
/**  
 * @brief Swap data  
 */  
template <typename T>  
\_\_device\_\_ \_\_forceinline\_\_ void Swap(T* first\_value, T* second\_value) {  
  T t_value;  
  t_value = (*first_value);  
  (*first_value) = (*second_value);  
  (*second_value) = t_value;  
}

Comparator

  
/**  
 * @brief Swap data according to  monotonic\_type.  
 */  
template <typename T>  
\_\_device\_\_ \_\_forceinline\_\_ void Comparator(T* first\_value,  
                                           T* second\_value,  
                                           int monotonic\_type) {  
  if (((*first_value) > (*second_value)) == monotonic_type) {  
    Swap<T>(first_value, second_value);  
  }  
}

ComparatorWithIndex

  
/**  
 * @brief Swap data and data index according to  monotonic\_type.  
 */  
template <typename T, typename IndexType>  
\_\_device\_\_ \_\_forceinline\_\_ void ComparatorWithIndex(T* first\_value,  
  
                                                    T* second\_value,  
                                                    IndexType* first\_index,  
                                                    IndexType* second\_index,  
                                                    int monotonic\_type) {  
  if ((*first_value > (*second_value)) == monotonic_type) {  
    // swap value  
    Swap<T>(first_value, second_value);  
    // swap index  
    Swap<IndexType>(first_index, second_index);  
  }  
}

GetLastPow2

获取小于当前数据的最大2的幂。

  
static inline int GetLastPow2(int n) {  
  n = log2(n);  
  n = max(0, n);  
  return std::pow(2, n);  
}

  
/**  
 * @brief get the last pow of 2  
 */  
\_\_device\_\_ inline int GetLastPow2(int n) {  
  n |= (n >> 1);  
  n |= (n >> 2);  
  n |= (n >> 4);  
  n |= (n >> 8);  
  n |= (n >> 16);  
  return std::max(1, n - (n >> 1));  
}

  
// https://blog.csdn.net/m0\_38086244/article/details/116654857  
  
unsigned hight\_bit(unsigned x){//0010 1100 0000 0000 0000 0000 0000 0000 0000 0001  
 x= x|(x>>1);              //0011 1110 0000 0000 0000 0000 0000 0000 0000 0000  
 x= x|(x>>2);              //0011 1111 1000 0000 0000 0000 0000 0000 0000 0000  
 x= x|(x>>4);              //0011 1111 1111 1000 0000 0000 0000 0000 0000 0000  
 x= x|(x>>8);              //0011 1111 1111 1111 1111 1000 0000 0000 0000 0000  
 x= x|(x>>16);             //0011 1111 1111 1111 1111 1111 1111 1111 1111 1111  
 x= x|(x>>32);  
 // 如果数特别大， 这里感觉会溢出， 所以这里只使用于小于数据最大值1/2的数。  
 return (x+1) >> 1;        //0100 0000 0000 0000 0000 0000 0000 0000 0000 0000

  
->>>>>>>> 0.145996 ms // 第一种  
--------> 0.011963 ms // 第二种  
-<<<<<<<< 0.007812 ms // 第三种实现： 真香

ElementwiseUnary

  
  
/**  
 * @brief Perform unary calculation according to OpFunc. Shape of input and  
 * output are the same.  
 *  
 * @template paraments  
 * InT: The data type of in.  
 * OutT: The data type of out.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following:  
 *     template <typename InT, typename OutT>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE OutT operator()(const InT& a) const {  
 *         return ...;  
 *       }  
 *     };  
 *  
 * @param：  
 * out: The register pointer of out, the size is NX * NY.  
 * in: The register pointer of in, the size is NX * NY.  
 * compute: Compute function which was declared like OpFunc<InT, OutT>().  
 */  
template <typename InT, typename OutT, int NX, int NY, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void ElementwiseUnary(OutT* out,  
                                                 const InT* in,  
                                                 OpFunc compute) {  
#pragma unroll  
  for (int idx = 0; idx < NX * NY; idx++) {  
    out[idx] = static\_cast<OutT>(compute(in[idx]));  
  }  
}

ElementwiseBinary

  
  
/**  
 * @brief Binary calculation according to OpFunc. Shape of The input and output  
 * are the same.  
 *  
 * @template paraments  
 * InT: The data type of in1 and in2.  
 * OutT: The data type of out.  
 * NX: The number of data columns computed by each thread.  
 * NY: The number of data rows computed by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following:  
 *     template <typename InT>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {  
 *         return ...;  
 *       }  
 *     };  
 *  
 * @param：  
 * out: The register pointer of out, the size is NX * NY.  
 * in1: The register pointer of fist input, size is NX * NY.  
 * in2: The register pointer of second input, size is NX * NY.  
 * compute: Compute function which was declared like OpFunc<InT>().  
 */  
template <typename InT, typename OutT, int NX, int NY, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void ElementwiseBinary(OutT* out,  
                                                  const InT* in1,  
                                                  const InT* in2,  
                                                  OpFunc compute) {  
#pragma unroll  
  for (int idx = 0; idx < NX * NY; ++idx) {  
    out[idx] = static\_cast<OutT>(compute(in1[idx], in2[idx]));  
  }  
}

ElementwiseTernary

  
  
/**  
 * @brief Ternary calculation according to OpFunc. Shape of input and output  
 * are the same.  
 *  
 * @template paraments  
 * InT: The data type of in1 and in2.  
 * OutT: The data type of out.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following  
 *     template <typename InT>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c)  
 * const {  
 *         return ...;  
 *       }  
 *     };  
 *  
 * @param  
 * out: The register pointer of out, the size is NX * NY.  
 * in1: The register pointer of fist input, size is NX * NY.  
 * in2: The register pointer of second input, size is NX * NY.  
 * in3: The register pointer of third input, size is NX * NY.  
 * compute: Compute function which was declared like OpFunc<InT>().  
 */  
template <typename InT, typename OutT, int NX, int NY, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void ElementwiseTernary(  
    OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {  
#pragma unroll  
  for (int idx = 0; idx < NX * NY; ++idx) {  
    out[idx] = static\_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));  
  }  
}

ElementwiseAny

  
  
/**  
 * @brief Multivariate calculation according to OpFunc. Shape of inputs and  
 * output are the same.  
 *  
 * @template paraments  
 * InT: The data type of in1, in2 and in3.  
 * OutT: The data type of out.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * Arity: The size of ins.  
 * OpFunc: Compute functor which has an operator() as following:  
 *     template <typename InT>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE InT operator()(const InT* args) const {  
 *         return ...;  
 *       }  
 *     };  
 *  
 * @param  
 * out: The register pointer of out, the size is NX * NY.  
 * ins: A pointers of array consisting of multiple inputs.  
 * compute: Compute function which was declared like OpFunc<InT>().  
 */  
template <typename InT, typename OutT, int NX, int NY, int Arity, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void ElementwiseAny(OutT* out,  
                                               InT (*ins)[NX * NY],  
                                               OpFunc compute) {  
  InT args[Arity];  
#pragma unroll  
  for (int idx = 0; idx < NX * NY; ++idx) {  
#pragma unroll  
    for (int j = 0; j < Arity; ++j) {  
      args[j] = ins[j][idx];  
    }  
    out[idx] = static\_cast<OutT>(compute(args));  
  }  
}

CycleBinary

  
  
/**  
 * @brief Binary calculation according to OpFunc. Shape of in1 and in2 are the  
 * different. Shape of in1 is [1, NX], but in2's shape is [NY, NX], the output  
 * shape is [NY, NX].  
 *  
 * @template paraments  
 * InT: The data type of in1 and in2.  
 * OutT: The data type of out.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following  
 *     template <typename InT, typename OutT>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {  
 *         return ...;  
 *       }  
 *     };  
 *  
 * @param  
 * out: The register pointer of out, the size is NX * NY.  
 * in1: The register pointer of fist input, size is NX * 1.  
 * in2: The register pointer of second input, size is NX * NY.  
 * compute: Compute function which was declared like OpFunc<InT, OutT>().  
 */  
template <typename InT, typename OutT, int NX, int NY, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void CycleBinary(OutT* out,  
                                            const InT* in1,  
                                            const InT* in2,  
                                            OpFunc compute) {  
#pragma unroll  
  for (int idx = 0; idx < NX; idx++) {  
#pragma unroll  
    for (int idy = 0; idy < NY; idy++) {  
      out[idx + idy * NX] =  
          static\_cast<OutT>(compute(in1[idx], in2[idx + idy * NX]));  
    }  
  }  
}

ElementwiseConstant

  
  
/*  
 * @brief Fill register with a constant according to OpFunc  
 *  
 * @template paraments  
 * InT: The data type of in1 and in2.  
 * OutT: The data type of out.  
 * NX: The number of data columns loaded by each thread.  
 * NY: The number of data rows loaded by each thread.  
 * GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following  
 *     template <typename InT>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE InT operator()()  
 * const {  
 *         return a;  
 *       }  
 *     };  
 *  
 * @param  
 * out: The register pointer of out, the size is NX * NY.  
 * compute: Compute function which was declared like OpFunc<InT>().  
 */  
template <typename InT, typename OutT, int NX, int NY, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void ElementwiseConstant(OutT* out, OpFunc compute) {  
#pragma unroll  
  for (int idx = 0; idx < NX * NY; idx++) {  
    out[idx] = static\_cast<OutT>(compute());  
  }  
}

ElementwiseRandom

  
  
/*  
 * @brief Get ReturnsCount random data fromm compute according to state, state  
 * can be curandStatePhilox4\_32\_10\_t, hiprandStatePhilox4\_32\_10\_t which has beed  
 * initialized.  
 *  
 * @template paraments  
 * StateType: the type of state, can be curandStatePhilox4\_32\_10\_t or  
 * hiprandStatePhilox4\_32\_10\_t.  
 * OutT: the type of out register.  
 * ReturnsCount: The number of random data generated by OpFunc.  
 * GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following  
 *     template <typename T>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE InT operator()(StateType state)  
 * const {  
 *         return ranomd(state);  // Returns ReturnsCount random numbers with  
 * data type T  
 *       }  
 *     };  
 *  
 * @param  
 * out: The register pointer of out, the size is NX * NY.  
 * compute: Compute function which was declared like OpFunc<T>().  
 */  
  
template <typename StateType, typename OutT, int ReturnsCount, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void ElementwiseRandom(OutT* out,  
                                                  OpFunc compute,  
                                                  StateType* state) {  
  auto random_tuple = compute(state);  
#pragma unroll  
  for (int i = 0; i < ReturnsCount; i++) {  
    out[i] = static\_cast<OutT>((&random_tuple.x)[i]);  
  }  
}

Reduce

http://courses.cms.caltech.edu/cs179/Old/2016\_lectures/cs179\_2016\_lec07.pdf

Optimizing Parallel Reduction in CUDA

根据 reducer 对 in 中的数据进行数据规约，输入 in 的 Shape 为 [NY, NX]。

当 Mode = kLocalMode 时，对 in 沿着 NX 方向进行规约，完成线程内规约，out 为[NY, 1]；

当 Mode = kGlobalMode 时，使用共享内存完成 block 内线程间的规约操作，in 和 out 的 size 相同，均为[NY, NX]。

ReduceMax 数据处理过程如下：

picture.image

  
/**  
 * @brief The Reduce provides collective methods for computing a parallelReduce(T* out, const T* in, ReduceFunctor reducer, bool reduce\_last\_dim  
 * reduction of items partitioned across a CUDA block and intra thread. When  
 * ReduceMode == kLocalMode, use shared memory to reduce between threads.When  
 * ReduceMode == kGlobalMode, thread reduce along nx.  
 *  
 * @template paraments  
 * T: The type of data.  
 * NX: The number of data continuously loaded by each thread.  
 * NY: The number of data rows loaded by each thread, only NY = 1 was supported.  
 * threadIdx.x is used as the thread index. Currently only GPU was supported.  
 * ReduceFunctor: Compute functor which has an operator() as following  
 *     template <typename InT>  
 *     struct ReduceFunctor {  
 *       HOSTDEVICE InT operator()(const InT& a, const InT& b) const {  
 *         return ...;  
 *       }  
 *     };  
 * ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode.  
 *  
 * @param  
 * out: The register pointer of out, the size is NX * NY.  
 * in: The register pointer of in, the size is NX * NY.  
 * reducer: Compute function which was declared like ReduceFunctor<InT>().  
 * reduce\_last\_dim: if the last dim gets involved in reduction.  
 */  
template <typename T,  
          int NX,  
          int NY,  
          class ReduceFunctor,  
          details::ReduceMode Mode>  
\_\_device\_\_ \_\_forceinline\_\_ void Reduce(T* out,  
                                       const T* in,  
                                       ReduceFunctor reducer,  
                                       bool reduce\_last\_dim) {  
  int block_index = blockDim.y;  
  
  if (Mode == details::ReduceMode::kGlobalMode) {  
    bool block_reduce_y = (!reduce_last_dim) && (block_index > 1);  
    // when reduce is not required for the last dim, and reduce num has been  
    // split into multiple threads  
    if (block_reduce_y) {  
#pragma unroll  
      for (int i = 0; i < NY * NX; i++) {  // reduce along blockdim.y  
        out[i] = details::BlockYReduce<T, ReduceFunctor>(out[i], reducer);  
      }  
    }  
  
    // when last dimension need to be reduced  
    if (reduce_last_dim) {  
#pragma unroll  
      for (int i = 0; i < NY * NX; i++) {  // reduce along blockDim.x  
        out[i] = details::BlockXReduce<T, ReduceFunctor>(out[i], reducer);  
      }  
    }  
  } else {  // else  kLocalMode  
#pragma unroll  
    for (int i = 0; i < NY; ++i) {  
#pragma unroll  
      for (int j = 0; j < NX; ++j) {  
        out[i] = reducer(out[i], in[i * NX + j]);  
      }  
    }  
  }  
}

Cumsum

对数据数组的全前缀和操作通常称为 scan。我们使用这个更简单的术语(它来自APL编程语言[Iverson 1962])。刚刚定义的扫描是 exclusive scan，因为结果中的每个元素j是输入数组中j之前但不包括j的所有元素的和。在includsive scan中，对包括j在内的所有元素求和。通过将结果数组右移一个元素并插入Identity，可以从includsive scan生成exclusive scan。同样，通过将结果数组向左移动并在最后插入扫描的最后一个元素和输入数组的最后一个元素的和(Blelloch 1990)，可以从 exclusive scan生成 includsive scan。

扫描有许多用途，包括但不限于排序、词法分析、字符串比较、多项式求值、流压缩以及并行构建直方图和数据结构(图、树等)。

一般来说，全前缀和可以用来将某些顺序计算转换成等价的并行计算.

  
# Sequential  
out[0] = 0;   
for j from 1 to n do    
  out[j] = out[j-1] + f(in[j-1]);  
    
# Parallel  
forall j in parallel do  
  temp[j] = f(in[j]);  
all_prefix_sums(out, temp);

GPU Programming

http://courses.cms.caltech.edu/cs179/

A Work-Efficient Parallel Scan

建议看下这篇blog：https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda

我们将使用一种在并行计算中经常出现的算法模式:平衡树。其思想是在输入数据上构建一棵平衡的二叉树，并 sweep it to and from the root 以计算前缀和。一个有n个叶子的二叉树有d=log2(n)层，每层d有2^d个节点。如果我们对每个节点执行一次add，那么我们将在树的一次遍历中执行O(n)个add。

picture.image

我们构建的树不是一个实际的数据结构，而是一个概念，用于确定每个线程在遍历的每一步做什么。在这种高效工作的扫描算法中，我们在共享内存中的数组上执行相应的操作。该算法由两个阶段组成:缩减阶段(也称为上扫阶段)和下扫阶段。在reduce阶段，我们从叶子遍历树到根，在树的内部节点计算部分和。这也称为并行缩减，因为在此阶段之后，根节点(数组中的最后一个节点)保存数组中所有节点的和。算法3给出了简化阶段的伪代码。

  
# The Up-Sweep (Reduce) Phase of a Work-Efficient Sum Scan Algorithm (After Blelloch 1990)  
for d = 0 to log2(n) - 1do  
  for all k = 0 to n -1 by 2^(d+1) in parallel do  
    x[k + 2^(d+1) - 1] = x[k + 2^d - 1] + x[k + 2^(d+1) -1]

picture.image

在下扫阶段，我们从根结点向下遍历树，使用reduce阶段的部分和在数组上构建扫描。我们首先在树的根插入0，在每个步骤中，当前级别的每个节点将其自己的值传递给其左子节点，并将其值和其左子节点的前值的总和传递给其右子节点。

  
# The Down-Sweep Phase of a Work-Efficient Parallel Sum Scan Algorithm (After Blelloch 1990)  
x[n – 1] U2190.GIF 0  
for d = log2(n) – 1 down to 0 do  
      for all k = 0 to n – 1 by 2^d +1 in parallel do  
          t = x[k +  2^d  – 1]  
          x[k +  2^d  – 1] = x[k +  2^(d+1) – 1]  
          x[k +  2^(d+1) – 1] = t +  x[k +  2^d+1） – 1]

picture.image

Avoiding Bank Conflicts

如果我们在运行CUDA的GPU上检查这个扫描的操作，我们会发现它遭受许多共享内存库冲突。这些会损害每次访问共享内存的性能，并显著影响总体性能。

picture.image

正如 NVIDIA CUDA 编程指南(NVIDIA 2007)中所描述的， 共享内存是由多个bank组成的 。

当同一个warp中的多个线程访问同一bank时，除非该warp中的所有线程访问同一32位字内的相同地址，否则会发生bank冲突 。

访问单个Bank的线程数量称为Bank冲突degree 。Bank冲突导致对内存bank的多次访问序列化，因此，一个有 degree-n bank 冲突的共享内存访问需要的处理周期是没有冲突的访问的n倍。

在大多数CUDA计算中，如果在访问__shared__内存数组时小心谨慎，Bank 冲突是可以避免的。

我们可以通过在计算的每个共享内存数组索引中添加可变数量的填充来避免扫描中的大多数Bank冲突 。

具体来说， 我们将索引的值除以共享内存Bank的数量添加到索引中 。

picture.image

Simple Padding Applied to Shared Memory Addresses Can Eliminate High-Degree Bank Conflicts During Tree-Based Algorithms Like Scan

  
# Macro Used for Computing Bank-Conflict-Free Shared Memory Array Indices  
#define NUM\_BANKS 16   
#define LOG\_NUM\_BANKS 4   
#define CONFLICT\_FREE\_OFFSET(n) \       
   ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))

Arrays of Arbitrary Size

前几节给出的算法扫描单个线程块中的数组。这对于小数组来说很好，可以将一个块中的最大线程数增加两倍(因为每个线程加载和处理两个元素)。在NVIDIA 8系列gpu上，这将我们限制到最多1024个元素。此外，数组大小必须是2的幂。在本节中，我们将解释如何扩展该算法以扫描任意(非二次幂)维的大型数组。

基本的想法很简单。我们将大数组划分为块，每个块可以被单个线程块扫描，然后我们扫描这些块，并将每个块的总和写入另一个块和数组。然后，我们扫描块和，生成一个块增量数组，这些块增量被添加到各自块中的所有元素。

更详细地说，设N是输入数组中的元素数量，B是一个块中处理的元素数量。我们分配了N/B线程块，每个线程有B/2个线程。(这里我们假设N是B的倍数，下一段我们将扩展到任意维度。)在NVIDIA 8系列gpu上，B的典型选择是128。我们使用前几节的扫描算法对每个块i进行独立扫描，将扫描结果存储到输出数组的顺序位置。我们对扫描算法做了一个小的修改。在将块i(清单39-2中标记为B的代码块)的最后一个元素归零之前，我们将值(块i的总和)存储到一个辅助数组SUMS中。然后以同样的方式扫描sum，将结果写入一个数组INCR。然后我们将INCR[i]添加到块i的所有元素中，使用一个简单的统一的添加内核，每个线程的N/B线程块调用INCR[i]。

picture.image

Algorithm for Performing a Sum Scan on a Large Array of Values

处理非二次幂很容易。我们只需将数组填充到块大小B的下一个倍数。扫描算法不依赖于数组末尾之后的元素，因此我们不必为最后一个块使用特殊情况。

  
/*  
 * @brief Complete the prefix and in the block, each thread calculates 2 data,  
 * the size of out and in is 2, and BlockDim.x must be less then 512.  
 *  
 * @template paraments  
 * InT: the type of input register.  
 * OutT: the type of out register.  
 * GPU was supported.  
 * OpFunc: Compute functor which has an operator() as following  
 *     template <typename T>  
 *     struct XxxFunctor {  
 *       HOSTDEVICE InT operator()(T a, T b)  
 * const {  
 *         return a + b;  
 *       }  
 *     };  
 *  
 * @param  
 * out: The register pointer of out, the size is 2;  
 * in: The register pointer of input, the size is 2;  
 * compute: Compute function which was declared like OpFunc<T>().  
 */  
  
#define SHARED\_SIZE\_LIMIT 512  
template <typename InT, typename OutT, class OpFunc>  
\_\_device\_\_ \_\_forceinline\_\_ void Cumsum(OutT* out,  
                                       const InT* in,  
                                       OpFunc compute) {  
  constexpr int kSize = SHARED_SIZE_LIMIT * 2 + (SHARED_SIZE_LIMIT * 2) / 32;  
  __shared__ InT temp[kSize];  
  int stride_size = blockDim.x;  
  int tidx = threadIdx.x;  
  temp[tidx + tidx / 32] = in[0];  
  temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1];  
  for (int stride = 1; stride <= stride_size; stride *= 2) {  
    __syncthreads();  
    int index = (tidx + 1) * 2 * stride - 1;  
    if (index < (blockDim.x * 2)) {  
      temp[index + index / 32] =  
          compute(temp[index + index / 32],  
                  temp[index - stride + (index - stride) / 32]);  
    }  
  }  
  for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) {  
    __syncthreads();  
    int index = (tidx + 1) * 2 * stride - 1;  
    if ((index + stride) < (blockDim.x * 2)) {  
      temp[index + stride + (stride + index) / 32] =  
          compute(temp[index + stride + (stride + index) / 32],  
                  temp[index + (index) / 32]);  
    }  
  }  
  
  __syncthreads();  
  out[0] = static\_cast<OutT>(temp[tidx + tidx / 32]);  
  out[1] =  
      static\_cast<OutT>(temp[tidx + stride_size + (tidx + stride_size) / 32]);  
}  
#undef SHARED\_SIZE\_LIMIT

discounted-cumsum

https://github.com/toshas/torch-discounted-cumsum

Fast Discounted Cumulative Sums in PyTorch

传统的顺序算法在循环中执行输出元素的计算。对于一个大小为N的输入，它需要O(N)次操作，需要O(N)个时间步来完成。

提出的并行算法总共需要O(N log N)操作，但只需要O(log N)时间步长，这在许多涉及大输入的应用中是一个相当大的权衡。

picture.image

Applications of Scan

Stream Compaction

非正式地说，流压缩是一种过滤操作:它从一个输入向量中选择这个向量的一个子集，并将这个子集打包成一个密集的输出向量。更正式地说，流压缩接受一个输入向量vi和一个谓词p，并且只输出vi中p(vi)为真的元素，保持输入元素的顺序。

picture.image

Stream Compaction Example

流压缩需要两个步骤，扫描和分散。

第一步生成一个临时向量，其中传递谓词的元素被设置为1，其他元素被设置为0。然后我们扫描这个临时向量。对于传递谓词的每个元素，扫描结果现在在输出向量中包含该元素的目标地址。
第二步使用扫描生成的地址将输入元素分散到输出向量。

picture.image

Scan and Scatter

Sort

GPU Programming

picture.image

双调排序Bitonic Sort，适合并行计算的排序算法： https://blog.csdn.net/xbinworld/article/details/76408595

Bitonic Sort（双调排序）基础 https://blog.csdn.net/jiange\_zh/article/details/49533477

双调序列

双调序列(Bitonic Sequence)是指由一个非严格增序列X和非严格减序列Y构成的序列，比如序列（23,10,8,3,5,7,11,78）。

定义:一个序列a1,a2,…,an是双调序列(Bitonic Sequence)，如果：

(1)存在一个ak(1≤k≤n), 使得a1≥…≥ak≤…≤an成立；或者

(2)序列能够循环移位满足条件(1)

Batcher定理

将任意一个长为2n的双调序列A分为等长的两半X和Y，将X中的元素与Y中的元素一一按原序比较，即a[i]与a[i+n] (i < n)比较，将较大者放入MAX序列，较小者放入MIN序列。则得到的MAX和MIN序列仍然是双调序列，并且MAX序列中的任意一个元素不小于MIN序列中的任意一个元素。

Bitonic merge（双调合并）

假设我们有一个双调序列，则我们根据Batcher定理，将该序列划分成2个双调序列，然后继续对每个双调序列递归划分，得到更短的双调序列，直到得到的子序列长度为1为止。这时的输出序列按单调递增顺序排列。

由于每次划分后问题长度都会减半，故所需要的划分次数为log n。

这个应用双调划分来对双调序列进行排序的过程称为Bitonic merge（双调合并）。

picture.image

合并一个有16个元素的双调序列

picture.image

双调合并网络

Bitonic Sort（双调排序）

对于两个元素x，y，如果x<=y，则x，y都位于双调序列的递增部分，而递减部分没有元素，如果x>=y，则x，y都位于双调序列的递减部分，而递增部分没有元素，于是x和y构成一个双调序列。因此， 任何无序的序列都是由若干个只有2个元素的双调序列连接而成 。

于是，对于一个无序序列，我们按照递增和递减顺序合并相邻的双调序列，按照双调序列的定义，通过连接递增和递减序列得到的序列是双调的。最终，我们可以将若干个只有2个元素的双调序列合并成1个有n个元素的双调序列。

picture.image

将无序的输入序列转换成双调序列

如上图，最终再对得到的双调序列进行一次双调合并，即可得到有序序列。

a % b = a & (b - 1)

  
/*  
 * @brief Sort data in this block, each thread calculates 2 data, the size of  
 * out and in is 2, and BlockDim.x must be less then 512.  
 *  
 * @template paraments  
 * InT: the type of input register.  
 * OutT: the type of out register.  
 * GPU was supported.  
 *  
 * @param  
 * out: The register pointer of out, the size is 2.  
 * in: The register pointer of input, the size is 2.  
 * num: The num of this block  
 * monotonic\_type: if monotonic\_type = 1 then sorted in ascending order, eles  
 * sorted in descending.  
 */  
#define SHARED\_SIZE\_LIMIT 1024  
// each thread load 2 data from global memory so SHARED\_SIZE\_LIMIT must  
// larger than blockDim.x * 2  
template <typename InT, typename OutT>  
\_\_device\_\_ \_\_forceinline\_\_ void Sort(OutT* out,  
                                     const InT* in,  
                                     int num,  
                                     int monotonic\_type) {  
  int upper_bound = blockDim.x;  
  // update upper\_bound  
  upper_bound = std::min(details::GetLastPow2(num), upper_bound);  
  // shareMem for value and index  num must smaller than SHARED\_SIZE\_LIMIT / 2  
  __shared__ InT value[SHARED_SIZE_LIMIT];  
  int stride_size = blockDim.x;  
  // shareMem's size must larger than blockDim * 2  
  // Copy value from in  
  value[threadIdx.x] = in[0];  
  value[threadIdx.x + stride_size] = in[1];  
  // make bitonicSort  
  for (int size = 2; size < upper_bound; size <<= 1) {  
    int bitonic_type = (threadIdx.x & (size / 2)) != 0;  
    for (int stride = size / 2; stride > 0; stride >>= 1) {  
      __syncthreads();  
      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));  
      details::Comparator<InT>(&value[pos], &value[pos + stride], bitonic_type);  
    }  
  }  
  // make bitonicMerge  
  for (int stride = stride_size; stride > 0; stride >>= 1) {  
    __syncthreads();  
    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));  
    // last sort when monotonic\_type = 1 then increase  
    details::Comparator<InT>(&value[pos], &value[pos + stride], monotonic_type);  
  }  
  __syncthreads();  
  out[0] = static\_cast<OutT>(value[threadIdx.x]);  
  out[1] = static\_cast<OutT>(value[threadIdx.x + stride_size]);  
}

  
/*  
 * @brief Sort data with data\_index in this block, each thread calculates 2  
 * data, the size of out and in is 2, and BlockDim.x must be less then 512.  
 *  
 * @template paraments  
 * InT: The type of input register.  
 * OutT: The type of out register.  
 * IndexType: The type of index.  
 * GPU was supported.  
 *  
 * @param  
 * out: The register pointer of out, the size is 2.  
 * out\_index: The register pointer of out\_index, the size is 2.  
 * in: The register pointer of input, the size is 2.  
 * in\_index: The register pointer of in\_index, the size is 2.  
 * num: The num of this block.  
 * monotonic\_type: if monotonic\_type = 1 then sorted in ascending order, eles  
 * sorted in escending.  
 */  
template <typename InT, typename OutT, typename IndexType>  
\_\_device\_\_ \_\_forceinline\_\_ void Sort(OutT* out,  
                                     IndexType* out\_index,  
                                     const InT* in,  
                                     IndexType* in\_index,  
                                     int num,  
                                     int monotonic\_type) {  
  int upper_bound = blockDim.x;  
  // update upper\_bound  
  upper_bound = std::min(details::GetLastPow2(num), upper_bound);  
  // shareMem for value and index  num must smaller than SHARED\_SIZE\_LIMIT / 2  
  __shared__ InT value[SHARED_SIZE_LIMIT];  
  // shareMem's size must larger than blockDim * 2  
  __shared__ IndexType index[SHARED_SIZE_LIMIT];  
  // Copy value and index from in and in\_index  
  int stride_size = blockDim.x;  
  value[threadIdx.x] = in[0];  
  value[threadIdx.x + stride_size] = in[1];  
  // index  
  index[threadIdx.x] = in_index[0];  
  index[threadIdx.x + stride_size] = in_index[1];  
  // make bitonicSort  
  for (int size = 2; size < upper_bound; size <<= 1) {  
    int bitonic_type = (threadIdx.x & (size / 2)) != 0;  
    for (int stride = size / 2; stride > 0; stride >>= 1) {  
      __syncthreads();  
      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));  
      details::ComparatorWithIndex<InT, IndexType>(&value[pos],  
                                                   &value[pos + stride],  
                                                   &index[pos],  
                                                   &index[pos + stride],  
                                                   bitonic_type);  
    }  
  }  
  
  for (int stride = stride_size; stride > 0; stride >>= 1) {  
    __syncthreads();  
    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));  
    // last sort when monotonic\_type = 1 then increase  
    details::ComparatorWithIndex<InT, IndexType>(&value[pos],  
                                                 &value[pos + stride],  
                                                 &index[pos],  
                                                 &index[pos + stride],  
                                                 monotonic_type);  
  }  
  
  __syncthreads();  
  out[0] = static\_cast<OutT>(value[threadIdx.x]);  
  out[1] = static\_cast<OutT>(value[threadIdx.x + stride_size]);  
  out_index[0] = index[threadIdx.x];  
  out_index[1] = index[threadIdx.x + stride_size];  
}

参考文献

picture.image