torch cummin/cummax

向量数据库大模型云通信

本门关注torch cummin API的实现。

  
def torch.cummin(input, dim, *, out=None)  

Returns a namedtuple (values, indices) where values is the cumulative minimum of elements of input in the dimension dim. And indices is the index location of each minimum value found in the dimension dim.

picture.image

  
>>> a = torch.randn(10)  
>>> a  
tensor([-0.2284, -0.6628,  0.0975,  0.2680, -1.3298, -0.4220, -0.3885,  1.1762,  
     0.9165,  1.6684])  
>>> torch.cummin(a, dim=0)  
torch.return_types.cummin(  
    values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,  
    -1.3298, -1.3298]),  
    indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))  

forward

  
# aten/src/ATen/native/native\_functions.yaml  
- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)  
  device\_check: NoCheck   # TensorIterator  
  variants: function, method  
  dispatch:  
    CompositeExplicitAutograd: cummin  

关于字段的含义参看:aten/src/ATen/native/README.md

backward

  
# tools/autograd/derivatives.yaml  
- name: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)  
  self: cummaxmin\_backward(grad, self, indices, dim)  
  values: self\_t.gather(dim, indices)  

at::native::cummin

  
// aten/src/ATen/native/ReduceOps.cpp  
std::tuple<Tensor, Tensor> cummin(const Tensor& self, Dimname dim) {  
  return at::cummin(self, dimname_to_position(self, dim));  
}  
  
std::tuple<Tensor&, Tensor&> cummin\_out(const Tensor& self, Dimname dim, Tensor& values, Tensor& indices) {  
  return at::cummin_out(values, indices, self, dimname_to_position(self, dim));  
}  

  
// aten/src/ATen/native/ReduceOps.cpp  
std::tuple<Tensor, Tensor> cummin(const Tensor& self, int64\_t dim) {  
  auto values = at::empty(self.sizes(), self.options());  
  auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));  
  at::cummin_out(values, indices, self, dim);  
  return std::make_tuple(values, indices);  
}  

  
std::tuple<Tensor&, Tensor&> cummin\_out(const Tensor& self, int64\_t dim, Tensor& values, Tensor& indices) {  
  check_scalar_type_device_layout_equal(values, self);  
  check_scalar_type_device_layout_equal(indices, at::empty({0}, self.options().dtype(at::kLong)));  
  {  
    NoNamesGuard guard;  
    at::native::resize_output(values, self.sizes());  
    at::native::resize_output(indices, self.sizes());  
    if(self.dim() == 0) {  
      values.fill_(self);  
      indices.fill_(0);  
    } else if(self.numel() != 0) {  
      dim = maybe_wrap_dim(dim, self.dim());  
      at::_cummin_helper(self, values, indices, dim);  
    }  
  }  
  namedinference::propagate_names(values, self);  
  namedinference::propagate_names(indices, self);  
  return std::forward_as_tuple(values, indices);  
}  

at::_cummin_helper

  
# aten/src/ATen/native/native\_functions.yaml  
- func: \_cummin\_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()  
  variants: function  
  dispatch:  
    CPU: cummin\_helper\_cpu  
    CUDA: cummin\_helper\_cuda  

cummin_helper_cpu

  
// aten/src/ATen/native/ReduceOps.cpp  
void cummin\_helper\_cpu(const Tensor& self, Tensor& values, Tensor& indices, int64\_t dim) {  
  AT_DISPATCH_ALL_TYPES_AND2(kBool, kBFloat16,  
    self.scalar_type(), "cummin\_cpu",  
    [&] {  
      at::native::tensor_dim_apply3<scalar\_t, int64\_t>(self, values, indices, dim, cummax_cummin_helper<scalar\_t, int64\_t, std::less_equal<scalar\_t>>);  
    });  
}  

cummax_cummin_helper
  
template<typename T1, typename T2, typename Operation>  
void cummax\_cummin\_helper(const T1* self\_data, T1* values\_data, T2* indices\_data,  
          int self\_dim\_size, int self\_stride, int values\_stride, int indices\_stride) {  
      Operation op;  
      T1 out = self_data[0];  
      int idx = 0;  
      for (const auto i : c10::irange(self_dim_size)) {  
        T1 curr_elem = self_data[i*self_stride];  
        if(isnan_(curr_elem) || (!isnan_(out) && op(curr_elem, out))) {  
            out = self_data[i*self_stride];  
            idx = i;  
        }  
        values_data[i*values_stride] = out;  
        indices_data[i*indices_stride] = idx;  
      }  
}  

cummin_helper_cuda

  
// aten/src/ATen/native/cuda/ScanKernels.cpp  
void cummin\_helper\_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64\_t dim) {  
  TensorArg output_arg{ values, "output", 1 };  
  TensorArg indices_arg{ indices, "indices", 2 };  
  TensorArg input_arg{ self, "input", 3 };  
  checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});  
  
  auto values_ = contiguous_out_arg(values);  
  auto indices_ = contiguous_out_arg(indices);  
  launch_cummin_cuda_kernel(self, *values_, *indices_, dim);  
  if (!values.is_same(*values_)) {  
    values.copy_(*values_);  
  }  
  if (!indices.is_same(*indices_)) {  
    indices.copy_(*indices_);  
  }  
}  

launch_cummin_cuda_kernel

  
// aten/src/ATen/native/cuda/ScanKernels.cu  
void launch\_cummin\_cuda\_kernel(const TensorBase& self, const TensorBase& values, const TensorBase& indices, int64\_t dim) {  
  AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16,  
    self.scalar_type(), "cummin\_cuda", [&]() {  
    scalar\_t init = self.is_floating_point() ? std::numeric_limits<scalar\_t>::infinity() : std::numeric_limits<scalar\_t>::max();  
    scan_dim_with_indices<scalar\_t>(self, values, indices, dim, init, std::less_equal<scalar\_t>());  
  });  
}  

  
template<typename scalar\_t, typename BinaryFunction>  
void scan\_dim\_with\_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64\_t dim) {  
     int64\_t dim, scalar\_t init, BinaryFunction binary\_op) {  
  int ndim = self.dim();  
  auto self_ = self.expect_contiguous();  
  TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous());  
  if (dim == ndim - 1) {  
    scan_innermost_dim_with_indices<scalar\_t>(*self_, values, indices, init, binary_op);  
  } else {  
    scan_outer_dim_with_indices<scalar\_t>(*self_, values, indices, dim, init, binary_op);  
  }  
}  

at::native::cummaxmin_backward

  
//  aten/src/ATen/native/ReduceOps.cpp  
Tensor cummaxmin\_backward(const Tensor& grad, const Tensor& input, const Tensor& indices, int64\_t dim) {  
  if (input.numel() == 0) {  
    return input;  
  }  
  auto result = at::zeros(input.sizes(), input.options());  
  
  // for composite compliance, use out-of-place variant of  
  // `scatter\_add` if `indices` or `grad` is a Tensor Subclass.  
  if (areAnyTensorSubclassLike({indices, grad})) {  
    return result.scatter_add(dim, indices, grad);  
  }  
  return result.scatter_add_(dim, indices, grad);  
}  

scatter_add

  
# aten/src/ATen/native/native\_functions.yaml  
- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor  
  structured_delegate: scatter_add.out  
  variants: function, method  
  
- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)  
  structured_delegate: scatter_add.out  
  variants: method  
  
- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)  
  structured: True  
  variants: function  
  dispatch:  
    CPU, CUDA: scatter_add  
    MPS: scatter_add_mps_out  

scatter_add

  
// aten/src/ATen/TensorMeta.h  
// Use this to define the prototype for an implementation.  This takes only  
// one argument, which is the name of the dispatch key entry you're  
// implementing.  
//  
// Example usage:  
//  
//    TORCH\_IMPL\_FUNC(add\_cpu) (  
//      Tensor& result, const Tensor& self, const Tensor& other  
//    ) {  
//      ... do the actual implementation ...  
//    }  
//  
#define TORCH\_IMPL\_FUNC(name) void structured\_##name::impl  

  
// aten/src/ATen/TensorMeta.h  
// Use this to define the prototype for a meta function.  There are two  
// versions; one that takes one argument (just the operator name), or FUNC2  
// variant that takes two arguments (operator name and overload name).  
//  
// Example usage:  
//  
//    TORCH\_META\_FUNC2(add, Tensor) (  
//      const Tensor& self, const Tensor& other  
//    ) {  
//      ... compute sizes and options ...  
//      set\_output(sizes, options);  
//    }  
//  
#define TORCH\_META\_FUNC(name) void structured\_##name::meta  

  
// aten/src/ATen/native/TensorAdvancedIndexing.cpp  
TORCH_META_FUNC(scatter_add)  
(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {  
  scatter_meta_impl(*this, self, dim, index, src, "add");  
}  

  
// aten/src/ATen/native/TensorAdvancedIndexing.cpp  
TORCH_IMPL_FUNC(scatter_add)  
(const Tensor& self,  
 int64\_t dim,  
 const Tensor& index,  
 const Tensor& src,  
 const Tensor& out) {  
  auto mut_out = const\_cast<Tensor&>(out);  
  dim = maybe_wrap_dim(dim, self.dim());  
  
  if (!self.is_same(mut_out)) {  
    mut_out.copy_(self);  
  }  
  
  if (index.numel() == 0) return;  
  
  if (globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA && self.dim() == 1) {  
    TORCH_CHECK(index.dim() == 1 && src.dim() == 1, "index and src should be 1D tensors when self is a 1D tensor, "  
      "but their dims are ", index.dim(), " and ", src.dim(), ", respectively");  
    TORCH_CHECK(index.numel() == src.numel(), "index and src should have same number of elements for 1D tensors, "  
      "but got ", index.numel(), " versus ", src.numel());  
    TORCH_CHECK(dim == 0, "dim should be zero for 1D self tensor, but got ", dim);  
    torch::List<c10::optional<Tensor>> indices;  
    indices.reserve(1);  
    indices.push_back(index);  
    mut_out.index_put_(indices, src, true);  
  } else {  
    scatter_add_stub(self.device().type(), mut_out, dim, index, src);  
  }  
}  

scatter_add_stub

  
// aten/src/ATen/native/cpu/ScatterGatherKernel.cpp  
REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cpu_kernel);  
  
void scatter\_add\_cpu\_kernel(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {  
  cpu_scatter_gather_base_kernel<>()(  
    self, dim, index, src,  
    "scatter\_add\_", reduce_add);  
  
}  

  
// aten/src/ATen/native/cuda/ScatterGatherKernel.cu  
REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel);  
  
void scatter\_add\_cuda\_kernel(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {  
  // See Note [Writing Nondeterministic Operations]  
  // Nondeterministic because of atomicAdd usage  
  globalContext().alertNotDeterministic("scatter\_add\_cuda\_kernel");  
  cuda_scatter_gather_base_kernel</*is_scatter_like=*/true, /*cast\_to\_opaque=*/false>()(  
    self, dim, index, src,  
    "scatter\_add\_cuda\_", reduce_add);  
}  

cpu_scatter_gather_base_kernel

  
// aten/src/ATen/native/cpu/ScatterGatherKernel.cpp  
template <bool is_scatter_like = true>  
struct cpu_scatter_gather_base_kernel {  
  
  template <typename func\_t>  
  void operator()(const Tensor& self, int64\_t dim,  
    const Tensor& index, const Scalar& value,  
    const std::string& method_name, func\_t& kernel_func) {  
    ...  
}  
  
  template <typename func\_t>  
  void operator()(const Tensor& self, int64\_t dim,  
    const Tensor& index, const Tensor& src,  
    const std::string& method_name, func\_t& kernel_func) {  
    ...      
}  
  
  void operator()(const Tensor& self, int64\_t dim,  
    const Tensor& index, const Tensor& src,  
    const std::string& method_name, ReduceMinimum& kernel_func) {  
  
    auto iter = TensorIteratorConfig()  
      .check_all_same_dtype(false)  
      .resize_outputs(false)  
      // NOLINTNEXTLINE(bugprone-argument-comment)  
      .declare_static_shape(index.sizes(), /*squash\_dim=*/dim)  
      .add_output(self)  
      .add_input(src)  
      .add_input(index)  
      .build();  
  
    auto self_dim_stride = ensure_nonempty_stride(self, dim);  
    auto self_dim_size = ensure_nonempty_size(self, dim);  
  
    auto index_dim_stride = ensure_nonempty_stride(index, dim);  
    auto index_dim_size = ensure_nonempty_size(index, dim);  
  
    auto src_dim_stride = ensure_nonempty_stride(src, dim);  
    auto src_dim_size = ensure_nonempty_size(src, dim);  
  
    auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size;  
  
    int64\_t grain_size = std::max((int64\_t) 1, at::internal::GRAIN_SIZE / index_dim_size);  
  
    AT_DISPATCH_ALL_TYPES_AND3(  
      ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(),  
      "scatter\_gather\_tensor\_cpu\_reduce\_amin", [&] {  
        constexpr auto SELF_ITER_STRIDE_IDX = 0;  
        constexpr auto INDEX_ITER_STRIDE_IDX = 2;  
        constexpr auto SRC_ITER_STRIDE_IDX = 1;  
        auto loop = [&](char** data, const int64\_t* strides, int64\_t n) {  
          auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX];  
          auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX];  
          auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX];  
          // we change the order of TensorIterator-dim loop  
          // vs dim-TensorIterator loop order depending on  
          // whether dim is the last dimension  
          if (dim== self.dim() - 1) {  
            for (const auto nelem : c10::irange(n)) {  
              (void)nelem; //Suppress unused variable warning  
              // dim loop is a separate code block  
              // for better performance  
              _cpu_scatter_gather_dim_loop<is_scatter_like>()(  
                 (scalar\_t*)self_data_bytes, self_dim_stride,  
                 (int64\_t*)index_data_bytes, index_dim_stride,  
                 (scalar\_t*)src_data_bytes, src_dim_stride,  
                 dim, index_dim_size, index_upper_bound,  
                 kernel_func  
               );  
  
              self_data_bytes += strides[SELF_ITER_STRIDE_IDX];  
              index_data_bytes += strides[INDEX_ITER_STRIDE_IDX];  
              src_data_bytes += strides[SRC_ITER_STRIDE_IDX];  
            }  
          }  
          else {  
            for (const auto i : c10::irange(index_dim_size)) {  
              auto* self_data = self_data_bytes;  
              auto* index_data = (char*)((int64\_t*)index_data_bytes + i * index_dim_stride);  
              auto* src_data = src_data_bytes;  
              for (const auto nelem : c10::irange(n)) {  
                (void)nelem; //Suppress unused variable warning  
                int64\_t idx_dim = *(int64\_t*)index_data;  
                // we are not putting idx\_dim in the error message because it disables  
                // loop optimization in clang-7  
                TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound,  
                            "index ", *(int64\_t*)index_data,  
                            " is out of bounds for dimension ", dim,  
                            " with size ", index_upper_bound);  
  
                kernel_func(  
                  (scalar\_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride,  
                  (scalar\_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride);  
  
                self_data += strides[SELF_ITER_STRIDE_IDX];  
                index_data += strides[INDEX_ITER_STRIDE_IDX];  
                src_data += strides[SRC_ITER_STRIDE_IDX];  
              }  
            }  
          }  
        };  
        iter.for_each(loop, grain_size);  
      }  
    );  
  }  
};  

参考文献

picture.image

0
0
0
0
关于作者

文章

0

获赞

0

收藏

0

相关资源
大规模高性能计算集群优化实践
随着机器学习的发展,数据量和训练模型都有越来越大的趋势,这对基础设施有了更高的要求,包括硬件、网络架构等。本次分享主要介绍火山引擎支撑大规模高性能计算集群的架构和优化实践。
相关产品
评论
未登录
看完啦,登录分享一下感受吧~
暂无评论