本门关注torch cummin API的实现。
def torch.cummin(input, dim, *, out=None)
Returns a namedtuple (values, indices) where values is the cumulative minimum of elements of input in the dimension dim. And indices is the index location of each minimum value found in the dimension dim.
>>> a = torch.randn(10)
>>> a
tensor([-0.2284, -0.6628, 0.0975, 0.2680, -1.3298, -0.4220, -0.3885, 1.1762,
0.9165, 1.6684])
>>> torch.cummin(a, dim=0)
torch.return_types.cummin(
values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
-1.3298, -1.3298]),
indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
forward
# aten/src/ATen/native/native\_functions.yaml
- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
device\_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: cummin
关于字段的含义参看:
aten/src/ATen/native/README.md
backward
# tools/autograd/derivatives.yaml
- name: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
self: cummaxmin\_backward(grad, self, indices, dim)
values: self\_t.gather(dim, indices)
at::native::cummin
// aten/src/ATen/native/ReduceOps.cpp
std::tuple<Tensor, Tensor> cummin(const Tensor& self, Dimname dim) {
return at::cummin(self, dimname_to_position(self, dim));
}
std::tuple<Tensor&, Tensor&> cummin\_out(const Tensor& self, Dimname dim, Tensor& values, Tensor& indices) {
return at::cummin_out(values, indices, self, dimname_to_position(self, dim));
}
// aten/src/ATen/native/ReduceOps.cpp
std::tuple<Tensor, Tensor> cummin(const Tensor& self, int64\_t dim) {
auto values = at::empty(self.sizes(), self.options());
auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
at::cummin_out(values, indices, self, dim);
return std::make_tuple(values, indices);
}
std::tuple<Tensor&, Tensor&> cummin\_out(const Tensor& self, int64\_t dim, Tensor& values, Tensor& indices) {
check_scalar_type_device_layout_equal(values, self);
check_scalar_type_device_layout_equal(indices, at::empty({0}, self.options().dtype(at::kLong)));
{
NoNamesGuard guard;
at::native::resize_output(values, self.sizes());
at::native::resize_output(indices, self.sizes());
if(self.dim() == 0) {
values.fill_(self);
indices.fill_(0);
} else if(self.numel() != 0) {
dim = maybe_wrap_dim(dim, self.dim());
at::_cummin_helper(self, values, indices, dim);
}
}
namedinference::propagate_names(values, self);
namedinference::propagate_names(indices, self);
return std::forward_as_tuple(values, indices);
}
at::_cummin_helper
# aten/src/ATen/native/native\_functions.yaml
- func: \_cummin\_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
variants: function
dispatch:
CPU: cummin\_helper\_cpu
CUDA: cummin\_helper\_cuda
cummin_helper_cpu
// aten/src/ATen/native/ReduceOps.cpp
void cummin\_helper\_cpu(const Tensor& self, Tensor& values, Tensor& indices, int64\_t dim) {
AT_DISPATCH_ALL_TYPES_AND2(kBool, kBFloat16,
self.scalar_type(), "cummin\_cpu",
[&] {
at::native::tensor_dim_apply3<scalar\_t, int64\_t>(self, values, indices, dim, cummax_cummin_helper<scalar\_t, int64\_t, std::less_equal<scalar\_t>>);
});
}
cummax_cummin_helper
template<typename T1, typename T2, typename Operation>
void cummax\_cummin\_helper(const T1* self\_data, T1* values\_data, T2* indices\_data,
int self\_dim\_size, int self\_stride, int values\_stride, int indices\_stride) {
Operation op;
T1 out = self_data[0];
int idx = 0;
for (const auto i : c10::irange(self_dim_size)) {
T1 curr_elem = self_data[i*self_stride];
if(isnan_(curr_elem) || (!isnan_(out) && op(curr_elem, out))) {
out = self_data[i*self_stride];
idx = i;
}
values_data[i*values_stride] = out;
indices_data[i*indices_stride] = idx;
}
}
cummin_helper_cuda
// aten/src/ATen/native/cuda/ScanKernels.cpp
void cummin\_helper\_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64\_t dim) {
TensorArg output_arg{ values, "output", 1 };
TensorArg indices_arg{ indices, "indices", 2 };
TensorArg input_arg{ self, "input", 3 };
checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
auto values_ = contiguous_out_arg(values);
auto indices_ = contiguous_out_arg(indices);
launch_cummin_cuda_kernel(self, *values_, *indices_, dim);
if (!values.is_same(*values_)) {
values.copy_(*values_);
}
if (!indices.is_same(*indices_)) {
indices.copy_(*indices_);
}
}
launch_cummin_cuda_kernel
// aten/src/ATen/native/cuda/ScanKernels.cu
void launch\_cummin\_cuda\_kernel(const TensorBase& self, const TensorBase& values, const TensorBase& indices, int64\_t dim) {
AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16,
self.scalar_type(), "cummin\_cuda", [&]() {
scalar\_t init = self.is_floating_point() ? std::numeric_limits<scalar\_t>::infinity() : std::numeric_limits<scalar\_t>::max();
scan_dim_with_indices<scalar\_t>(self, values, indices, dim, init, std::less_equal<scalar\_t>());
});
}
template<typename scalar\_t, typename BinaryFunction>
void scan\_dim\_with\_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64\_t dim) {
int64\_t dim, scalar\_t init, BinaryFunction binary\_op) {
int ndim = self.dim();
auto self_ = self.expect_contiguous();
TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous());
if (dim == ndim - 1) {
scan_innermost_dim_with_indices<scalar\_t>(*self_, values, indices, init, binary_op);
} else {
scan_outer_dim_with_indices<scalar\_t>(*self_, values, indices, dim, init, binary_op);
}
}
at::native::cummaxmin_backward
// aten/src/ATen/native/ReduceOps.cpp
Tensor cummaxmin\_backward(const Tensor& grad, const Tensor& input, const Tensor& indices, int64\_t dim) {
if (input.numel() == 0) {
return input;
}
auto result = at::zeros(input.sizes(), input.options());
// for composite compliance, use out-of-place variant of
// `scatter\_add` if `indices` or `grad` is a Tensor Subclass.
if (areAnyTensorSubclassLike({indices, grad})) {
return result.scatter_add(dim, indices, grad);
}
return result.scatter_add_(dim, indices, grad);
}
scatter_add
# aten/src/ATen/native/native\_functions.yaml
- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
structured_delegate: scatter_add.out
variants: function, method
- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
structured_delegate: scatter_add.out
variants: method
- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
structured: True
variants: function
dispatch:
CPU, CUDA: scatter_add
MPS: scatter_add_mps_out
scatter_add
// aten/src/ATen/TensorMeta.h
// Use this to define the prototype for an implementation. This takes only
// one argument, which is the name of the dispatch key entry you're
// implementing.
//
// Example usage:
//
// TORCH\_IMPL\_FUNC(add\_cpu) (
// Tensor& result, const Tensor& self, const Tensor& other
// ) {
// ... do the actual implementation ...
// }
//
#define TORCH\_IMPL\_FUNC(name) void structured\_##name::impl
// aten/src/ATen/TensorMeta.h
// Use this to define the prototype for a meta function. There are two
// versions; one that takes one argument (just the operator name), or FUNC2
// variant that takes two arguments (operator name and overload name).
//
// Example usage:
//
// TORCH\_META\_FUNC2(add, Tensor) (
// const Tensor& self, const Tensor& other
// ) {
// ... compute sizes and options ...
// set\_output(sizes, options);
// }
//
#define TORCH\_META\_FUNC(name) void structured\_##name::meta
// aten/src/ATen/native/TensorAdvancedIndexing.cpp
TORCH_META_FUNC(scatter_add)
(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {
scatter_meta_impl(*this, self, dim, index, src, "add");
}
// aten/src/ATen/native/TensorAdvancedIndexing.cpp
TORCH_IMPL_FUNC(scatter_add)
(const Tensor& self,
int64\_t dim,
const Tensor& index,
const Tensor& src,
const Tensor& out) {
auto mut_out = const\_cast<Tensor&>(out);
dim = maybe_wrap_dim(dim, self.dim());
if (!self.is_same(mut_out)) {
mut_out.copy_(self);
}
if (index.numel() == 0) return;
if (globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA && self.dim() == 1) {
TORCH_CHECK(index.dim() == 1 && src.dim() == 1, "index and src should be 1D tensors when self is a 1D tensor, "
"but their dims are ", index.dim(), " and ", src.dim(), ", respectively");
TORCH_CHECK(index.numel() == src.numel(), "index and src should have same number of elements for 1D tensors, "
"but got ", index.numel(), " versus ", src.numel());
TORCH_CHECK(dim == 0, "dim should be zero for 1D self tensor, but got ", dim);
torch::List<c10::optional<Tensor>> indices;
indices.reserve(1);
indices.push_back(index);
mut_out.index_put_(indices, src, true);
} else {
scatter_add_stub(self.device().type(), mut_out, dim, index, src);
}
}
scatter_add_stub
// aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cpu_kernel);
void scatter\_add\_cpu\_kernel(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {
cpu_scatter_gather_base_kernel<>()(
self, dim, index, src,
"scatter\_add\_", reduce_add);
}
// aten/src/ATen/native/cuda/ScatterGatherKernel.cu
REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel);
void scatter\_add\_cuda\_kernel(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("scatter\_add\_cuda\_kernel");
cuda_scatter_gather_base_kernel</*is_scatter_like=*/true, /*cast\_to\_opaque=*/false>()(
self, dim, index, src,
"scatter\_add\_cuda\_", reduce_add);
}
cpu_scatter_gather_base_kernel
// aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
template <bool is_scatter_like = true>
struct cpu_scatter_gather_base_kernel {
template <typename func\_t>
void operator()(const Tensor& self, int64\_t dim,
const Tensor& index, const Scalar& value,
const std::string& method_name, func\_t& kernel_func) {
...
}
template <typename func\_t>
void operator()(const Tensor& self, int64\_t dim,
const Tensor& index, const Tensor& src,
const std::string& method_name, func\_t& kernel_func) {
...
}
void operator()(const Tensor& self, int64\_t dim,
const Tensor& index, const Tensor& src,
const std::string& method_name, ReduceMinimum& kernel_func) {
auto iter = TensorIteratorConfig()
.check_all_same_dtype(false)
.resize_outputs(false)
// NOLINTNEXTLINE(bugprone-argument-comment)
.declare_static_shape(index.sizes(), /*squash\_dim=*/dim)
.add_output(self)
.add_input(src)
.add_input(index)
.build();
auto self_dim_stride = ensure_nonempty_stride(self, dim);
auto self_dim_size = ensure_nonempty_size(self, dim);
auto index_dim_stride = ensure_nonempty_stride(index, dim);
auto index_dim_size = ensure_nonempty_size(index, dim);
auto src_dim_stride = ensure_nonempty_stride(src, dim);
auto src_dim_size = ensure_nonempty_size(src, dim);
auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size;
int64\_t grain_size = std::max((int64\_t) 1, at::internal::GRAIN_SIZE / index_dim_size);
AT_DISPATCH_ALL_TYPES_AND3(
ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(),
"scatter\_gather\_tensor\_cpu\_reduce\_amin", [&] {
constexpr auto SELF_ITER_STRIDE_IDX = 0;
constexpr auto INDEX_ITER_STRIDE_IDX = 2;
constexpr auto SRC_ITER_STRIDE_IDX = 1;
auto loop = [&](char** data, const int64\_t* strides, int64\_t n) {
auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX];
auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX];
auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX];
// we change the order of TensorIterator-dim loop
// vs dim-TensorIterator loop order depending on
// whether dim is the last dimension
if (dim== self.dim() - 1) {
for (const auto nelem : c10::irange(n)) {
(void)nelem; //Suppress unused variable warning
// dim loop is a separate code block
// for better performance
_cpu_scatter_gather_dim_loop<is_scatter_like>()(
(scalar\_t*)self_data_bytes, self_dim_stride,
(int64\_t*)index_data_bytes, index_dim_stride,
(scalar\_t*)src_data_bytes, src_dim_stride,
dim, index_dim_size, index_upper_bound,
kernel_func
);
self_data_bytes += strides[SELF_ITER_STRIDE_IDX];
index_data_bytes += strides[INDEX_ITER_STRIDE_IDX];
src_data_bytes += strides[SRC_ITER_STRIDE_IDX];
}
}
else {
for (const auto i : c10::irange(index_dim_size)) {
auto* self_data = self_data_bytes;
auto* index_data = (char*)((int64\_t*)index_data_bytes + i * index_dim_stride);
auto* src_data = src_data_bytes;
for (const auto nelem : c10::irange(n)) {
(void)nelem; //Suppress unused variable warning
int64\_t idx_dim = *(int64\_t*)index_data;
// we are not putting idx\_dim in the error message because it disables
// loop optimization in clang-7
TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound,
"index ", *(int64\_t*)index_data,
" is out of bounds for dimension ", dim,
" with size ", index_upper_bound);
kernel_func(
(scalar\_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride,
(scalar\_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride);
self_data += strides[SELF_ITER_STRIDE_IDX];
index_data += strides[INDEX_ITER_STRIDE_IDX];
src_data += strides[SRC_ITER_STRIDE_IDX];
}
}
}
};
iter.for_each(loop, grain_size);
}
);
}
};
参考文献
- https://pytorch.org/docs/stable/generated/torch.cummin.html
- https://pytorch.org/docs/stable/generated/torch.Tensor.scatter\_add\_.html#torch.Tensor.scatter\_add\_
