本门关注torch cummin API的实现。
def torch.cummin(input, dim, *, out=None)
Returns a namedtuple (values, indices)
where values
is the cumulative minimum of elements of input in the dimension dim
. And indices
is the index location of each minimum value found in the dimension dim.
>>> a = torch.randn(10)
>>> a
tensor([-0.2284, -0.6628, 0.0975, 0.2680, -1.3298, -0.4220, -0.3885, 1.1762,
0.9165, 1.6684])
>>> torch.cummin(a, dim=0)
torch.return_types.cummin(
values=tensor([-0.2284, -0.6628, -0.6628, -0.6628, -1.3298, -1.3298, -1.3298, -1.3298,
-1.3298, -1.3298]),
indices=tensor([0, 1, 1, 1, 4, 4, 4, 4, 4, 4]))
forward
# aten/src/ATen/native/native\_functions.yaml
- func: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
device\_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CompositeExplicitAutograd: cummin
关于字段的含义参看:
aten/src/ATen/native/README.md
backward
# tools/autograd/derivatives.yaml
- name: cummin(Tensor self, int dim) -> (Tensor values, Tensor indices)
self: cummaxmin\_backward(grad, self, indices, dim)
values: self\_t.gather(dim, indices)
at::native::cummin
// aten/src/ATen/native/ReduceOps.cpp
std::tuple<Tensor, Tensor> cummin(const Tensor& self, Dimname dim) {
return at::cummin(self, dimname_to_position(self, dim));
}
std::tuple<Tensor&, Tensor&> cummin\_out(const Tensor& self, Dimname dim, Tensor& values, Tensor& indices) {
return at::cummin_out(values, indices, self, dimname_to_position(self, dim));
}
// aten/src/ATen/native/ReduceOps.cpp
std::tuple<Tensor, Tensor> cummin(const Tensor& self, int64\_t dim) {
auto values = at::empty(self.sizes(), self.options());
auto indices = at::empty(self.sizes(), self.options().dtype(at::kLong));
at::cummin_out(values, indices, self, dim);
return std::make_tuple(values, indices);
}
std::tuple<Tensor&, Tensor&> cummin\_out(const Tensor& self, int64\_t dim, Tensor& values, Tensor& indices) {
check_scalar_type_device_layout_equal(values, self);
check_scalar_type_device_layout_equal(indices, at::empty({0}, self.options().dtype(at::kLong)));
{
NoNamesGuard guard;
at::native::resize_output(values, self.sizes());
at::native::resize_output(indices, self.sizes());
if(self.dim() == 0) {
values.fill_(self);
indices.fill_(0);
} else if(self.numel() != 0) {
dim = maybe_wrap_dim(dim, self.dim());
at::_cummin_helper(self, values, indices, dim);
}
}
namedinference::propagate_names(values, self);
namedinference::propagate_names(indices, self);
return std::forward_as_tuple(values, indices);
}
at::_cummin_helper
# aten/src/ATen/native/native\_functions.yaml
- func: \_cummin\_helper(Tensor self, Tensor(a!) values, Tensor(b!) indices, int dim) -> ()
variants: function
dispatch:
CPU: cummin\_helper\_cpu
CUDA: cummin\_helper\_cuda
cummin_helper_cpu
// aten/src/ATen/native/ReduceOps.cpp
void cummin\_helper\_cpu(const Tensor& self, Tensor& values, Tensor& indices, int64\_t dim) {
AT_DISPATCH_ALL_TYPES_AND2(kBool, kBFloat16,
self.scalar_type(), "cummin\_cpu",
[&] {
at::native::tensor_dim_apply3<scalar\_t, int64\_t>(self, values, indices, dim, cummax_cummin_helper<scalar\_t, int64\_t, std::less_equal<scalar\_t>>);
});
}
cummax_cummin_helper
template<typename T1, typename T2, typename Operation>
void cummax\_cummin\_helper(const T1* self\_data, T1* values\_data, T2* indices\_data,
int self\_dim\_size, int self\_stride, int values\_stride, int indices\_stride) {
Operation op;
T1 out = self_data[0];
int idx = 0;
for (const auto i : c10::irange(self_dim_size)) {
T1 curr_elem = self_data[i*self_stride];
if(isnan_(curr_elem) || (!isnan_(out) && op(curr_elem, out))) {
out = self_data[i*self_stride];
idx = i;
}
values_data[i*values_stride] = out;
indices_data[i*indices_stride] = idx;
}
}
cummin_helper_cuda
// aten/src/ATen/native/cuda/ScanKernels.cpp
void cummin\_helper\_cuda(const Tensor& self, Tensor& values, Tensor& indices, int64\_t dim) {
TensorArg output_arg{ values, "output", 1 };
TensorArg indices_arg{ indices, "indices", 2 };
TensorArg input_arg{ self, "input", 3 };
checkAllSameGPU(__func__, {output_arg, indices_arg, input_arg});
auto values_ = contiguous_out_arg(values);
auto indices_ = contiguous_out_arg(indices);
launch_cummin_cuda_kernel(self, *values_, *indices_, dim);
if (!values.is_same(*values_)) {
values.copy_(*values_);
}
if (!indices.is_same(*indices_)) {
indices.copy_(*indices_);
}
}
launch_cummin_cuda_kernel
// aten/src/ATen/native/cuda/ScanKernels.cu
void launch\_cummin\_cuda\_kernel(const TensorBase& self, const TensorBase& values, const TensorBase& indices, int64\_t dim) {
AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Bool, at::ScalarType::Half, at::ScalarType::BFloat16,
self.scalar_type(), "cummin\_cuda", [&]() {
scalar\_t init = self.is_floating_point() ? std::numeric_limits<scalar\_t>::infinity() : std::numeric_limits<scalar\_t>::max();
scan_dim_with_indices<scalar\_t>(self, values, indices, dim, init, std::less_equal<scalar\_t>());
});
}
template<typename scalar\_t, typename BinaryFunction>
void scan\_dim\_with\_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64\_t dim) {
int64\_t dim, scalar\_t init, BinaryFunction binary\_op) {
int ndim = self.dim();
auto self_ = self.expect_contiguous();
TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous());
if (dim == ndim - 1) {
scan_innermost_dim_with_indices<scalar\_t>(*self_, values, indices, init, binary_op);
} else {
scan_outer_dim_with_indices<scalar\_t>(*self_, values, indices, dim, init, binary_op);
}
}
at::native::cummaxmin_backward
// aten/src/ATen/native/ReduceOps.cpp
Tensor cummaxmin\_backward(const Tensor& grad, const Tensor& input, const Tensor& indices, int64\_t dim) {
if (input.numel() == 0) {
return input;
}
auto result = at::zeros(input.sizes(), input.options());
// for composite compliance, use out-of-place variant of
// `scatter\_add` if `indices` or `grad` is a Tensor Subclass.
if (areAnyTensorSubclassLike({indices, grad})) {
return result.scatter_add(dim, indices, grad);
}
return result.scatter_add_(dim, indices, grad);
}
scatter_add
# aten/src/ATen/native/native\_functions.yaml
- func: scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor
structured_delegate: scatter_add.out
variants: function, method
- func: scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)
structured_delegate: scatter_add.out
variants: method
- func: scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)
structured: True
variants: function
dispatch:
CPU, CUDA: scatter_add
MPS: scatter_add_mps_out
scatter_add
// aten/src/ATen/TensorMeta.h
// Use this to define the prototype for an implementation. This takes only
// one argument, which is the name of the dispatch key entry you're
// implementing.
//
// Example usage:
//
// TORCH\_IMPL\_FUNC(add\_cpu) (
// Tensor& result, const Tensor& self, const Tensor& other
// ) {
// ... do the actual implementation ...
// }
//
#define TORCH\_IMPL\_FUNC(name) void structured\_##name::impl
// aten/src/ATen/TensorMeta.h
// Use this to define the prototype for a meta function. There are two
// versions; one that takes one argument (just the operator name), or FUNC2
// variant that takes two arguments (operator name and overload name).
//
// Example usage:
//
// TORCH\_META\_FUNC2(add, Tensor) (
// const Tensor& self, const Tensor& other
// ) {
// ... compute sizes and options ...
// set\_output(sizes, options);
// }
//
#define TORCH\_META\_FUNC(name) void structured\_##name::meta
// aten/src/ATen/native/TensorAdvancedIndexing.cpp
TORCH_META_FUNC(scatter_add)
(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {
scatter_meta_impl(*this, self, dim, index, src, "add");
}
// aten/src/ATen/native/TensorAdvancedIndexing.cpp
TORCH_IMPL_FUNC(scatter_add)
(const Tensor& self,
int64\_t dim,
const Tensor& index,
const Tensor& src,
const Tensor& out) {
auto mut_out = const\_cast<Tensor&>(out);
dim = maybe_wrap_dim(dim, self.dim());
if (!self.is_same(mut_out)) {
mut_out.copy_(self);
}
if (index.numel() == 0) return;
if (globalContext().deterministicAlgorithms() && self.device().type() == DeviceType::CUDA && self.dim() == 1) {
TORCH_CHECK(index.dim() == 1 && src.dim() == 1, "index and src should be 1D tensors when self is a 1D tensor, "
"but their dims are ", index.dim(), " and ", src.dim(), ", respectively");
TORCH_CHECK(index.numel() == src.numel(), "index and src should have same number of elements for 1D tensors, "
"but got ", index.numel(), " versus ", src.numel());
TORCH_CHECK(dim == 0, "dim should be zero for 1D self tensor, but got ", dim);
torch::List<c10::optional<Tensor>> indices;
indices.reserve(1);
indices.push_back(index);
mut_out.index_put_(indices, src, true);
} else {
scatter_add_stub(self.device().type(), mut_out, dim, index, src);
}
}
scatter_add_stub
// aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cpu_kernel);
void scatter\_add\_cpu\_kernel(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {
cpu_scatter_gather_base_kernel<>()(
self, dim, index, src,
"scatter\_add\_", reduce_add);
}
// aten/src/ATen/native/cuda/ScatterGatherKernel.cu
REGISTER_DISPATCH(scatter_add_stub, &scatter_add_cuda_kernel);
void scatter\_add\_cuda\_kernel(const Tensor& self, int64\_t dim, const Tensor& index, const Tensor& src) {
// See Note [Writing Nondeterministic Operations]
// Nondeterministic because of atomicAdd usage
globalContext().alertNotDeterministic("scatter\_add\_cuda\_kernel");
cuda_scatter_gather_base_kernel</*is_scatter_like=*/true, /*cast\_to\_opaque=*/false>()(
self, dim, index, src,
"scatter\_add\_cuda\_", reduce_add);
}
cpu_scatter_gather_base_kernel
// aten/src/ATen/native/cpu/ScatterGatherKernel.cpp
template <bool is_scatter_like = true>
struct cpu_scatter_gather_base_kernel {
template <typename func\_t>
void operator()(const Tensor& self, int64\_t dim,
const Tensor& index, const Scalar& value,
const std::string& method_name, func\_t& kernel_func) {
...
}
template <typename func\_t>
void operator()(const Tensor& self, int64\_t dim,
const Tensor& index, const Tensor& src,
const std::string& method_name, func\_t& kernel_func) {
...
}
void operator()(const Tensor& self, int64\_t dim,
const Tensor& index, const Tensor& src,
const std::string& method_name, ReduceMinimum& kernel_func) {
auto iter = TensorIteratorConfig()
.check_all_same_dtype(false)
.resize_outputs(false)
// NOLINTNEXTLINE(bugprone-argument-comment)
.declare_static_shape(index.sizes(), /*squash\_dim=*/dim)
.add_output(self)
.add_input(src)
.add_input(index)
.build();
auto self_dim_stride = ensure_nonempty_stride(self, dim);
auto self_dim_size = ensure_nonempty_size(self, dim);
auto index_dim_stride = ensure_nonempty_stride(index, dim);
auto index_dim_size = ensure_nonempty_size(index, dim);
auto src_dim_stride = ensure_nonempty_stride(src, dim);
auto src_dim_size = ensure_nonempty_size(src, dim);
auto index_upper_bound = is_scatter_like ? self_dim_size : src_dim_size;
int64\_t grain_size = std::max((int64\_t) 1, at::internal::GRAIN_SIZE / index_dim_size);
AT_DISPATCH_ALL_TYPES_AND3(
ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(),
"scatter\_gather\_tensor\_cpu\_reduce\_amin", [&] {
constexpr auto SELF_ITER_STRIDE_IDX = 0;
constexpr auto INDEX_ITER_STRIDE_IDX = 2;
constexpr auto SRC_ITER_STRIDE_IDX = 1;
auto loop = [&](char** data, const int64\_t* strides, int64\_t n) {
auto* self_data_bytes = data[SELF_ITER_STRIDE_IDX];
auto* index_data_bytes = data[INDEX_ITER_STRIDE_IDX];
auto* src_data_bytes = data[SRC_ITER_STRIDE_IDX];
// we change the order of TensorIterator-dim loop
// vs dim-TensorIterator loop order depending on
// whether dim is the last dimension
if (dim== self.dim() - 1) {
for (const auto nelem : c10::irange(n)) {
(void)nelem; //Suppress unused variable warning
// dim loop is a separate code block
// for better performance
_cpu_scatter_gather_dim_loop<is_scatter_like>()(
(scalar\_t*)self_data_bytes, self_dim_stride,
(int64\_t*)index_data_bytes, index_dim_stride,
(scalar\_t*)src_data_bytes, src_dim_stride,
dim, index_dim_size, index_upper_bound,
kernel_func
);
self_data_bytes += strides[SELF_ITER_STRIDE_IDX];
index_data_bytes += strides[INDEX_ITER_STRIDE_IDX];
src_data_bytes += strides[SRC_ITER_STRIDE_IDX];
}
}
else {
for (const auto i : c10::irange(index_dim_size)) {
auto* self_data = self_data_bytes;
auto* index_data = (char*)((int64\_t*)index_data_bytes + i * index_dim_stride);
auto* src_data = src_data_bytes;
for (const auto nelem : c10::irange(n)) {
(void)nelem; //Suppress unused variable warning
int64\_t idx_dim = *(int64\_t*)index_data;
// we are not putting idx\_dim in the error message because it disables
// loop optimization in clang-7
TORCH_CHECK(idx_dim >= 0 && idx_dim < index_upper_bound,
"index ", *(int64\_t*)index_data,
" is out of bounds for dimension ", dim,
" with size ", index_upper_bound);
kernel_func(
(scalar\_t*)self_data + (is_scatter_like ? idx_dim : i) * self_dim_stride,
(scalar\_t*)src_data + (is_scatter_like ? i : idx_dim) * src_dim_stride);
self_data += strides[SELF_ITER_STRIDE_IDX];
index_data += strides[INDEX_ITER_STRIDE_IDX];
src_data += strides[SRC_ITER_STRIDE_IDX];
}
}
}
};
iter.for_each(loop, grain_size);
}
);
}
};
参考文献
- https://pytorch.org/docs/stable/generated/torch.cummin.html
- https://pytorch.org/docs/stable/generated/torch.Tensor.scatter\_add\_.html#torch.Tensor.scatter\_add\_