#pragma once #include #include #include std::vector dispatch_cuda(torch::Tensor input, int dim, bool descending, torch::optional> out);