IF YOU WOULD LIKE TO GET AN ACCOUNT, please write an email to s dot adaszewski at gmail dot com. User accounts are meant only to report issues and/or generate pull requests. This is a purpose-specific Git hosting for ADARED projects. Thank you for your understanding!
소스 검색

Progress with CUDA version of torch_stablesort.

master
Stanislaw Adaszewski 4 년 전
부모
커밋
62af8fc5f1
6개의 변경된 파일231개의 추가작업 그리고 213개의 파일을 삭제
  1. +1
    -0
      src/torch_stablesort/dispatch.h
  2. +8
    -5
      src/torch_stablesort/setup.py
  3. +115
    -0
      src/torch_stablesort/torch_stablesort_cpu.cpp
  4. +1
    -109
      src/torch_stablesort/torch_stablesort_cpu.h
  5. +105
    -0
      src/torch_stablesort/torch_stablesort_cuda.cu
  6. +1
    -99
      src/torch_stablesort/torch_stablesort_cuda.h

+ 1
- 0
src/torch_stablesort/dispatch.h 파일 보기

@@ -1,5 +1,6 @@
#pragma once
#include <torch/extension.h>
#include <utility>
template<template<typename T> class F, typename R, typename... Ts>


+ 8
- 5
src/torch_stablesort/setup.py 파일 보기

@@ -3,8 +3,11 @@ from torch.utils import cpp_extension
setup(name='torch_stablesort',
py_modules=['torch_stablesort'],
ext_modules=[cpp_extension.CppExtension('torch_stablesort_cpp',
['torch_stablesort.cpp'],
extra_compile_args=['-I/pstore/home/adaszews/scratch/thrust',
'-fopenmp', '-ggdb', '-std=c++1z'])],
cmdclass={'build_ext': cpp_extension.BuildExtension})
ext_modules=[ cpp_extension.CUDAExtension( 'torch_stablesort_cpp',
['torch_stablesort.cpp', 'torch_stablesort_cpu.cpp', 'torch_stablesort_cuda.cu'],
extra_compile_args={
'cxx': ['-fopenmp', '-ggdb', '-std=c++1z'],
'nvcc': [ '-I/pstore/home/adaszews/scratch/thrust',
'-ccbin', '/pstore/data/data_science/app/modules/anaconda3-2020.07/bin/x86_64-conda_cos6-linux-gnu-gcc', '-std=c++14']
} ) ],
cmdclass={'build_ext': cpp_extension.BuildExtension})

+ 115
- 0
src/torch_stablesort/torch_stablesort_cpu.cpp 파일 보기

@@ -0,0 +1,115 @@
#include <torch/extension.h>
#include "dispatch.h"
#include "torch_stablesort_cpu.h"
template<bool descending, typename T>
struct stable_sort_impl {
std::vector<torch::Tensor> operator()(
torch::Tensor input,
int dim,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out
) const {
if (input.is_sparse())
throw std::runtime_error("Sparse tensors are not supported");
if (input.device().type() != torch::DeviceType::CPU)
throw std::runtime_error("Only CPU tensors are supported");
if (out != torch::nullopt)
throw std::runtime_error("out argument is not supported");
auto in = (dim != -1) ?
torch::transpose(input, dim, -1) :
input;
auto in_sizes = in.sizes();
// std::cout << "in_sizes: " << in_sizes << std::endl;
in = in.view({ -1, in.size(-1) }).contiguous();
auto in_outer_stride = in.stride(-2);
auto in_inner_stride = in.stride(-1);
auto pin = static_cast<T*>(in.data_ptr());
auto x = in.clone();
auto x_outer_stride = x.stride(-2);
auto x_inner_stride = x.stride(-1);
auto n_cols = x.size(1);
auto n_rows = x.size(0);
auto px = static_cast<T*>(x.data_ptr());
auto y = torch::empty({ n_rows, n_cols },
torch::TensorOptions().dtype(torch::kInt64));
auto y_outer_stride = y.stride(-2);
auto y_inner_stride = y.stride(-1);
auto py = static_cast<int64_t*>(y.data_ptr());
#pragma omp parallel for
for (decltype(n_rows) i = 0; i < n_rows; i++) {
std::vector<int64_t> indices(n_cols);
for (decltype(n_cols) k = 0; k < n_cols; k++) {
indices[k] = k;
}
std::stable_sort(std::begin(indices), std::end(indices),
[pin, i, in_outer_stride, in_inner_stride](const auto &a, const auto &b) {
auto va = pin[i * in_outer_stride + a * in_inner_stride];
auto vb = pin[i * in_outer_stride + b * in_inner_stride];
if constexpr(descending)
return (vb < va);
else
return (va < vb);
});
for (decltype(n_cols) k = 0; k < n_cols; k++) {
py[i * y_outer_stride + k * y_inner_stride] = indices[k];
px[i * x_outer_stride + k * x_inner_stride] =
pin[i * in_outer_stride + indices[k] * in_inner_stride];
}
}
// std::cout << "Here" << std::endl;
x = x.view(in_sizes);
y = y.view(in_sizes);
x = (dim == -1) ?
x :
torch::transpose(x, dim, -1).contiguous();
y = (dim == -1) ?
y :
torch::transpose(y, dim, -1).contiguous();
// std::cout << "Here 2" << std::endl;
return { x, y };
}
};
template <typename T>
struct stable_sort_impl_desc: stable_sort_impl<true, T> {};
template <typename T>
struct stable_sort_impl_asc: stable_sort_impl<false, T> {};
std::vector<torch::Tensor> dispatch_cpu(torch::Tensor input,
int dim,
bool descending,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out) {
if (descending)
return dispatch<stable_sort_impl_desc, std::vector<torch::Tensor>>(
input, dim, out);
else
return dispatch<stable_sort_impl_asc, std::vector<torch::Tensor>>(
input, dim, out);
}

+ 1
- 109
src/torch_stablesort/torch_stablesort_cpu.h 파일 보기

@@ -5,115 +5,7 @@
#include <vector>
#include <tuple>
#include "dispatch.h"
template<bool descending, typename T>
struct stable_sort_impl {
std::vector<torch::Tensor> operator()(
torch::Tensor input,
int dim,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out
) const {
if (input.is_sparse())
throw std::runtime_error("Sparse tensors are not supported");
if (input.device().type() != torch::DeviceType::CPU)
throw std::runtime_error("Only CPU tensors are supported");
if (out != torch::nullopt)
throw std::runtime_error("out argument is not supported");
auto in = (dim != -1) ?
torch::transpose(input, dim, -1) :
input;
auto in_sizes = in.sizes();
// std::cout << "in_sizes: " << in_sizes << std::endl;
in = in.view({ -1, in.size(-1) }).contiguous();
auto in_outer_stride = in.stride(-2);
auto in_inner_stride = in.stride(-1);
auto pin = static_cast<T*>(in.data_ptr());
auto x = in.clone();
auto x_outer_stride = x.stride(-2);
auto x_inner_stride = x.stride(-1);
auto n_cols = x.size(1);
auto n_rows = x.size(0);
auto px = static_cast<T*>(x.data_ptr());
auto y = torch::empty({ n_rows, n_cols },
torch::TensorOptions().dtype(torch::kInt64));
auto y_outer_stride = y.stride(-2);
auto y_inner_stride = y.stride(-1);
auto py = static_cast<int64_t*>(y.data_ptr());
#pragma omp parallel for
for (decltype(n_rows) i = 0; i < n_rows; i++) {
std::vector<int64_t> indices(n_cols);
for (decltype(n_cols) k = 0; k < n_cols; k++) {
indices[k] = k;
}
std::stable_sort(std::begin(indices), std::end(indices),
[pin, i, in_outer_stride, in_inner_stride](const auto &a, const auto &b) {
auto va = pin[i * in_outer_stride + a * in_inner_stride];
auto vb = pin[i * in_outer_stride + b * in_inner_stride];
if constexpr(descending)
return (vb < va);
else
return (va < vb);
});
for (decltype(n_cols) k = 0; k < n_cols; k++) {
py[i * y_outer_stride + k * y_inner_stride] = indices[k];
px[i * x_outer_stride + k * x_inner_stride] =
pin[i * in_outer_stride + indices[k] * in_inner_stride];
}
}
// std::cout << "Here" << std::endl;
x = x.view(in_sizes);
y = y.view(in_sizes);
x = (dim == -1) ?
x :
torch::transpose(x, dim, -1).contiguous();
y = (dim == -1) ?
y :
torch::transpose(y, dim, -1).contiguous();
// std::cout << "Here 2" << std::endl;
return { x, y };
}
};
template <typename T>
struct stable_sort_impl_desc: stable_sort_impl<true, T> {};
template <typename T>
struct stable_sort_impl_asc: stable_sort_impl<false, T> {};
std::vector<torch::Tensor> dispatch_cpu(torch::Tensor input,
int dim,
bool descending,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out) {
if (descending)
return dispatch<stable_sort_impl_desc, std::vector<torch::Tensor>>(
input, dim, out);
else
return dispatch<stable_sort_impl_asc, std::vector<torch::Tensor>>(
input, dim, out);
}
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out);

+ 105
- 0
src/torch_stablesort/torch_stablesort_cuda.cu 파일 보기

@@ -0,0 +1,105 @@
#pragma once
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include "dispatch.h"
#include "torch_stablesort_cuda.h"
template<bool descending, typename T>
struct stable_sort_impl_cuda {
std::vector<torch::Tensor> operator()(
torch::Tensor input,
int dim,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out
) const {
if (input.is_sparse())
throw std::runtime_error("Sparse tensors are not supported");
if (input.device().type() != torch::DeviceType::CUDA)
throw std::runtime_error("Only CUDA tensors are supported");
if (out != torch::nullopt)
throw std::runtime_error("out argument is not supported");
auto x = input.clone();
if (dim != -1)
x = torch::transpose(x, dim, -1);
auto x_sizes = x.sizes();
x = x.view({ -1, x.size(-1) }).contiguous();
auto x_outer_stride = x.stride(-2);
auto x_inner_stride = x.stride(-1);
auto n_cols = x.size(1);
auto n_rows = x.size(0);
auto px = x.data_ptr<T>();
assert(x_inner_stride == 1);
auto y = torch::repeat_interleave(
torch::arange(0, n_cols, 1, torch::TensorOptions()
.dtype(torch::kInt32)
.device(x.device())),
torch::ones(n_rows, torch::TensorOptions()
.dtype(torch::kInt32)
.device(x.device()))
);
auto y_outer_stride = y.stride(-2);
auto y_inner_stride = y.stride(-1);
auto py = y.data_ptr<int32_t>();
assert(y_inner_stride == 1);
for (decltype(n_rows) i = 0; i < n_rows; i++) {
auto ind_beg = thrust::device_pointer_cast(py + i * y_outer_stride);
auto val_beg = thrust::device_pointer_cast(px + i * x_outer_stride);
auto val_end = thrust::device_pointer_cast(px + i * x_outer_stride +
n_cols * x_inner_stride);
if constexpr(descending)
thrust::stable_sort_by_key(thrust::device, val_beg, val_end, ind_beg,
thrust::greater<T>());
else
thrust::stable_sort_by_key(thrust::device, val_beg, val_end, ind_beg);
}
x = x.view(x_sizes);
y = y.view(x_sizes);
x = (dim == -1) ?
x :
torch::transpose(x, dim, -1).contiguous();
y = (dim == -1) ?
y :
torch::transpose(y, dim, -1).contiguous();
return { x, y };
}
};
template <typename T>
struct stable_sort_impl_desc_cuda: stable_sort_impl_cuda<true, T> {};
template <typename T>
struct stable_sort_impl_asc_cuda: stable_sort_impl_cuda<false, T> {};
std::vector<torch::Tensor> dispatch_cuda(torch::Tensor input,
int dim,
bool descending,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out) {
if (descending)
return dispatch<stable_sort_impl_desc_cuda, std::vector<torch::Tensor>>(
input, dim, out);
else
return dispatch<stable_sort_impl_asc_cuda, std::vector<torch::Tensor>>(
input, dim, out);
}

+ 1
- 99
src/torch_stablesort/torch_stablesort_cuda.h 파일 보기

@@ -2,108 +2,10 @@
#include <torch/extension.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <thrust/execution_policy.h>
#include <vector>
#include <tuple>
#include "dispatch.h"
template<bool descending, typename T>
struct stable_sort_impl_cuda {
std::vector<torch::Tensor> operator()(
torch::Tensor input,
int dim,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out
) const {
if (input.is_sparse())
throw std::runtime_error("Sparse tensors are not supported");
if (input.device().type() != torch::DeviceType::CUDA)
throw std::runtime_error("Only CUDA tensors are supported");
if (out != torch::nullopt)
throw std::runtime_error("out argument is not supported");
auto x = input.clone();
if (dim != -1)
x = torch::transpose(x, dim, -1);
auto x_sizes = x.sizes();
x = x.view({ -1, x.size(-1) }).contiguous();
auto x_outer_stride = x.stride(-2);
auto x_inner_stride = x.stride(-1);
auto n_cols = x.size(1);
auto n_rows = x.size(0);
auto px = x.data_ptr<T>();
assert(x_inner_stride == 1);
auto y = torch::repeat_interleave(
torch::arange(0, n_cols, 1, torch::TensorOptions()
.dtype(torch::kInt32)
.device(x.device())),
torch::ones(n_rows, torch::TensorOptions()
.dtype(torch::kInt32)
.device(x.device()))
);
auto y_outer_stride = y.stride(-2);
auto y_inner_stride = y.stride(-1);
auto py = y.data_ptr<int32_t>();
assert(y_inner_stride == 1);
for (decltype(n_rows) i = 0; i < n_rows; i++) {
auto ind_beg = thrust::device_pointer_cast(py + i * y_outer_stride);
auto val_beg = thrust::device_pointer_cast(px + i * x_outer_stride);
auto val_end = thrust::device_pointer_cast(px + i * x_outer_stride +
n_cols * x_inner_stride);
if constexpr(descending)
thrust::stable_sort_by_key(thrust::device, val_beg, val_end, ind_beg,
thrust::greater<T>());
else
thrust::stable_sort_by_key(thrust::device, val_beg, val_end, ind_beg);
}
x = x.view(x_sizes);
y = y.view(x_sizes);
x = (dim == -1) ?
x :
torch::transpose(x, dim, -1).contiguous();
y = (dim == -1) ?
y :
torch::transpose(y, dim, -1).contiguous();
return { x, y };
}
};
template <typename T>
struct stable_sort_impl_desc_cuda: stable_sort_impl_cuda<true, T> {};
template <typename T>
struct stable_sort_impl_asc_cuda: stable_sort_impl_cuda<false, T> {};
std::vector<torch::Tensor> dispatch_cuda(torch::Tensor input,
int dim,
bool descending,
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out) {
if (descending)
return dispatch<stable_sort_impl_desc_cuda, std::vector<torch::Tensor>>(
input, dim, out);
else
return dispatch<stable_sort_impl_asc_cuda, std::vector<torch::Tensor>>(
input, dim, out);
}
torch::optional<std::tuple<torch::Tensor, torch::Tensor>> out);

불러오는 중...
취소
저장