remove submodules metis and gklib

bb8fcbda · Wenjie Huang · 41b0c86a · bb8fcbda · bb8fcbda · 41b0c86a
Commit bb8fcbda authored Dec 14, 2023 by Wenjie Huang
11 changed files
--- a/.gitmodules
+++ b/.gitmodules
 [submodule "third_party/METIS"]
 	path = third_party/METIS
 	url = https://github.com/KarypisLab/METIS
+[submodule "third_party/GKlib"]
+	path = third_party/GKlib
+	url = https://github.com/KarypisLab/GKlib
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,8 @@ cmake_minimum_required(VERSION 3.15)
 project(starrygl_ops VERSION 0.1)
 option(WITH_PYTHON "Link to Python when building" ON)
+option(WITH_CUDA "Link to CUDA when building" ON)
+option(WITH_METIS "Link to METIS when building" ON)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -17,24 +19,39 @@ if(WITH_PYTHON)
    include_directories(${Python3_INCLUDE_DIRS})
 endif()
+if(WITH_CUDA)
+    add_definitions(-DWITH_CUDA)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set(CUDA_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
+endif()
+if(WITH_METIS)
+    set(GKLIB_DIR "third_party/GKlib")
+    set(METIS_DIR "third_party/METIS")
+    add_subdirectory(${GKLIB_DIR})
+endif()
 find_package(Torch REQUIRED)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+include_directories(${TORCH_INCLUDE_DIRS})
+add_compile_options(${TORCH_CXX_FLAGS})
+# find_package(OpenMP REQUIRED)
-find_package(OpenMP REQUIRED)
+include_directories("csrc/include")
-# add_subdirectory(third_party/cccl-2.2.0)
+file(GLOB_RECURSE UVM_SRCS "csrc/uvm/*.cpp")
-# add_subdirectory(third_party/cutlass-3.2.1)
+file(GLOB_RECURSE METIS_SRCS "csrc/uvm/*.cpp")
-# add_library(${PROJECT_NAME} SHARED csrc/custom_ops.cpp csrc/custom_kernel.cu)
+add_library(${PROJECT_NAME} SHARED csrc/export.cpp ${UVM_SRCS} ${METIS_SRCS})
-add_library(${PROJECT_NAME} SHARED csrc/custom_ops.cpp)
 if(WITH_PYTHON)
    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
    target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
 endif()
 target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
-target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_CXX)
+# target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_CXX)
-# target_link_libraries(${PROJECT_NAME} PRIVATE CCCL::CCCL)
 target_compile_definitions(${PROJECT_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${PROJECT_NAME})

--- a/csrc/custom_kernel.cu
+++ b/csrc/custom_kernel.cu
-// #include <cub/block/block_reduce.cuh>
\ No newline at end of file
--- a/csrc/custom_ops.cpp
+++ b/csrc/custom_ops.cpp
-#include <torch/extension.h>
-#include <omp.h>
-torch::Tensor add(torch::Tensor a, torch::Tensor b) {
-    return a + b;
-}
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("add", &add, "a function implemented using pybind11");
-}
\ No newline at end of file
--- a/csrc/export.cpp
+++ b/csrc/export.cpp
+#include "extension.h"
+#include "uvm.h"
+torch::Tensor add(torch::Tensor a, torch::Tensor b) {
+    return a + b;
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("add", &add, "a function implemented using pybind11");
+    m.def("uvm_storage_new", &uvm_storage_new, "return storage of unified virtual memory");
+    m.def("uvm_storage_to_cuda", &uvm_storage_to_cuda, "share uvm storage with another cuda device");
+    m.def("uvm_storage_to_cpu", &uvm_storage_to_cpu, "share uvm storage with cpu");
+    m.def("uvm_storage_advise", &uvm_storage_advise, "apply cudaMemAdvise() to uvm storage");
+    m.def("uvm_storage_prefetch", &uvm_storage_prefetch, "apply cudaMemPrefetchAsync() to uvm storage");
+    py::enum_<cudaMemoryAdvise>(m, "cudaMemoryAdvise")
+        .value("cudaMemAdviseSetAccessedBy", cudaMemoryAdvise::cudaMemAdviseSetAccessedBy)
+        .value("cudaMemAdviseUnsetAccessedBy", cudaMemoryAdvise::cudaMemAdviseUnsetAccessedBy)
+        .value("cudaMemAdviseSetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseSetPreferredLocation)
+        .value("cudaMemAdviseUnsetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseUnsetPreferredLocation)
+        .value("cudaMemAdviseSetReadMostly", cudaMemoryAdvise::cudaMemAdviseSetReadMostly)
+        .value("cudaMemAdviseUnsetReadMostly", cudaMemoryAdvise::cudaMemAdviseUnsetReadMostly);
+}
--- a/csrc/include/extension.h
+++ b/csrc/include/extension.h
+#pragma once
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
\ No newline at end of file
--- a/csrc/include/utils.h
+++ b/csrc/include/utils.h
+#pragma once
+#include "extension.h"
+#define CHECK_CUDA(x) \
+    AT_ASSERTM(x.device().is_cuda(), #x "must be CUDA Tensor")
--- a/csrc/include/uvm.h
+++ b/csrc/include/uvm.h
+#pragma once
+#include "extension.h"
+at::Storage uvm_storage_new(const std::size_t size_bytes, const at::DeviceIndex cuda_device);
+at::Storage uvm_storage_to_cuda(const at::Storage& storage, const at::DeviceIndex cuda_device);
+at::Storage uvm_storage_to_cpu(const at::Storage& storage);
+void uvm_storage_advise(const at::Storage& storage, const cudaMemoryAdvise advise);
+void uvm_storage_prefetch(const at::Storage& storage);
\ No newline at end of file
--- a/csrc/metis/metis.cpp
+++ b/csrc/metis/metis.cpp
+#include <torch/torch.h>
+at::Tensor metis_partition() {
+}
+at::Tensor metis_mt_partition() {
+}
\ No newline at end of file
--- a/csrc/uvm/uvm.cpp
+++ b/csrc/uvm/uvm.cpp
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+struct CUDAManagedContext {
+    void *_ptr;
+    at::DeviceIndex _dev;
+    CUDAManagedContext(void* ptr, const at::DeviceIndex cuda_device): _ptr(ptr), _dev(cuda_device) {}
+    ~CUDAManagedContext() {
+        at::cuda::CUDAGuard device_guard(this->_dev);
+        AT_CUDA_CHECK(cudaFree(this->_ptr));
+    }
+    static void release(void* ptr) {
+        delete static_cast<CUDAManagedContext*>(ptr);
+    }
+};
+struct CUDAManagedContextViewer {
+    at::Storage _storage;
+    CUDAManagedContextViewer(at::Storage storage): _storage(std::move(storage)) {}
+    static void release(void* ptr) {
+        delete static_cast<CUDAManagedContextViewer*>(ptr);
+    }
+};
+at::Storage uvm_storage_new(const std::size_t size_bytes, const at::DeviceIndex cuda_device) {
+    at::cuda::CUDAGuard device_guard(cuda_device);
+    void *ptr;
+    AT_CUDA_CHECK(cudaMallocManaged(&ptr, size_bytes));
+    auto storage = at::Storage(
+        at::Storage::use_byte_size_t(),
+        size_bytes,
+        at::DataPtr(
+            ptr, new CUDAManagedContext(ptr, cuda_device),
+            &CUDAManagedContext::release,
+            {at::DeviceType::CUDA, cuda_device}
+        ),
+        /* allocator= */ nullptr,
+        /* resizable= */ false
+    );
+    return at::Storage(
+        at::Storage::use_byte_size_t(),
+        size_bytes,
+        at::DataPtr(
+            ptr, new CUDAManagedContextViewer(storage),
+            &CUDAManagedContextViewer::release,
+            {at::DeviceType::CUDA, cuda_device}
+        ),
+        /* allocator= */ nullptr,
+        /* resizable= */ false
+    );
+}
+bool uvm_storage_check(const at::Storage& storage) {
+    return storage.data_ptr().get_deleter() == &CUDAManagedContextViewer::release;
+}
+at::Storage uvm_storage_to_cuda(const at::Storage& storage, const at::DeviceIndex cuda_device) {
+    AT_ASSERT(uvm_storage_check(storage), "must be uvm storage");
+    auto& data_ptr = storage.data_ptr();
+    auto ctx_view = static_cast<CUDAManagedContextViewer*>(data_ptr.get_context());
+    at::cuda::CUDAGuard device_guard(cuda_device);
+    return at::Storage(
+        at::Storage::use_byte_size_t(),
+        storage.nbytes(),
+        at::DataPtr(
+            data_ptr.get(),
+            new CUDAManagedContextViewer(ctx_view->_storage),
+            &CUDAManagedContextViewer::release,
+            {at::DeviceType::CUDA, cuda_device}
+        ),
+        /* allocator= */ nullptr,
+        /* resizable= */ false
+    );
+}
+at::Storage uvm_storage_to_cpu(const at::Storage& storage) {
+    AT_ASSERT(uvm_storage_check(storage), "must be uvm storage");
+    auto& data_ptr = storage.data_ptr();
+    auto ctx_view = static_cast<CUDAManagedContextViewer*>(data_ptr.get_context());
+    return at::Storage(
+        at::Storage::use_byte_size_t(),
+        storage.nbytes(),
+        at::DataPtr(
+            data_ptr.get(),
+            new CUDAManagedContextViewer(ctx_view->_storage),
+            &CUDAManagedContextViewer::release,
+            {at::DeviceType::CPU}
+        ),
+        /* allocator= */ nullptr,
+        /* resizable= */ false
+    );
+}
+void uvm_storage_advise(const at::Storage& storage, const cudaMemoryAdvise advise) {
+    AT_ASSERT(uvm_storage_check(storage), "must be uvm storage");
+    at::cuda::OptionalCUDAGuard device_guard;
+    at::DeviceIndex hint_device = cudaCpuDeviceId;
+    if (storage.device_type() == at::DeviceType::CUDA) {
+        hint_device = storage.device().index();
+        device_guard.set_index(hint_device);
+    }
+    AT_CUDA_CHECK(cudaMemAdvise(
+        storage.data(),
+        storage.nbytes(),
+        // static_cast<enum cudaMemoryAdvise>(advise),
+        advise,
+        hint_device
+    ));
+}
+void uvm_storage_prefetch(const at::Storage& storage) {
+    AT_ASSERT(uvm_storage_check(storage), "must be uvm storage");
+    at::cuda::OptionalCUDAGuard device_guard;
+    cudaStream_t stream = 0;
+    at::DeviceIndex hint_device = cudaCpuDeviceId;
+    if (storage.device_type() == at::DeviceType::CUDA) {
+        hint_device = storage.device().index();
+        device_guard.set_index(hint_device);
+        stream = at::cuda::getCurrentCUDAStream().stream();
+    }
+    AT_CUDA_CHECK(cudaMemPrefetchAsync(
+        storage.data(),
+        storage.nbytes(),
+        hint_device,
+        stream
+    ));
+}
--- a/METIS @ f5ae915a
+++ b/METIS @ f5ae915a
-Subproject commit f5ae915a84d3bbf1508b529af90292dd7085b9ec