Commit 223bb6c0 by Tianxing Wang

initial commit

parents
.vscode
.idea
build
cmake-build-debug
.cache
95dc0207-5097-42db-90c6-b6da5316fba2
\ No newline at end of file
cmake_minimum_required(VERSION 3.15)
set(CUDAToolkit_ROOT /home/wtx/.local/cuda-11.7)
set(CMAKE_CUDA_ARCHITECTURES 86)
set(CMAKE_CUDA_COMPILER /home/wtx/.local/cuda-11.7/bin/nvcc)
project(uvm_utils CXX CUDA)
option(WITH_PYTHON "Link to Python when building" ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# list(APPEND CMAKE_PREFIX_PATH /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages)
list(APPEND CMAKE_PREFIX_PATH /home/wtx/local/pkg/libtorch/share/cmake/Torch)
set(Python3_ROOT_DIR /home/wtx/miniconda3/envs/dgl )
# set(CMAKE_CUDA_COMPILER /home/wtx/.local/cuda/bin)
# set(CUDA_NVCC_EXECUTABLE /home/wtx/.local/cuda/bin)
# set(ENV{CUDA_HOME} /home/wtx/.local/cuda)
# set(ENV{Torch_DIR} /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages/torch/share/cmake/Torch/)
set(CUDAToolkit_INCLUDE_DIR /home/wtx/.local/cuda11.7+cudnn8.9)
if(WITH_PYTHON)
add_definitions(-DWITH_PYTHON)
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
include_directories(${Python3_INCLUDE_DIRS})
message(STATUS "python3 is found:" ${Python3_INCLUDE_DIRS} )
endif()
set(ENV{Torch_DIR} /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages/torch/share/cmake/Torch)
find_package(Torch REQUIRED)
include_directories(${TORCH_INCLUDE_DIRS})
message(STATUS "torch: ${TORCH_INCLUDE_DIRS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
include_directories(${CMAKE_SOURCE_DIR}/include)
message(STATUS ${CMAKE_SOURCE_DIR})
find_package(OpenMP REQUIRED)
# add_library(${PROJECT_NAME} SHARED uvm/uvm_utils.cu)
# add_library(${PROJECT_NAME} SHARED csrc/custom_ops.cpp)
# add_library(${PROJECT_NAME} SHARED )
# file(GLOB SOURCES memory_utils/*.cpp memory_utils/*.cu)
file(GLOB SOURCES memory_utils/*.cpp memory_utils/*.cu)
message(STATUS "sources files: ${SOURCES}")
add_library(${PROJECT_NAME} SHARED ${SOURCES})
if(WITH_PYTHON)
find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
endif()
message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}")
target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_CXX)
target_compile_definitions(${PROJECT_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${PROJECT_NAME})
# set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" OUTPUT_NAME "_C")
install(TARGETS ${PROJECT_NAME} DESTINATION "${CMAKE_SOURCE_DIR}/install")
e8f3f4b2-bd63-40b0-9b38-0616c1d2b0aa
\ No newline at end of file
#ifndef CUSTOM_OPS_DISPATCH_H
#define CUSTOM_OPS_DISPATCH_H
#define DISPATCH_TO_CUDA(name, function) \
m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
#define DISPATCH_TO_CPU(name, function) \
m.impl(name, torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(function)))
#define DISPATCH_TO_META(name, function) \
m.impl(name, torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(function)))
#endif //CUSTOM_OPS_DISPATCH_H
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <ATen/ATen.h>
namespace uvm {
using Tensor = at::Tensor;
/// Allocate the ATen Tensor with unified managed memory (UVM)
/// and set both UVM storage preference to CPU and access from self.device
Tensor new_managed_tensor(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool copy_data);
Tensor new_managed_tensor_meta(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool copy_data);
// Allocate the ATen Tensor with host-mapped memory
Tensor new_host_mapped_tensor(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool copy_data);
// Allocate the ATen Tensor with unified managed memory (UVM) or host-mapped
// memory.
Tensor new_unified_tensor(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool is_host_mapped,
bool copy_data);
/// Allocate the ATen Tensor with unified managed memory (UVM)
Tensor new_vanilla_managed_tensor(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool copy_data);
///@ingroup cumem-utils
/// Check if a tensor is allocated with UVM
bool uvm_storage(const Tensor& t);
///@ingroup cumem-utils
/// Check if a tensor is allocated with UVM *AND* is not on a CPU
bool is_uvm_tensor(const Tensor& t);
///@ingroup cumem-utils
/// Convert a UVM tensor to a CPU tensor
Tensor uvm_to_cpu(const Tensor& t);
///@ingroup cumem-utils
/// Create a UVM tensor on the same device as prototype sharing
/// the same uvm storage as t
Tensor uvm_to_device(const Tensor& t, const Tensor& prototype);
///@ingroup cumem-utils
/// Call cudaMemAdvise on UVM Storage. The hint enum is generated in Python
/// (fbgemm,uvm) using data returned from C++ op.
void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise);
///@ingroup cumem-utils
/// Call cudaMemPrefetchAsync on UVM Storage
void uvm_cuda_mem_prefetch_async(
const Tensor& t,
c10::optional<Tensor> device_t);
///@ingroup cumem-utils
/// Call madvise(..MADV_DONTFORK) on the UVM storage. This is a workaround for
/// an issue where the UVM kernel driver unmaps UVM storage pages from the page
/// table on fork - causing slowdown on the next access from a CPU.
void uvm_mem_advice_dont_fork(const Tensor& t);
///@ingroup cumem-utils
/// Copy a contigious uvm Tensor (uvm_storage(t) is true) into a CPU Tensor
/// The copy uses single threaded memcpy
Tensor uvm_to_cpu_clone(const Tensor& t);
} // namespace uvm
e446cb7d-fe2e-47d9-adf1-88586cfccc97
\ No newline at end of file
4037be3b-719f-4cf3-a705-923fbfe4b316
\ No newline at end of file
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <ATen/ATen.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAGuard.h>
#include <sys/mman.h>
#include <unistd.h>
#include <cstring>
#include "common.h"
// #include "cumem_utils.h"
#include "dispatch.h"
#include "uvm_utils.h"
// #include "enum_utils.h"
// namespace fbgemm_gpu {
// FBGEMM_GPU_ENUM_CREATE_TAG(uvm)
// } // namespace fbgemm_gpu
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#pragma once
#include <ATen/ATen.h>
using Tensor = at::Tensor;
namespace uvm {
Tensor new_unified_tensor_cpu(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool is_host_mapped,
bool copy_data);
} // namespace uvm
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "common.h"
#include <c10/util/logging_is_not_google_glog.h>
using Tensor = at::Tensor;
using std::vector;
namespace uvm {
Tensor new_unified_tensor_cpu(
const Tensor& self,
const std::vector<std::int64_t>& sizes,
bool is_host_mapped,
bool copy_data) {
return at::empty({0}, self.options());
}
} // namespace uvm
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "common.cuh"
using namespace at;
namespace uvm {
// Freeing host/uvm memory with cudaFree[Host] requires a cuda context.
// If a uvm tensor is released from an arbitrary thread without a context
// then cuda helpfully create a new default context on the default device.
// If we have not used the default device before in this process cuda
// needs to also allocate a device context. However creating a device
// context requires device resources and may fail with out of memory error
// causing cudaFree[Host] to fail with out of memory error.
// The solution is simply to remember the device from the allocation context
// and set the correct device in the thread before calling cudaFree[Host]
namespace {
struct CUDAHostMappedContext {
void *ptr_;
int cuda_device_;
CUDAHostMappedContext(void *ptr, int cuda_device)
: ptr_(ptr), cuda_device_(cuda_device) {};
~CUDAHostMappedContext() {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(cuda_device_);
AT_CUDA_CHECK(cudaHostUnregister(ptr_));
free(ptr_);
}
static void release(void *ptr) {
delete static_cast<CUDAHostMappedContext *>(ptr);
}
};
struct CUDAManagedContext {
void *ptr_;
int cuda_device_;
CUDAManagedContext(void *ptr, int cuda_device)
: ptr_(ptr), cuda_device_(cuda_device) {};
~CUDAManagedContext() {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(cuda_device_);
AT_CUDA_CHECK(cudaFree(ptr_));
}
static void release(void *ptr) {
delete static_cast<CUDAManagedContext *>(ptr);
}
};
// Keep a reference to the UVM memory allocation from the associated
// CPU Tensor to prevent lifetime issues (use after free)
struct CUDAManagedIndirectContext {
Storage storage_;
CUDAManagedIndirectContext(Storage storage) : storage_(std::move(storage)) {};
static void release(void *ptr) {
delete static_cast<CUDAManagedIndirectContext *>(ptr);
}
};
// Get the default strides from the input Tensor dimensions
std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
std::vector<int64_t> strides(sizes.size());
int64_t stride = 1;
for (size_t i = sizes.size(); i > 0; --i) {
strides[i - 1] = stride;
stride *= sizes[i - 1];
}
return strides;
}
// Allocate the ATen Tensor with unified managed memory (UVM)
Tensor new_managed_tensor_internal(
const Tensor &self,
const std::vector<std::int64_t> &sizes,
bool copy_data) {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(self.get_device());
auto strides = defaultStrides(sizes);
size_t size_bytes =
at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
void *ptr;
if (copy_data) {
TORCH_CHECK(self.sizes().vec() == sizes);
}
AT_CUDA_CHECK(cudaMallocManaged(&ptr, size_bytes));
if (copy_data) {
if (self.is_cpu()) {
AT_CUDA_CHECK(cudaMemcpy(ptr, self.data_ptr(), size_bytes, cudaMemcpyHostToHost));
} else if (self.is_cuda()) {
AT_CUDA_CHECK(cudaMemcpy(ptr, self.data_ptr(), size_bytes, cudaMemcpyDeviceToHost));
}
}
// The memory allocated above can be accessed from CUDA and CPU
// However Storage requires a specific device and we need to retain the cuda
// device for releasing the memory (see "Freeing host/uvm memory with cudaFree
// .." above. To access the memory from devices other then the one used for
// allocation we need a new Storage object (with the new device) referring to
// the original storage object. We force this indirection even for newly
// allocated Tensors for code unification.
auto real_storage = Storage(
Storage::use_byte_size_t(),
size_bytes,
at::DataPtr(
ptr,
new CUDAManagedContext(ptr, self.get_device()),
&CUDAManagedContext::release,
{at::DeviceType::CUDA, self.device().index()}),
nullptr, /* allocator */
/*resizable=*/false);
auto indirect_storage = Storage(
Storage::use_byte_size_t(),
size_bytes,
at::DataPtr(
ptr,
new CUDAManagedIndirectContext(real_storage),
&CUDAManagedIndirectContext::release,
{at::DeviceType::CUDA, self.device().index()}),
nullptr, /* allocator */
/*resizable=*/false);
return at::empty({0}, self.options())
.set_(indirect_storage, 0, sizes, strides);
}
std::tuple<void *, size_t> adjust_to_page_boundaries(void *ptr, size_t size) {
static uint64_t page_mask = ([]() -> uint64_t {
uint64_t page_size = (uint64_t) sysconf(_SC_PAGESIZE);
return (page_size - 1);
})();
uint64_t raw_ptr = (uint64_t) ptr;
uint64_t raw_ptr_adjusted = raw_ptr & ~page_mask;
uint64_t raw_ptr_end_adjusted = (raw_ptr + size + page_mask) & ~page_mask;
uint64_t size_adjusted = raw_ptr_end_adjusted - raw_ptr_adjusted;
return std::make_tuple((void *) raw_ptr_adjusted, (size_t) size_adjusted);
}
} // namespace
// Allocate a cuda Tensor with unified managed memory (UVM)
// Then set the preferred data location to CPU (host memory)
// And establish mappings on the cuda device to the host memory
Tensor new_managed_tensor(
const Tensor &self,
const std::vector<std::int64_t> &sizes,
bool copy_data) {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(self.get_device());
Tensor t = new_managed_tensor_internal(self, sizes, copy_data);
void *ptr = t.data_ptr();
size_t size_bytes = t.storage().nbytes();
// Set preferred memory location to host memory
AT_CUDA_CHECK(cudaMemAdvise(
ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
// User hints with "accessed by": GPU will establish direct mapping of data
// in CPU memory, no page faults will be generated
AT_CUDA_CHECK(cudaMemAdvise(
ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device()));
C10_CUDA_KERNEL_LAUNCH_CHECK();
// Work around fork issue - see uvm_mem_advice_dont_fork for details
auto adjusted = adjust_to_page_boundaries(ptr, size_bytes);
int result =
madvise(std::get<0>(adjusted), std::get<1>(adjusted), MADV_DONTFORK);
TORCH_CHECK(result == 0)
return t;
}
// copy_data is used to make the function signature same with new_managed_tensor
Tensor new_managed_tensor_meta(
const Tensor &self,
const std::vector<std::int64_t> &sizes,
bool copy_data) {
return at::empty(sizes, self.options());
}
// Allocate a cuda Tensor with unified managed memory (UVM) without the
// additional steps taked by new_managed_tensor above
Tensor new_vanilla_managed_tensor(
const Tensor &self,
const std::vector<std::int64_t> &sizes,
bool copy_data) {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(self.get_device());
return new_managed_tensor_internal(self, sizes, copy_data);
}
// Allocate the ATen Tensor with host-mapped memory
Tensor new_host_mapped_tensor(
const Tensor &self,
const std::vector<std::int64_t> &sizes,
bool copy_data) {
at::cuda::OptionalCUDAGuard device_guard;
device_guard.set_index(self.get_device());
auto strides = defaultStrides(sizes);
size_t size_bytes =
at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
if (copy_data) {
TORCH_CHECK(self.sizes().vec() == sizes);
}
// When using cudaHostAlloc for large allocations, we found that it can
// potentially take a global lock and lock out CUDA APIs from other processes.
// The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead
// of using this cuda API, we can do regular malloc, pre-fault the pages, and
// then do cudaHostRegister with GPU mapping flags to lock the pages, so we
// can minimize the cost while holding this global lock.
void *const ptr = malloc(size_bytes);
// Pre-fault/map the pages by setting the first byte of the page
// TODO: parallelize the mapping of pages with a threadpool executor
const size_t pageSize = (size_t) sysconf(_SC_PAGESIZE);
// alignedPtr 指向 ptr 所在页面的第一个字节。在这段代码中,通过将 ptr 的值加上页面大小减1,然后通过位运算将结果向下舍入到页面边界,
// 得到了 alignedPtr 的值。这样做是为了确保 alignedPtr 是指向页面起始位置的指针,以便后续循环中按页面大小进行迭代操作。
// 因此,alignedPtr 指向的是 ptr 所在页面的起始位置
// 这行代码的作用是将指针ptr按照系统页面大小对齐,确保它指向页面的起始位置。它首先将指针ptr加上一个页面大小再减去1,
// 这样可以将指针ptr向上舍入到下一个页面的边界。然后,使用按位取反的方式将页面大小-1应用于指针ptr+pageSize-1,
// 这样就可以将其向下舍入到前一个页面的边界。最后,将对齐后的指针赋值给变量alignedPtr。
uintptr_t alignedPtr = (((uintptr_t) ptr + pageSize - 1) & ~(pageSize - 1));
for (uintptr_t p = alignedPtr; p < ((uintptr_t) ptr + size_bytes);
p += pageSize) {
memset((void *) p, 0, 1);
}
AT_CUDA_CHECK(cudaHostRegister(
ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable));
if (copy_data) {
if (self.is_cpu()) {
memcpy(ptr, self.data_ptr(), size_bytes);
} else if (self.is_cuda()) {
AT_CUDA_CHECK(cudaMemcpy(ptr, self.data_ptr(), size_bytes, cudaMemcpyDeviceToHost));
}
}
void *dev_ptr;
AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0));
auto storage = Storage(
Storage::use_byte_size_t(),
size_bytes,
at::DataPtr(
dev_ptr,
new CUDAHostMappedContext(ptr, self.get_device()),
&CUDAHostMappedContext::release,
{at::DeviceType::CUDA, self.device().index()}),
nullptr, /* allocator */
/*resizable=*/false);
return at::empty({0}, self.options())
.set_(std::move(storage), 0, sizes, strides);
}
// Allocate the ATen Tensor with UVM or host-mapped memory
Tensor new_unified_tensor(
const Tensor &self,
const std::vector<std::int64_t> &sizes,
bool is_host_mapped,
bool copy_data) {
if (is_host_mapped) {
VLOG(2) << "Allocate the ATen Tensor with cudaHostAlloc";
return new_host_mapped_tensor(self, sizes, copy_data);
} else {
VLOG(2) << "Allocate the ATen Tensor with cudaMallocManaged";
return new_managed_tensor(self, sizes, copy_data);
}
}
// Check if a tensor is allocated with UVM (CPU or GPU Tensor)
bool uvm_storage(const Tensor &t) {
auto deleter = t.storage().data_ptr().get_deleter();
return deleter == &CUDAManagedIndirectContext::release ||
deleter == &CUDAHostMappedContext::release;
}
// Check if a tensor is allocated with UVM but is not a CPU Tensor
bool is_uvm_tensor(const Tensor &t) {
if (t.device().is_cpu()) {
return false;
}
return uvm_storage(t);
}
// Convert a UVM tensor to a CPU tensor
Tensor uvm_to_cpu(const Tensor &t) {
TORCH_CHECK(is_uvm_tensor(t));
// Don't copy the storage - just keep a reference to the original storage
auto *tcontext =
t.storage().data_ptr().cast_context<CUDAManagedIndirectContext>(
&CUDAManagedIndirectContext::release);
TORCH_CHECK(tcontext != nullptr)
auto *ocontext =
tcontext->storage_.data_ptr().cast_context<CUDAManagedContext>(
&CUDAManagedContext::release);
auto storage = Storage(
Storage::use_byte_size_t(),
t.storage().nbytes(),
at::DataPtr(
ocontext->ptr_,
new CUDAManagedIndirectContext(tcontext->storage_),
&CUDAManagedIndirectContext::release,
{at::DeviceType::CPU}),
nullptr, /* allocator */
/*resizable=*/false);
return at::empty({0}, t.options().device(Device::Type::CPU))
.set_(std::move(storage), t.storage_offset(), t.sizes(), t.strides());
}
// Create a new UVM tensor sharing storage with t on the same device as
// prototype
Tensor uvm_to_device(const Tensor &t, const Tensor &prototype) {
TORCH_CHECK(is_uvm_tensor(t));
// Don't copy the storage - just keep a reference to the original storage
auto *tcontext =
t.storage().data_ptr().cast_context<CUDAManagedIndirectContext>(
&CUDAManagedIndirectContext::release);
TORCH_CHECK(tcontext != nullptr)
auto device = prototype.device();
auto *ocontext =
tcontext->storage_.data_ptr().cast_context<CUDAManagedContext>(
&CUDAManagedContext::release);
auto storage = Storage(
Storage::use_byte_size_t(),
t.storage().nbytes(),
at::DataPtr(
ocontext->ptr_,
new CUDAManagedIndirectContext(tcontext->storage_),
&CUDAManagedIndirectContext::release,
device),
nullptr, /* allocator */
/*resizable=*/false);
return at::empty({0}, t.options().device(device))
.set_(std::move(storage), t.storage_offset(), t.sizes(), t.strides());
}
namespace {
int64_t uvm_get_guard_index(const Tensor &t) {
TORCH_CHECK(uvm_storage(t));
int cuda_device_index;
if (t.is_cpu()) {
auto *tcontext =
t.storage().data_ptr().cast_context<CUDAManagedIndirectContext>(
&CUDAManagedIndirectContext::release);
TORCH_CHECK(tcontext != nullptr)
auto *ocontext =
tcontext->storage_.data_ptr().cast_context<CUDAManagedContext>(
&CUDAManagedContext::release);
TORCH_CHECK(ocontext != nullptr)
cuda_device_index = static_cast<int64_t>(ocontext->cuda_device_);
} else {
TORCH_CHECK(t.is_cuda());
cuda_device_index = t.get_device();
}
return cuda_device_index;
}
} // namespace
void uvm_cuda_mem_advise(const Tensor &t, int64_t cuda_memory_advise) {
// Call cudaMemAdvise on vm tensor
// See cudaMemoryAdvise enum (automatically exported to python fbgemm_gpu.uvm
// namespace) for valid values and interface stub.
at::cuda::OptionalCUDAGuard device_guard;
int64_t cuda_device_index = uvm_get_guard_index(t);
int hint_device;
if (t.is_cpu()) {
hint_device = cudaCpuDeviceId;
} else {
TORCH_CHECK(t.is_cuda());
hint_device = static_cast<int>(cuda_device_index);
}
void *ptr = t.data_ptr();
size_t size_bytes = at::detail::computeStorageNbytes(
t.sizes(), t.strides(), t.dtype().itemsize());
device_guard.set_index(cuda_device_index);
// FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
AT_CUDA_CHECK(cudaMemAdvise(
ptr,
size_bytes,
static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
hint_device));
return;
}
void uvm_cuda_mem_prefetch_async(
const Tensor &t,
c10::optional<Tensor> device_t) {
// Call cudaMemPrefetchAsync on Tensor
at::cuda::OptionalCUDAGuard device_guard;
TORCH_CHECK(uvm_storage(t));
TORCH_CHECK(t.is_cuda() || (t.is_cpu() && device_t.has_value()));
TORCH_CHECK(!device_t.has_value() || device_t.value().is_cuda());
int prefetch_device =
(t.is_cpu()) ? cudaCpuDeviceId : static_cast<int>(t.get_device());
const Tensor &context_t = device_t.has_value() ? device_t.value() : t;
void *ptr = t.data_ptr();
size_t size_bytes = at::detail::computeStorageNbytes(
t.sizes(), t.strides(), t.dtype().itemsize());
device_guard.set_index(context_t.get_device());
auto stream = at::cuda::getCurrentCUDAStream();
AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream));
return;
}
void uvm_mem_advice_dont_fork(const Tensor &t) {
// During fork() the uvm driver is called to copy VMA for UVM space.
// The uvm driver then removes pmap entries for both child and parent.
// Re-establishing the mappings for the paretn is slow.
// This works around the issue by setting the UVM VMA to not be copied
// into the child.
TORCH_CHECK(uvm_storage(t));
void *ptr = t.data_ptr();
size_t size_bytes = at::detail::computeStorageNbytes(
t.sizes(), t.strides(), t.dtype().itemsize());
auto adjusted = adjust_to_page_boundaries(ptr, size_bytes);
int result =
madvise(std::get<0>(adjusted), std::get<1>(adjusted), MADV_DONTFORK);
TORCH_CHECK_EQ(result, 0);
return;
}
Tensor uvm_to_cpu_clone(const Tensor &t) {
TORCH_CHECK(uvm_storage(t));
TORCH_CHECK(t.is_contiguous());
Tensor cpu_clone = at::empty_like(t, t.options().device(kCPU));
size_t size_bytes = at::detail::computeStorageNbytes(
t.sizes(), t.strides(), t.dtype().itemsize());
memcpy(cpu_clone.data_ptr(), t.data_ptr(), size_bytes);
return cpu_clone;
}
// FBGEMM_GPU_ENUM_GLOGAL(uvm)
// FBGEMM_GPU_ENUM_REGISTER_START(uvm, cudaMemory, Advise){
// FBGEMM_GPU_ENUM_ITEM(
// cudaMem,
// AdviseSetReadMostly,
// cudaMemAdviseSetReadMostly),
// FBGEMM_GPU_ENUM_ITEM(
// cudaMem,
// AdviseUnsetReadMostly,
// cudaMemAdviseUnsetReadMostly),
// FBGEMM_GPU_ENUM_ITEM(
// cudaMem,
// AdviseSetPreferredLocation,
// cudaMemAdviseSetPreferredLocation),
// FBGEMM_GPU_ENUM_ITEM(
// cudaMem,
// AdviseUnsetPreferredLocation,
// cudaMemAdviseUnsetPreferredLocation),
// FBGEMM_GPU_ENUM_ITEM(
// cudaMem,
// AdviseSetAccessedBy,
// cudaMemAdviseSetAccessedBy),
// FBGEMM_GPU_ENUM_ITEM(
// cudaMem,
// AdviseUnsetAccessedBy,
// cudaMemAdviseUnsetAccessedBy),
// } FBGEMM_GPU_ENUM_REGISTER_END
} // namespace uvm
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <torch/library.h>
#include "common.cuh"
//#define DISPATCH_TO_CPU(name, function) \
// m.impl(name, torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(function)))
//
//#define DISPATCH_TO_META(name, function) \
// m.impl(name, torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(function)))
using Tensor = at::Tensor;
namespace uvm {
TORCH_LIBRARY_FRAGMENT(uvm, m) {
m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));
m.def(
"uvm_to_device(Tensor self, Tensor prototype) -> Tensor",
TORCH_FN(uvm_to_device));
m.def("uvm_to_cpu(Tensor t) -> Tensor");
m.def("new_managed_tensor(Tensor self, int[] sizes, bool copy_data) -> Tensor");
m.def("new_host_mapped_tensor(Tensor self, int[] sizes, bool copy_data) -> Tensor");
m.def(
"new_unified_tensor(Tensor self, int[] sizes, bool is_host_mapped, bool copy_data) -> Tensor");
m.def("new_vanilla_managed_tensor(Tensor self, int[] sizes, bool copy_data) -> Tensor");
m.def(
"cuda_mem_advise(Tensor t, int advice) -> ()",
TORCH_FN(uvm_cuda_mem_advise));
m.def(
"cuda_mem_prefetch_async(Tensor t, Tensor? device_t) -> ()",
TORCH_FN(uvm_cuda_mem_prefetch_async));
m.def(
"uvm_mem_advice_dont_fork(Tensor t) -> ()",
TORCH_FN(uvm_mem_advice_dont_fork));
m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
// m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
}
TORCH_LIBRARY_FRAGMENT(uvm, m) {
DISPATCH_TO_CPU("new_unified_tensor", new_unified_tensor_cpu);
DISPATCH_TO_META("new_managed_tensor_meta", new_managed_tensor_meta);
}
} // namespace uvn
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
#include <torch/library.h>
#include "common.cuh"
namespace uvm {
TORCH_LIBRARY_FRAGMENT(uvm, m) {
DISPATCH_TO_CUDA("uvm_to_cpu", uvm_to_cpu);
DISPATCH_TO_CUDA("new_managed_tensor", new_managed_tensor);
DISPATCH_TO_META("new_managed_tensor", new_managed_tensor_meta);
DISPATCH_TO_CUDA("new_host_mapped_tensor", new_host_mapped_tensor);
DISPATCH_TO_CUDA("new_unified_tensor", new_unified_tensor);
DISPATCH_TO_CUDA("new_vanilla_managed_tensor", new_vanilla_managed_tensor);
}
} // namespace uvm
import subprocess
from distutils.sysconfig import get_python_lib
cmake_args = [
"-DCMAKE_PREFIX_PATH=" + get_python_lib(),
]
subprocess.check_call(["cmake", "-B", "build"] + cmake_args)
# subprocess.check_call1
\ No newline at end of file
cmake_minimum_required(VERSION 3.10)
set(CUDAToolkit_ROOT /home/wtx/.local/cuda-11.7)
set(CMAKE_CUDA_ARCHITECTURES 86)
set(CMAKE_CUDA_COMPILER /home/wtx/.local/cuda-11.7/bin/nvcc)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
list(APPEND CMAKE_PREFIX_PATH /home/wtx/local/pkg/libtorch/share/cmake/Torch)
set(Python3_ROOT_DIR /home/wtx/miniconda3/envs/dgl )
set(CUDAToolkit_INCLUDE_DIR /home/wtx/.local/cuda11.7+cudnn8.9)
project(uvm_test CXX CUDA)
set(ENV{Torch_DIR} /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages/torch/share/cmake/Torch)
find_package(Torch REQUIRED)
include_directories(${TORCH_INCLUDE_DIRS})
message(STATUS "torch: ${TORCH_INCLUDE_DIRS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
# include_directories(/home/wtx/workspace/cpp_project/uvm/memory_utils_old)
include_directories("../include")
add_executable(uvm main.cpp)
set(ENV{LD_LIBRARY_PATH} "$ENV{LD_LIBRARY_PATH} /home/wtx/workspace/cpp_project/uvm/test")
target_link_libraries(uvm /home/wtx/workspace/cpp_project/uvm/test/libuvm_utils.so)
#include "uvm_utils.h"
#include <iostream>
using std::cout;
using std::endl;
int main(){
auto t = at::ones({1,1}, c10::TensorOptions());
cout << uvm::uvm_storage(t) << endl;
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import torch\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"a = torch.tensor([[1,2,3],[4,5,6]])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([2, 3])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"b = a.t()\n",
"b[0,0] = 100"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"id(b.storage()) == id(a.storage())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor(100)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a[0,0]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, 0)"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.storage_offset(), b.storage_offset()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((3, 1), (1, 3))"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a.stride(), b.stride()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"c = b[(0,2),:]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(tensor([[100, 4],\n",
" [ 3, 6]]),\n",
" tensor([[100, 4],\n",
" [ 2, 5],\n",
" [ 3, 6]]),\n",
" tensor([[100, 2, 3],\n",
" [ 4, 5, 6]]))"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c, b, a"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2, 1)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c.stride()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"a = torch.arange(1,11).reshape(2,5)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[ 1, 2, 3, 4, 5],\n",
" [ 6, 7, 8, 9, 10]])"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[ 1, 6],\n",
" [ 2, 7],\n",
" [ 3, 8],\n",
" [ 4, 9],\n",
" [ 5, 10]])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"b = a.t()\n",
"b"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([2, 3])"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"c = b[(1,3,4),:].t()\n",
"c.shape"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"torch.Size([10, 2])"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a = torch.arange(20).reshape(10,2)\n",
"a.shape"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"torch.Size([3, 2])\n",
" 8\n",
" 9\n",
" 10\n",
" 11\n",
" 18\n",
" 19\n",
"[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 6]\n",
"0\n",
"(2, 1)\n"
]
}
],
"source": [
"b = a[(4,5,9), :]\n",
"print(id(b.storage()) == id(a.storage()))\n",
"print(b.shape)\n",
"print(b.storage())\n",
"print(b.storage_offset())\n",
"print(b.stride())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dgl",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import torch \n",
"\n",
"torch.ops.load_library(\"cmake-build-debug/libuvm_utils.so\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"a = torch.arange(1,10).reshape(3,3).cuda()\n",
"a = torch.empty(0).cuda()\n",
"# x = torch.ops.fbgemm.new_managed_tensor(a, (3,3))\n",
"# print(x)\n",
"# torch.ops.fbgemm.is_uvm_tensor(x)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<module 'torch.ops.uvm' from 'torch.ops'>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.ops.uvm"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[1., 1.],\n",
" [1., 1.]])\n",
"<bound method Tensor.storage of tensor([])>\n",
"tensor([])\n"
]
}
],
"source": [
"y = torch.ones((2,2))\n",
"z = torch.ops.uvm.new_unified_tensor(y,(2,2),True,True)\n",
"print(y)\n",
"print(z.storage)\n",
"print(z)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([])\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "dgl",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"libcudart.so.12: cannot open shared object file: No such file or directory\n"
]
},
{
"ename": "AttributeError",
"evalue": "'_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/torch/_ops.py:565\u001b[0m, in \u001b[0;36m_OpNamespace.__getattr__\u001b[0;34m(self, op_name)\u001b[0m\n\u001b[1;32m 564\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 565\u001b[0m op, overload_names \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49m_C\u001b[39m.\u001b[39;49m_jit_get_operation(qualified_op_name)\n\u001b[1;32m 566\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 567\u001b[0m \u001b[39m# Turn this into AttributeError so getattr(obj, key, default)\u001b[39;00m\n\u001b[1;32m 568\u001b[0m \u001b[39m# works (this is called by TorchScript with __origin__)\u001b[39;00m\n",
"\u001b[0;31mRuntimeError\u001b[0m: No such operator fbgemm::jagged_2d_to_dense",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/wtx/workspace/cpp_project/uvm/uvm.ipynb Cell 1\u001b[0m line \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mfbgemm_gpu\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mfbgemm_gpu\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39muvm\u001b[39;00m \u001b[39mimport\u001b[39;00m cudaMemAdvise, cudaMemPrefetchAsync, cudaMemoryAdvise\n",
"File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/fbgemm_gpu/__init__.py:22\u001b[0m\n\u001b[1;32m 19\u001b[0m open_source: \u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[39m# Re-export docs\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m _fbgemm_gpu_docs \u001b[39m# noqa: F401, E402\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \u001b[39m# Re-export the version string from the auto-generated version file\u001b[39;00m\n\u001b[1;32m 25\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_fbgemm_gpu_version\u001b[39;00m \u001b[39mimport\u001b[39;00m __version__ \u001b[39m# noqa: F401, E402\u001b[39;00m\n",
"File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py:19\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39madd_docs\u001b[39m(method, docstr):\n\u001b[1;32m 15\u001b[0m method\u001b[39m.\u001b[39m\u001b[39m__doc__\u001b[39m \u001b[39m=\u001b[39m docstr\n\u001b[1;32m 18\u001b[0m add_docs(\n\u001b[0;32m---> 19\u001b[0m torch\u001b[39m.\u001b[39;49mops\u001b[39m.\u001b[39;49mfbgemm\u001b[39m.\u001b[39;49mjagged_2d_to_dense,\n\u001b[1;32m 20\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \u001b[39mjagged_2d_to_dense(values, x_offsets, max_sequence_length) -> Tensor\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \n\u001b[1;32m 23\u001b[0m \u001b[39mConverts a jagged tensor, with a 2D values array into a dense tensor, padding with zeros.\u001b[39;00m\n\u001b[1;32m 24\u001b[0m \n\u001b[1;32m 25\u001b[0m \u001b[39mArgs:\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[39m values (Tensor): 2D tensor containing the values of the jagged tensor.\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \n\u001b[1;32m 28\u001b[0m \u001b[39m x_offsets (Tensor): 1D tensor containing the starting point of each jagged row in the values tensor.\u001b[39;00m\n\u001b[1;32m 29\u001b[0m \n\u001b[1;32m 30\u001b[0m \u001b[39m max_sequence_length (int): Maximum length of any row in the jagged dimension.\u001b[39;00m\n\u001b[1;32m 31\u001b[0m \n\u001b[1;32m 32\u001b[0m \u001b[39mReturns:\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[39m Tensor: The padded dense tensor\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \n\u001b[1;32m 35\u001b[0m \u001b[39mExample:\u001b[39;00m\n\u001b[1;32m 36\u001b[0m \u001b[39m >>> values = torch.tensor([[1,1],[2,2],[3,3],[4,4]])\u001b[39;00m\n\u001b[1;32m 37\u001b[0m \u001b[39m >>> x_offsets = torch.tensor([0, 1, 3])\u001b[39;00m\n\u001b[1;32m 38\u001b[0m \u001b[39m >>> torch.ops.fbgemm.jagged_2d_to_dense(values, x_offsets, 3)\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[39m tensor([[[1, 1],\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[39m [0, 0],\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[39m [0, 0]],\u001b[39;00m\n\u001b[1;32m 42\u001b[0m \u001b[39m [[2, 2],\u001b[39;00m\n\u001b[1;32m 43\u001b[0m \u001b[39m [3, 3],\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[39m [0, 0]]])\u001b[39;00m\n\u001b[1;32m 45\u001b[0m \n\u001b[1;32m 46\u001b[0m \u001b[39m\"\"\"\u001b[39;00m,\n\u001b[1;32m 47\u001b[0m )\n\u001b[1;32m 49\u001b[0m \u001b[39m# Example:\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[39m# >>> t = torch.arange(4)\u001b[39;00m\n\u001b[1;32m 54\u001b[0m add_docs(\n\u001b[1;32m 55\u001b[0m torch\u001b[39m.\u001b[39mops\u001b[39m.\u001b[39mfbgemm\u001b[39m.\u001b[39mjagged_1d_to_dense,\n\u001b[1;32m 56\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[39m\"\"\"\u001b[39;00m,\n\u001b[1;32m 81\u001b[0m )\n",
"File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/torch/_ops.py:569\u001b[0m, in \u001b[0;36m_OpNamespace.__getattr__\u001b[0;34m(self, op_name)\u001b[0m\n\u001b[1;32m 565\u001b[0m op, overload_names \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39m_C\u001b[39m.\u001b[39m_jit_get_operation(qualified_op_name)\n\u001b[1;32m 566\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 567\u001b[0m \u001b[39m# Turn this into AttributeError so getattr(obj, key, default)\u001b[39;00m\n\u001b[1;32m 568\u001b[0m \u001b[39m# works (this is called by TorchScript with __origin__)\u001b[39;00m\n\u001b[0;32m--> 569\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mAttributeError\u001b[39;00m(\n\u001b[1;32m 570\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m'\u001b[39m\u001b[39m_OpNamespace\u001b[39m\u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m object has no attribute \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mop_name\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 571\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 573\u001b[0m \u001b[39m# let the script frontend know that op is identical to the builtin op\u001b[39;00m\n\u001b[1;32m 574\u001b[0m \u001b[39m# with qualified_op_name\u001b[39;00m\n\u001b[1;32m 575\u001b[0m torch\u001b[39m.\u001b[39mjit\u001b[39m.\u001b[39m_builtins\u001b[39m.\u001b[39m_register_builtin(op, qualified_op_name)\n",
"\u001b[0;31mAttributeError\u001b[0m: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'"
]
}
],
"source": [
"import torch\n",
"import fbgemm_gpu\n",
"from fbgemm_gpu.uvm import cudaMemAdvise, cudaMemPrefetchAsync, cudaMemoryAdvise"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "RuntimeError",
"evalue": "The NVIDIA driver on your system is too old (found version 11070). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/home/wtx/workspace/cpp_project/uvm/uvm.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m ref \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mzeros(\u001b[39m1\u001b[39;49m, dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat32, device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda:0\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m x \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mops\u001b[39m.\u001b[39mfbgemm\u001b[39m.\u001b[39mnew_managed_tensor(ref, (\u001b[39m10\u001b[39m, \u001b[39m10\u001b[39m))\n",
"File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/torch/cuda/__init__.py:298\u001b[0m, in \u001b[0;36m_lazy_init\u001b[0;34m()\u001b[0m\n\u001b[1;32m 296\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mCUDA_MODULE_LOADING\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m os\u001b[39m.\u001b[39menviron:\n\u001b[1;32m 297\u001b[0m os\u001b[39m.\u001b[39menviron[\u001b[39m\"\u001b[39m\u001b[39mCUDA_MODULE_LOADING\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mLAZY\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m--> 298\u001b[0m torch\u001b[39m.\u001b[39;49m_C\u001b[39m.\u001b[39;49m_cuda_init()\n\u001b[1;32m 299\u001b[0m \u001b[39m# Some of the queued calls may reentrantly call _lazy_init();\u001b[39;00m\n\u001b[1;32m 300\u001b[0m \u001b[39m# we need to just return without initializing in that case.\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[39m# However, we must not let any *other* threads in!\u001b[39;00m\n\u001b[1;32m 302\u001b[0m _tls\u001b[39m.\u001b[39mis_initializing \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
"\u001b[0;31mRuntimeError\u001b[0m: The NVIDIA driver on your system is too old (found version 11070). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver."
]
}
],
"source": [
"ref = torch.zeros(1, dtype=torch.float32, device=\"cuda:0\")\n",
"x = torch.ops.fbgemm.new_managed_tensor(ref, (10, 10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"values = torch.tensor([[1,1],[2,2],[3,3],[4,4]])\n",
"x_offsets = torch.tensor([0, 4, 4])\n",
"m = torch.ops.fbgemm.jagged_2d_to_dense(values, x_offsets, 3)\n",
"m"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.ops.fbgemm.dense_to_jagged(m, [x_offsets])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y = torch.randn(10, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x[:] = y"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y[:] = x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"z = x + 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.ops.fbgemm.uvm_storage(x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.ops.fbgemm.fbgemm_gpu_uvm_enum_query()[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cudaMemAdvise(x, cudaMemoryAdvise.cudaMemAdviseSetReadMostly)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cudaMemPrefetchAsync(x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.ops.fbgemm.is_uvm_tensor(x[:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"torch.ops.fbgemm.uvm_to_cpu(x).data_ptr() == x.data_ptr()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x2 = torch.ops.fbgemm.uvm_to_device(x, x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x[:1] = 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"x"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fbgemm_gpu.split_table_batched_embeddings_ops import EmbeddingLocation, SplitTableBatchedEmbeddingBagsCodegen, ComputeDevice, DenseTableBatchedEmbeddingBagsCodegen"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embedding_specs = [\n",
" (3, 8, EmbeddingLocation.DEVICE, ComputeDevice.CUDA),\n",
" (5, 4, EmbeddingLocation.MANAGED, ComputeDevice.CUDA),\n",
"]\n",
"\n",
"tbe = SplitTableBatchedEmbeddingBagsCodegen(embedding_specs)\n",
"tbe.init_embedding_weights_uniform(-1, 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tbe.weights_dev.copy_(torch.arange(3, device=\"cuda:0\").view(-1, 1).repeat(1, 8).flatten())\n",
"tbe.weights_uvm.copy_((torch.arange(5) + 3).view(-1, 1).repeat(1, 4).flatten())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tbe.split_embedding_weights()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"0 1\n",
"2 0 1\n",
"2 0\n",
"3 1\n",
"4 2 0\n",
"0\n",
"\"\"\"\n",
"indices = torch.tensor([0, 1, 2, 0, 1, 2, 0, 3, 1, 4, 2, 0, 0], device=\"cuda\", dtype=torch.long)\n",
"offsets = torch.tensor([0, 2, 5, 7, 9, 12, 13], device=\"cuda\", dtype=torch.long)\n",
"tbe(indices, offsets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized\n",
"# torch.ops.fbgemm.FloatToFP8RowwiseQuantized"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DenseTableBatchedEmbeddingBagsCodegen"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "llm",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment