initial commit

223bb6c0 · Tianxing Wang · 223bb6c0 · 223bb6c0 · 223bb6c0 · 223bb6c0
Commit 223bb6c0 authored Dec 08, 2023 by Tianxing Wang
22 changed files
--- a/.gitignore
+++ b/.gitignore
+.vscode
+.idea
+build
+cmake-build-debug
+.cache
--- a/.uuid
+++ b/.uuid
+95dc0207-5097-42db-90c6-b6da5316fba2
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.15)
+
+
+set(CUDAToolkit_ROOT /home/wtx/.local/cuda-11.7)
+set(CMAKE_CUDA_ARCHITECTURES 86)
+set(CMAKE_CUDA_COMPILER /home/wtx/.local/cuda-11.7/bin/nvcc)
+project(uvm_utils CXX CUDA)
+
+option(WITH_PYTHON "Link to Python when building" ON)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+# list(APPEND CMAKE_PREFIX_PATH /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages)
+list(APPEND CMAKE_PREFIX_PATH /home/wtx/local/pkg/libtorch/share/cmake/Torch)
+set(Python3_ROOT_DIR /home/wtx/miniconda3/envs/dgl )
+# set(CMAKE_CUDA_COMPILER /home/wtx/.local/cuda/bin)
+# set(CUDA_NVCC_EXECUTABLE  /home/wtx/.local/cuda/bin)
+# set(ENV{CUDA_HOME} /home/wtx/.local/cuda)
+# set(ENV{Torch_DIR} /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages/torch/share/cmake/Torch/)
+set(CUDAToolkit_INCLUDE_DIR /home/wtx/.local/cuda11.7+cudnn8.9)
+
+
+
+if(WITH_PYTHON)
+    add_definitions(-DWITH_PYTHON)
+    find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+    include_directories(${Python3_INCLUDE_DIRS})
+    message(STATUS "python3 is found:" ${Python3_INCLUDE_DIRS} )
+endif()
+
+
+set(ENV{Torch_DIR} /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages/torch/share/cmake/Torch)
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+message(STATUS "torch: ${TORCH_INCLUDE_DIRS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+message(STATUS ${CMAKE_SOURCE_DIR})
+
+find_package(OpenMP REQUIRED)
+
+# add_library(${PROJECT_NAME} SHARED uvm/uvm_utils.cu)
+# add_library(${PROJECT_NAME} SHARED csrc/custom_ops.cpp)
+# add_library(${PROJECT_NAME} SHARED )
+
+# file(GLOB SOURCES memory_utils/*.cpp memory_utils/*.cu)
+file(GLOB SOURCES memory_utils/*.cpp memory_utils/*.cu)
+message(STATUS "sources files: ${SOURCES}")
+add_library(${PROJECT_NAME} SHARED ${SOURCES})
+
+
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+
+message(STATUS "TORCH_LIBRARIES: ${TORCH_LIBRARIES}")
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_CXX)
+target_compile_definitions(${PROJECT_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${PROJECT_NAME})
+
+# set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" OUTPUT_NAME "_C")
+install(TARGETS ${PROJECT_NAME} DESTINATION  "${CMAKE_SOURCE_DIR}/install")
--- a/include/.uuid
+++ b/include/.uuid
+e8f3f4b2-bd63-40b0-9b38-0616c1d2b0aa
\ No newline at end of file
--- a/include/dispatch.h
+++ b/include/dispatch.h
+#ifndef CUSTOM_OPS_DISPATCH_H
+#define CUSTOM_OPS_DISPATCH_H
+
+#define DISPATCH_TO_CUDA(name, function) \
+  m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
+
+#define DISPATCH_TO_CPU(name, function) \
+  m.impl(name, torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(function)))
+
+#define DISPATCH_TO_META(name, function) \
+  m.impl(name, torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(function)))
+
+#endif //CUSTOM_OPS_DISPATCH_H
--- a/include/uvm_utils.h
+++ b/include/uvm_utils.h
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace uvm {
+
+using Tensor = at::Tensor;
+
+
+/// Allocate the ATen Tensor with unified managed memory (UVM)
+/// and set both UVM storage preference to CPU and access from self.device
+Tensor new_managed_tensor(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool copy_data);
+
+Tensor new_managed_tensor_meta(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool copy_data);
+
+// Allocate the ATen Tensor with host-mapped memory
+Tensor new_host_mapped_tensor(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool copy_data);
+
+// Allocate the ATen Tensor with unified managed memory (UVM) or host-mapped
+// memory.
+Tensor new_unified_tensor(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool is_host_mapped,
+    bool copy_data);
+
+/// Allocate the ATen Tensor with unified managed memory (UVM)
+Tensor new_vanilla_managed_tensor(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool copy_data);
+
+///@ingroup cumem-utils
+/// Check if a tensor is allocated with UVM
+bool uvm_storage(const Tensor& t);
+
+///@ingroup cumem-utils
+/// Check if a tensor is allocated with UVM *AND* is not on a CPU
+bool is_uvm_tensor(const Tensor& t);
+
+///@ingroup cumem-utils
+/// Convert a UVM tensor to a CPU tensor
+Tensor uvm_to_cpu(const Tensor& t);
+
+///@ingroup cumem-utils
+/// Create a UVM tensor on the same device as prototype sharing
+/// the same uvm storage as t
+Tensor uvm_to_device(const Tensor& t, const Tensor& prototype);
+
+///@ingroup cumem-utils
+/// Call cudaMemAdvise on UVM Storage. The hint enum is generated in Python
+/// (fbgemm,uvm) using data returned from C++ op.
+void uvm_cuda_mem_advise(const Tensor& t, int64_t cuda_memory_advise);
+
+///@ingroup cumem-utils
+/// Call cudaMemPrefetchAsync on UVM Storage
+void uvm_cuda_mem_prefetch_async(
+    const Tensor& t,
+    c10::optional<Tensor> device_t);
+
+///@ingroup cumem-utils
+/// Call madvise(..MADV_DONTFORK) on the UVM storage. This is a workaround for
+/// an issue where the UVM kernel driver unmaps UVM storage pages from the page
+/// table on fork - causing slowdown on the next access from a CPU.
+void uvm_mem_advice_dont_fork(const Tensor& t);
+
+///@ingroup cumem-utils
+/// Copy a contigious uvm Tensor (uvm_storage(t) is true) into a CPU Tensor
+/// The copy uses single threaded memcpy
+Tensor uvm_to_cpu_clone(const Tensor& t);
+
+} // namespace uvm
--- a/install/.uuid
+++ b/install/.uuid
+e446cb7d-fe2e-47d9-adf1-88586cfccc97
\ No newline at end of file
--- a/install/libuvm_utils.so
+++ b/install/libuvm_utils.so
--- a/memory_utils/.uuid
+++ b/memory_utils/.uuid
+4037be3b-719f-4cf3-a705-923fbfe4b316
\ No newline at end of file
--- a/memory_utils/common.cuh
+++ b/memory_utils/common.cuh
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <cstring>
+
+#include "common.h"
+// #include "cumem_utils.h"
+#include "dispatch.h"
+#include "uvm_utils.h"
+// #include "enum_utils.h"
+
+// namespace fbgemm_gpu {
+
+// FBGEMM_GPU_ENUM_CREATE_TAG(uvm)
+
+// } // namespace fbgemm_gpu
--- a/memory_utils/common.h
+++ b/memory_utils/common.h
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>  
+
+using Tensor = at::Tensor;
+
+namespace uvm {
+
+Tensor new_unified_tensor_cpu(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool is_host_mapped,
+    bool copy_data);
+
+} // namespace uvm
--- a/memory_utils/memory_utils.cpp
+++ b/memory_utils/memory_utils.cpp
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.h"
+#include <c10/util/logging_is_not_google_glog.h>
+
+using Tensor = at::Tensor;
+using std::vector;
+
+namespace uvm {
+
+Tensor new_unified_tensor_cpu(
+    const Tensor& self,
+    const std::vector<std::int64_t>& sizes,
+    bool is_host_mapped,
+    bool copy_data) {
+  return at::empty({0}, self.options());
+}
+
+} // namespace uvm
--- a/memory_utils/memory_utils.cu
+++ b/memory_utils/memory_utils.cu
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "common.cuh"
+
+using namespace at;
+
+namespace uvm {
+
+// Freeing host/uvm memory with cudaFree[Host] requires a cuda context.
+// If a uvm tensor is released from an arbitrary thread without a context
+// then cuda helpfully create a new default context on the default device.
+// If we have not used the default device before in this process cuda
+// needs to also allocate a device context. However creating a device
+// context requires device resources and may fail with out of memory error
+// causing  cudaFree[Host] to fail with out of memory error.
+// The solution is simply to remember the device from the allocation context
+// and set the correct device in the thread before calling cudaFree[Host]
+
+    namespace {
+
+        struct CUDAHostMappedContext {
+            void *ptr_;
+            int cuda_device_;
+
+            CUDAHostMappedContext(void *ptr, int cuda_device)
+                    : ptr_(ptr), cuda_device_(cuda_device) {};
+
+            ~CUDAHostMappedContext() {
+                at::cuda::OptionalCUDAGuard device_guard;
+                device_guard.set_index(cuda_device_);
+                AT_CUDA_CHECK(cudaHostUnregister(ptr_));
+                free(ptr_);
+            }
+
+            static void release(void *ptr) {
+                delete static_cast<CUDAHostMappedContext *>(ptr);
+            }
+        };
+
+        struct CUDAManagedContext {
+            void *ptr_;
+            int cuda_device_;
+
+            CUDAManagedContext(void *ptr, int cuda_device)
+                    : ptr_(ptr), cuda_device_(cuda_device) {};
+
+            ~CUDAManagedContext() {
+                at::cuda::OptionalCUDAGuard device_guard;
+                device_guard.set_index(cuda_device_);
+                AT_CUDA_CHECK(cudaFree(ptr_));
+            }
+
+            static void release(void *ptr) {
+                delete static_cast<CUDAManagedContext *>(ptr);
+            }
+        };
+
+// Keep a reference to the UVM memory allocation from the associated
+// CPU Tensor to prevent lifetime issues (use after free)
+        struct CUDAManagedIndirectContext {
+            Storage storage_;
+
+            CUDAManagedIndirectContext(Storage storage) : storage_(std::move(storage)) {};
+
+            static void release(void *ptr) {
+                delete static_cast<CUDAManagedIndirectContext *>(ptr);
+            }
+        };
+
+// Get the default strides from the input Tensor dimensions
+        std::vector<int64_t> defaultStrides(IntArrayRef sizes) {
+            std::vector<int64_t> strides(sizes.size());
+            int64_t stride = 1;
+            for (size_t i = sizes.size(); i > 0; --i) {
+                strides[i - 1] = stride;
+                stride *= sizes[i - 1];
+            }
+            return strides;
+        }
+
+// Allocate the ATen Tensor with unified managed memory (UVM)
+        Tensor new_managed_tensor_internal(
+                const Tensor &self,
+                const std::vector<std::int64_t> &sizes,
+                bool copy_data) {
+            at::cuda::OptionalCUDAGuard device_guard;
+            device_guard.set_index(self.get_device());
+
+            auto strides = defaultStrides(sizes);
+            size_t size_bytes =
+                    at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
+            void *ptr;
+
+            if (copy_data) {
+                TORCH_CHECK(self.sizes().vec() == sizes);
+            }
+
+            AT_CUDA_CHECK(cudaMallocManaged(&ptr, size_bytes));
+
+            if (copy_data) {
+                if (self.is_cpu()) {
+                    AT_CUDA_CHECK(cudaMemcpy(ptr, self.data_ptr(), size_bytes, cudaMemcpyHostToHost));
+                } else if (self.is_cuda()) {
+                    AT_CUDA_CHECK(cudaMemcpy(ptr, self.data_ptr(), size_bytes, cudaMemcpyDeviceToHost));
+                }
+            }
+
+
+
+            // The memory allocated above can be accessed from CUDA and CPU
+            // However Storage requires a specific device and we need to retain the cuda
+            // device for releasing the memory (see "Freeing host/uvm memory with cudaFree
+            // .." above. To access the memory from devices other then the one used for
+            // allocation we need a new Storage object (with the new device) referring to
+            // the original storage object. We force this indirection even for newly
+            // allocated Tensors for code unification.
+
+            auto real_storage = Storage(
+                    Storage::use_byte_size_t(),
+                    size_bytes,
+                    at::DataPtr(
+                            ptr,
+                            new CUDAManagedContext(ptr, self.get_device()),
+                            &CUDAManagedContext::release,
+                            {at::DeviceType::CUDA, self.device().index()}),
+                    nullptr, /* allocator */
+                    /*resizable=*/false);
+
+            auto indirect_storage = Storage(
+                    Storage::use_byte_size_t(),
+                    size_bytes,
+                    at::DataPtr(
+                            ptr,
+                            new CUDAManagedIndirectContext(real_storage),
+                            &CUDAManagedIndirectContext::release,
+                            {at::DeviceType::CUDA, self.device().index()}),
+                    nullptr, /* allocator */
+                    /*resizable=*/false);
+
+            return at::empty({0}, self.options())
+                    .set_(indirect_storage, 0, sizes, strides);
+        }
+
+        std::tuple<void *, size_t> adjust_to_page_boundaries(void *ptr, size_t size) {
+            static uint64_t page_mask = ([]() -> uint64_t {
+                uint64_t page_size = (uint64_t) sysconf(_SC_PAGESIZE);
+                return (page_size - 1);
+            })();
+
+            uint64_t raw_ptr = (uint64_t) ptr;
+            uint64_t raw_ptr_adjusted = raw_ptr & ~page_mask;
+            uint64_t raw_ptr_end_adjusted = (raw_ptr + size + page_mask) & ~page_mask;
+            uint64_t size_adjusted = raw_ptr_end_adjusted - raw_ptr_adjusted;
+
+            return std::make_tuple((void *) raw_ptr_adjusted, (size_t) size_adjusted);
+        }
+
+    } // namespace
+
+// Allocate a cuda Tensor with unified managed memory (UVM)
+// Then set the preferred data location to CPU (host memory)
+// And establish mappings on the cuda device to the host memory
+    Tensor new_managed_tensor(
+            const Tensor &self,
+            const std::vector<std::int64_t> &sizes,
+            bool copy_data) {
+        at::cuda::OptionalCUDAGuard device_guard;
+        device_guard.set_index(self.get_device());
+
+        Tensor t = new_managed_tensor_internal(self, sizes, copy_data);
+
+        void *ptr = t.data_ptr();
+        size_t size_bytes = t.storage().nbytes();
+
+        // Set preferred memory location to host memory
+        AT_CUDA_CHECK(cudaMemAdvise(
+                ptr, size_bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
+        // User hints with "accessed by": GPU will establish direct mapping of data
+        // in CPU memory, no page faults will be generated
+        AT_CUDA_CHECK(cudaMemAdvise(
+                ptr, size_bytes, cudaMemAdviseSetAccessedBy, at::cuda::current_device()));
+        C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+        // Work around fork issue - see uvm_mem_advice_dont_fork for details
+        auto adjusted = adjust_to_page_boundaries(ptr, size_bytes);
+        int result =
+                madvise(std::get<0>(adjusted), std::get<1>(adjusted), MADV_DONTFORK);
+        TORCH_CHECK(result == 0)
+
+        return t;
+    }
+
+    // copy_data is used to make the function signature same with new_managed_tensor
+    Tensor new_managed_tensor_meta(
+            const Tensor &self,
+            const std::vector<std::int64_t> &sizes,
+            bool copy_data) {
+        return at::empty(sizes, self.options());
+    }
+
+// Allocate a cuda Tensor with unified managed memory (UVM) without the
+// additional steps taked by new_managed_tensor above
+    Tensor new_vanilla_managed_tensor(
+            const Tensor &self,
+            const std::vector<std::int64_t> &sizes,
+            bool copy_data) {
+        at::cuda::OptionalCUDAGuard device_guard;
+        device_guard.set_index(self.get_device());
+
+        return new_managed_tensor_internal(self, sizes, copy_data);
+    }
+
+// Allocate the ATen Tensor with host-mapped memory
+    Tensor new_host_mapped_tensor(
+            const Tensor &self,
+            const std::vector<std::int64_t> &sizes,
+            bool copy_data) {
+        at::cuda::OptionalCUDAGuard device_guard;
+        device_guard.set_index(self.get_device());
+
+        auto strides = defaultStrides(sizes);
+        size_t size_bytes =
+                at::detail::computeStorageNbytes(sizes, strides, self.dtype().itemsize());
+        if (copy_data) {
+            TORCH_CHECK(self.sizes().vec() == sizes);
+        }
+
+        // When using cudaHostAlloc for large allocations, we found that it can
+        // potentially take a global lock and lock out CUDA APIs from other processes.
+        // The main cost in cudaHostAlloc is faulting/mapping the pages. So, instead
+        // of using this cuda API, we can do regular malloc, pre-fault the pages, and
+        // then do cudaHostRegister with GPU mapping flags to lock the pages, so we
+        // can minimize the cost while holding this global lock.
+        void *const ptr = malloc(size_bytes);
+
+
+        // Pre-fault/map the pages by setting the first byte of the page
+        // TODO: parallelize the mapping of pages with a threadpool executor
+        const size_t pageSize = (size_t) sysconf(_SC_PAGESIZE);
+
+        // alignedPtr 指向 ptr 所在页面的第一个字节。在这段代码中，通过将 ptr 的值加上页面大小减1，然后通过位运算将结果向下舍入到页面边界，
+        // 得到了 alignedPtr 的值。这样做是为了确保 alignedPtr 是指向页面起始位置的指针，以便后续循环中按页面大小进行迭代操作。
+        // 因此，alignedPtr 指向的是 ptr 所在页面的起始位置
+        // 这行代码的作用是将指针ptr按照系统页面大小对齐，确保它指向页面的起始位置。它首先将指针ptr加上一个页面大小再减去1，
+        // 这样可以将指针ptr向上舍入到下一个页面的边界。然后，使用按位取反的方式将页面大小-1应用于指针ptr+pageSize-1，
+        // 这样就可以将其向下舍入到前一个页面的边界。最后，将对齐后的指针赋值给变量alignedPtr。
+        uintptr_t alignedPtr = (((uintptr_t) ptr + pageSize - 1) & ~(pageSize - 1));
+        for (uintptr_t p = alignedPtr; p < ((uintptr_t) ptr + size_bytes);
+             p += pageSize) {
+            memset((void *) p, 0, 1);
+        }
+
+        AT_CUDA_CHECK(cudaHostRegister(
+                ptr, size_bytes, cudaHostRegisterMapped | cudaHostRegisterPortable));
+
+        if (copy_data) {
+            if (self.is_cpu()) {
+                memcpy(ptr, self.data_ptr(), size_bytes);
+            } else if (self.is_cuda()) {
+                AT_CUDA_CHECK(cudaMemcpy(ptr, self.data_ptr(), size_bytes, cudaMemcpyDeviceToHost));
+            }
+        }
+
+        void *dev_ptr;
+        AT_CUDA_CHECK(cudaHostGetDevicePointer(&dev_ptr, ptr, 0));
+
+        auto storage = Storage(
+                Storage::use_byte_size_t(),
+                size_bytes,
+                at::DataPtr(
+                        dev_ptr,
+                        new CUDAHostMappedContext(ptr, self.get_device()),
+                        &CUDAHostMappedContext::release,
+                        {at::DeviceType::CUDA, self.device().index()}),
+                nullptr, /* allocator */
+                /*resizable=*/false);
+        return at::empty({0}, self.options())
+                .set_(std::move(storage), 0, sizes, strides);
+    }
+
+// Allocate the ATen Tensor with UVM or host-mapped memory
+    Tensor new_unified_tensor(
+            const Tensor &self,
+            const std::vector<std::int64_t> &sizes,
+            bool is_host_mapped,
+            bool copy_data) {
+        if (is_host_mapped) {
+            VLOG(2) << "Allocate the ATen Tensor with cudaHostAlloc";
+            return new_host_mapped_tensor(self, sizes, copy_data);
+        } else {
+            VLOG(2) << "Allocate the ATen Tensor with cudaMallocManaged";
+            return new_managed_tensor(self, sizes, copy_data);
+        }
+    }
+
+// Check if a tensor is allocated with UVM (CPU or GPU Tensor)
+    bool uvm_storage(const Tensor &t) {
+        auto deleter = t.storage().data_ptr().get_deleter();
+        return deleter == &CUDAManagedIndirectContext::release ||
+               deleter == &CUDAHostMappedContext::release;
+    }
+
+// Check if a tensor is allocated with UVM but is not a CPU Tensor
+    bool is_uvm_tensor(const Tensor &t) {
+        if (t.device().is_cpu()) {
+            return false;
+        }
+        return uvm_storage(t);
+    }
+
+// Convert a UVM tensor to a CPU tensor
+    Tensor uvm_to_cpu(const Tensor &t) {
+        TORCH_CHECK(is_uvm_tensor(t));
+        // Don't copy the storage - just keep a reference to the original storage
+        auto *tcontext =
+                t.storage().data_ptr().cast_context<CUDAManagedIndirectContext>(
+                        &CUDAManagedIndirectContext::release);
+        TORCH_CHECK(tcontext != nullptr)
+        auto *ocontext =
+                tcontext->storage_.data_ptr().cast_context<CUDAManagedContext>(
+                        &CUDAManagedContext::release);
+        auto storage = Storage(
+                Storage::use_byte_size_t(),
+                t.storage().nbytes(),
+                at::DataPtr(
+                        ocontext->ptr_,
+                        new CUDAManagedIndirectContext(tcontext->storage_),
+                        &CUDAManagedIndirectContext::release,
+                        {at::DeviceType::CPU}),
+                nullptr, /* allocator */
+                /*resizable=*/false);
+        return at::empty({0}, t.options().device(Device::Type::CPU))
+                .set_(std::move(storage), t.storage_offset(), t.sizes(), t.strides());
+    }
+
+// Create a new UVM tensor sharing storage with t on the same device as
+// prototype
+    Tensor uvm_to_device(const Tensor &t, const Tensor &prototype) {
+        TORCH_CHECK(is_uvm_tensor(t));
+        // Don't copy the storage - just keep a reference to the original storage
+        auto *tcontext =
+                t.storage().data_ptr().cast_context<CUDAManagedIndirectContext>(
+                        &CUDAManagedIndirectContext::release);
+        TORCH_CHECK(tcontext != nullptr)
+
+        auto device = prototype.device();
+        auto *ocontext =
+                tcontext->storage_.data_ptr().cast_context<CUDAManagedContext>(
+                        &CUDAManagedContext::release);
+        auto storage = Storage(
+                Storage::use_byte_size_t(),
+                t.storage().nbytes(),
+                at::DataPtr(
+                        ocontext->ptr_,
+                        new CUDAManagedIndirectContext(tcontext->storage_),
+                        &CUDAManagedIndirectContext::release,
+                        device),
+                nullptr, /* allocator */
+                /*resizable=*/false);
+        return at::empty({0}, t.options().device(device))
+                .set_(std::move(storage), t.storage_offset(), t.sizes(), t.strides());
+    }
+
+    namespace {
+        int64_t uvm_get_guard_index(const Tensor &t) {
+            TORCH_CHECK(uvm_storage(t));
+            int cuda_device_index;
+            if (t.is_cpu()) {
+                auto *tcontext =
+                        t.storage().data_ptr().cast_context<CUDAManagedIndirectContext>(
+                                &CUDAManagedIndirectContext::release);
+                TORCH_CHECK(tcontext != nullptr)
+                auto *ocontext =
+                        tcontext->storage_.data_ptr().cast_context<CUDAManagedContext>(
+                                &CUDAManagedContext::release);
+                TORCH_CHECK(ocontext != nullptr)
+                cuda_device_index = static_cast<int64_t>(ocontext->cuda_device_);
+            } else {
+                TORCH_CHECK(t.is_cuda());
+                cuda_device_index = t.get_device();
+            }
+            return cuda_device_index;
+        }
+    } // namespace
+
+    void uvm_cuda_mem_advise(const Tensor &t, int64_t cuda_memory_advise) {
+        // Call cudaMemAdvise on vm tensor
+        // See cudaMemoryAdvise enum (automatically exported to python fbgemm_gpu.uvm
+        // namespace) for valid values and interface stub.
+        at::cuda::OptionalCUDAGuard device_guard;
+        int64_t cuda_device_index = uvm_get_guard_index(t);
+        int hint_device;
+        if (t.is_cpu()) {
+            hint_device = cudaCpuDeviceId;
+        } else {
+            TORCH_CHECK(t.is_cuda());
+            hint_device = static_cast<int>(cuda_device_index);
+        }
+
+        void *ptr = t.data_ptr();
+        size_t size_bytes = at::detail::computeStorageNbytes(
+                t.sizes(), t.strides(), t.dtype().itemsize());
+
+        device_guard.set_index(cuda_device_index);
+
+        // FIXME: some advanced "cudaMemAdvise" flags are not supported by HIP.
+        AT_CUDA_CHECK(cudaMemAdvise(
+                ptr,
+                size_bytes,
+                static_cast<enum cudaMemoryAdvise>(cuda_memory_advise),
+                hint_device));
+        return;
+    }
+
+    void uvm_cuda_mem_prefetch_async(
+            const Tensor &t,
+            c10::optional<Tensor> device_t) {
+        // Call cudaMemPrefetchAsync on Tensor
+        at::cuda::OptionalCUDAGuard device_guard;
+        TORCH_CHECK(uvm_storage(t));
+        TORCH_CHECK(t.is_cuda() || (t.is_cpu() && device_t.has_value()));
+        TORCH_CHECK(!device_t.has_value() || device_t.value().is_cuda());
+
+        int prefetch_device =
+                (t.is_cpu()) ? cudaCpuDeviceId : static_cast<int>(t.get_device());
+
+        const Tensor &context_t = device_t.has_value() ? device_t.value() : t;
+
+        void *ptr = t.data_ptr();
+        size_t size_bytes = at::detail::computeStorageNbytes(
+                t.sizes(), t.strides(), t.dtype().itemsize());
+
+        device_guard.set_index(context_t.get_device());
+
+        auto stream = at::cuda::getCurrentCUDAStream();
+
+        AT_CUDA_CHECK(cudaMemPrefetchAsync(ptr, size_bytes, prefetch_device, stream));
+
+        return;
+    }
+
+    void uvm_mem_advice_dont_fork(const Tensor &t) {
+        // During fork() the uvm driver is called to copy VMA for UVM space.
+        // The uvm driver then removes pmap entries for both child and parent.
+        // Re-establishing the mappings for the paretn is slow.
+        // This works around the issue by setting the UVM VMA to not be copied
+        // into the child.
+        TORCH_CHECK(uvm_storage(t));
+
+        void *ptr = t.data_ptr();
+        size_t size_bytes = at::detail::computeStorageNbytes(
+                t.sizes(), t.strides(), t.dtype().itemsize());
+
+        auto adjusted = adjust_to_page_boundaries(ptr, size_bytes);
+
+        int result =
+                madvise(std::get<0>(adjusted), std::get<1>(adjusted), MADV_DONTFORK);
+
+        TORCH_CHECK_EQ(result, 0);
+
+        return;
+    }
+
+    Tensor uvm_to_cpu_clone(const Tensor &t) {
+        TORCH_CHECK(uvm_storage(t));
+        TORCH_CHECK(t.is_contiguous());
+
+        Tensor cpu_clone = at::empty_like(t, t.options().device(kCPU));
+
+        size_t size_bytes = at::detail::computeStorageNbytes(
+                t.sizes(), t.strides(), t.dtype().itemsize());
+
+        memcpy(cpu_clone.data_ptr(), t.data_ptr(), size_bytes);
+
+        return cpu_clone;
+    }
+
+// FBGEMM_GPU_ENUM_GLOGAL(uvm)
+
+// FBGEMM_GPU_ENUM_REGISTER_START(uvm, cudaMemory, Advise){
+//     FBGEMM_GPU_ENUM_ITEM(
+//         cudaMem,
+//         AdviseSetReadMostly,
+//         cudaMemAdviseSetReadMostly),
+//     FBGEMM_GPU_ENUM_ITEM(
+//         cudaMem,
+//         AdviseUnsetReadMostly,
+//         cudaMemAdviseUnsetReadMostly),
+//     FBGEMM_GPU_ENUM_ITEM(
+//         cudaMem,
+//         AdviseSetPreferredLocation,
+//         cudaMemAdviseSetPreferredLocation),
+//     FBGEMM_GPU_ENUM_ITEM(
+//         cudaMem,
+//         AdviseUnsetPreferredLocation,
+//         cudaMemAdviseUnsetPreferredLocation),
+//     FBGEMM_GPU_ENUM_ITEM(
+//         cudaMem,
+//         AdviseSetAccessedBy,
+//         cudaMemAdviseSetAccessedBy),
+//     FBGEMM_GPU_ENUM_ITEM(
+//         cudaMem,
+//         AdviseUnsetAccessedBy,
+//         cudaMemAdviseUnsetAccessedBy),
+// } FBGEMM_GPU_ENUM_REGISTER_END
+
+} // namespace uvm
--- a/memory_utils/memory_utils_ops.cpp
+++ b/memory_utils/memory_utils_ops.cpp
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/library.h>
+#include "common.cuh"
+
+
+//#define DISPATCH_TO_CPU(name, function) \
+//  m.impl(name, torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(function)))
+//
+//#define DISPATCH_TO_META(name, function) \
+//  m.impl(name, torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(function)))
+
+using Tensor = at::Tensor;
+
+namespace uvm {
+
+    TORCH_LIBRARY_FRAGMENT(uvm, m) {
+        m.def("is_uvm_tensor(Tensor t) -> bool", TORCH_FN(is_uvm_tensor));
+        m.def("uvm_storage(Tensor t) -> bool", TORCH_FN(uvm_storage));
+        m.def(
+                "uvm_to_device(Tensor self, Tensor prototype) -> Tensor",
+                TORCH_FN(uvm_to_device));
+        m.def("uvm_to_cpu(Tensor t) -> Tensor");
+        m.def("new_managed_tensor(Tensor self, int[] sizes, bool copy_data) -> Tensor");
+        m.def("new_host_mapped_tensor(Tensor self, int[] sizes, bool copy_data) -> Tensor");
+        m.def(
+                "new_unified_tensor(Tensor self, int[] sizes, bool is_host_mapped, bool copy_data) -> Tensor");
+        m.def("new_vanilla_managed_tensor(Tensor self, int[] sizes, bool copy_data) -> Tensor");
+        m.def(
+                "cuda_mem_advise(Tensor t, int advice) -> ()",
+                TORCH_FN(uvm_cuda_mem_advise));
+        m.def(
+                "cuda_mem_prefetch_async(Tensor t, Tensor? device_t) -> ()",
+                TORCH_FN(uvm_cuda_mem_prefetch_async));
+        m.def(
+                "uvm_mem_advice_dont_fork(Tensor t) -> ()",
+                TORCH_FN(uvm_mem_advice_dont_fork));
+
+        m.def("uvm_to_cpu_clone(Tensor t) -> Tensor", TORCH_FN(uvm_to_cpu_clone));
+        // m.def(FBGEMM_GPU_ENUM_OP(uvm, fbgemm_gpu_uvm_enum_query));
+    }
+
+    TORCH_LIBRARY_FRAGMENT(uvm, m) {
+        DISPATCH_TO_CPU("new_unified_tensor", new_unified_tensor_cpu);
+        DISPATCH_TO_META("new_managed_tensor_meta", new_managed_tensor_meta);
+    }
+
+} // namespace uvn
--- a/memory_utils/memory_utils_ops.cu
+++ b/memory_utils/memory_utils_ops.cu
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/library.h>
+#include "common.cuh"
+
+namespace uvm {
+
+TORCH_LIBRARY_FRAGMENT(uvm, m) {
+  DISPATCH_TO_CUDA("uvm_to_cpu", uvm_to_cpu);
+  DISPATCH_TO_CUDA("new_managed_tensor", new_managed_tensor);
+  DISPATCH_TO_META("new_managed_tensor", new_managed_tensor_meta);
+  DISPATCH_TO_CUDA("new_host_mapped_tensor", new_host_mapped_tensor);
+  DISPATCH_TO_CUDA("new_unified_tensor", new_unified_tensor);
+  DISPATCH_TO_CUDA("new_vanilla_managed_tensor", new_vanilla_managed_tensor);
+}
+
+} // namespace uvm
--- a/setup.py
+++ b/setup.py
+import subprocess
+from distutils.sysconfig import get_python_lib
+
+cmake_args = [
+    "-DCMAKE_PREFIX_PATH=" + get_python_lib(),
+]
+
+subprocess.check_call(["cmake", "-B", "build"] + cmake_args)
+# subprocess.check_call1
\ No newline at end of file
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
+cmake_minimum_required(VERSION 3.10)
+
+set(CUDAToolkit_ROOT /home/wtx/.local/cuda-11.7)
+set(CMAKE_CUDA_ARCHITECTURES 86)
+set(CMAKE_CUDA_COMPILER /home/wtx/.local/cuda-11.7/bin/nvcc)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+list(APPEND CMAKE_PREFIX_PATH /home/wtx/local/pkg/libtorch/share/cmake/Torch)
+set(Python3_ROOT_DIR /home/wtx/miniconda3/envs/dgl )
+set(CUDAToolkit_INCLUDE_DIR /home/wtx/.local/cuda11.7+cudnn8.9)
+
+project(uvm_test CXX CUDA)
+
+
+set(ENV{Torch_DIR} /home/wtx/miniconda3/pkgs/pytorch-1.13.1-py3.10_cuda11.7_cudnn8.5.0_0/lib/python3.10/site-packages/torch/share/cmake/Torch)
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+message(STATUS "torch: ${TORCH_INCLUDE_DIRS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+# include_directories(/home/wtx/workspace/cpp_project/uvm/memory_utils_old)
+include_directories("../include")
+
+add_executable(uvm main.cpp)
+set(ENV{LD_LIBRARY_PATH} "$ENV{LD_LIBRARY_PATH} /home/wtx/workspace/cpp_project/uvm/test")
+target_link_libraries(uvm /home/wtx/workspace/cpp_project/uvm/test/libuvm_utils.so)
--- a/test/libuvm_utils.so
+++ b/test/libuvm_utils.so
--- a/test/main.cpp
+++ b/test/main.cpp
+#include "uvm_utils.h"
+#include <iostream>
+
+using std::cout;
+using std::endl;
+
+int main(){
+    auto t = at::ones({1,1}, c10::TensorOptions());
+    cout << uvm::uvm_storage(t) << endl;
+}
--- a/test2.ipynb
+++ b/test2.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = torch.tensor([[1,2,3],[4,5,6]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 3])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = a.t()\n",
+    "b[0,0] = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "id(b.storage()) == id(a.storage())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor(100)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a[0,0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0, 0)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.storage_offset(), b.storage_offset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((3, 1), (1, 3))"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a.stride(), b.stride()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = b[(0,2),:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tensor([[100,   4],\n",
+       "         [  3,   6]]),\n",
+       " tensor([[100,   4],\n",
+       "         [  2,   5],\n",
+       "         [  3,   6]]),\n",
+       " tensor([[100,   2,   3],\n",
+       "         [  4,   5,   6]]))"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c, b, a"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2, 1)"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c.stride()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = torch.arange(1,11).reshape(2,5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 1,  2,  3,  4,  5],\n",
+       "        [ 6,  7,  8,  9, 10]])"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 1,  6],\n",
+       "        [ 2,  7],\n",
+       "        [ 3,  8],\n",
+       "        [ 4,  9],\n",
+       "        [ 5, 10]])"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "b = a.t()\n",
+    "b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([2, 3])"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "c = b[(1,3,4),:].t()\n",
+    "c.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([10, 2])"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = torch.arange(20).reshape(10,2)\n",
+    "a.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "torch.Size([3, 2])\n",
+      " 8\n",
+      " 9\n",
+      " 10\n",
+      " 11\n",
+      " 18\n",
+      " 19\n",
+      "[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 6]\n",
+      "0\n",
+      "(2, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "b = a[(4,5,9), :]\n",
+    "print(id(b.storage()) == id(a.storage()))\n",
+    "print(b.shape)\n",
+    "print(b.storage())\n",
+    "print(b.storage_offset())\n",
+    "print(b.stride())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dgl",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/testuvm.ipynb
+++ b/testuvm.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch \n",
+    "\n",
+    "torch.ops.load_library(\"cmake-build-debug/libuvm_utils.so\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = torch.arange(1,10).reshape(3,3).cuda()\n",
+    "a = torch.empty(0).cuda()\n",
+    "# x = torch.ops.fbgemm.new_managed_tensor(a, (3,3))\n",
+    "# print(x)\n",
+    "# torch.ops.fbgemm.is_uvm_tensor(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<module 'torch.ops.uvm' from 'torch.ops'>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.ops.uvm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[1., 1.],\n",
+      "        [1., 1.]])\n",
+      "<bound method Tensor.storage of tensor([])>\n",
+      "tensor([])\n"
+     ]
+    }
+   ],
+   "source": [
+    "y = torch.ones((2,2))\n",
+    "z = torch.ops.uvm.new_unified_tensor(y,(2,2),True,True)\n",
+    "print(y)\n",
+    "print(z.storage)\n",
+    "print(z)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([])\n"
+     ]
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "dgl",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/uvm.ipynb
+++ b/uvm.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "libcudart.so.12: cannot open shared object file: No such file or directory\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/torch/_ops.py:565\u001b[0m, in \u001b[0;36m_OpNamespace.__getattr__\u001b[0;34m(self, op_name)\u001b[0m\n\u001b[1;32m    564\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 565\u001b[0m     op, overload_names \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49m_C\u001b[39m.\u001b[39;49m_jit_get_operation(qualified_op_name)\n\u001b[1;32m    566\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    567\u001b[0m     \u001b[39m# Turn this into AttributeError so getattr(obj, key, default)\u001b[39;00m\n\u001b[1;32m    568\u001b[0m     \u001b[39m# works (this is called by TorchScript with __origin__)\u001b[39;00m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: No such operator fbgemm::jagged_2d_to_dense",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/home/wtx/workspace/cpp_project/uvm/uvm.ipynb Cell 1\u001b[0m line \u001b[0;36m2\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mfbgemm_gpu\u001b[39;00m\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W0sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mfbgemm_gpu\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39muvm\u001b[39;00m \u001b[39mimport\u001b[39;00m cudaMemAdvise, cudaMemPrefetchAsync, cudaMemoryAdvise\n",
+      "File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/fbgemm_gpu/__init__.py:22\u001b[0m\n\u001b[1;32m     19\u001b[0m open_source: \u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[39m# Re-export docs\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m _fbgemm_gpu_docs  \u001b[39m# noqa: F401, E402\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[39m# Re-export the version string from the auto-generated version file\u001b[39;00m\n\u001b[1;32m     25\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m_fbgemm_gpu_version\u001b[39;00m \u001b[39mimport\u001b[39;00m __version__  \u001b[39m# noqa: F401, E402\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/fbgemm_gpu/_fbgemm_gpu_docs.py:19\u001b[0m\n\u001b[1;32m     14\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39madd_docs\u001b[39m(method, docstr):\n\u001b[1;32m     15\u001b[0m     method\u001b[39m.\u001b[39m\u001b[39m__doc__\u001b[39m \u001b[39m=\u001b[39m docstr\n\u001b[1;32m     18\u001b[0m add_docs(\n\u001b[0;32m---> 19\u001b[0m     torch\u001b[39m.\u001b[39;49mops\u001b[39m.\u001b[39;49mfbgemm\u001b[39m.\u001b[39;49mjagged_2d_to_dense,\n\u001b[1;32m     20\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \u001b[39mjagged_2d_to_dense(values, x_offsets, max_sequence_length) -> Tensor\u001b[39;00m\n\u001b[1;32m     22\u001b[0m \n\u001b[1;32m     23\u001b[0m \u001b[39mConverts a jagged tensor, with a 2D values array into a dense tensor, padding with zeros.\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \n\u001b[1;32m     25\u001b[0m \u001b[39mArgs:\u001b[39;00m\n\u001b[1;32m     26\u001b[0m \u001b[39m    values (Tensor): 2D tensor containing the values of the jagged tensor.\u001b[39;00m\n\u001b[1;32m     27\u001b[0m \n\u001b[1;32m     28\u001b[0m \u001b[39m    x_offsets (Tensor): 1D tensor containing the starting point of each jagged row in the values tensor.\u001b[39;00m\n\u001b[1;32m     29\u001b[0m \n\u001b[1;32m     30\u001b[0m \u001b[39m    max_sequence_length (int): Maximum length of any row in the jagged dimension.\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \n\u001b[1;32m     32\u001b[0m \u001b[39mReturns:\u001b[39;00m\n\u001b[1;32m     33\u001b[0m \u001b[39m    Tensor: The padded dense tensor\u001b[39;00m\n\u001b[1;32m     34\u001b[0m \n\u001b[1;32m     35\u001b[0m \u001b[39mExample:\u001b[39;00m\n\u001b[1;32m     36\u001b[0m \u001b[39m    >>> values = torch.tensor([[1,1],[2,2],[3,3],[4,4]])\u001b[39;00m\n\u001b[1;32m     37\u001b[0m \u001b[39m    >>> x_offsets = torch.tensor([0, 1, 3])\u001b[39;00m\n\u001b[1;32m     38\u001b[0m \u001b[39m    >>> torch.ops.fbgemm.jagged_2d_to_dense(values, x_offsets, 3)\u001b[39;00m\n\u001b[1;32m     39\u001b[0m \u001b[39m    tensor([[[1, 1],\u001b[39;00m\n\u001b[1;32m     40\u001b[0m \u001b[39m             [0, 0],\u001b[39;00m\n\u001b[1;32m     41\u001b[0m \u001b[39m             [0, 0]],\u001b[39;00m\n\u001b[1;32m     42\u001b[0m \u001b[39m            [[2, 2],\u001b[39;00m\n\u001b[1;32m     43\u001b[0m \u001b[39m             [3, 3],\u001b[39;00m\n\u001b[1;32m     44\u001b[0m \u001b[39m             [0, 0]]])\u001b[39;00m\n\u001b[1;32m     45\u001b[0m \n\u001b[1;32m     46\u001b[0m \u001b[39m\"\"\"\u001b[39;00m,\n\u001b[1;32m     47\u001b[0m )\n\u001b[1;32m     49\u001b[0m \u001b[39m# Example:\u001b[39;00m\n\u001b[1;32m     50\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[1;32m     51\u001b[0m \u001b[39m#    >>> t = torch.arange(4)\u001b[39;00m\n\u001b[1;32m     54\u001b[0m add_docs(\n\u001b[1;32m     55\u001b[0m     torch\u001b[39m.\u001b[39mops\u001b[39m.\u001b[39mfbgemm\u001b[39m.\u001b[39mjagged_1d_to_dense,\n\u001b[1;32m     56\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     80\u001b[0m \u001b[39m\"\"\"\u001b[39;00m,\n\u001b[1;32m     81\u001b[0m )\n",
+      "File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/torch/_ops.py:569\u001b[0m, in \u001b[0;36m_OpNamespace.__getattr__\u001b[0;34m(self, op_name)\u001b[0m\n\u001b[1;32m    565\u001b[0m     op, overload_names \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39m_C\u001b[39m.\u001b[39m_jit_get_operation(qualified_op_name)\n\u001b[1;32m    566\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    567\u001b[0m     \u001b[39m# Turn this into AttributeError so getattr(obj, key, default)\u001b[39;00m\n\u001b[1;32m    568\u001b[0m     \u001b[39m# works (this is called by TorchScript with __origin__)\u001b[39;00m\n\u001b[0;32m--> 569\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mAttributeError\u001b[39;00m(\n\u001b[1;32m    570\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m'\u001b[39m\u001b[39m_OpNamespace\u001b[39m\u001b[39m'\u001b[39m\u001b[39m \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mname\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m object has no attribute \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mop_name\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m    571\u001b[0m     ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m    573\u001b[0m \u001b[39m# let the script frontend know that op is identical to the builtin op\u001b[39;00m\n\u001b[1;32m    574\u001b[0m \u001b[39m# with qualified_op_name\u001b[39;00m\n\u001b[1;32m    575\u001b[0m torch\u001b[39m.\u001b[39mjit\u001b[39m.\u001b[39m_builtins\u001b[39m.\u001b[39m_register_builtin(op, qualified_op_name)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: '_OpNamespace' 'fbgemm' object has no attribute 'jagged_2d_to_dense'"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import fbgemm_gpu\n",
+    "from fbgemm_gpu.uvm import cudaMemAdvise, cudaMemPrefetchAsync, cudaMemoryAdvise"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "RuntimeError",
+     "evalue": "The NVIDIA driver on your system is too old (found version 11070). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "\u001b[1;32m/home/wtx/workspace/cpp_project/uvm/uvm.ipynb Cell 2\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m ref \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mzeros(\u001b[39m1\u001b[39;49m, dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat32, device\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mcuda:0\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m      <a href='vscode-notebook-cell://ssh-remote%2B10.214.211.184/home/wtx/workspace/cpp_project/uvm/uvm.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m x \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mops\u001b[39m.\u001b[39mfbgemm\u001b[39m.\u001b[39mnew_managed_tensor(ref, (\u001b[39m10\u001b[39m, \u001b[39m10\u001b[39m))\n",
+      "File \u001b[0;32m~/miniconda3/envs/fbgemm/lib/python3.10/site-packages/torch/cuda/__init__.py:298\u001b[0m, in \u001b[0;36m_lazy_init\u001b[0;34m()\u001b[0m\n\u001b[1;32m    296\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m\"\u001b[39m\u001b[39mCUDA_MODULE_LOADING\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m os\u001b[39m.\u001b[39menviron:\n\u001b[1;32m    297\u001b[0m     os\u001b[39m.\u001b[39menviron[\u001b[39m\"\u001b[39m\u001b[39mCUDA_MODULE_LOADING\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mLAZY\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m--> 298\u001b[0m torch\u001b[39m.\u001b[39;49m_C\u001b[39m.\u001b[39;49m_cuda_init()\n\u001b[1;32m    299\u001b[0m \u001b[39m# Some of the queued calls may reentrantly call _lazy_init();\u001b[39;00m\n\u001b[1;32m    300\u001b[0m \u001b[39m# we need to just return without initializing in that case.\u001b[39;00m\n\u001b[1;32m    301\u001b[0m \u001b[39m# However, we must not let any *other* threads in!\u001b[39;00m\n\u001b[1;32m    302\u001b[0m _tls\u001b[39m.\u001b[39mis_initializing \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: The NVIDIA driver on your system is too old (found version 11070). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver."
+     ]
+    }
+   ],
+   "source": [
+    "ref = torch.zeros(1, dtype=torch.float32, device=\"cuda:0\")\n",
+    "x = torch.ops.fbgemm.new_managed_tensor(ref, (10, 10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "values = torch.tensor([[1,1],[2,2],[3,3],[4,4]])\n",
+    "x_offsets = torch.tensor([0, 4, 4])\n",
+    "m = torch.ops.fbgemm.jagged_2d_to_dense(values, x_offsets, 3)\n",
+    "m"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.ops.fbgemm.dense_to_jagged(m, [x_offsets])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y = torch.randn(10, 10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x[:] = y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y[:] = x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "z = x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.ops.fbgemm.uvm_storage(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.ops.fbgemm.fbgemm_gpu_uvm_enum_query()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudaMemAdvise(x, cudaMemoryAdvise.cudaMemAdviseSetReadMostly)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cudaMemPrefetchAsync(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.ops.fbgemm.is_uvm_tensor(x[:1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.ops.fbgemm.uvm_to_cpu(x).data_ptr() == x.data_ptr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x2 = torch.ops.fbgemm.uvm_to_device(x, x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x[:1] = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fbgemm_gpu.split_table_batched_embeddings_ops import EmbeddingLocation, SplitTableBatchedEmbeddingBagsCodegen, ComputeDevice, DenseTableBatchedEmbeddingBagsCodegen"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_specs = [\n",
+    "    (3, 8, EmbeddingLocation.DEVICE, ComputeDevice.CUDA),\n",
+    "    (5, 4, EmbeddingLocation.MANAGED, ComputeDevice.CUDA),\n",
+    "]\n",
+    "\n",
+    "tbe = SplitTableBatchedEmbeddingBagsCodegen(embedding_specs)\n",
+    "tbe.init_embedding_weights_uniform(-1, 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tbe.weights_dev.copy_(torch.arange(3, device=\"cuda:0\").view(-1, 1).repeat(1, 8).flatten())\n",
+    "tbe.weights_uvm.copy_((torch.arange(5) + 3).view(-1, 1).repeat(1, 4).flatten())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tbe.split_embedding_weights()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "0 1\n",
+    "2 0 1\n",
+    "2 0\n",
+    "3 1\n",
+    "4 2 0\n",
+    "0\n",
+    "\"\"\"\n",
+    "indices = torch.tensor([0, 1, 2, 0, 1, 2, 0, 3, 1, 4, 2, 0, 0], device=\"cuda\", dtype=torch.long)\n",
+    "offsets = torch.tensor([0, 2, 5, 7, 9, 12, 13], device=\"cuda\", dtype=torch.long)\n",
+    "tbe(indices, offsets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# torch.ops.fbgemm.FloatToFused8BitRowwiseQuantized\n",
+    "# torch.ops.fbgemm.FloatToFP8RowwiseQuantized"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DenseTableBatchedEmbeddingBagsCodegen"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "llm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}