Commit f081fb39 by tianxing wang

finish fifo cache

parent 52df1e52
...@@ -10,7 +10,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS on) ...@@ -10,7 +10,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS on)
# gpucache # gpucache
file(GLOB SOURCE_FILES file(GLOB SOURCE_FILES
# ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/* # ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
# ${CMAKE_CURRENT_SOURCE_DIR}/src/hash/*.cuh ${CMAKE_CURRENT_SOURCE_DIR}/src/hash/*.cu
${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu
) )
...@@ -35,8 +35,9 @@ target_compile_options(${cache_lib_name} PRIVATE ${TORCH_CXX_FLAGS} -O3) ...@@ -35,8 +35,9 @@ target_compile_options(${cache_lib_name} PRIVATE ${TORCH_CXX_FLAGS} -O3)
target_link_libraries(${cache_lib_name} PRIVATE ${TORCH_LIBRARIES}) target_link_libraries(${cache_lib_name} PRIVATE ${TORCH_LIBRARIES})
target_compile_definitions(${cache_lib_name} PRIVATE -DTORCH_EXTENSION_NAME=lib${cache_lib_name}) target_compile_definitions(${cache_lib_name} PRIVATE -DTORCH_EXTENSION_NAME=lib${cache_lib_name})
find_library(TORCH_PYTHON_LIBRARY torch_python "${TORCH_INSTALL_PREFIX}/lib" REQUIRED) find_library(TORCH_PYTHON_LIBRARY torch_python "${TORCH_INSTALL_PREFIX}/lib")
target_link_libraries(${cache_lib_name} PRIVATE ${TORCH_PYTHON_LIBRARY}) target_link_libraries(${cache_lib_name} PRIVATE ${TORCH_PYTHON_LIBRARY})
message(STATUS "TORCH_PYTHON_LIBRARY: " ${TORCH_PYTHON_LIBRARY} )
......
No preview for this file type
...@@ -2,7 +2,7 @@ TODO ...@@ -2,7 +2,7 @@ TODO
[] LRU [] LRU
[] FIFO [] FIFO
[] LFU [] LFU
......
#pragma once #pragma once
#include <cstdint> #include <cstdint>
namespace gpucache { namespace gpucache {
struct CacheConfig { struct CacheConfig {
...@@ -48,9 +48,6 @@ namespace gpucache { ...@@ -48,9 +48,6 @@ namespace gpucache {
virtual void Put(cudaStream_t stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0; virtual void Put(cudaStream_t stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0;
// virtual void* Mutex() = 0;
virtual void Clear() = 0; virtual void Clear() = 0;
virtual uint32_t MaxQueryNum() = 0; virtual uint32_t MaxQueryNum() = 0;
......
#pragma once #pragma once
#include <c10/cuda/CUDAStream.h> #include <c10/cuda/CUDAStream.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <cstdint> #include <cstdint>
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <vector> #include <memory>
#include "utils.cuh"
#include "cache.h"
#include "hash/hash_function.cuh"
#define CUDA_CHECK(call) \ #define CUDA_CHECK(call) \
{ \ { \
const cudaError_t error = call; \ const cudaError_t error = call; \
if (error != cudaSuccess) \ if (error != cudaSuccess) \
...@@ -20,3 +22,5 @@ ...@@ -20,3 +22,5 @@
cudaGetErrorString(error)); \ cudaGetErrorString(error)); \
} \ } \
} }
#include "common.cuh"
#include "lru_cache.h"
#include <pybind11/pybind11.h>
namespace py = pybind11;
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
py::class_<gpucache::CacheConfig> cfg(m,"CacheConfig");
cfg.def(py::init<>())
.def(py::init<gpucache::CacheConfig::CacheEvictStrategy, uint64_t, uint32_t, uint32_t, uint32_t, int8_t,uint32_t>())
.def_readwrite("strategy", &gpucache::CacheConfig::strategy)
.def_readwrite("capacity",&gpucache::CacheConfig::capacity)
.def_readwrite("key_size", &gpucache::CacheConfig::keySize)
.def_readwrite("value_size",&gpucache::CacheConfig::valueSize)
.def_readwrite("max_query_num", &gpucache::CacheConfig::maxQueryNum)
.def_readwrite("device_id",&gpucache::CacheConfig::deviceId)
.def_readwrite("dim",&gpucache::CacheConfig::dim);
py::enum_<gpucache::CacheConfig::CacheEvictStrategy>(cfg,"CacheEvictStrategy")
.value("LRU",gpucache::CacheConfig::CacheEvictStrategy::LRU)
.value("LFU",gpucache::CacheConfig::CacheEvictStrategy::LFU)
.value("FIFO",gpucache::CacheConfig::CacheEvictStrategy::FIFO)
.export_values();
py::class_<gpucache::LRUCacheWrapper> lru_cache(m, "LRUCache");
lru_cache
.def(py::init<at::Tensor, gpucache::CacheConfig>())
.def("Get",&gpucache::LRUCacheWrapper::Get,"get values for keys, find_mask return whether each key exists in cache")
.def("Put",&gpucache::LRUCacheWrapper::Put,"put key-value pairs")
.def("Strategy",&gpucache::LRUCacheWrapper::Strategy,"get evict strategy")
.def("Capacity",&gpucache::LRUCacheWrapper::Capacity,"return cache capacity")
.def("KeySize",&gpucache::LRUCacheWrapper::KeySize,"return key size")
.def("ValueSize",&gpucache::LRUCacheWrapper::ValueSize,"return value size")
.def("MaxQueryNum",&gpucache::LRUCacheWrapper::MaxQueryNum,"return max number of keys to get or key-values to put once")
.def("Clear",&gpucache::LRUCacheWrapper::Clear,"clear cache")
.def("Device",&gpucache::LRUCacheWrapper::DeviceId,"return device id")
.def("Dim",&gpucache::LRUCacheWrapper::Dim,"return value dim");
m.def("NewLRUCache", &gpucache::NewLRUCache, "create a lru cache",py::return_value_policy::reference);
}
#include "lru_cache.h"
#include "fifo_cache.h"
#include <pybind11/pybind11.h>
namespace py = pybind11;
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
/* CacheConfig */
/*-----------------------------------------------------------------------------------------------------------------------------------*/
py::class_<gpucache::CacheConfig> cfg(m,"CacheConfig");
cfg.def(py::init<>())
.def(py::init<gpucache::CacheConfig::CacheEvictStrategy, uint64_t, uint32_t, uint32_t, uint32_t, int8_t,uint32_t>())
.def_readwrite("strategy", &gpucache::CacheConfig::strategy)
.def_readwrite("capacity",&gpucache::CacheConfig::capacity)
.def_readwrite("keySize", &gpucache::CacheConfig::keySize)
.def_readwrite("valueSize",&gpucache::CacheConfig::valueSize)
.def_readwrite("maxQueryNum", &gpucache::CacheConfig::maxQueryNum)
.def_readwrite("deviceId",&gpucache::CacheConfig::deviceId)
.def_readwrite("dim",&gpucache::CacheConfig::dim);
py::enum_<gpucache::CacheConfig::CacheEvictStrategy>(cfg,"CacheEvictStrategy")
.value("LRU",gpucache::CacheConfig::CacheEvictStrategy::LRU)
.value("LFU",gpucache::CacheConfig::CacheEvictStrategy::LFU)
.value("FIFO",gpucache::CacheConfig::CacheEvictStrategy::FIFO)
.export_values();
/*-----------------------------------------------------------------------------------------------------------------------------------*/
/* lrucache */
/*-----------------------------------------------------------------------------------------------------------------------------------*/
py::class_<gpucache::lrucache::LRUCacheWrapper> lru_cache(m, "LRUCache");
lru_cache
.def(py::init<at::Tensor, gpucache::CacheConfig>())
.def("Get",&gpucache::lrucache::LRUCacheWrapper::Get,"get values for keys, find_mask return whether each key exists in cache")
.def("Put",&gpucache::lrucache::LRUCacheWrapper::Put,"put key-value pairs")
.def("Strategy",&gpucache::lrucache::LRUCacheWrapper::Strategy,"get evict strategy")
.def("Capacity",&gpucache::lrucache::LRUCacheWrapper::Capacity,"return cache capacity")
.def("KeySize",&gpucache::lrucache::LRUCacheWrapper::KeySize,"return key size")
.def("ValueSize",&gpucache::lrucache::LRUCacheWrapper::ValueSize,"return value size")
.def("MaxQueryNum",&gpucache::lrucache::LRUCacheWrapper::MaxQueryNum,"return max number of keys to get or key-values to put once")
.def("Clear",&gpucache::lrucache::LRUCacheWrapper::Clear,"clear cache")
.def("Device",&gpucache::lrucache::LRUCacheWrapper::DeviceId,"return device id")
.def("Dim",&gpucache::lrucache::LRUCacheWrapper::Dim,"return value dim");
m.def("NewLRUCache", &gpucache::lrucache::NewLRUCache, "create a lru cache",py::return_value_policy::reference);
/*-----------------------------------------------------------------------------------------------------------------------------------*/
/* lrucache */
/*-----------------------------------------------------------------------------------------------------------------------------------*/
py::class_<gpucache::fifocache::FIFOCacheWrapper> fifo_cache(m, "FIFOCache");
fifo_cache
.def(py::init<at::Tensor, gpucache::CacheConfig>())
.def("Get",&gpucache::fifocache::FIFOCacheWrapper::Get,"get values for keys, find_mask return whether each key exists in cache")
.def("Put",&gpucache::fifocache::FIFOCacheWrapper::Put,"put key-value pairs")
.def("Strategy",&gpucache::fifocache::FIFOCacheWrapper::Strategy,"get evict strategy")
.def("Capacity",&gpucache::fifocache::FIFOCacheWrapper::Capacity,"return cache capacity")
.def("KeySize",&gpucache::fifocache::FIFOCacheWrapper::KeySize,"return key size")
.def("ValueSize",&gpucache::fifocache::FIFOCacheWrapper::ValueSize,"return value size")
.def("MaxQueryNum",&gpucache::fifocache::FIFOCacheWrapper::MaxQueryNum,"return max number of keys to get or key-values to put once")
.def("Clear",&gpucache::fifocache::FIFOCacheWrapper::Clear,"clear cache")
.def("Device",&gpucache::fifocache::FIFOCacheWrapper::DeviceId,"return device id")
.def("Dim",&gpucache::fifocache::FIFOCacheWrapper::Dim,"return value dim");
m.def("NewFIFOCache", &gpucache::fifocache::NewFIFOCache, "create a fifo cache",py::return_value_policy::reference);
/*-----------------------------------------------------------------------------------------------------------------------------------*/
}
#pragma once
#include <torch/extension.h>
#include "cache.h"
namespace gpucache {
namespace fifocache {
class FIFOCacheWrapper {
public:
FIFOCacheWrapper(at::Tensor t, CacheConfig cfg);
~FIFOCacheWrapper();
std::pair<torch::Tensor, torch::Tensor> Get(uint32_t num_query, const torch::Tensor queries);
void Put(uint32_t num_query, const torch::Tensor keys, const torch::Tensor values);
CacheConfig::CacheEvictStrategy Strategy();
uint64_t Capacity();
uint32_t KeySize();
uint32_t ValueSize();
uint32_t MaxQueryNum();
uint64_t DeviceId();
uint32_t Dim();
void Clear();
private:
void *fifo_cache;
c10::ScalarType dtype; // value dtype
c10::ScalarType kdtype; // key dtype
bool key_is_int32; // only support int32 and int64
CacheConfig cache_cfg;
};
std::unique_ptr<FIFOCacheWrapper> NewFIFOCache(at::Tensor t, CacheConfig cfg);
} // namespace lrucache
} // namespace gpucache
\ No newline at end of file
#pragma once #pragma once
#include <string> #include <cuda_runtime.h>
#include "murmurhash3.cuh"
constexpr uint32_t lruSeed = 0X12fb73ac; constexpr uint32_t lruSeed = 0X12fb73ac;
__device__ void MurmurHash3_x86_32 ( const void * key, int len,
uint32_t seed, void * out );
template<typename T> template<typename T>
__device__ size_t getHash(const T& obj){ __device__ size_t getHash(const T& obj){
size_t hash; size_t hash;
MurmurHash3_x86_32(reinterpret_cast<const void*>(&obj),sizeof(T),lruSeed,reinterpret_cast<void*>(&hash)); MurmurHash3_x86_32(reinterpret_cast<const void*>(&obj),sizeof(T),lruSeed,reinterpret_cast<void*>(&hash));
return hash; return hash;
} }
\ No newline at end of file
...@@ -11,7 +11,6 @@ ...@@ -11,7 +11,6 @@
// Platform-specific functions and macros // Platform-specific functions and macros
// Microsoft Visual Studio // Microsoft Visual Studio
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define FORCE_INLINE __forceinline #define FORCE_INLINE __forceinline
......
#pragma once #pragma once
#include <torch/extension.h>
#include "common.cuh"
#include "cache.h" #include "cache.h"
namespace gpucache { namespace gpucache {
namespace lrucache {
class LRUCacheWrapper {
public:
LRUCacheWrapper(at::Tensor t, CacheConfig cfg);
~LRUCacheWrapper();
std::pair<torch::Tensor, torch::Tensor> Get(uint32_t num_query, const torch::Tensor queries);
void Put(uint32_t num_query, const torch::Tensor keys, const torch::Tensor values);
CacheConfig::CacheEvictStrategy Strategy();
uint64_t Capacity();
uint32_t KeySize();
uint32_t ValueSize();
uint32_t MaxQueryNum();
uint64_t DeviceId();
uint32_t Dim();
void Clear();
private:
void *lru_cache;
c10::ScalarType dtype; // value dtype
c10::ScalarType kdtype; // key dtype
bool key_is_int32; // only support int32 and int64
CacheConfig cache_cfg;
};
// template<typename KeyType, typename ElemType> std::unique_ptr<LRUCacheWrapper> NewLRUCache(at::Tensor t, CacheConfig cfg);
// class LRUCache; } // namespace lrucache
// } // namespace gpucache
// template<typename KeyType, typename ElemType> \ No newline at end of file
// struct BucketView;
//
// struct ThreadCtx;
//
// template<typename KeyType, typename ElemType>
// __device__ __host__ BucketView<KeyType, ElemType>
// setBucketView(ThreadCtx ctx, KeyType *cache_keys, ElemType *cache_values,
// uint8_t *cache_timestamps, void *cache_mutexes,
// uint32_t num_elem_per_value, uint32_t bucket_id);
//
// template<typename KeyType, typename ElemType>
// class LRUCache : public Cache<KeyType, ElemType> {
//
// friend BucketView<KeyType, ElemType> __device__ __host__ setBucketView<KeyType, ElemType>(
// ThreadCtx ctx, KeyType *cache_keys, ElemType *cache_values,
// uint8_t *cache_timestamps, void *cache_mutexes,
// uint32_t num_elem_per_value, uint32_t bucket_id);
//
// public:
// explicit LRUCache(const CacheConfig &cfg);
//
//
// ~LRUCache();
//
// uint32_t KeySize() override;
//
// uint32_t ValueSize() override;
//
// uint64_t Capacity() override;
//
// uint32_t NumElemsPerValue() override;
//
// uint32_t MaxQueryNum();
//
// uint32_t NBucket();
//
// int8_t DeviceId() override;
//
// uint32_t Dim() override;
//
// // for test
// // void *Mutex() { return bucketMutexes; }
//
// CacheConfig::CacheEvictStrategy Strategy() override;
//
// void Clear() override;
//
// void Get(cudaStream_t stream, uint32_t num_query, KeyType *queries,
// ElemType *results, bool *find_mask) override;
//
// void Put(cudaStream_t stream, uint32_t num_query, KeyType *putkeys,
// ElemType *putvalues, uint32_t *n_evict = nullptr,
// KeyType *evict_keys = nullptr) override;
//
// private:
// KeyType *keys;
// ElemType *values;
// uint8_t *timestamps{};
// uint32_t nbucket; // 32 values for one bucket
// void *bucketMutexes{};
//
// // CacheConfig::CacheEvictStrategy strategy;
// uint64_t capacity;
// uint32_t keySize;
// uint32_t valueSize;
// uint32_t numElemPerValue; // embedding dim
// int8_t device_id;
// uint32_t dim;
//
// // store missing keys and indices for Evict
// KeyType *queryKeyBuffer{};
// uint32_t *queryIndiceBuffer{};
// uint32_t maxQueryNum;
// };
class LRUCacheWrapper {
public:
LRUCacheWrapper(at::Tensor t, CacheConfig cfg);
~LRUCacheWrapper();
std::pair<torch::Tensor, torch::Tensor> Get(uint32_t num_query, const torch::Tensor queries);
void Put(uint32_t num_query, const torch::Tensor keys, const torch::Tensor values);
CacheConfig::CacheEvictStrategy Strategy();
uint64_t Capacity();
uint32_t KeySize();
uint32_t ValueSize();
uint32_t MaxQueryNum();
uint64_t DeviceId();
uint32_t Dim();
void Clear();
// private:
void *lru_cache;
c10::ScalarType dtype;
c10::ScalarType kdtype;
bool key_is_int32;
CacheConfig cache_cfg;
};
std::unique_ptr<LRUCacheWrapper> NewLRUCache(at::Tensor t, CacheConfig cfg);
}
\ No newline at end of file
#include "utils.cuh"
namespace gpucache {
__device__ ThreadCtx::ThreadCtx() {
auto global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
global_warp_idx = global_thread_id / warpsize;
block_warp_idx = threadIdx.x / warpsize;
lane_id = threadIdx.x % warpsize;
num_warps = blockDim.x * gridDim.x / warpsize;
}
__device__ WarpMutex::WarpMutex() : flag(0) {}
__device__ void WarpMutex::Lock(ThreadCtx &ctx, uint32_t bucket_id) {
if (ctx.lane_id == 0) {
while (atomicCAS(&flag, 0, 1) != 0) {}
}
__threadfence();
__syncwarp();
}
__device__ void WarpMutex::UnLock(ThreadCtx &ctx) {
__syncwarp();
__threadfence();
if (ctx.lane_id == 0) {
atomicExch(&flag, 0);
}
}
__global__ void initLocks(uint32_t n_bucket, void *bucketMutexes) {
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
if (global_thread_idx < n_bucket) {
new(reinterpret_cast<WarpMutex *>(bucketMutexes) + global_thread_idx)
WarpMutex();
}
}
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
// printf("thread %u CUDA_CHECK lock\n",global_thread_idx);
if (global_thread_idx < n_bucket) {
auto mutex =
reinterpret_cast<WarpMutex *>(bucketMutexes) + global_thread_idx;
if (mutex->flag != 0u && mutex->flag != 1u) {
printf("bucket id %u not equal 0 or 1, is %u\n", global_thread_idx,
mutex->flag);
}
}
}
}
#pragma once
#include <cuda_runtime.h>
namespace gpucache{
constexpr unsigned int warpFullMask = 0xFFFFFFFF;
constexpr unsigned int defaultBlockX = 256;
constexpr unsigned int warpsize = 32;
constexpr unsigned int defaultNumWarpsPerBlock = defaultBlockX / warpsize;
// bucket_id + key
constexpr unsigned int uint32SharedMemorySize = 2 * sizeof(uint32_t) * defaultNumWarpsPerBlock * warpsize;
constexpr unsigned int uint64SharedMemorySize =
(sizeof(uint64_t) + sizeof(uint32_t)) * defaultNumWarpsPerBlock * warpsize;
struct ThreadCtx {
__device__ ThreadCtx();
uint32_t global_warp_idx;
uint32_t block_warp_idx;
uint32_t num_warps;
uint32_t lane_id;
};
struct WarpMutex {
public:
__device__ WarpMutex();
~WarpMutex() = default;
WarpMutex(const WarpMutex &) = delete;
WarpMutex &operator=(const WarpMutex &) = delete;
WarpMutex(WarpMutex &&) = delete;
WarpMutex &operator=(WarpMutex &&) = delete;
__device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id);
__device__ void UnLock(ThreadCtx &ctx);
// private:
uint32_t flag;
};
__global__ void initLocks(uint32_t n_bucket, void *bucketMutexes);
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes);
}
...@@ -16,8 +16,9 @@ ...@@ -16,8 +16,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# args: strtegy, capacity, key_size(only support int32 and int64), value_size(dim * sizeof(elem), elem is decided by the passing tensor dtype of NewLRUCache), device_id, dim\n", "# args: strtegy, capacity, keySize(only support int32 and int64), valueSize(dim * sizeof(elem), elem is decided by the passing tensor dtype of NewLRUCache), deviceId, dim\n",
"cfg = libgpucache.CacheConfig(libgpucache.CacheConfig.LRU,65536,4,128,4096,0,32)" "# cfg = libgpucache.CacheConfig(libgpucache.CacheConfig.LRU,65536,4,128,4096,0,32)\n",
"cfg = libgpucache.CacheConfig(libgpucache.CacheConfig.FIFO,65536,4,128,4096,0,32)"
] ]
}, },
{ {
...@@ -28,7 +29,7 @@ ...@@ -28,7 +29,7 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"(<CacheEvictStrategy.LRU: 1>, 65536, 4, 128, 4096, 0, 32)" "(<CacheEvictStrategy.FIFO: 0>, 65536, 4, 128, 4096, 0, 32)"
] ]
}, },
"execution_count": 3, "execution_count": 3,
...@@ -37,7 +38,7 @@ ...@@ -37,7 +38,7 @@
} }
], ],
"source": [ "source": [
"cfg.strategy, cfg.capacity, cfg.key_size, cfg.value_size, cfg.max_query_num, cfg.device_id, cfg.dim" "cfg.strategy, cfg.capacity, cfg.keySize, cfg.valueSize, cfg.maxQueryNum, cfg.deviceId, cfg.dim"
] ]
}, },
{ {
...@@ -49,13 +50,14 @@ ...@@ -49,13 +50,14 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"LRUCache: keySize: 4, valueSize: 128, dim: 32, capacity: 65536, maxQueryNum: 4096, deviceId: 0\n" "FIFOCache: keySize: 4, valueSize: 128, dim: 32, capacity: 65536, maxQueryNum: 4096, deviceId: 0\n"
] ]
} }
], ],
"source": [ "source": [
"t = torch.empty([1],dtype=torch.float32)\n", "t = torch.empty([1],dtype=torch.float32)\n",
"cache = libgpucache.NewLRUCache(t,cfg)" "# cache = libgpucache.NewLRUCache(t,cfg)\n",
"cache = libgpucache.NewFIFOCache(t,cfg)"
] ]
}, },
{ {
...@@ -88,11 +90,6 @@ ...@@ -88,11 +90,6 @@
] ]
}, },
{ {
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 6,
"metadata": {}, "metadata": {},
...@@ -135,6 +132,38 @@ ...@@ -135,6 +132,38 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(tensor([[ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,\n",
" 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,\n",
" 29., 30., 31., 32.],\n",
" [32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45.,\n",
" 46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,\n",
" 60., 61., 62., 63.],\n",
" [64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77.,\n",
" 78., 79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91.,\n",
" 92., 93., 94., 95.]], device='cuda:0'),\n",
" tensor([False, True, True], device='cuda:0'),\n",
" torch.Size([3, 32]))"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keys[0] = 999\n",
"values, find_mask = cache.Get(3,keys)\n",
"values, find_mask, values.shape"
]
},
{
"cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
......
...@@ -34,6 +34,7 @@ enable_testing() ...@@ -34,6 +34,7 @@ enable_testing()
add_executable( add_executable(
cache_test cache_test
cache_test.cu cache_test.cu
) )
target_include_directories(cache_test PRIVATE /home/wtx/miniconda3/envs/dgl/include/python3.10) # path to your Python.h target_include_directories(cache_test PRIVATE /home/wtx/miniconda3/envs/dgl/include/python3.10) # path to your Python.h
......
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "../src/hash/murmurhash3.cu"
#include "../src/utils.cuh"
#include "../src/utils.cu"
#include "../src/lru_cache.h" #include "../src/lru_cache.h"
#include "../src/lru_cache.cu" #include "../src/lru_cache.cu"
#include "../src/fifo_cache.h"
#include "../src/fifo_cache.cu"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
...@@ -9,10 +14,23 @@ ...@@ -9,10 +14,23 @@
#include <algorithm> #include <algorithm>
namespace gpucache{ namespace gpucache{
// for test
template<typename KeyType> template<typename KeyType>
__global__ void CollectMissingKeysNew(uint32_t num_query, KeyType *keys, __global__ void CollectMissingKeys(uint32_t num_query, KeyType *keys,
bool *find_mask, uint32_t *n_missing, bool *find_mask, uint32_t *n_missing,KeyType *missing_keys) {
KeyType *missing_keys);
ThreadCtx ctx{};
for (auto offset = ctx.global_warp_idx * warpsize; offset < num_query;
offset += ctx.num_warps * warpsize) {
auto idx = offset + ctx.lane_id;
if (!find_mask[idx]) {
uint32_t base_missing_idx = atomicAdd(n_missing, 1);
missing_keys[base_missing_idx] = keys[idx];
}
}
}
void TestCache(Cache<int32_t ,uint32_t>& cache, uint32_t num_elem_per_value){ void TestCache(Cache<int32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
constexpr int warpsize = 32; constexpr int warpsize = 32;
std::unordered_set<uint32_t> in_cache; std::unordered_set<uint32_t> in_cache;
...@@ -114,20 +132,20 @@ namespace gpucache{ ...@@ -114,20 +132,20 @@ namespace gpucache{
dim3 block(256); dim3 block(256);
dim3 grid((n_keys + block.x - 1)/block.x); dim3 grid((n_keys + block.x - 1)/block.x);
// CollectMissingKeys<uint32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys); // CollectMissingKeys<uint32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys);
CollectMissingKeysNew<int32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys); CollectMissingKeys<int32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys);
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(n_missing,d_n_missing, sizeof(uint32_t),cudaMemcpyDefault)); CUDA_CHECK(cudaMemcpy(n_missing,d_n_missing, sizeof(uint32_t),cudaMemcpyDefault));
CUDA_CHECK(cudaMemcpy(missing_keys,d_missing_keys,keys_size,cudaMemcpyDefault)); CUDA_CHECK(cudaMemcpy(missing_keys,d_missing_keys,keys_size,cudaMemcpyDefault));
ASSERT_EQ(expect_n_missing, *n_missing) << "expect_n_missing is " << expect_n_missing << " n_missing is " << *n_missing; ASSERT_EQ(expect_n_missing, *n_missing) << "expect_n_missing is " << expect_n_missing << " n_missing is " << *n_missing;
std::unordered_set<int32_t> missing_keys_set(missing_keys, missing_keys + *n_missing); std::unordered_set<int32_t> missing_keys_set(missing_keys, missing_keys + *n_missing);
ASSERT_EQ(missing_keys_set,expect_missing_keys_set); ASSERT_EQ(missing_keys_set,expect_missing_keys_set);
// check get value // check get value
CUDA_CHECK(cudaMemcpy(values,d_values,values_size,cudaMemcpyDefault)); CUDA_CHECK(cudaMemcpy(values,d_values,values_size,cudaMemcpyDefault));
for (size_t i = 0; i < n_keys ; i += 1) { for (size_t i = 0; i < n_keys ; i += 1) {
if(find_mask[i]){ if(find_mask[i]){
ASSERT_EQ(keys[i] + 123,values[i * num_elem_per_value]) << "key[" << i << "] = " << keys[i] << " doesn't get correct value should be " << keys[i] + 123 << " get " << values[i * num_elem_per_value]; ASSERT_EQ(keys[i] + 123,values[i * num_elem_per_value]) << "key[" << i << "] = " << keys[i] << " doesn't get correct value should be " << keys[i] + 123 << " get " << values[i * num_elem_per_value];
} }
} }
...@@ -187,30 +205,68 @@ namespace gpucache{ ...@@ -187,30 +205,68 @@ namespace gpucache{
cfg.deviceId = 0; cfg.deviceId = 0;
cfg.dim = 8; cfg.dim = 8;
LRUCache<int32_t,uint32_t> cache(cfg); lrucache::LRUCache<int32_t,uint32_t> cache(cfg);
TestCache(cache,cfg.dim);
}
TEST(GPUCACHE,FIFOCACHE){
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::FIFO;
cfg.valueSize = 32;
cfg.capacity = 4096 * 2;
cfg.keySize = 4;
cfg.maxQueryNum = 2048;
cfg.deviceId = 0;
cfg.dim = 8;
fifocache::FIFOCache<int32_t,uint32_t> cache(cfg);
TestCache(cache,cfg.dim); TestCache(cache,cfg.dim);
} }
TEST(GPUCACHE, FIFOCACHEWRAPPER){
CacheConfig cfg{CacheConfig::CacheEvictStrategy::FIFO,8192,4,32,2048,0,8};
auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0));
auto cache = fifocache::NewFIFOCache(t,cfg);
auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0);
auto [values, find_mask] = cache->Get(5,keys);
torch::Tensor put_values = torch::reshape(torch::arange(0, 5 * 8,torch::dtype(torch::kInt32)),{5,8}).to(torch::kCUDA, 0);
std::cout << "put_values: " << put_values << std::endl;
cache->Put(5,keys,put_values);
auto result = cache->Get(5,keys);
values = result.first;
find_mask = result.second;
std::cout << " values: " << values << "find_mask: " << find_mask << std::endl;
// auto fifoc = reinterpret_cast<fifocache::FIFOCache<int32_t,int32_t>*>(cache->fifo_cache);
// std::cout <<"test keysize" << fifoc->KeySize() << std::endl;
CUDA_CHECK(cudaDeviceSynchronize());
}
TEST(GPUCACHE, LRUCACHEWRAPPER){ TEST(GPUCACHE, LRUCACHEWRAPPER){
CacheConfig cfg{CacheConfig::CacheEvictStrategy::LRU,8192,4,32,2048,0,8}; CacheConfig cfg{CacheConfig::CacheEvictStrategy::LRU,8192,4,32,2048,0,8};
auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0)); auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0));
auto cache = NewLRUCache(t,cfg); auto cache = lrucache::NewLRUCache(t,cfg);
auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0); auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0);
auto [values, find_mask] = cache.Get(5,keys); auto [values, find_mask] = cache->Get(5,keys);
torch::Tensor put_values = torch::reshape(torch::arange(0, 5 * 8,torch::dtype(torch::kInt32)),{5,8}).to(torch::kCUDA, 0); torch::Tensor put_values = torch::reshape(torch::arange(0, 5 * 8,torch::dtype(torch::kInt32)),{5,8}).to(torch::kCUDA, 0);
std::cout << "put_values: " << put_values << std::endl; std::cout << "put_values: " << put_values << std::endl;
cache.Put(5,keys,put_values); cache->Put(5,keys,put_values);
auto result = cache.Get(5,keys); auto result = cache->Get(5,keys);
values = result.first; values = result.first;
find_mask = result.second; find_mask = result.second;
std::cout << " values: " << values << "find_mask: " << find_mask << std::endl; std::cout << " values: " << values << "find_mask: " << find_mask << std::endl;
auto lruc = reinterpret_cast<LRUCache<int32_t,int32_t>*>(cache.lru_cache); // auto lruc = reinterpret_cast<lrucache::LRUCache<int32_t,int32_t>*>(cache->lru_cache);
std::cout <<"test keysize" << lruc->KeySize() << std::endl; // std::cout <<"test keysize" << lruc->KeySize() << std::endl;
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment