Commit 67b6e2a5 by tianxing wang

add debug info

parents dcc32e94 e50b9ce9
lru_cache
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<<<<<<< HEAD
<component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/test"> <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/test">
<contentRoot DIR="$PROJECT_DIR$" /> <contentRoot DIR="$PROJECT_DIR$" />
</component> </component>
=======
<component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
</project> </project>
\ No newline at end of file
...@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16) ...@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16)
project(gpucache CXX CUDA) project(gpucache CXX CUDA)
<<<<<<< HEAD
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED on) set(CMAKE_CXX_STANDARD_REQUIRED on)
=======
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
file(GLOB SOURCE_FILES file(GLOB SOURCE_FILES
# ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/* # ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
...@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES ...@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES
) )
message(STATUS "source files:" ${SOURCE_FILES}) message(STATUS "source files:" ${SOURCE_FILES})
<<<<<<< HEAD
#include_directories(${CMAKE_SOURCE_DIR}/include) #include_directories(${CMAKE_SOURCE_DIR}/include)
#link_directories(${CMAKE_SOURCE_DIR}/libs) #link_directories(${CMAKE_SOURCE_DIR}/libs)
=======
include_directories(${CMAKE_SOURCE_DIR}/include)
link_directories(${CMAKE_SOURCE_DIR}/libs)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
add_library(gpucache SHARED ${SOURCE_FILES}) add_library(gpucache SHARED ${SOURCE_FILES})
set_target_properties(gpucache PROPERTIES set_target_properties(gpucache PROPERTIES
CUDA_SEPARABLE_COMPILATION ON CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES "86" CUDA_ARCHITECTURES "86"
) )
# add_subdirectory(test) <<<<<<< HEAD
\ No newline at end of file # add_subdirectory(test)
=======
#include(FetchContent)
#FetchContent_Declare(
# googletest
# URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
#)
## For Windows: Prevent overriding the parent project's compiler/linker settings
#set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
#FetchContent_MakeAvailable(googletest)
#
#enable_testing()
#
#add_executable(
# cache_test
# src/cache_test.cpp
#)
#target_link_libraries(
# cache_test
# GTest::gtest_main
#)
#
#include(GoogleTest)
#gtest_discover_tests(cache_test)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
...@@ -26,6 +26,7 @@ namespace gpucache { ...@@ -26,6 +26,7 @@ namespace gpucache {
virtual ~Cache() = default; virtual ~Cache() = default;
// Cache(const Cache &) = delete; // Cache(const Cache &) = delete;
// //
// Cache &operator=(const Cache &) = delete; // Cache &operator=(const Cache &) = delete;
...@@ -48,6 +49,7 @@ namespace gpucache { ...@@ -48,6 +49,7 @@ namespace gpucache {
virtual void Put(cudaStream_t *stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0; virtual void Put(cudaStream_t *stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0;
virtual void* Mutex() = 0; virtual void* Mutex() = 0;
virtual void Clear() = 0; virtual void Clear() = 0;
...@@ -57,6 +59,7 @@ namespace gpucache { ...@@ -57,6 +59,7 @@ namespace gpucache {
// TODO: 添加其他种类的cache // TODO: 添加其他种类的cache
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
std::unique_ptr<Cache<KeyType, ElemType>> NewCache(const CacheConfig &cfg) { std::unique_ptr<Cache<KeyType, ElemType>> NewCache(const CacheConfig &cfg) {
assert(cfg.keySize > 0); assert(cfg.keySize > 0);
...@@ -65,6 +68,7 @@ namespace gpucache { ...@@ -65,6 +68,7 @@ namespace gpucache {
return nullptr; return nullptr;
} }
// template<typename KeyType, typename ElemType> // template<typename KeyType, typename ElemType>
// std::unique_ptr<> // std::unique_ptr<>
} }
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <stdio.h> #include <stdio.h>
#define CHECK(call) \ #define CHECK(call) \
{ \ { \
const cudaError_t error = call; \ const cudaError_t error = call; \
......
...@@ -18,9 +18,11 @@ ...@@ -18,9 +18,11 @@
#include <stdlib.h> #include <stdlib.h>
#define ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) #define ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#define ROTL64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) #define ROTL64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#define BIG_CONSTANT(x) (x) #define BIG_CONSTANT(x) (x)
// Other compilers // Other compilers
......
...@@ -39,6 +39,7 @@ namespace gpucache { ...@@ -39,6 +39,7 @@ namespace gpucache {
WarpMutex &operator=(WarpMutex &&) = delete; WarpMutex &operator=(WarpMutex &&) = delete;
__device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) { __device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) {
if (ctx.lane_id == 0) { if (ctx.lane_id == 0) {
// while (atomicCAS(&flag, 0, 1) != 0) { // while (atomicCAS(&flag, 0, 1) != 0) {
...@@ -53,6 +54,7 @@ namespace gpucache { ...@@ -53,6 +54,7 @@ namespace gpucache {
} }
} }
// printf("bucket id: Get Lock\n"); // printf("bucket id: Get Lock\n");
} }
__threadfence(); __threadfence();
__syncwarp(); __syncwarp();
...@@ -80,6 +82,7 @@ namespace gpucache { ...@@ -80,6 +82,7 @@ namespace gpucache {
} }
} }
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) { __global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x; uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
// printf("thread %u check lock\n",global_thread_idx); // printf("thread %u check lock\n",global_thread_idx);
...@@ -91,12 +94,14 @@ namespace gpucache { ...@@ -91,12 +94,14 @@ namespace gpucache {
} }
} }
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
class LRUCache; class LRUCache;
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
struct BucketView; struct BucketView;
// template<typename KeyType, typename ElemType> // template<typename KeyType, typename ElemType>
// __global__ void // __global__ void
// GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries, // GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries,
...@@ -141,7 +146,6 @@ namespace gpucache { ...@@ -141,7 +146,6 @@ namespace gpucache {
void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id); void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id);
public: public:
explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize), explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize),
valueSize(cfg.valueSize), valueSize(cfg.valueSize),
capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) { capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) {
...@@ -175,6 +179,7 @@ namespace gpucache { ...@@ -175,6 +179,7 @@ namespace gpucache {
CHECK(cudaFree(queryIndiceBuffer)) CHECK(cudaFree(queryIndiceBuffer))
} }
uint32_t KeySize() override { return keySize; } uint32_t KeySize() override { return keySize; }
uint32_t ValueSize() override { return valueSize; } uint32_t ValueSize() override { return valueSize; }
...@@ -199,6 +204,7 @@ namespace gpucache { ...@@ -199,6 +204,7 @@ namespace gpucache {
initLocks<<<grid, block>>>(nbucket, bucketMutexes); initLocks<<<grid, block>>>(nbucket, bucketMutexes);
} }
void void
Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override; Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override;
...@@ -206,6 +212,7 @@ namespace gpucache { ...@@ -206,6 +212,7 @@ namespace gpucache {
KeyType *evict_keys) override; KeyType *evict_keys) override;
private: private:
KeyType *keys; KeyType *keys;
ElemType *values; ElemType *values;
...@@ -236,7 +243,6 @@ namespace gpucache { ...@@ -236,7 +243,6 @@ namespace gpucache {
ts), ts),
num_elems_per_value( num_elems_per_value(
num_elems_per_value) {} num_elems_per_value) {}
__device__ int Get(const ThreadCtx &ctx, const KeyType key) { __device__ int Get(const ThreadCtx &ctx, const KeyType key) {
KeyType lane_key = bkeys[ctx.lane_id]; KeyType lane_key = bkeys[ctx.lane_id];
uint8_t ts = btimestamps[ctx.lane_id]; uint8_t ts = btimestamps[ctx.lane_id];
...@@ -302,6 +308,7 @@ namespace gpucache { ...@@ -302,6 +308,7 @@ namespace gpucache {
if (ts > slot_ts) { if (ts > slot_ts) {
ts--; ts--;
} else if (ctx.lane_id == slot_num) { } else if (ctx.lane_id == slot_num) {
*evict_key = lane_key; *evict_key = lane_key;
bkeys[ctx.lane_id] = key; bkeys[ctx.lane_id] = key;
ts = warpsize; ts = warpsize;
...@@ -331,6 +338,7 @@ namespace gpucache { ...@@ -331,6 +338,7 @@ namespace gpucache {
}; };
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
__device__ __host__ BucketView<KeyType, ElemType> __device__ __host__ BucketView<KeyType, ElemType>
setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes, setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, uint32_t bucket_id) { uint32_t num_elem_per_value, uint32_t bucket_id) {
...@@ -347,7 +355,6 @@ namespace gpucache { ...@@ -347,7 +355,6 @@ namespace gpucache {
GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes, GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries, uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries,
ElemType *results, bool *find_mask) { ElemType *results, bool *find_mask) {
ThreadCtx ctx{}; ThreadCtx ctx{};
__shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize]; __shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize];
__shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize]; __shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize];
...@@ -366,15 +373,18 @@ namespace gpucache { ...@@ -366,15 +373,18 @@ namespace gpucache {
blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id; blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id;
} }
__syncwarp(); __syncwarp();
// if (ctx.lane_id == 0){ // if (ctx.lane_id == 0){
// printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize); // printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize);
// } // }
// 32 threads compare it own slot with key // 32 threads compare it own slot with key
// if find parallel write to result // if find parallel write to result
for (uint32_t i = 0; i < n_query; i++) { for (uint32_t i = 0; i < n_query; i++) {
uint32_t idx = offset + i; uint32_t idx = offset + i;
KeyType key = blockQueryKeys[ctx.block_warp_idx][i]; KeyType key = blockQueryKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i]; uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps, auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_timestamps, num_elem_per_value, bucket_id); cache_timestamps, num_elem_per_value, bucket_id);
...@@ -388,11 +398,13 @@ namespace gpucache { ...@@ -388,11 +398,13 @@ namespace gpucache {
if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){ if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id); printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id);
} }
int slot_num = bucket.Get(ctx, key); int slot_num = bucket.Get(ctx, key);
if (slot_num != -1) { if (slot_num != -1) {
bucket.ReadOneValue(ctx, slot_num, &results[idx]); bucket.ReadOneValue(ctx, slot_num, &results[idx]);
} }
bucket.mutex->UnLock(ctx); bucket.mutex->UnLock(ctx);
// if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){ // if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
// printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id); // printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id);
// } // }
...@@ -415,6 +427,7 @@ namespace gpucache { ...@@ -415,6 +427,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
__global__ void __global__ void
PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes, PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys, uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys, ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys,
...@@ -445,6 +458,7 @@ namespace gpucache { ...@@ -445,6 +458,7 @@ namespace gpucache {
//ElemType* Value = &put_values[idx]; //ElemType* Value = &put_values[idx];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i]; uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps, auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id); cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id); bucket.mutex->Lock(ctx, bucket_id);
...@@ -479,6 +493,7 @@ namespace gpucache { ...@@ -479,6 +493,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
__global__ void __global__ void
EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes, EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys, uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys,
uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) { uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) {
...@@ -497,6 +512,7 @@ namespace gpucache { ...@@ -497,6 +512,7 @@ namespace gpucache {
uint32_t idx = offset + i; uint32_t idx = offset + i;
KeyType key = blockPutKeys[ctx.block_warp_idx][i]; KeyType key = blockPutKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i]; uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps, auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id); cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id); bucket.mutex->Lock(ctx, bucket_id);
...@@ -513,7 +529,9 @@ namespace gpucache { ...@@ -513,7 +529,9 @@ namespace gpucache {
// TODO switch to cuda stream // TODO switch to cuda stream
template<typename KeyType, typename ElemType> template<typename KeyType, typename ElemType>
void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys, void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) { ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) {
assert(num_query <= maxQueryNum); assert(num_query <= maxQueryNum);
if (num_query == 0) { if (num_query == 0) {
return; return;
...@@ -521,6 +539,7 @@ namespace gpucache { ...@@ -521,6 +539,7 @@ namespace gpucache {
dim3 block(defaultBlockX); dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX); dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
uint32_t n_missing = 0; uint32_t n_missing = 0;
PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket, PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, put_keys, put_values, numElemPerValue, num_query, put_keys, put_values,
&n_missing, queryKeyBuffer, &n_missing, queryKeyBuffer,
...@@ -542,6 +561,7 @@ namespace gpucache { ...@@ -542,6 +561,7 @@ namespace gpucache {
uint32_t warp_n_missing = 0; uint32_t warp_n_missing = 0;
uint32_t base_missing_idx = 0; uint32_t base_missing_idx = 0;
uint32_t warp_missing_idx = 0; uint32_t warp_missing_idx = 0;
if (ctx.lane_id < n_query) { if (ctx.lane_id < n_query) {
uint32_t idx = offset + ctx.lane_id; uint32_t idx = offset + ctx.lane_id;
uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]); uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]);
...@@ -589,6 +609,7 @@ namespace gpucache { ...@@ -589,6 +609,7 @@ namespace gpucache {
if (num_query == 0) { return; } if (num_query == 0) { return; }
dim3 block(defaultBlockX); dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX); dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket, GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, queries, results, find_mask); numElemPerValue, num_query, queries, results, find_mask);
CHECK(cudaDeviceSynchronize()); CHECK(cudaDeviceSynchronize());
......
...@@ -9,12 +9,14 @@ ...@@ -9,12 +9,14 @@
#include <algorithm> #include <algorithm>
namespace gpucache{ namespace gpucache{
void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){ void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
std::unordered_set<uint32_t> in_cache; std::unordered_set<uint32_t> in_cache;
const size_t n_iter = 32; const size_t n_iter = 32;
const uint32_t n_keys = 64; const uint32_t n_keys = 64;
uint32_t* d_keys; uint32_t* d_keys;
uint32_t* keys; uint32_t* keys;
uint32_t* d_values; uint32_t* d_values;
...@@ -31,7 +33,9 @@ namespace gpucache{ ...@@ -31,7 +33,9 @@ namespace gpucache{
bool* d_find_mask; bool* d_find_mask;
const size_t keys_size = n_keys * sizeof(uint32_t); const size_t keys_size = n_keys * sizeof(uint32_t);
const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t); const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t);
const size_t mask_size = n_keys * sizeof(bool); const size_t mask_size = n_keys * sizeof(bool);
...@@ -47,18 +51,22 @@ namespace gpucache{ ...@@ -47,18 +51,22 @@ namespace gpucache{
CHECK(cudaMallocHost(&values,values_size)); CHECK(cudaMallocHost(&values,values_size));
CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t))); CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t)));
CHECK(cudaMallocHost(&find_mask,mask_size)); CHECK(cudaMallocHost(&find_mask,mask_size));
CHECK(cudaMallocHost(&missing_keys,keys_size)); CHECK(cudaMallocHost(&missing_keys,keys_size));
CHECK(cudaMallocHost(&evict_keys,keys_size)); CHECK(cudaMallocHost(&evict_keys,keys_size));
CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t))); CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t)));
// std::cout << "get there" << std::endl;
std::vector<uint32_t> random_keys(n_keys * n_iter); std::vector<uint32_t> random_keys(n_keys * n_iter);
std::iota(random_keys.begin(),random_keys.end(),1); std::iota(random_keys.begin(),random_keys.end(),1);
std::random_device rd; std::random_device rd;
std::mt19937 g(rd()); std::mt19937 g(rd());
for (size_t iter = 0; iter < n_iter; iter++){ for (size_t iter = 0; iter < n_iter; iter++){
std::cout << "iter " << iter << std::endl; std::cout << "iter " << iter << std::endl;
uint32_t expect_n_missing = 0; uint32_t expect_n_missing = 0;
CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t))); CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t)));
...@@ -70,6 +78,7 @@ namespace gpucache{ ...@@ -70,6 +78,7 @@ namespace gpucache{
std::cout << *i << " "; std::cout << *i << " ";
} }
std::cout << std::endl; std::cout << std::endl;
std::unordered_set<uint32_t> expect_missing_keys_set; std::unordered_set<uint32_t> expect_missing_keys_set;
std::unordered_set<uint32_t> keys_set; // store current iter keys std::unordered_set<uint32_t> keys_set; // store current iter keys
for (size_t i = 0; i < n_keys; ++i) { for (size_t i = 0; i < n_keys; ++i) {
...@@ -81,6 +90,7 @@ namespace gpucache{ ...@@ -81,6 +90,7 @@ namespace gpucache{
} }
// test get // test get
std::cout << "test get" << std::endl; std::cout << "test get" << std::endl;
CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice));
cudaStream_t stream; cudaStream_t stream;
...@@ -116,6 +126,7 @@ namespace gpucache{ ...@@ -116,6 +126,7 @@ namespace gpucache{
cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys); cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys);
CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault)); CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault));
CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault)); CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault));
std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict); std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict);
CHECK(cudaDeviceSynchronize()); CHECK(cudaDeviceSynchronize());
for(size_t i = 0; i < *n_evict; i++){ for(size_t i = 0; i < *n_evict; i++){
...@@ -123,6 +134,7 @@ namespace gpucache{ ...@@ -123,6 +134,7 @@ namespace gpucache{
} }
for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);} for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);}
for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);} for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);}
checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex()); checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex());
CHECK(cudaDeviceSynchronize()); CHECK(cudaDeviceSynchronize());
printf("---------------------------------------------------\n"); printf("---------------------------------------------------\n");
...@@ -143,11 +155,13 @@ namespace gpucache{ ...@@ -143,11 +155,13 @@ namespace gpucache{
CHECK(cudaFreeHost(evict_keys)); CHECK(cudaFreeHost(evict_keys));
CHECK(cudaFreeHost(n_evict)); CHECK(cudaFreeHost(n_evict));
} }
TEST(GPUCACHE,LRUCACHE){ TEST(GPUCACHE,LRUCACHE){
CacheConfig cfg{}; CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::LRU; cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
cfg.valueSize = 32; cfg.valueSize = 32;
cfg.capacity = 4096; cfg.capacity = 4096;
cfg.keySize = 4; cfg.keySize = 4;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment