Commit 67b6e2a5 by tianxing wang

add debug info

parents dcc32e94 e50b9ce9
lru_cache
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<<<<<<< HEAD
<component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/test">
<contentRoot DIR="$PROJECT_DIR$" />
</component>
=======
<component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
</project>
\ No newline at end of file
......@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16)
project(gpucache CXX CUDA)
<<<<<<< HEAD
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED on)
=======
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
file(GLOB SOURCE_FILES
# ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
......@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES
)
message(STATUS "source files:" ${SOURCE_FILES})
<<<<<<< HEAD
#include_directories(${CMAKE_SOURCE_DIR}/include)
#link_directories(${CMAKE_SOURCE_DIR}/libs)
=======
include_directories(${CMAKE_SOURCE_DIR}/include)
link_directories(${CMAKE_SOURCE_DIR}/libs)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
add_library(gpucache SHARED ${SOURCE_FILES})
set_target_properties(gpucache PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES "86"
)
<<<<<<< HEAD
# add_subdirectory(test)
=======
#include(FetchContent)
#FetchContent_Declare(
# googletest
# URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
#)
## For Windows: Prevent overriding the parent project's compiler/linker settings
#set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
#FetchContent_MakeAvailable(googletest)
#
#enable_testing()
#
#add_executable(
# cache_test
# src/cache_test.cpp
#)
#target_link_libraries(
# cache_test
# GTest::gtest_main
#)
#
#include(GoogleTest)
#gtest_discover_tests(cache_test)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
......@@ -26,6 +26,7 @@ namespace gpucache {
virtual ~Cache() = default;
// Cache(const Cache &) = delete;
//
// Cache &operator=(const Cache &) = delete;
......@@ -48,6 +49,7 @@ namespace gpucache {
virtual void Put(cudaStream_t *stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0;
virtual void* Mutex() = 0;
virtual void Clear() = 0;
......@@ -57,6 +59,7 @@ namespace gpucache {
// TODO: 添加其他种类的cache
template<typename KeyType, typename ElemType>
std::unique_ptr<Cache<KeyType, ElemType>> NewCache(const CacheConfig &cfg) {
assert(cfg.keySize > 0);
......@@ -65,6 +68,7 @@ namespace gpucache {
return nullptr;
}
// template<typename KeyType, typename ElemType>
// std::unique_ptr<>
}
......@@ -8,6 +8,7 @@
#include <stdio.h>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
......
......@@ -18,9 +18,11 @@
#include <stdlib.h>
#define ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#define ROTL64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#define BIG_CONSTANT(x) (x)
// Other compilers
......
......@@ -39,6 +39,7 @@ namespace gpucache {
WarpMutex &operator=(WarpMutex &&) = delete;
__device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) {
if (ctx.lane_id == 0) {
// while (atomicCAS(&flag, 0, 1) != 0) {
......@@ -53,6 +54,7 @@ namespace gpucache {
}
}
// printf("bucket id: Get Lock\n");
}
__threadfence();
__syncwarp();
......@@ -80,6 +82,7 @@ namespace gpucache {
}
}
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
// printf("thread %u check lock\n",global_thread_idx);
......@@ -91,12 +94,14 @@ namespace gpucache {
}
}
template<typename KeyType, typename ElemType>
class LRUCache;
template<typename KeyType, typename ElemType>
struct BucketView;
// template<typename KeyType, typename ElemType>
// __global__ void
// GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries,
......@@ -141,7 +146,6 @@ namespace gpucache {
void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id);
public:
explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize),
valueSize(cfg.valueSize),
capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) {
......@@ -175,6 +179,7 @@ namespace gpucache {
CHECK(cudaFree(queryIndiceBuffer))
}
uint32_t KeySize() override { return keySize; }
uint32_t ValueSize() override { return valueSize; }
......@@ -199,6 +204,7 @@ namespace gpucache {
initLocks<<<grid, block>>>(nbucket, bucketMutexes);
}
void
Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override;
......@@ -206,6 +212,7 @@ namespace gpucache {
KeyType *evict_keys) override;
private:
KeyType *keys;
ElemType *values;
......@@ -236,7 +243,6 @@ namespace gpucache {
ts),
num_elems_per_value(
num_elems_per_value) {}
__device__ int Get(const ThreadCtx &ctx, const KeyType key) {
KeyType lane_key = bkeys[ctx.lane_id];
uint8_t ts = btimestamps[ctx.lane_id];
......@@ -302,6 +308,7 @@ namespace gpucache {
if (ts > slot_ts) {
ts--;
} else if (ctx.lane_id == slot_num) {
*evict_key = lane_key;
bkeys[ctx.lane_id] = key;
ts = warpsize;
......@@ -331,6 +338,7 @@ namespace gpucache {
};
template<typename KeyType, typename ElemType>
__device__ __host__ BucketView<KeyType, ElemType>
setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, uint32_t bucket_id) {
......@@ -347,7 +355,6 @@ namespace gpucache {
GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries,
ElemType *results, bool *find_mask) {
ThreadCtx ctx{};
__shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize];
__shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize];
......@@ -366,15 +373,18 @@ namespace gpucache {
blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id;
}
__syncwarp();
// if (ctx.lane_id == 0){
// printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize);
// }
// 32 threads compare it own slot with key
// if find parallel write to result
for (uint32_t i = 0; i < n_query; i++) {
uint32_t idx = offset + i;
KeyType key = blockQueryKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_timestamps, num_elem_per_value, bucket_id);
......@@ -388,11 +398,13 @@ namespace gpucache {
if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id);
}
int slot_num = bucket.Get(ctx, key);
if (slot_num != -1) {
bucket.ReadOneValue(ctx, slot_num, &results[idx]);
}
bucket.mutex->UnLock(ctx);
// if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
// printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id);
// }
......@@ -415,6 +427,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType>
__global__ void
PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys,
......@@ -445,6 +458,7 @@ namespace gpucache {
//ElemType* Value = &put_values[idx];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
......@@ -479,6 +493,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType>
__global__ void
EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys,
uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) {
......@@ -497,6 +512,7 @@ namespace gpucache {
uint32_t idx = offset + i;
KeyType key = blockPutKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
......@@ -513,7 +529,9 @@ namespace gpucache {
// TODO switch to cuda stream
template<typename KeyType, typename ElemType>
void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) {
assert(num_query <= maxQueryNum);
if (num_query == 0) {
return;
......@@ -521,6 +539,7 @@ namespace gpucache {
dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
uint32_t n_missing = 0;
PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, put_keys, put_values,
&n_missing, queryKeyBuffer,
......@@ -542,6 +561,7 @@ namespace gpucache {
uint32_t warp_n_missing = 0;
uint32_t base_missing_idx = 0;
uint32_t warp_missing_idx = 0;
if (ctx.lane_id < n_query) {
uint32_t idx = offset + ctx.lane_id;
uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]);
......@@ -589,6 +609,7 @@ namespace gpucache {
if (num_query == 0) { return; }
dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, queries, results, find_mask);
CHECK(cudaDeviceSynchronize());
......
......@@ -9,12 +9,14 @@
#include <algorithm>
namespace gpucache{
void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
std::unordered_set<uint32_t> in_cache;
const size_t n_iter = 32;
const uint32_t n_keys = 64;
uint32_t* d_keys;
uint32_t* keys;
uint32_t* d_values;
......@@ -31,7 +33,9 @@ namespace gpucache{
bool* d_find_mask;
const size_t keys_size = n_keys * sizeof(uint32_t);
const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t);
const size_t mask_size = n_keys * sizeof(bool);
......@@ -47,18 +51,22 @@ namespace gpucache{
CHECK(cudaMallocHost(&values,values_size));
CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t)));
CHECK(cudaMallocHost(&find_mask,mask_size));
CHECK(cudaMallocHost(&missing_keys,keys_size));
CHECK(cudaMallocHost(&evict_keys,keys_size));
CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t)));
// std::cout << "get there" << std::endl;
std::vector<uint32_t> random_keys(n_keys * n_iter);
std::iota(random_keys.begin(),random_keys.end(),1);
std::random_device rd;
std::mt19937 g(rd());
for (size_t iter = 0; iter < n_iter; iter++){
std::cout << "iter " << iter << std::endl;
uint32_t expect_n_missing = 0;
CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t)));
......@@ -70,6 +78,7 @@ namespace gpucache{
std::cout << *i << " ";
}
std::cout << std::endl;
std::unordered_set<uint32_t> expect_missing_keys_set;
std::unordered_set<uint32_t> keys_set; // store current iter keys
for (size_t i = 0; i < n_keys; ++i) {
......@@ -81,6 +90,7 @@ namespace gpucache{
}
// test get
std::cout << "test get" << std::endl;
CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice));
cudaStream_t stream;
......@@ -116,6 +126,7 @@ namespace gpucache{
cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys);
CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault));
CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault));
std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict);
CHECK(cudaDeviceSynchronize());
for(size_t i = 0; i < *n_evict; i++){
......@@ -123,6 +134,7 @@ namespace gpucache{
}
for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);}
for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);}
checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex());
CHECK(cudaDeviceSynchronize());
printf("---------------------------------------------------\n");
......@@ -143,11 +155,13 @@ namespace gpucache{
CHECK(cudaFreeHost(evict_keys));
CHECK(cudaFreeHost(n_evict));
}
TEST(GPUCACHE,LRUCACHE){
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
cfg.valueSize = 32;
cfg.capacity = 4096;
cfg.keySize = 4;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment