Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
G
gpucache
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
tianxing wang
gpucache
Commits
67b6e2a5
Commit
67b6e2a5
authored
Jan 02, 2024
by
tianxing wang
Browse files
Options
Browse Files
Download
Plain Diff
add debug info
parents
dcc32e94
e50b9ce9
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
85 additions
and
8 deletions
+85
-8
.idea/.name
+0
-2
.idea/misc.xml
+5
-0
CMakeLists.txt
+34
-2
src/cache.h
+4
-0
src/common.cuh
+1
-0
src/hash/murmurhash3.cuh
+2
-0
src/lru_cache.cu
+24
-3
test/cache_test.cu
+15
-1
No files found.
.idea/.name
deleted
100644 → 0
View file @
dcc32e94
lru_cache
\ No newline at end of file
.idea/misc.xml
View file @
67b6e2a5
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<<<<<<
< HEAD
<component
name=
"CMakeWorkspace"
PROJECT_DIR=
"$PROJECT_DIR$/test"
>
<contentRoot
DIR=
"$PROJECT_DIR$"
/>
</component>
=======
<component
name=
"CMakeWorkspace"
PROJECT_DIR=
"$PROJECT_DIR$"
/>
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
</project>
\ No newline at end of file
CMakeLists.txt
View file @
67b6e2a5
...
...
@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16)
project
(
gpucache CXX CUDA
)
<<<<<<< HEAD
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD_REQUIRED on
)
=======
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
file
(
GLOB SOURCE_FILES
# ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
...
...
@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES
)
message
(
STATUS
"source files:"
${
SOURCE_FILES
}
)
<<<<<<< HEAD
#include_directories(${CMAKE_SOURCE_DIR}/include)
#link_directories(${CMAKE_SOURCE_DIR}/libs)
=======
include_directories
(
${
CMAKE_SOURCE_DIR
}
/include
)
link_directories
(
${
CMAKE_SOURCE_DIR
}
/libs
)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
add_library
(
gpucache SHARED
${
SOURCE_FILES
}
)
set_target_properties
(
gpucache PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES
"86"
)
# add_subdirectory(test)
\ No newline at end of file
<<<<<<< HEAD
# add_subdirectory(test)
=======
#include(FetchContent)
#FetchContent_Declare(
# googletest
# URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
#)
## For Windows: Prevent overriding the parent project's compiler/linker settings
#set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
#FetchContent_MakeAvailable(googletest)
#
#enable_testing()
#
#add_executable(
# cache_test
# src/cache_test.cpp
#)
#target_link_libraries(
# cache_test
# GTest::gtest_main
#)
#
#include(GoogleTest)
#gtest_discover_tests(cache_test)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
src/cache.h
View file @
67b6e2a5
...
...
@@ -26,6 +26,7 @@ namespace gpucache {
virtual
~
Cache
()
=
default
;
// Cache(const Cache &) = delete;
//
// Cache &operator=(const Cache &) = delete;
...
...
@@ -48,6 +49,7 @@ namespace gpucache {
virtual
void
Put
(
cudaStream_t
*
stream
,
uint32_t
num_keys
,
KeyType
*
keys
,
ElemType
*
values
,
uint32_t
*
n_evict
,
KeyType
*
evict_keys
)
=
0
;
virtual
void
*
Mutex
()
=
0
;
virtual
void
Clear
()
=
0
;
...
...
@@ -57,6 +59,7 @@ namespace gpucache {
// TODO: 添加其他种类的cache
template
<
typename
KeyType
,
typename
ElemType
>
std
::
unique_ptr
<
Cache
<
KeyType
,
ElemType
>>
NewCache
(
const
CacheConfig
&
cfg
)
{
assert
(
cfg
.
keySize
>
0
);
...
...
@@ -65,6 +68,7 @@ namespace gpucache {
return
nullptr
;
}
// template<typename KeyType, typename ElemType>
// std::unique_ptr<>
}
src/common.cuh
View file @
67b6e2a5
...
...
@@ -8,6 +8,7 @@
#include <stdio.h>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
...
...
src/hash/murmurhash3.cuh
View file @
67b6e2a5
...
...
@@ -18,9 +18,11 @@
#include <stdlib.h>
#define ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#define ROTL64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#define BIG_CONSTANT(x) (x)
// Other compilers
...
...
src/lru_cache.cu
View file @
67b6e2a5
...
...
@@ -39,6 +39,7 @@ namespace gpucache {
WarpMutex &operator=(WarpMutex &&) = delete;
__device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) {
if (ctx.lane_id == 0) {
// while (atomicCAS(&flag, 0, 1) != 0) {
...
...
@@ -53,6 +54,7 @@ namespace gpucache {
}
}
// printf("bucket id: Get Lock\n");
}
__threadfence();
__syncwarp();
...
...
@@ -80,6 +82,7 @@ namespace gpucache {
}
}
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
// printf("thread %u check lock\n",global_thread_idx);
...
...
@@ -91,12 +94,14 @@ namespace gpucache {
}
}
template<typename KeyType, typename ElemType>
class LRUCache;
template<typename KeyType, typename ElemType>
struct BucketView;
// template<typename KeyType, typename ElemType>
// __global__ void
// GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries,
...
...
@@ -141,7 +146,6 @@ namespace gpucache {
void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id);
public:
explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize),
valueSize(cfg.valueSize),
capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) {
...
...
@@ -175,6 +179,7 @@ namespace gpucache {
CHECK(cudaFree(queryIndiceBuffer))
}
uint32_t KeySize() override { return keySize; }
uint32_t ValueSize() override { return valueSize; }
...
...
@@ -199,6 +204,7 @@ namespace gpucache {
initLocks<<<grid, block>>>(nbucket, bucketMutexes);
}
void
Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override;
...
...
@@ -206,6 +212,7 @@ namespace gpucache {
KeyType *evict_keys) override;
private:
KeyType *keys;
ElemType *values;
...
...
@@ -236,7 +243,6 @@ namespace gpucache {
ts),
num_elems_per_value(
num_elems_per_value) {}
__device__ int Get(const ThreadCtx &ctx, const KeyType key) {
KeyType lane_key = bkeys[ctx.lane_id];
uint8_t ts = btimestamps[ctx.lane_id];
...
...
@@ -302,6 +308,7 @@ namespace gpucache {
if (ts > slot_ts) {
ts--;
} else if (ctx.lane_id == slot_num) {
*evict_key = lane_key;
bkeys[ctx.lane_id] = key;
ts = warpsize;
...
...
@@ -331,6 +338,7 @@ namespace gpucache {
};
template<typename KeyType, typename ElemType>
__device__ __host__ BucketView<KeyType, ElemType>
setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, uint32_t bucket_id) {
...
...
@@ -347,7 +355,6 @@ namespace gpucache {
GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries,
ElemType *results, bool *find_mask) {
ThreadCtx ctx{};
__shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize];
__shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize];
...
...
@@ -366,15 +373,18 @@ namespace gpucache {
blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id;
}
__syncwarp();
// if (ctx.lane_id == 0){
// printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize);
// }
// 32 threads compare it own slot with key
// if find parallel write to result
for (uint32_t i = 0; i < n_query; i++) {
uint32_t idx = offset + i;
KeyType key = blockQueryKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_timestamps, num_elem_per_value, bucket_id);
...
...
@@ -388,11 +398,13 @@ namespace gpucache {
if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id);
}
int slot_num = bucket.Get(ctx, key);
if (slot_num != -1) {
bucket.ReadOneValue(ctx, slot_num, &results[idx]);
}
bucket.mutex->UnLock(ctx);
// if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
// printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id);
// }
...
...
@@ -415,6 +427,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType>
__global__ void
PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys,
...
...
@@ -445,6 +458,7 @@ namespace gpucache {
//ElemType* Value = &put_values[idx];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
...
...
@@ -479,6 +493,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType>
__global__ void
EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys,
uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) {
...
...
@@ -497,6 +512,7 @@ namespace gpucache {
uint32_t idx = offset + i;
KeyType key = blockPutKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
...
...
@@ -513,7 +529,9 @@ namespace gpucache {
// TODO switch to cuda stream
template<typename KeyType, typename ElemType>
void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) {
assert(num_query <= maxQueryNum);
if (num_query == 0) {
return;
...
...
@@ -521,6 +539,7 @@ namespace gpucache {
dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
uint32_t n_missing = 0;
PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, put_keys, put_values,
&n_missing, queryKeyBuffer,
...
...
@@ -542,6 +561,7 @@ namespace gpucache {
uint32_t warp_n_missing = 0;
uint32_t base_missing_idx = 0;
uint32_t warp_missing_idx = 0;
if (ctx.lane_id < n_query) {
uint32_t idx = offset + ctx.lane_id;
uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]);
...
...
@@ -589,6 +609,7 @@ namespace gpucache {
if (num_query == 0) { return; }
dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, queries, results, find_mask);
CHECK(cudaDeviceSynchronize());
...
...
test/cache_test.cu
View file @
67b6e2a5
...
...
@@ -9,12 +9,14 @@
#include <algorithm>
namespace gpucache{
void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
std::unordered_set<uint32_t> in_cache;
const size_t n_iter = 32;
const uint32_t n_keys = 64;
uint32_t* d_keys;
uint32_t* keys;
uint32_t* d_values;
...
...
@@ -31,7 +33,9 @@ namespace gpucache{
bool* d_find_mask;
const size_t keys_size = n_keys * sizeof(uint32_t);
const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t);
const size_t mask_size = n_keys * sizeof(bool);
...
...
@@ -47,18 +51,22 @@ namespace gpucache{
CHECK(cudaMallocHost(&values,values_size));
CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t)));
CHECK(cudaMallocHost(&find_mask,mask_size));
CHECK(cudaMallocHost(&missing_keys,keys_size));
CHECK(cudaMallocHost(&evict_keys,keys_size));
CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t)));
// std::cout << "get there" << std::endl;
std::vector<uint32_t> random_keys(n_keys * n_iter);
std::iota(random_keys.begin(),random_keys.end(),1);
std::random_device rd;
std::mt19937 g(rd());
for (size_t iter = 0; iter < n_iter; iter++){
std::cout << "iter " << iter << std::endl;
uint32_t expect_n_missing = 0;
CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t)));
...
...
@@ -70,6 +78,7 @@ namespace gpucache{
std::cout << *i << " ";
}
std::cout << std::endl;
std::unordered_set<uint32_t> expect_missing_keys_set;
std::unordered_set<uint32_t> keys_set; // store current iter keys
for (size_t i = 0; i < n_keys; ++i) {
...
...
@@ -81,6 +90,7 @@ namespace gpucache{
}
// test get
std::cout << "test get" << std::endl;
CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice));
cudaStream_t stream;
...
...
@@ -116,6 +126,7 @@ namespace gpucache{
cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys);
CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault));
CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault));
std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict);
CHECK(cudaDeviceSynchronize());
for(size_t i = 0; i < *n_evict; i++){
...
...
@@ -123,6 +134,7 @@ namespace gpucache{
}
for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);}
for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);}
checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex());
CHECK(cudaDeviceSynchronize());
printf("---------------------------------------------------\n");
...
...
@@ -143,11 +155,13 @@ namespace gpucache{
CHECK(cudaFreeHost(evict_keys));
CHECK(cudaFreeHost(n_evict));
}
TEST(GPUCACHE,LRUCACHE){
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
cfg.valueSize = 32;
cfg.capacity = 4096;
cfg.keySize = 4;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment