Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
G
gpucache
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
tianxing wang
gpucache
Commits
67b6e2a5
Commit
67b6e2a5
authored
Jan 02, 2024
by
tianxing wang
Browse files
Options
Browse Files
Download
Plain Diff
add debug info
parents
dcc32e94
e50b9ce9
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
85 additions
and
8 deletions
+85
-8
.idea/.name
+0
-2
.idea/misc.xml
+5
-0
CMakeLists.txt
+34
-2
src/cache.h
+4
-0
src/common.cuh
+1
-0
src/hash/murmurhash3.cuh
+2
-0
src/lru_cache.cu
+24
-3
test/cache_test.cu
+15
-1
No files found.
.idea/.name
deleted
100644 → 0
View file @
dcc32e94
lru_cache
\ No newline at end of file
.idea/misc.xml
View file @
67b6e2a5
<?xml version="1.0" encoding="UTF-8"?>
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<project
version=
"4"
>
<<<<<<
< HEAD
<component
name=
"CMakeWorkspace"
PROJECT_DIR=
"$PROJECT_DIR$/test"
>
<component
name=
"CMakeWorkspace"
PROJECT_DIR=
"$PROJECT_DIR$/test"
>
<contentRoot
DIR=
"$PROJECT_DIR$"
/>
<contentRoot
DIR=
"$PROJECT_DIR$"
/>
</component>
</component>
=======
<component
name=
"CMakeWorkspace"
PROJECT_DIR=
"$PROJECT_DIR$"
/>
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
</project>
</project>
\ No newline at end of file
CMakeLists.txt
View file @
67b6e2a5
...
@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16)
...
@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16)
project
(
gpucache CXX CUDA
)
project
(
gpucache CXX CUDA
)
<<<<<<< HEAD
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD_REQUIRED on
)
set
(
CMAKE_CXX_STANDARD_REQUIRED on
)
=======
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
file
(
GLOB SOURCE_FILES
file
(
GLOB SOURCE_FILES
# ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
# ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
...
@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES
...
@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES
)
)
message
(
STATUS
"source files:"
${
SOURCE_FILES
}
)
message
(
STATUS
"source files:"
${
SOURCE_FILES
}
)
<<<<<<< HEAD
#include_directories(${CMAKE_SOURCE_DIR}/include)
#include_directories(${CMAKE_SOURCE_DIR}/include)
#link_directories(${CMAKE_SOURCE_DIR}/libs)
#link_directories(${CMAKE_SOURCE_DIR}/libs)
=======
include_directories
(
${
CMAKE_SOURCE_DIR
}
/include
)
link_directories
(
${
CMAKE_SOURCE_DIR
}
/libs
)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
add_library
(
gpucache SHARED
${
SOURCE_FILES
}
)
add_library
(
gpucache SHARED
${
SOURCE_FILES
}
)
set_target_properties
(
gpucache PROPERTIES
set_target_properties
(
gpucache PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_SEPARABLE_COMPILATION ON
CUDA_ARCHITECTURES
"86"
CUDA_ARCHITECTURES
"86"
)
)
# add_subdirectory(test)
<<<<<<< HEAD
\ No newline at end of file
# add_subdirectory(test)
=======
#include(FetchContent)
#FetchContent_Declare(
# googletest
# URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
#)
## For Windows: Prevent overriding the parent project's compiler/linker settings
#set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
#FetchContent_MakeAvailable(googletest)
#
#enable_testing()
#
#add_executable(
# cache_test
# src/cache_test.cpp
#)
#target_link_libraries(
# cache_test
# GTest::gtest_main
#)
#
#include(GoogleTest)
#gtest_discover_tests(cache_test)
>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
src/cache.h
View file @
67b6e2a5
...
@@ -26,6 +26,7 @@ namespace gpucache {
...
@@ -26,6 +26,7 @@ namespace gpucache {
virtual
~
Cache
()
=
default
;
virtual
~
Cache
()
=
default
;
// Cache(const Cache &) = delete;
// Cache(const Cache &) = delete;
//
//
// Cache &operator=(const Cache &) = delete;
// Cache &operator=(const Cache &) = delete;
...
@@ -48,6 +49,7 @@ namespace gpucache {
...
@@ -48,6 +49,7 @@ namespace gpucache {
virtual
void
Put
(
cudaStream_t
*
stream
,
uint32_t
num_keys
,
KeyType
*
keys
,
ElemType
*
values
,
uint32_t
*
n_evict
,
KeyType
*
evict_keys
)
=
0
;
virtual
void
Put
(
cudaStream_t
*
stream
,
uint32_t
num_keys
,
KeyType
*
keys
,
ElemType
*
values
,
uint32_t
*
n_evict
,
KeyType
*
evict_keys
)
=
0
;
virtual
void
*
Mutex
()
=
0
;
virtual
void
*
Mutex
()
=
0
;
virtual
void
Clear
()
=
0
;
virtual
void
Clear
()
=
0
;
...
@@ -57,6 +59,7 @@ namespace gpucache {
...
@@ -57,6 +59,7 @@ namespace gpucache {
// TODO: 添加其他种类的cache
// TODO: 添加其他种类的cache
template
<
typename
KeyType
,
typename
ElemType
>
template
<
typename
KeyType
,
typename
ElemType
>
std
::
unique_ptr
<
Cache
<
KeyType
,
ElemType
>>
NewCache
(
const
CacheConfig
&
cfg
)
{
std
::
unique_ptr
<
Cache
<
KeyType
,
ElemType
>>
NewCache
(
const
CacheConfig
&
cfg
)
{
assert
(
cfg
.
keySize
>
0
);
assert
(
cfg
.
keySize
>
0
);
...
@@ -65,6 +68,7 @@ namespace gpucache {
...
@@ -65,6 +68,7 @@ namespace gpucache {
return
nullptr
;
return
nullptr
;
}
}
// template<typename KeyType, typename ElemType>
// template<typename KeyType, typename ElemType>
// std::unique_ptr<>
// std::unique_ptr<>
}
}
src/common.cuh
View file @
67b6e2a5
...
@@ -8,6 +8,7 @@
...
@@ -8,6 +8,7 @@
#include <stdio.h>
#include <stdio.h>
#define CHECK(call) \
#define CHECK(call) \
{ \
{ \
const cudaError_t error = call; \
const cudaError_t error = call; \
...
...
src/hash/murmurhash3.cuh
View file @
67b6e2a5
...
@@ -18,9 +18,11 @@
...
@@ -18,9 +18,11 @@
#include <stdlib.h>
#include <stdlib.h>
#define ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#define ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
#define ROTL64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#define ROTL64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
#define BIG_CONSTANT(x) (x)
#define BIG_CONSTANT(x) (x)
// Other compilers
// Other compilers
...
...
src/lru_cache.cu
View file @
67b6e2a5
...
@@ -39,6 +39,7 @@ namespace gpucache {
...
@@ -39,6 +39,7 @@ namespace gpucache {
WarpMutex &operator=(WarpMutex &&) = delete;
WarpMutex &operator=(WarpMutex &&) = delete;
__device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) {
__device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) {
if (ctx.lane_id == 0) {
if (ctx.lane_id == 0) {
// while (atomicCAS(&flag, 0, 1) != 0) {
// while (atomicCAS(&flag, 0, 1) != 0) {
...
@@ -53,6 +54,7 @@ namespace gpucache {
...
@@ -53,6 +54,7 @@ namespace gpucache {
}
}
}
}
// printf("bucket id: Get Lock\n");
// printf("bucket id: Get Lock\n");
}
}
__threadfence();
__threadfence();
__syncwarp();
__syncwarp();
...
@@ -80,6 +82,7 @@ namespace gpucache {
...
@@ -80,6 +82,7 @@ namespace gpucache {
}
}
}
}
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
__global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
// printf("thread %u check lock\n",global_thread_idx);
// printf("thread %u check lock\n",global_thread_idx);
...
@@ -91,12 +94,14 @@ namespace gpucache {
...
@@ -91,12 +94,14 @@ namespace gpucache {
}
}
}
}
template<typename KeyType, typename ElemType>
template<typename KeyType, typename ElemType>
class LRUCache;
class LRUCache;
template<typename KeyType, typename ElemType>
template<typename KeyType, typename ElemType>
struct BucketView;
struct BucketView;
// template<typename KeyType, typename ElemType>
// template<typename KeyType, typename ElemType>
// __global__ void
// __global__ void
// GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries,
// GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries,
...
@@ -141,7 +146,6 @@ namespace gpucache {
...
@@ -141,7 +146,6 @@ namespace gpucache {
void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id);
void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id);
public:
public:
explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize),
explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize),
valueSize(cfg.valueSize),
valueSize(cfg.valueSize),
capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) {
capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) {
...
@@ -175,6 +179,7 @@ namespace gpucache {
...
@@ -175,6 +179,7 @@ namespace gpucache {
CHECK(cudaFree(queryIndiceBuffer))
CHECK(cudaFree(queryIndiceBuffer))
}
}
uint32_t KeySize() override { return keySize; }
uint32_t KeySize() override { return keySize; }
uint32_t ValueSize() override { return valueSize; }
uint32_t ValueSize() override { return valueSize; }
...
@@ -199,6 +204,7 @@ namespace gpucache {
...
@@ -199,6 +204,7 @@ namespace gpucache {
initLocks<<<grid, block>>>(nbucket, bucketMutexes);
initLocks<<<grid, block>>>(nbucket, bucketMutexes);
}
}
void
void
Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override;
Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override;
...
@@ -206,6 +212,7 @@ namespace gpucache {
...
@@ -206,6 +212,7 @@ namespace gpucache {
KeyType *evict_keys) override;
KeyType *evict_keys) override;
private:
private:
KeyType *keys;
KeyType *keys;
ElemType *values;
ElemType *values;
...
@@ -236,7 +243,6 @@ namespace gpucache {
...
@@ -236,7 +243,6 @@ namespace gpucache {
ts),
ts),
num_elems_per_value(
num_elems_per_value(
num_elems_per_value) {}
num_elems_per_value) {}
__device__ int Get(const ThreadCtx &ctx, const KeyType key) {
__device__ int Get(const ThreadCtx &ctx, const KeyType key) {
KeyType lane_key = bkeys[ctx.lane_id];
KeyType lane_key = bkeys[ctx.lane_id];
uint8_t ts = btimestamps[ctx.lane_id];
uint8_t ts = btimestamps[ctx.lane_id];
...
@@ -302,6 +308,7 @@ namespace gpucache {
...
@@ -302,6 +308,7 @@ namespace gpucache {
if (ts > slot_ts) {
if (ts > slot_ts) {
ts--;
ts--;
} else if (ctx.lane_id == slot_num) {
} else if (ctx.lane_id == slot_num) {
*evict_key = lane_key;
*evict_key = lane_key;
bkeys[ctx.lane_id] = key;
bkeys[ctx.lane_id] = key;
ts = warpsize;
ts = warpsize;
...
@@ -331,6 +338,7 @@ namespace gpucache {
...
@@ -331,6 +338,7 @@ namespace gpucache {
};
};
template<typename KeyType, typename ElemType>
template<typename KeyType, typename ElemType>
__device__ __host__ BucketView<KeyType, ElemType>
__device__ __host__ BucketView<KeyType, ElemType>
setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, uint32_t bucket_id) {
uint32_t num_elem_per_value, uint32_t bucket_id) {
...
@@ -347,7 +355,6 @@ namespace gpucache {
...
@@ -347,7 +355,6 @@ namespace gpucache {
GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries,
ElemType *results, bool *find_mask) {
ElemType *results, bool *find_mask) {
ThreadCtx ctx{};
ThreadCtx ctx{};
__shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize];
__shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize];
__shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize];
__shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize];
...
@@ -366,15 +373,18 @@ namespace gpucache {
...
@@ -366,15 +373,18 @@ namespace gpucache {
blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id;
blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id;
}
}
__syncwarp();
__syncwarp();
// if (ctx.lane_id == 0){
// if (ctx.lane_id == 0){
// printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize);
// printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize);
// }
// }
// 32 threads compare it own slot with key
// 32 threads compare it own slot with key
// if find parallel write to result
// if find parallel write to result
for (uint32_t i = 0; i < n_query; i++) {
for (uint32_t i = 0; i < n_query; i++) {
uint32_t idx = offset + i;
uint32_t idx = offset + i;
KeyType key = blockQueryKeys[ctx.block_warp_idx][i];
KeyType key = blockQueryKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_timestamps, num_elem_per_value, bucket_id);
cache_timestamps, num_elem_per_value, bucket_id);
...
@@ -388,11 +398,13 @@ namespace gpucache {
...
@@ -388,11 +398,13 @@ namespace gpucache {
if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id);
printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id);
}
}
int slot_num = bucket.Get(ctx, key);
int slot_num = bucket.Get(ctx, key);
if (slot_num != -1) {
if (slot_num != -1) {
bucket.ReadOneValue(ctx, slot_num, &results[idx]);
bucket.ReadOneValue(ctx, slot_num, &results[idx]);
}
}
bucket.mutex->UnLock(ctx);
bucket.mutex->UnLock(ctx);
// if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
// if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
// printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id);
// printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id);
// }
// }
...
@@ -415,6 +427,7 @@ namespace gpucache {
...
@@ -415,6 +427,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType>
template<typename KeyType, typename ElemType>
__global__ void
__global__ void
PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys,
uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys,
ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys,
...
@@ -445,6 +458,7 @@ namespace gpucache {
...
@@ -445,6 +458,7 @@ namespace gpucache {
//ElemType* Value = &put_values[idx];
//ElemType* Value = &put_values[idx];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id);
cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
...
@@ -479,6 +493,7 @@ namespace gpucache {
...
@@ -479,6 +493,7 @@ namespace gpucache {
template<typename KeyType, typename ElemType>
template<typename KeyType, typename ElemType>
__global__ void
__global__ void
EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys,
uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys,
uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) {
uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) {
...
@@ -497,6 +512,7 @@ namespace gpucache {
...
@@ -497,6 +512,7 @@ namespace gpucache {
uint32_t idx = offset + i;
uint32_t idx = offset + i;
KeyType key = blockPutKeys[ctx.block_warp_idx][i];
KeyType key = blockPutKeys[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
cache_mutexes, num_elem_per_value, bucket_id);
cache_mutexes, num_elem_per_value, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
bucket.mutex->Lock(ctx, bucket_id);
...
@@ -513,7 +529,9 @@ namespace gpucache {
...
@@ -513,7 +529,9 @@ namespace gpucache {
// TODO switch to cuda stream
// TODO switch to cuda stream
template<typename KeyType, typename ElemType>
template<typename KeyType, typename ElemType>
void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys,
void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys,
ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) {
ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) {
assert(num_query <= maxQueryNum);
assert(num_query <= maxQueryNum);
if (num_query == 0) {
if (num_query == 0) {
return;
return;
...
@@ -521,6 +539,7 @@ namespace gpucache {
...
@@ -521,6 +539,7 @@ namespace gpucache {
dim3 block(defaultBlockX);
dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
uint32_t n_missing = 0;
uint32_t n_missing = 0;
PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, put_keys, put_values,
numElemPerValue, num_query, put_keys, put_values,
&n_missing, queryKeyBuffer,
&n_missing, queryKeyBuffer,
...
@@ -542,6 +561,7 @@ namespace gpucache {
...
@@ -542,6 +561,7 @@ namespace gpucache {
uint32_t warp_n_missing = 0;
uint32_t warp_n_missing = 0;
uint32_t base_missing_idx = 0;
uint32_t base_missing_idx = 0;
uint32_t warp_missing_idx = 0;
uint32_t warp_missing_idx = 0;
if (ctx.lane_id < n_query) {
if (ctx.lane_id < n_query) {
uint32_t idx = offset + ctx.lane_id;
uint32_t idx = offset + ctx.lane_id;
uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]);
uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]);
...
@@ -589,6 +609,7 @@ namespace gpucache {
...
@@ -589,6 +609,7 @@ namespace gpucache {
if (num_query == 0) { return; }
if (num_query == 0) { return; }
dim3 block(defaultBlockX);
dim3 block(defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
numElemPerValue, num_query, queries, results, find_mask);
numElemPerValue, num_query, queries, results, find_mask);
CHECK(cudaDeviceSynchronize());
CHECK(cudaDeviceSynchronize());
...
...
test/cache_test.cu
View file @
67b6e2a5
...
@@ -9,12 +9,14 @@
...
@@ -9,12 +9,14 @@
#include <algorithm>
#include <algorithm>
namespace gpucache{
namespace gpucache{
void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
std::unordered_set<uint32_t> in_cache;
std::unordered_set<uint32_t> in_cache;
const size_t n_iter = 32;
const size_t n_iter = 32;
const uint32_t n_keys = 64;
const uint32_t n_keys = 64;
uint32_t* d_keys;
uint32_t* d_keys;
uint32_t* keys;
uint32_t* keys;
uint32_t* d_values;
uint32_t* d_values;
...
@@ -31,7 +33,9 @@ namespace gpucache{
...
@@ -31,7 +33,9 @@ namespace gpucache{
bool* d_find_mask;
bool* d_find_mask;
const size_t keys_size = n_keys * sizeof(uint32_t);
const size_t keys_size = n_keys * sizeof(uint32_t);
const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t);
const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t);
const size_t mask_size = n_keys * sizeof(bool);
const size_t mask_size = n_keys * sizeof(bool);
...
@@ -47,18 +51,22 @@ namespace gpucache{
...
@@ -47,18 +51,22 @@ namespace gpucache{
CHECK(cudaMallocHost(&values,values_size));
CHECK(cudaMallocHost(&values,values_size));
CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t)));
CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t)));
CHECK(cudaMallocHost(&find_mask,mask_size));
CHECK(cudaMallocHost(&find_mask,mask_size));
CHECK(cudaMallocHost(&missing_keys,keys_size));
CHECK(cudaMallocHost(&missing_keys,keys_size));
CHECK(cudaMallocHost(&evict_keys,keys_size));
CHECK(cudaMallocHost(&evict_keys,keys_size));
CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t)));
CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t)));
// std::cout << "get there" << std::endl;
std::vector<uint32_t> random_keys(n_keys * n_iter);
std::vector<uint32_t> random_keys(n_keys * n_iter);
std::iota(random_keys.begin(),random_keys.end(),1);
std::iota(random_keys.begin(),random_keys.end(),1);
std::random_device rd;
std::random_device rd;
std::mt19937 g(rd());
std::mt19937 g(rd());
for (size_t iter = 0; iter < n_iter; iter++){
for (size_t iter = 0; iter < n_iter; iter++){
std::cout << "iter " << iter << std::endl;
std::cout << "iter " << iter << std::endl;
uint32_t expect_n_missing = 0;
uint32_t expect_n_missing = 0;
CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t)));
CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t)));
...
@@ -70,6 +78,7 @@ namespace gpucache{
...
@@ -70,6 +78,7 @@ namespace gpucache{
std::cout << *i << " ";
std::cout << *i << " ";
}
}
std::cout << std::endl;
std::cout << std::endl;
std::unordered_set<uint32_t> expect_missing_keys_set;
std::unordered_set<uint32_t> expect_missing_keys_set;
std::unordered_set<uint32_t> keys_set; // store current iter keys
std::unordered_set<uint32_t> keys_set; // store current iter keys
for (size_t i = 0; i < n_keys; ++i) {
for (size_t i = 0; i < n_keys; ++i) {
...
@@ -81,6 +90,7 @@ namespace gpucache{
...
@@ -81,6 +90,7 @@ namespace gpucache{
}
}
// test get
// test get
std::cout << "test get" << std::endl;
std::cout << "test get" << std::endl;
CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice));
cudaStream_t stream;
cudaStream_t stream;
...
@@ -116,6 +126,7 @@ namespace gpucache{
...
@@ -116,6 +126,7 @@ namespace gpucache{
cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys);
cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys);
CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault));
CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault));
CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault));
CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault));
std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict);
std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict);
CHECK(cudaDeviceSynchronize());
CHECK(cudaDeviceSynchronize());
for(size_t i = 0; i < *n_evict; i++){
for(size_t i = 0; i < *n_evict; i++){
...
@@ -123,6 +134,7 @@ namespace gpucache{
...
@@ -123,6 +134,7 @@ namespace gpucache{
}
}
for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);}
for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);}
for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);}
for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);}
checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex());
checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex());
CHECK(cudaDeviceSynchronize());
CHECK(cudaDeviceSynchronize());
printf("---------------------------------------------------\n");
printf("---------------------------------------------------\n");
...
@@ -143,11 +155,13 @@ namespace gpucache{
...
@@ -143,11 +155,13 @@ namespace gpucache{
CHECK(cudaFreeHost(evict_keys));
CHECK(cudaFreeHost(evict_keys));
CHECK(cudaFreeHost(n_evict));
CHECK(cudaFreeHost(n_evict));
}
}
TEST(GPUCACHE,LRUCACHE){
TEST(GPUCACHE,LRUCACHE){
CacheConfig cfg{};
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
cfg.valueSize = 32;
cfg.valueSize = 32;
cfg.capacity = 4096;
cfg.capacity = 4096;
cfg.keySize = 4;
cfg.keySize = 4;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment