add debug info

67b6e2a5 · tianxing wang · dcc32e94 · e50b9ce9 · dcc32e94 · 67b6e2a5
Commit 67b6e2a5 authored Jan 02, 2024 by tianxing wang
Showing with 85 additions and 8 deletions

.idea/.name
+0 -2

.idea/misc.xml
+5 -0

CMakeLists.txt
+34 -2

src/cache.h
+4 -0

src/common.cuh
+1 -0

src/hash/murmurhash3.cuh
+2 -0

src/lru_cache.cu
+24 -3

test/cache_test.cu
+15 -1

No files found.
--- a/.idea/.name
+++ b/.idea/.name
-lru_cache
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
+<<<<<<< HEAD
  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$/test">
    <contentRoot DIR="$PROJECT_DIR$" />
  </component>
+=======
+  <component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
+>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
 </project>
\ No newline at end of file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,8 +2,11 @@ cmake_minimum_required(VERSION 3.16)

 project(gpucache CXX CUDA)

+<<<<<<< HEAD
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED on)
+=======
+>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e

 file(GLOB SOURCE_FILES
        # ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
@@ -13,12 +16,42 @@ file(GLOB SOURCE_FILES
 )
 message(STATUS "source files:" ${SOURCE_FILES})

+<<<<<<< HEAD
 #include_directories(${CMAKE_SOURCE_DIR}/include)
 #link_directories(${CMAKE_SOURCE_DIR}/libs)
+=======
+include_directories(${CMAKE_SOURCE_DIR}/include)
+link_directories(${CMAKE_SOURCE_DIR}/libs)
+>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
 add_library(gpucache SHARED ${SOURCE_FILES})
 set_target_properties(gpucache PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON
        CUDA_ARCHITECTURES "86"
 )

-# add_subdirectory(test)
\ No newline at end of file
+<<<<<<< HEAD
+# add_subdirectory(test)
+=======
+#include(FetchContent)
+#FetchContent_Declare(
+#        googletest
+#        URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip
+#)
+## For Windows: Prevent overriding the parent project's compiler/linker settings
+#set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+#FetchContent_MakeAvailable(googletest)
+#
+#enable_testing()
+#
+#add_executable(
+#        cache_test
+#        src/cache_test.cpp
+#)
+#target_link_libraries(
+#        cache_test
+#        GTest::gtest_main
+#)
+#
+#include(GoogleTest)
+#gtest_discover_tests(cache_test)
+>>>>>>> e50b9ce9f75b7f0d73eff96d54906b351df6985e
--- a/src/cache.h
+++ b/src/cache.h
@@ -26,6 +26,7 @@ namespace gpucache {

        virtual ~Cache() = default;

+
 //        Cache(const Cache &) = delete;
 //
 //        Cache &operator=(const Cache &) = delete;
@@ -48,6 +49,7 @@ namespace gpucache {

        virtual void Put(cudaStream_t *stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0;

+
        virtual void* Mutex() = 0;

        virtual void Clear() = 0;
@@ -57,6 +59,7 @@ namespace gpucache {


 // TODO: 添加其他种类的cache
+
    template<typename KeyType, typename ElemType>
    std::unique_ptr<Cache<KeyType, ElemType>> NewCache(const CacheConfig &cfg) {
        assert(cfg.keySize > 0);
@@ -65,6 +68,7 @@ namespace gpucache {
        return nullptr;
    }

+
 //    template<typename KeyType, typename ElemType>
 //    std::unique_ptr<>
 }
--- a/src/common.cuh
+++ b/src/common.cuh
@@ -8,6 +8,7 @@
 #include <stdio.h>


+
 #define CHECK(call)                                                            \
 {                                                                              \
    const cudaError_t error = call;                                            \

--- a/src/hash/murmurhash3.cuh
+++ b/src/hash/murmurhash3.cuh
@@ -18,9 +18,11 @@

 #include <stdlib.h>

+
 #define ROTL32(x,r)	(((x) << (r)) | ((x) >> (32 - (r))))
 #define ROTL64(x,r)	(((x) << (r)) | ((x) >> (64 - (r))))

+
 #define BIG_CONSTANT(x) (x)

 // Other compilers

--- a/src/lru_cache.cu
+++ b/src/lru_cache.cu
@@ -39,6 +39,7 @@ namespace gpucache {

        WarpMutex &operator=(WarpMutex &&) = delete;

+
        __device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id) {
            if (ctx.lane_id == 0) {
 //                while (atomicCAS(&flag, 0, 1) != 0) {
@@ -53,6 +54,7 @@ namespace gpucache {
                    }
                }
                // printf("bucket id: Get Lock\n");
+
            }
            __threadfence();
            __syncwarp();
@@ -80,6 +82,7 @@ namespace gpucache {
        }
    }

+
    __global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
        uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
        // printf("thread %u check lock\n",global_thread_idx);
@@ -91,12 +94,14 @@ namespace gpucache {
        }
    }

+
    template<typename KeyType, typename ElemType>
    class LRUCache;

    template<typename KeyType, typename ElemType>
    struct BucketView;

+
 //    template<typename KeyType, typename ElemType>
 //    __global__ void
 //    GetInternal(LRUCache<KeyType, ElemType> cache, uint32_t num_query, KeyType *queries,
@@ -141,7 +146,6 @@ namespace gpucache {
                                         void *cache_mutexes, uint32_t num_elem_per_value, uint32_t bucket_id);

    public:
-
        explicit LRUCache(const CacheConfig &cfg) : keySize(cfg.keySize),
                                                    valueSize(cfg.valueSize),
                                                    capacity(cfg.capacity), maxQueryNum(cfg.maxQueryNum) {
@@ -175,6 +179,7 @@ namespace gpucache {
            CHECK(cudaFree(queryIndiceBuffer))
        }

+
        uint32_t KeySize() override { return keySize; }

        uint32_t ValueSize() override { return valueSize; }
@@ -199,6 +204,7 @@ namespace gpucache {
            initLocks<<<grid, block>>>(nbucket, bucketMutexes);
        }

+
        void
        Get(cudaStream_t *stream, uint32_t num_query, KeyType *queries, ElemType *results, bool *find_mask) override;

@@ -206,6 +212,7 @@ namespace gpucache {
                 KeyType *evict_keys) override;


+
    private:
        KeyType *keys;
        ElemType *values;
@@ -236,7 +243,6 @@ namespace gpucache {
                                                                                                                          ts),
                                                                                                                  num_elems_per_value(
                                                                                                                          num_elems_per_value) {}
-
        __device__ int Get(const ThreadCtx &ctx, const KeyType key) {
            KeyType lane_key = bkeys[ctx.lane_id];
            uint8_t ts = btimestamps[ctx.lane_id];
@@ -302,6 +308,7 @@ namespace gpucache {
            if (ts > slot_ts) {
                ts--;
            } else if (ctx.lane_id == slot_num) {
+
                *evict_key = lane_key;
                bkeys[ctx.lane_id] = key;
                ts = warpsize;
@@ -331,6 +338,7 @@ namespace gpucache {
    };

    template<typename KeyType, typename ElemType>
+
    __device__ __host__ BucketView<KeyType, ElemType>
    setBucketView(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
                  uint32_t num_elem_per_value, uint32_t bucket_id) {
@@ -347,7 +355,6 @@ namespace gpucache {
    GetInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
                uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *queries,
                ElemType *results, bool *find_mask) {
-
        ThreadCtx ctx{};
        __shared__ KeyType blockQueryKeys[defaultNumWarpsPerBlock][warpsize];
        __shared__ uint32_t blockBucketIds[defaultNumWarpsPerBlock][warpsize];
@@ -366,15 +373,18 @@ namespace gpucache {
                blockBucketIds[ctx.block_warp_idx][ctx.lane_id] = bucket_id;
            }
            __syncwarp();
+
 //            if (ctx.lane_id == 0){
 //                printf("warp %u hash collect query %u keys of data block %u\n", ctx.global_warp_idx, n_query, offset / warpsize);
 //            }
 //             32 threads compare it own slot with key
 //             if find parallel write to result
+
            for (uint32_t i = 0; i < n_query; i++) {
                uint32_t idx = offset + i;
                KeyType key = blockQueryKeys[ctx.block_warp_idx][i];
                uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
+
                auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
                                                               cache_timestamps, num_elem_per_value, bucket_id);

@@ -388,11 +398,13 @@ namespace gpucache {
                if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
                    printf("thread %u get lock for bucket %u\n", ctx.lane_id, bucket_id);
                }
+
                int slot_num = bucket.Get(ctx, key);
                if (slot_num != -1) {
                    bucket.ReadOneValue(ctx, slot_num, &results[idx]);
                }
                bucket.mutex->UnLock(ctx);
+
 //                if(ctx.global_warp_idx == 0 && ctx.lane_id == 0){
 //                    printf("thread %u release lock for bucket %u\n", ctx.lane_id, bucket_id);
 //                }
@@ -415,6 +427,7 @@ namespace gpucache {

    template<typename KeyType, typename ElemType>
    __global__ void
+
    PutWithoutEvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
                            uint32_t nbucket, uint32_t num_elem_per_value, uint32_t num_query, KeyType *put_keys,
                            ElemType *put_values, uint32_t *n_missing, KeyType *missing_keys,
@@ -445,6 +458,7 @@ namespace gpucache {
                //ElemType* Value = &put_values[idx];
                uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];

+
                auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
                                                               cache_mutexes, num_elem_per_value, bucket_id);
                bucket.mutex->Lock(ctx, bucket_id);
@@ -479,6 +493,7 @@ namespace gpucache {

    template<typename KeyType, typename ElemType>
    __global__ void
+
    EvictInternal(KeyType *cache_keys, ElemType *cache_values, uint8_t *cache_timestamps, void *cache_mutexes,
                  uint32_t num_elem_per_value, ElemType *put_values, uint32_t n_missing, KeyType *missing_keys,
                  uint32_t *missing_indices, uint32_t *num_evict, KeyType *evict_keys) {
@@ -497,6 +512,7 @@ namespace gpucache {
                uint32_t idx = offset + i;
                KeyType key = blockPutKeys[ctx.block_warp_idx][i];
                uint32_t bucket_id = blockBucketIds[ctx.block_warp_idx][i];
+
                auto bucket = setBucketView<KeyType, ElemType>(cache_keys, cache_values, cache_timestamps,
                                                               cache_mutexes, num_elem_per_value, bucket_id);
                bucket.mutex->Lock(ctx, bucket_id);
@@ -513,7 +529,9 @@ namespace gpucache {
    // TODO switch to cuda stream
    template<typename KeyType, typename ElemType>
    void LRUCache<KeyType, ElemType>::Put(cudaStream_t *stream, uint32_t num_query, KeyType *put_keys,
+
                                          ElemType *put_values, uint32_t *n_evict, KeyType *evict_keys) {
+
        assert(num_query <= maxQueryNum);
        if (num_query == 0) {
            return;
@@ -521,6 +539,7 @@ namespace gpucache {
        dim3 block(defaultBlockX);
        dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
        uint32_t n_missing = 0;
+
        PutWithoutEvictInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
                                                                    numElemPerValue, num_query, put_keys, put_values,
                                                                    &n_missing, queryKeyBuffer,
@@ -542,6 +561,7 @@ namespace gpucache {
            uint32_t warp_n_missing = 0;
            uint32_t base_missing_idx = 0;
            uint32_t warp_missing_idx = 0;
+
            if (ctx.lane_id < n_query) {
                uint32_t idx = offset + ctx.lane_id;
                uint32_t warp_missing_mask = __ballot_sync(0xFFFFFFFF, !find_mask[idx]);
@@ -589,6 +609,7 @@ namespace gpucache {
        if (num_query == 0) { return; }
        dim3 block(defaultBlockX);
        dim3 grid((num_query + defaultBlockX - 1) / defaultBlockX);
+
        GetInternal<KeyType, ElemType><<<grid, block>>>(keys, values, timestamps, bucketMutexes, nbucket,
                                                        numElemPerValue, num_query, queries, results, find_mask);
        CHECK(cudaDeviceSynchronize());

--- a/test/cache_test.cu
+++ b/test/cache_test.cu
@@ -9,12 +9,14 @@
 #include <algorithm>

 namespace gpucache{
+
    void TestCache(Cache<uint32_t ,uint32_t>& cache, uint32_t num_elem_per_value){

        std::unordered_set<uint32_t> in_cache;
        const size_t n_iter = 32;
        const uint32_t n_keys = 64;

+
        uint32_t* d_keys;
        uint32_t* keys;
        uint32_t* d_values;
@@ -31,7 +33,9 @@ namespace gpucache{
        bool* d_find_mask;

        const size_t keys_size = n_keys * sizeof(uint32_t);
+
        const size_t values_size = n_keys * num_elem_per_value * sizeof(uint32_t);
+
        const size_t mask_size = n_keys * sizeof(bool);


@@ -47,18 +51,22 @@ namespace gpucache{
        CHECK(cudaMallocHost(&values,values_size));
        CHECK(cudaMallocHost(&n_missing,sizeof(uint32_t)));
        CHECK(cudaMallocHost(&find_mask,mask_size));
+
        CHECK(cudaMallocHost(&missing_keys,keys_size));
+
        CHECK(cudaMallocHost(&evict_keys,keys_size));
        CHECK(cudaMallocHost(&n_evict,sizeof(uint32_t)));


-//        std::cout << "get there" << std::endl;
+
+
        std::vector<uint32_t> random_keys(n_keys * n_iter);
        std::iota(random_keys.begin(),random_keys.end(),1);
        std::random_device rd;
        std::mt19937 g(rd());

        for (size_t iter = 0; iter < n_iter; iter++){
+
            std::cout << "iter " << iter << std::endl;
            uint32_t expect_n_missing = 0;
            CHECK(cudaMemset(d_n_missing,0,sizeof(uint32_t)));
@@ -70,6 +78,7 @@ namespace gpucache{
                std::cout << *i << " ";
            }
            std::cout << std::endl;
+
            std::unordered_set<uint32_t> expect_missing_keys_set;
            std::unordered_set<uint32_t> keys_set; // store current iter keys
            for (size_t i = 0; i < n_keys; ++i) {
@@ -81,6 +90,7 @@ namespace gpucache{
            }

            // test get
+
            std::cout << "test get" << std::endl;
            CHECK(cudaMemcpy(d_keys,keys,keys_size,cudaMemcpyHostToDevice));
            cudaStream_t stream;
@@ -116,6 +126,7 @@ namespace gpucache{
            cache.Put(&stream,n_keys,d_keys,d_values,d_n_evict,d_evict_keys);
            CHECK(cudaMemcpy(n_evict,d_n_evict,sizeof(uint32_t),cudaMemcpyDefault));
            CHECK(cudaMemcpy(evict_keys,d_evict_keys,*n_evict * num_elem_per_value * sizeof(uint32_t), cudaMemcpyDefault));
+
            std::unordered_set<uint32_t> evict_keys_set(evict_keys, evict_keys + *n_evict);
            CHECK(cudaDeviceSynchronize());
            for(size_t i = 0; i < *n_evict; i++){
@@ -123,6 +134,7 @@ namespace gpucache{
            }
            for (size_t i = 0; i < n_keys; ++i){in_cache.emplace(keys[i]);}
            for (size_t i = 0; i < *n_evict; ++i){in_cache.erase(evict_keys[i]);}
+
            checkLocks<<<cgrid, cblock>>>((cache.Capacity() + warpsize - 1) / warpsize, cache.Mutex());
            CHECK(cudaDeviceSynchronize());
            printf("---------------------------------------------------\n");
@@ -143,11 +155,13 @@ namespace gpucache{
        CHECK(cudaFreeHost(evict_keys));
        CHECK(cudaFreeHost(n_evict));

+
    }

    TEST(GPUCACHE,LRUCACHE){
        CacheConfig cfg{};
        cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
+
        cfg.valueSize = 32;
        cfg.capacity = 4096;
        cfg.keySize = 4;