fix Get bug: find_mask is not zero initialization

30d343ec · tianxing wang · ab08d21b · ab08d21b · 30d343ec · 30d343ec
Commit 30d343ec authored Apr 18, 2024 by tianxing wang
Hide whitespace changes
Inline Side-by-side

Showing with 31 additions and 57 deletions

bug.log
+0 -0

libgpucache.so
+0 -0

src/cache.cuh
+4 -46

src/cache_wrapper.cu
+2 -2

src/export.cpp
+2 -1

test.ipynb
+0 -0

test/cache_test.cu
+23 -8

No files found.
--- a/bug.log
+++ b/bug.log
--- a/libgpucache.so
+++ b/libgpucache.so
--- a/src/cache.cuh
+++ b/src/cache.cuh
@@ -11,9 +11,9 @@ namespace gpucache {
        CacheStorage(CacheConfig &cfg) : key_size(cfg.key_size), value_size(cfg.value_size), capacity(cfg.capacity),
                                         max_query_num(cfg.max_query_num), device_id(cfg.device_id), dim(cfg.dim) {
            nbucket = (capacity + warpSize - 1) / warpSize;
-            printf("Cache: keySize: %lu, valueSize: %u, dim: %u, capacity: %lu, "
-                   "maxQueryNum: %u, deviceId: %u\n",
-                   sizeof(KeyType), value_size, dim, capacity, max_query_num, device_id);
+//            printf("Cache: keySize: %lu, valueSize: %u, dim: %u, capacity: %lu, "
+//                   "maxQueryNum: %u, deviceId: %u\n",
+//                   sizeof(KeyType), value_size, dim, capacity, max_query_num, device_id);
            CUDA_CHECK(cudaMalloc((void **) &keys, capacity * key_size));
            CUDA_CHECK(cudaMalloc((void **) &values, capacity * value_size));
            CUDA_CHECK(cudaMalloc((void **) &timestamps, capacity * sizeof(uint8_t)));
@@ -55,22 +55,6 @@ namespace gpucache {
        uint32_t max_query_num;
    };

-//    template<typename KeyType, typename ElemType>
-//    struct CacheManager {
-//        __device__ CacheManager() = default;
-//        __device__ virtual BucketView<KeyType, ElemType> *
-//        SetBucketView(ThreadCtx ctx, KeyType *cache_keys, ElemType *cache_values,
-//                      uint8_t *cache_timestamps, void *cache_mutexes,
-//                      uint32_t num_elem_per_value, uint32_t bucket_id) = 0;
-//
-//        __device__  __host__ virtual void Type(){
-//            printf("CacheManager\n");
-//        };
-//
-//        __device__ virtual void DestroyBucketView(BucketView<KeyType, ElemType> *bucket) = 0;
-//
-//    };
-

    template<typename KeyType, typename ElemType, CacheConfig::CacheEvictStrategy Strategy>
    struct BucketView;
@@ -86,34 +70,7 @@ namespace gpucache {
                                                       reinterpret_cast<WarpMutex *>(cache_mutexes) + bucket_id,
                                                       num_elem_per_value);
    }
-/*
-    {
-        __device__ BucketView(KeyType* k, ElemType* v, uint8_t* ts, WarpMutex* m, uint32_t num_elems_per_value): bkeys(k), bvalues(v),btimestamps(ts),mutex(m),num_elems_per_value(num_elems_per_value){}
-        __device__ ~BucketView()= default;
-        __device__ int Get(const ThreadCtx &ctx, KeyType key){};
-        __device__ int TryPut(const ThreadCtx &ctx, KeyType key){};
-        __device__ int Evict(const ThreadCtx &ctx, KeyType key, KeyType *evict_key){};
-        __device__ void ReadOneValue(const ThreadCtx &ctx, uint8_t slot_num,
-                                     ElemType *out) {
-            for (size_t i = ctx.lane_id; i < num_elems_per_value; i += warpSize) {
-                out[i] = bvalues[slot_num * num_elems_per_value + i];
-            }
-        }

-        __device__ void WriteOneValue(const ThreadCtx &ctx, uint8_t slot_num,
-                                      ElemType *v) {
-            for (size_t i = ctx.lane_id; i < num_elems_per_value; i += warpSize) {
-                bvalues[slot_num * num_elems_per_value + i] = v[i];
-            }
-        }
-        WarpMutex *mutex;
-        KeyType *bkeys;
-        ElemType *bvalues;
-        uint8_t *btimestamps;
-        uint32_t num_elems_per_value;
-
-    };
-*/


    // Cache Interface
@@ -200,6 +157,7 @@ namespace gpucache {

                int slot_num = bucket.Get(ctx, key);
                if (slot_num != -1) {
+                    // printf("warp_id:%d,lane_id:%d find key %d\n",ctx.global_warp_idx,ctx.lane_id,key);
                    find_mask[idx] = true;
                    bucket.ReadOneValue(ctx, slot_num, &results[idx * num_elem_per_value]);
                }

--- a/src/cache_wrapper.cu
+++ b/src/cache_wrapper.cu
@@ -91,14 +91,13 @@ namespace gpucache {
            assert(num_query <= cache_cfg.max_query_num && "num_query must less than max_query_num");
            assert(queries.dtype().toScalarType() == ktype && "key type doesn't match");
            //std::cout << "Get queries: " << queries << std::endl;
-//        printf("hello!!!\n");
            //std::cout << "Get num_query: " << num_query <<"\n";
            //std::cout << "Get queries.sizes() " << queries.sizes() << "\n";
            auto keys = queries.data_ptr();
            auto stream = at::cuda::getCurrentCUDAStream(cache_cfg.device_id).stream();
            auto result = torch::empty({num_query, cache_cfg.dim},
                                       torch::dtype(vtype).device(torch::kCUDA, cache_cfg.device_id));
-            auto find_mask = torch::empty({num_query},
+            auto find_mask = torch::zeros({num_query},
                                          torch::dtype(torch::kBool).device(torch::kCUDA, cache_cfg.device_id));

            if (key_is_int32) {
@@ -121,6 +120,7 @@ namespace gpucache {
                               reinterpret_cast<bool *>(find_mask.data_ptr()));
                });
            }
+            // std::cout << "find_mask:" << find_mask << std::endl;
            return std::make_pair(result, find_mask);
        }


--- a/src/export.cpp
+++ b/src/export.cpp
@@ -45,6 +45,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
            .def("Device",&gpucache::CacheWrapper::DeviceId,"return device id")
            .def("Dim",&gpucache::CacheWrapper::Dim,"return value dim");

-    m.def("NewCache", &gpucache::NewCache, "create a lru cache",py::return_value_policy::reference);
+    m.def("NewCache", &gpucache::NewCache, "create a cache",py::return_value_policy::reference);
 /*-----------------------------------------------------------------------------------------------------------------------------------*/
 }
\ No newline at end of file
--- a/test.ipynb
+++ b/test.ipynb
--- a/test/cache_test.cu
+++ b/test/cache_test.cu
@@ -48,7 +48,7 @@ namespace gpucache{
        constexpr int warpsize = 32;
        std::unordered_set<uint32_t> in_cache;
        const size_t n_iter = 32;
-        const uint32_t n_keys = 1024;
+        const uint32_t n_keys = 2048;


        int32_t* d_keys;
@@ -165,7 +165,7 @@ namespace gpucache{
            //test put
            std::cout << "test put" << std::endl;
            memset(values, 0,values_size);
-            for (size_t i = 0; i < n_keys; i++){
+            for (size_t i = 0; i < n_keys; i++){ // for key i, the first number of embeddding[key] should be key i +123
                values[i * num_elem_per_value] = keys[i] + 123;
            }
            CUDA_CHECK(cudaMemcpy(d_values,values,values_size,cudaMemcpyDefault));
@@ -209,12 +209,12 @@ namespace gpucache{
        CacheConfig cfg{};
        cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;

-        cfg.value_size = 32;
+        cfg.value_size = 172*4;
        cfg.capacity = 4096 * 2;
        cfg.key_size = 4;
        cfg.max_query_num = 2048;
        cfg.device_id = 0;
-        cfg.dim = 8;
+        cfg.dim = 172;

        LRUCache<int32_t,uint32_t> *cache = NewLRUCache<int32_t,uint32_t>(cfg);
        TestCache(*cache,cfg.dim);
@@ -225,12 +225,12 @@ namespace gpucache{
        CacheConfig cfg{};
        cfg.strategy = CacheConfig::CacheEvictStrategy::FIFO;

-        cfg.value_size = 32;
+        cfg.value_size = 172*4;
        cfg.capacity = 4096 * 2;
        cfg.key_size = 4;
        cfg.max_query_num = 2048;
        cfg.device_id = 0;
-        cfg.dim = 8;
+        cfg.dim = 172;

        FIFOCache<int32_t,uint32_t> *cache = NewFIFOCache<int32_t,uint32_t>(cfg);
        TestCache(*cache,cfg.dim);
@@ -241,17 +241,32 @@ namespace gpucache{
        CacheConfig cfg{};
        cfg.strategy = CacheConfig::CacheEvictStrategy::LFU;

-        cfg.value_size = 32;
+        cfg.value_size = 172*4;
        cfg.capacity = 4096 * 2;
        cfg.key_size = 4;
        cfg.max_query_num = 2048;
        cfg.device_id = 0;
-        cfg.dim = 8;
+        cfg.dim = 172;

        LFUCache<int32_t,uint32_t> *cache = NewLFUCache<int32_t,uint32_t>(cfg);
        TestCache(*cache,cfg.dim);

    }
+    TEST(GPUCACHE, LRUCACHEGET){
+        CacheConfig cfg{CacheConfig::CacheEvictStrategy::LRU,8192,4,4*172,8192,0,172};
+        auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0));
+
+        auto cache = NewCache(t,cfg);
+        auto size = std::vector<int64_t>{6483};
+        auto keys = torch::randint(0,12481924,size,torch::dtype(torch::kInt32)).to(torch::kCUDA,0);
+        // auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0);
+        auto [values, find_mask] = cache->Get(6483,keys);
+        for(int i = 0; i < find_mask.size(0);i++){
+            ASSERT_FALSE(find_mask[i].item<bool>()) << "should be false";
+        }
+
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }

    TEST(GPUCACHE, FIFOCACHEWRAPPER){
        CacheConfig cfg{CacheConfig::CacheEvictStrategy::FIFO,8192,4,32,2048,0,8};