finish fifo cache

f081fb39 · tianxing wang · 52df1e52 · f081fb39 · f081fb39 · f081fb39
Commit f081fb39 authored Jan 25, 2024 by tianxing wang
18 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS on)
 # gpucache
 file(GLOB SOURCE_FILES
        # ${CMAKE_CURRENT_SOURCE_DIR}/src/cuda/*
-        # ${CMAKE_CURRENT_SOURCE_DIR}/src/hash/*.cuh
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/hash/*.cu
        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
        ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu
 )
@@ -35,8 +35,9 @@ target_compile_options(${cache_lib_name} PRIVATE ${TORCH_CXX_FLAGS} -O3)
 target_link_libraries(${cache_lib_name} PRIVATE ${TORCH_LIBRARIES})
 target_compile_definitions(${cache_lib_name} PRIVATE -DTORCH_EXTENSION_NAME=lib${cache_lib_name})
-find_library(TORCH_PYTHON_LIBRARY torch_python "${TORCH_INSTALL_PREFIX}/lib" REQUIRED)
+find_library(TORCH_PYTHON_LIBRARY torch_python "${TORCH_INSTALL_PREFIX}/lib")
 target_link_libraries(${cache_lib_name} PRIVATE ${TORCH_PYTHON_LIBRARY})
+message(STATUS "TORCH_PYTHON_LIBRARY: " ${TORCH_PYTHON_LIBRARY} )

--- a/libgpucache.so
+++ b/libgpucache.so
--- a/readme.md
+++ b/readme.md
@@ -2,7 +2,7 @@ TODO
 [√] LRU
-[] FIFO
+[√] FIFO
 [] LFU

--- a/src/cache.h
+++ b/src/cache.h
 #pragma once 
 #include <cstdint>
 namespace gpucache {
    struct CacheConfig {
@@ -48,9 +48,6 @@ namespace gpucache {
        virtual void Put(cudaStream_t stream, uint32_t num_keys, KeyType *keys, ElemType *values,uint32_t *n_evict, KeyType* evict_keys) = 0;
-        // virtual void* Mutex() = 0;
        virtual void Clear() = 0;
        virtual uint32_t MaxQueryNum() = 0;

--- a/src/common.cuh
+++ b/src/common.cuh
 #pragma once
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
 #include <cstdint>
 #include <assert.h>
 #include <stdio.h>
 #include <cuda_runtime.h>
-#include <vector>
+#include <memory>
+#include "utils.cuh"
+#include "cache.h"
+#include "hash/hash_function.cuh"
-#define CUDA_CHECK(call)                                                            \
+#define CUDA_CHECK(call)                                                       \
 {                                                                              \
    const cudaError_t error = call;                                            \
    if (error != cudaSuccess)                                                  \
@@ -20,3 +22,5 @@
                cudaGetErrorString(error));                                    \
    }                                                                          \
 }
--- a/src/export.cpp
+++ b/src/export.cpp
-#include "common.cuh"
-#include "lru_cache.h"
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
-    py::class_<gpucache::CacheConfig> cfg(m,"CacheConfig");
-    cfg.def(py::init<>())
-       .def(py::init<gpucache::CacheConfig::CacheEvictStrategy, uint64_t, uint32_t, uint32_t, uint32_t, int8_t,uint32_t>())
-       .def_readwrite("strategy", &gpucache::CacheConfig::strategy)
-       .def_readwrite("capacity",&gpucache::CacheConfig::capacity)
-       .def_readwrite("key_size", &gpucache::CacheConfig::keySize)
-       .def_readwrite("value_size",&gpucache::CacheConfig::valueSize)
-       .def_readwrite("max_query_num", &gpucache::CacheConfig::maxQueryNum)
-       .def_readwrite("device_id",&gpucache::CacheConfig::deviceId)
-       .def_readwrite("dim",&gpucache::CacheConfig::dim);
-    py::enum_<gpucache::CacheConfig::CacheEvictStrategy>(cfg,"CacheEvictStrategy")
-        .value("LRU",gpucache::CacheConfig::CacheEvictStrategy::LRU)
-        .value("LFU",gpucache::CacheConfig::CacheEvictStrategy::LFU)
-        .value("FIFO",gpucache::CacheConfig::CacheEvictStrategy::FIFO)
-        .export_values();
-    py::class_<gpucache::LRUCacheWrapper> lru_cache(m, "LRUCache");
-    lru_cache
-        .def(py::init<at::Tensor, gpucache::CacheConfig>())
-        .def("Get",&gpucache::LRUCacheWrapper::Get,"get values for keys,  find_mask return whether each key exists in cache")
-        .def("Put",&gpucache::LRUCacheWrapper::Put,"put key-value pairs")
-        .def("Strategy",&gpucache::LRUCacheWrapper::Strategy,"get evict strategy")
-        .def("Capacity",&gpucache::LRUCacheWrapper::Capacity,"return cache capacity")
-        .def("KeySize",&gpucache::LRUCacheWrapper::KeySize,"return key size")
-        .def("ValueSize",&gpucache::LRUCacheWrapper::ValueSize,"return value size")
-        .def("MaxQueryNum",&gpucache::LRUCacheWrapper::MaxQueryNum,"return max number of keys to get or key-values to put once")
-        .def("Clear",&gpucache::LRUCacheWrapper::Clear,"clear cache")
-        .def("Device",&gpucache::LRUCacheWrapper::DeviceId,"return device id")
-        .def("Dim",&gpucache::LRUCacheWrapper::Dim,"return value dim");
-    m.def("NewLRUCache", &gpucache::NewLRUCache, "create a lru cache",py::return_value_policy::reference);
-}
--- a/src/export.cu
+++ b/src/export.cu
+#include "lru_cache.h"
+#include "fifo_cache.h"
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
+    /* CacheConfig */
+    /*-----------------------------------------------------------------------------------------------------------------------------------*/
+    py::class_<gpucache::CacheConfig> cfg(m,"CacheConfig");
+    cfg.def(py::init<>())
+       .def(py::init<gpucache::CacheConfig::CacheEvictStrategy, uint64_t, uint32_t, uint32_t, uint32_t, int8_t,uint32_t>())
+       .def_readwrite("strategy", &gpucache::CacheConfig::strategy)
+       .def_readwrite("capacity",&gpucache::CacheConfig::capacity)
+       .def_readwrite("keySize", &gpucache::CacheConfig::keySize)
+       .def_readwrite("valueSize",&gpucache::CacheConfig::valueSize)
+       .def_readwrite("maxQueryNum", &gpucache::CacheConfig::maxQueryNum)
+       .def_readwrite("deviceId",&gpucache::CacheConfig::deviceId)
+       .def_readwrite("dim",&gpucache::CacheConfig::dim);
+    py::enum_<gpucache::CacheConfig::CacheEvictStrategy>(cfg,"CacheEvictStrategy")
+        .value("LRU",gpucache::CacheConfig::CacheEvictStrategy::LRU)
+        .value("LFU",gpucache::CacheConfig::CacheEvictStrategy::LFU)
+        .value("FIFO",gpucache::CacheConfig::CacheEvictStrategy::FIFO)
+        .export_values();
+    /*-----------------------------------------------------------------------------------------------------------------------------------*/
+    /* lrucache */
+    /*-----------------------------------------------------------------------------------------------------------------------------------*/
+    py::class_<gpucache::lrucache::LRUCacheWrapper> lru_cache(m, "LRUCache");
+    lru_cache
+        .def(py::init<at::Tensor, gpucache::CacheConfig>())
+        .def("Get",&gpucache::lrucache::LRUCacheWrapper::Get,"get values for keys,  find_mask return whether each key exists in cache")
+        .def("Put",&gpucache::lrucache::LRUCacheWrapper::Put,"put key-value pairs")
+        .def("Strategy",&gpucache::lrucache::LRUCacheWrapper::Strategy,"get evict strategy")
+        .def("Capacity",&gpucache::lrucache::LRUCacheWrapper::Capacity,"return cache capacity")
+        .def("KeySize",&gpucache::lrucache::LRUCacheWrapper::KeySize,"return key size")
+        .def("ValueSize",&gpucache::lrucache::LRUCacheWrapper::ValueSize,"return value size")
+        .def("MaxQueryNum",&gpucache::lrucache::LRUCacheWrapper::MaxQueryNum,"return max number of keys to get or key-values to put once")
+        .def("Clear",&gpucache::lrucache::LRUCacheWrapper::Clear,"clear cache")
+        .def("Device",&gpucache::lrucache::LRUCacheWrapper::DeviceId,"return device id")
+        .def("Dim",&gpucache::lrucache::LRUCacheWrapper::Dim,"return value dim");
+    m.def("NewLRUCache", &gpucache::lrucache::NewLRUCache, "create a lru cache",py::return_value_policy::reference);
+    /*-----------------------------------------------------------------------------------------------------------------------------------*/
+    /* lrucache */
+    /*-----------------------------------------------------------------------------------------------------------------------------------*/
+    py::class_<gpucache::fifocache::FIFOCacheWrapper> fifo_cache(m, "FIFOCache");
+    fifo_cache
+            .def(py::init<at::Tensor, gpucache::CacheConfig>())
+            .def("Get",&gpucache::fifocache::FIFOCacheWrapper::Get,"get values for keys,  find_mask return whether each key exists in cache")
+            .def("Put",&gpucache::fifocache::FIFOCacheWrapper::Put,"put key-value pairs")
+            .def("Strategy",&gpucache::fifocache::FIFOCacheWrapper::Strategy,"get evict strategy")
+            .def("Capacity",&gpucache::fifocache::FIFOCacheWrapper::Capacity,"return cache capacity")
+            .def("KeySize",&gpucache::fifocache::FIFOCacheWrapper::KeySize,"return key size")
+            .def("ValueSize",&gpucache::fifocache::FIFOCacheWrapper::ValueSize,"return value size")
+            .def("MaxQueryNum",&gpucache::fifocache::FIFOCacheWrapper::MaxQueryNum,"return max number of keys to get or key-values to put once")
+            .def("Clear",&gpucache::fifocache::FIFOCacheWrapper::Clear,"clear cache")
+            .def("Device",&gpucache::fifocache::FIFOCacheWrapper::DeviceId,"return device id")
+            .def("Dim",&gpucache::fifocache::FIFOCacheWrapper::Dim,"return value dim");
+    m.def("NewFIFOCache", &gpucache::fifocache::NewFIFOCache, "create a fifo cache",py::return_value_policy::reference);
+    /*-----------------------------------------------------------------------------------------------------------------------------------*/
+}
--- a/src/fifo_cache.cu
+++ b/src/fifo_cache.cu
--- a/src/fifo_cache.h
+++ b/src/fifo_cache.h
+#pragma once
+#include <torch/extension.h>
+#include "cache.h"
+namespace gpucache {
+    namespace fifocache {
+        class FIFOCacheWrapper {
+        public:
+            FIFOCacheWrapper(at::Tensor t, CacheConfig cfg);
+            ~FIFOCacheWrapper();
+            std::pair<torch::Tensor, torch::Tensor> Get(uint32_t num_query, const torch::Tensor queries);
+            void Put(uint32_t num_query, const torch::Tensor keys, const torch::Tensor values);
+            CacheConfig::CacheEvictStrategy Strategy();
+            uint64_t Capacity();
+            uint32_t KeySize();
+            uint32_t ValueSize();
+            uint32_t MaxQueryNum();
+            uint64_t DeviceId();
+            uint32_t Dim();
+            void Clear();
+        private:
+            void *fifo_cache;
+            c10::ScalarType dtype; // value dtype
+            c10::ScalarType kdtype; // key dtype
+            bool key_is_int32; // only support int32 and int64
+            CacheConfig cache_cfg;
+        };
+        std::unique_ptr<FIFOCacheWrapper> NewFIFOCache(at::Tensor t, CacheConfig cfg);
+    } // namespace lrucache
+} // namespace gpucache
\ No newline at end of file
--- a/src/hash/hash_fucntion.cuh
+++ b/src/hash/hash_fucntion.cuh
 #pragma once
-#include <string>
+#include <cuda_runtime.h>
-#include "murmurhash3.cuh"
 constexpr uint32_t lruSeed = 0X12fb73ac;
+__device__ void MurmurHash3_x86_32 ( const void * key, int len,
+                                     uint32_t seed, void * out );
 template<typename T>
 __device__ size_t getHash(const T& obj){
    size_t hash;
    MurmurHash3_x86_32(reinterpret_cast<const void*>(&obj),sizeof(T),lruSeed,reinterpret_cast<void*>(&hash));
    return hash;
 }
\ No newline at end of file
--- a/src/hash/murmurhash3.cuh
+++ b/src/hash/murmurhash3.cuh
@@ -11,7 +11,6 @@
 // Platform-specific functions and macros
 // Microsoft Visual Studio
 #if defined(_MSC_VER)
 #define FORCE_INLINE	__forceinline

--- a/src/lru_cache.cu
+++ b/src/lru_cache.cu
--- a/src/lru_cache.h
+++ b/src/lru_cache.h
 #pragma once
+#include <torch/extension.h>
-#include "common.cuh"
 #include "cache.h"
 namespace gpucache {
+    namespace lrucache {
+        class LRUCacheWrapper {
+        public:
+            LRUCacheWrapper(at::Tensor t, CacheConfig cfg);
+            ~LRUCacheWrapper();
+            std::pair<torch::Tensor, torch::Tensor> Get(uint32_t num_query, const torch::Tensor queries);
+            void Put(uint32_t num_query, const torch::Tensor keys, const torch::Tensor values);
+            CacheConfig::CacheEvictStrategy Strategy();
+            uint64_t Capacity();
+            uint32_t KeySize();
+            uint32_t ValueSize();
+            uint32_t MaxQueryNum();
+            uint64_t DeviceId();
+            uint32_t Dim();
+            void Clear();
+        private:
+            void *lru_cache;
+            c10::ScalarType dtype; // value dtype
+            c10::ScalarType kdtype; // key dtype
+            bool key_is_int32; // only support int32 and int64
+            CacheConfig cache_cfg;
+        };
-//    template<typename KeyType, typename ElemType>
+        std::unique_ptr<LRUCacheWrapper> NewLRUCache(at::Tensor t, CacheConfig cfg);
-//    class LRUCache;
+    } // namespace lrucache
-//
+} // namespace gpucache
-//    template<typename KeyType, typename ElemType>
\ No newline at end of file
-//    struct BucketView;
-//
-//    struct ThreadCtx;
-//
-//    template<typename KeyType, typename ElemType>
-//    __device__ __host__ BucketView<KeyType, ElemType>
-//    setBucketView(ThreadCtx ctx, KeyType *cache_keys, ElemType *cache_values,
-//                  uint8_t *cache_timestamps, void *cache_mutexes,
-//                  uint32_t num_elem_per_value, uint32_t bucket_id);
-//
-//    template<typename KeyType, typename ElemType>
-//    class LRUCache : public Cache<KeyType, ElemType> {
-//
-//        friend BucketView<KeyType, ElemType> __device__ __host__ setBucketView<KeyType, ElemType>(
-//                ThreadCtx ctx, KeyType *cache_keys, ElemType *cache_values,
-//                uint8_t *cache_timestamps, void *cache_mutexes,
-//                uint32_t num_elem_per_value, uint32_t bucket_id);
-//
-//    public:
-//        explicit LRUCache(const CacheConfig &cfg);
-//
-//
-//        ~LRUCache();
-//
-//        uint32_t KeySize() override;
-//
-//        uint32_t ValueSize() override;
-//
-//        uint64_t Capacity() override;
-//
-//        uint32_t NumElemsPerValue() override;
-//
-//        uint32_t MaxQueryNum();
-//
-//        uint32_t NBucket();
-//
-//        int8_t DeviceId() override;
-//
-//        uint32_t Dim() override;
-//
-//        // for test
-//        // void *Mutex() { return bucketMutexes; }
-//
-//        CacheConfig::CacheEvictStrategy Strategy() override;
-//
-//        void Clear() override;
-//
-//        void Get(cudaStream_t stream, uint32_t num_query, KeyType *queries,
-//                 ElemType *results, bool *find_mask) override;
-//
-//        void Put(cudaStream_t stream, uint32_t num_query, KeyType *putkeys,
-//                 ElemType *putvalues, uint32_t *n_evict = nullptr,
-//                 KeyType *evict_keys = nullptr) override;
-//
-//    private:
-//        KeyType *keys;
-//        ElemType *values;
-//        uint8_t *timestamps{};
-//        uint32_t nbucket; // 32 values for one bucket
-//        void *bucketMutexes{};
-//
-//        // CacheConfig::CacheEvictStrategy strategy;
-//        uint64_t capacity;
-//        uint32_t keySize;
-//        uint32_t valueSize;
-//        uint32_t numElemPerValue; // embedding dim
-//        int8_t device_id;
-//        uint32_t dim;
-//
-//        // store missing keys and indices for Evict
-//        KeyType *queryKeyBuffer{};
-//        uint32_t *queryIndiceBuffer{};
-//        uint32_t maxQueryNum;
-//    };
-    class LRUCacheWrapper {
-    public:
-        LRUCacheWrapper(at::Tensor t, CacheConfig cfg);
-        ~LRUCacheWrapper();
-        std::pair<torch::Tensor, torch::Tensor> Get(uint32_t num_query, const torch::Tensor queries);
-        void Put(uint32_t num_query, const torch::Tensor keys, const torch::Tensor values);
-        CacheConfig::CacheEvictStrategy Strategy();
-        uint64_t Capacity();
-        uint32_t KeySize();
-        uint32_t ValueSize();
-        uint32_t MaxQueryNum();
-        uint64_t DeviceId();
-        uint32_t Dim();
-        void Clear();
-//    private:
-        void *lru_cache;
-        c10::ScalarType dtype;
-        c10::ScalarType kdtype;
-        bool key_is_int32;
-        CacheConfig cache_cfg;
-    };
-    std::unique_ptr<LRUCacheWrapper> NewLRUCache(at::Tensor t, CacheConfig cfg);
-}
\ No newline at end of file
--- a/src/utils.cu
+++ b/src/utils.cu
+#include "utils.cuh"
+namespace gpucache {
+    __device__ ThreadCtx::ThreadCtx() {
+        auto global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+        global_warp_idx = global_thread_id / warpsize;
+        block_warp_idx = threadIdx.x / warpsize;
+        lane_id = threadIdx.x % warpsize;
+        num_warps = blockDim.x * gridDim.x / warpsize;
+    }
+    __device__ WarpMutex::WarpMutex() : flag(0) {}
+    __device__ void WarpMutex::Lock(ThreadCtx &ctx, uint32_t bucket_id) {
+        if (ctx.lane_id == 0) {
+            while (atomicCAS(&flag, 0, 1) != 0) {}
+        }
+        __threadfence();
+        __syncwarp();
+    }
+    __device__ void WarpMutex::UnLock(ThreadCtx &ctx) {
+        __syncwarp();
+        __threadfence();
+        if (ctx.lane_id == 0) {
+            atomicExch(&flag, 0);
+        }
+    }
+    __global__ void initLocks(uint32_t n_bucket, void *bucketMutexes) {
+        uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
+        if (global_thread_idx < n_bucket) {
+            new(reinterpret_cast<WarpMutex *>(bucketMutexes) + global_thread_idx)
+                    WarpMutex();
+        }
+    }
+    __global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes) {
+        uint32_t global_thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
+        // printf("thread %u CUDA_CHECK lock\n",global_thread_idx);
+        if (global_thread_idx < n_bucket) {
+            auto mutex =
+                    reinterpret_cast<WarpMutex *>(bucketMutexes) + global_thread_idx;
+            if (mutex->flag != 0u && mutex->flag != 1u) {
+                printf("bucket id %u not equal 0 or 1, is %u\n", global_thread_idx,
+                       mutex->flag);
+            }
+        }
+    }
+}
--- a/src/utils.cuh
+++ b/src/utils.cuh
+#pragma once
+#include <cuda_runtime.h>
+namespace gpucache{
+    constexpr unsigned int warpFullMask = 0xFFFFFFFF;
+    constexpr unsigned int defaultBlockX = 256;
+    constexpr unsigned int warpsize = 32;
+    constexpr unsigned int defaultNumWarpsPerBlock = defaultBlockX / warpsize;
+    // bucket_id + key
+    constexpr unsigned int uint32SharedMemorySize = 2 * sizeof(uint32_t) * defaultNumWarpsPerBlock * warpsize;
+    constexpr unsigned int uint64SharedMemorySize =
+            (sizeof(uint64_t) + sizeof(uint32_t)) * defaultNumWarpsPerBlock * warpsize;
+    struct ThreadCtx {
+        __device__ ThreadCtx();
+        uint32_t global_warp_idx;
+        uint32_t block_warp_idx;
+        uint32_t num_warps;
+        uint32_t lane_id;
+    };
+    struct WarpMutex {
+    public:
+        __device__ WarpMutex();
+        ~WarpMutex() = default;
+        WarpMutex(const WarpMutex &) = delete;
+        WarpMutex &operator=(const WarpMutex &) = delete;
+        WarpMutex(WarpMutex &&) = delete;
+        WarpMutex &operator=(WarpMutex &&) = delete;
+        __device__ void Lock(ThreadCtx &ctx, uint32_t bucket_id);
+        __device__ void UnLock(ThreadCtx &ctx);
+        //    private:
+        uint32_t flag;
+    };
+    __global__ void initLocks(uint32_t n_bucket, void *bucketMutexes);
+    __global__ void checkLocks(uint32_t n_bucket, void *bucketMutexes);
+}
--- a/test.ipynb
+++ b/test.ipynb
@@ -16,8 +16,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# args: strtegy, capacity, key_size(only support int32 and int64), value_size(dim * sizeof(elem), elem is decided by the passing tensor dtype of NewLRUCache), device_id, dim\n",
+    "# args: strtegy, capacity, keySize(only support int32 and int64), valueSize(dim * sizeof(elem), elem is decided by the passing tensor dtype of NewLRUCache), deviceId, dim\n",
-    "cfg = libgpucache.CacheConfig(libgpucache.CacheConfig.LRU,65536,4,128,4096,0,32)"
+    "# cfg = libgpucache.CacheConfig(libgpucache.CacheConfig.LRU,65536,4,128,4096,0,32)\n",
+    "cfg = libgpucache.CacheConfig(libgpucache.CacheConfig.FIFO,65536,4,128,4096,0,32)"
   ]
  },
  {
@@ -28,7 +29,7 @@
    {
     "data": {
      "text/plain": [
-       "(<CacheEvictStrategy.LRU: 1>, 65536, 4, 128, 4096, 0, 32)"
+       "(<CacheEvictStrategy.FIFO: 0>, 65536, 4, 128, 4096, 0, 32)"
      ]
     },
     "execution_count": 3,
@@ -37,7 +38,7 @@
    }
   ],
   "source": [
-    "cfg.strategy, cfg.capacity, cfg.key_size, cfg.value_size, cfg.max_query_num, cfg.device_id, cfg.dim"
+    "cfg.strategy, cfg.capacity, cfg.keySize, cfg.valueSize, cfg.maxQueryNum, cfg.deviceId, cfg.dim"
   ]
  },
  {
@@ -49,13 +50,14 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "LRUCache: keySize: 4, valueSize: 128, dim: 32, capacity: 65536, maxQueryNum: 4096, deviceId: 0\n"
+      "FIFOCache: keySize: 4, valueSize: 128, dim: 32, capacity: 65536, maxQueryNum: 4096, deviceId: 0\n"
     ]
    }
   ],
   "source": [
    "t = torch.empty([1],dtype=torch.float32)\n",
-    "cache = libgpucache.NewLRUCache(t,cfg)"
+    "# cache = libgpucache.NewLRUCache(t,cfg)\n",
+    "cache = libgpucache.NewFIFOCache(t,cfg)"
   ]
  },
  {
@@ -88,11 +90,6 @@
   ]
  },
  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
@@ -135,6 +132,38 @@
  },
  {
   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14.,\n",
+       "          15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28.,\n",
+       "          29., 30., 31., 32.],\n",
+       "         [32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45.,\n",
+       "          46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,\n",
+       "          60., 61., 62., 63.],\n",
+       "         [64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77.,\n",
+       "          78., 79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91.,\n",
+       "          92., 93., 94., 95.]], device='cuda:0'),\n",
+       " tensor([False,  True,  True], device='cuda:0'),\n",
+       " torch.Size([3, 32]))"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "keys[0] = 999\n",
+    "values, find_mask = cache.Get(3,keys)\n",
+    "values, find_mask, values.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -34,6 +34,7 @@ enable_testing()
 add_executable(
        cache_test
        cache_test.cu
 )
 target_include_directories(cache_test PRIVATE /home/wtx/miniconda3/envs/dgl/include/python3.10) # path to your Python.h

--- a/test/cache_test.cu
+++ b/test/cache_test.cu
 #include "gtest/gtest.h"
+#include "../src/hash/murmurhash3.cu"
+#include "../src/utils.cuh"
+#include "../src/utils.cu"
 #include "../src/lru_cache.h"
 #include "../src/lru_cache.cu"
+#include "../src/fifo_cache.h"
+#include "../src/fifo_cache.cu"
 #include <cuda_runtime.h>
 #include <unordered_set>
 #include <vector>
@@ -9,10 +14,23 @@
 #include <algorithm>
 namespace gpucache{
+    // for test
    template<typename KeyType>
-    __global__ void CollectMissingKeysNew(uint32_t num_query, KeyType *keys,
+    __global__ void CollectMissingKeys(uint32_t num_query, KeyType *keys,
-                                          bool *find_mask, uint32_t *n_missing,
+        bool *find_mask, uint32_t *n_missing,KeyType *missing_keys) {
-                                          KeyType *missing_keys);
+        ThreadCtx ctx{};
+        for (auto offset = ctx.global_warp_idx * warpsize; offset < num_query;
+            offset += ctx.num_warps * warpsize) {
+                auto idx = offset + ctx.lane_id;
+                if (!find_mask[idx]) {
+                    uint32_t base_missing_idx = atomicAdd(n_missing, 1);
+                    missing_keys[base_missing_idx] = keys[idx];
+                }
+        }
+    }
    void TestCache(Cache<int32_t ,uint32_t>& cache, uint32_t num_elem_per_value){
        constexpr int warpsize = 32;
        std::unordered_set<uint32_t> in_cache;
@@ -114,20 +132,20 @@ namespace gpucache{
            dim3 block(256);
            dim3 grid((n_keys + block.x - 1)/block.x);
            // CollectMissingKeys<uint32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys);
-            CollectMissingKeysNew<int32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys);
+            CollectMissingKeys<int32_t><<<grid,block>>>(n_keys,d_keys,d_find_mask,d_n_missing,d_missing_keys);
            CUDA_CHECK(cudaDeviceSynchronize());
            CUDA_CHECK(cudaMemcpy(n_missing,d_n_missing, sizeof(uint32_t),cudaMemcpyDefault));
            CUDA_CHECK(cudaMemcpy(missing_keys,d_missing_keys,keys_size,cudaMemcpyDefault));
            ASSERT_EQ(expect_n_missing, *n_missing) << "expect_n_missing is " << expect_n_missing << " n_missing is " << *n_missing;
            std::unordered_set<int32_t> missing_keys_set(missing_keys, missing_keys + *n_missing);
            ASSERT_EQ(missing_keys_set,expect_missing_keys_set);
-            // check get value 
+            // check get value
            CUDA_CHECK(cudaMemcpy(values,d_values,values_size,cudaMemcpyDefault));
            for (size_t i = 0; i < n_keys ; i += 1) {
                if(find_mask[i]){
                    ASSERT_EQ(keys[i] + 123,values[i * num_elem_per_value]) << "key[" << i << "] = " << keys[i] <<  " doesn't get correct value should be " << keys[i] + 123 << " get " << values[i * num_elem_per_value];
                }
            }
@@ -187,30 +205,68 @@ namespace gpucache{
        cfg.deviceId = 0;
        cfg.dim = 8;
-        LRUCache<int32_t,uint32_t> cache(cfg);
+        lrucache::LRUCache<int32_t,uint32_t> cache(cfg);
+        TestCache(cache,cfg.dim);
+    }
+    TEST(GPUCACHE,FIFOCACHE){
+        CacheConfig cfg{};
+        cfg.strategy = CacheConfig::CacheEvictStrategy::FIFO;
+        cfg.valueSize = 32;
+        cfg.capacity = 4096 * 2;
+        cfg.keySize = 4;
+        cfg.maxQueryNum = 2048;
+        cfg.deviceId = 0;
+        cfg.dim = 8;
+        fifocache::FIFOCache<int32_t,uint32_t> cache(cfg);
        TestCache(cache,cfg.dim);
    }
+    TEST(GPUCACHE, FIFOCACHEWRAPPER){
+        CacheConfig cfg{CacheConfig::CacheEvictStrategy::FIFO,8192,4,32,2048,0,8};
+        auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0));
+        auto cache = fifocache::NewFIFOCache(t,cfg);
+        auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0);
+        auto [values, find_mask] = cache->Get(5,keys);
+        torch::Tensor put_values = torch::reshape(torch::arange(0, 5 * 8,torch::dtype(torch::kInt32)),{5,8}).to(torch::kCUDA, 0);
+        std::cout << "put_values: " << put_values << std::endl;
+        cache->Put(5,keys,put_values);
+        auto result = cache->Get(5,keys);
+        values = result.first;
+        find_mask = result.second;
+        std::cout << " values: " << values << "find_mask: " << find_mask << std::endl;
+//        auto fifoc = reinterpret_cast<fifocache::FIFOCache<int32_t,int32_t>*>(cache->fifo_cache);
+//        std::cout <<"test keysize" <<  fifoc->KeySize() << std::endl;
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
    TEST(GPUCACHE, LRUCACHEWRAPPER){
        CacheConfig cfg{CacheConfig::CacheEvictStrategy::LRU,8192,4,32,2048,0,8};
        auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0));
-        auto cache = NewLRUCache(t,cfg);
+        auto cache = lrucache::NewLRUCache(t,cfg);
        auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0);
-        auto [values, find_mask] = cache.Get(5,keys);
+        auto [values, find_mask] = cache->Get(5,keys);
        torch::Tensor put_values = torch::reshape(torch::arange(0, 5 * 8,torch::dtype(torch::kInt32)),{5,8}).to(torch::kCUDA, 0);
        std::cout << "put_values: " << put_values << std::endl;
-        cache.Put(5,keys,put_values);
+        cache->Put(5,keys,put_values);
-        auto result = cache.Get(5,keys);
+        auto result = cache->Get(5,keys);
        values = result.first;
        find_mask = result.second;
        std::cout << " values: " << values << "find_mask: " << find_mask << std::endl;
-        auto lruc = reinterpret_cast<LRUCache<int32_t,int32_t>*>(cache.lru_cache);
+//        auto lruc = reinterpret_cast<lrucache::LRUCache<int32_t,int32_t>*>(cache->lru_cache);
-        std::cout <<"test keysize" <<  lruc->KeySize() << std::endl;
+//        std::cout <<"test keysize" <<  lruc->KeySize() << std::endl;
        CUDA_CHECK(cudaDeviceSynchronize());
    }
 }