Commit 30d343ec by tianxing wang

fix Get bug: find_mask is not zero initialization

parent ab08d21b
This source diff could not be displayed because it is too large. You can view the blob instead.
No preview for this file type
......@@ -11,9 +11,9 @@ namespace gpucache {
CacheStorage(CacheConfig &cfg) : key_size(cfg.key_size), value_size(cfg.value_size), capacity(cfg.capacity),
max_query_num(cfg.max_query_num), device_id(cfg.device_id), dim(cfg.dim) {
nbucket = (capacity + warpSize - 1) / warpSize;
printf("Cache: keySize: %lu, valueSize: %u, dim: %u, capacity: %lu, "
"maxQueryNum: %u, deviceId: %u\n",
sizeof(KeyType), value_size, dim, capacity, max_query_num, device_id);
// printf("Cache: keySize: %lu, valueSize: %u, dim: %u, capacity: %lu, "
// "maxQueryNum: %u, deviceId: %u\n",
// sizeof(KeyType), value_size, dim, capacity, max_query_num, device_id);
CUDA_CHECK(cudaMalloc((void **) &keys, capacity * key_size));
CUDA_CHECK(cudaMalloc((void **) &values, capacity * value_size));
CUDA_CHECK(cudaMalloc((void **) &timestamps, capacity * sizeof(uint8_t)));
......@@ -55,22 +55,6 @@ namespace gpucache {
uint32_t max_query_num;
};
// template<typename KeyType, typename ElemType>
// struct CacheManager {
// __device__ CacheManager() = default;
// __device__ virtual BucketView<KeyType, ElemType> *
// SetBucketView(ThreadCtx ctx, KeyType *cache_keys, ElemType *cache_values,
// uint8_t *cache_timestamps, void *cache_mutexes,
// uint32_t num_elem_per_value, uint32_t bucket_id) = 0;
//
// __device__ __host__ virtual void Type(){
// printf("CacheManager\n");
// };
//
// __device__ virtual void DestroyBucketView(BucketView<KeyType, ElemType> *bucket) = 0;
//
// };
template<typename KeyType, typename ElemType, CacheConfig::CacheEvictStrategy Strategy>
struct BucketView;
......@@ -86,34 +70,7 @@ namespace gpucache {
reinterpret_cast<WarpMutex *>(cache_mutexes) + bucket_id,
num_elem_per_value);
}
/*
{
__device__ BucketView(KeyType* k, ElemType* v, uint8_t* ts, WarpMutex* m, uint32_t num_elems_per_value): bkeys(k), bvalues(v),btimestamps(ts),mutex(m),num_elems_per_value(num_elems_per_value){}
__device__ ~BucketView()= default;
__device__ int Get(const ThreadCtx &ctx, KeyType key){};
__device__ int TryPut(const ThreadCtx &ctx, KeyType key){};
__device__ int Evict(const ThreadCtx &ctx, KeyType key, KeyType *evict_key){};
__device__ void ReadOneValue(const ThreadCtx &ctx, uint8_t slot_num,
ElemType *out) {
for (size_t i = ctx.lane_id; i < num_elems_per_value; i += warpSize) {
out[i] = bvalues[slot_num * num_elems_per_value + i];
}
}
__device__ void WriteOneValue(const ThreadCtx &ctx, uint8_t slot_num,
ElemType *v) {
for (size_t i = ctx.lane_id; i < num_elems_per_value; i += warpSize) {
bvalues[slot_num * num_elems_per_value + i] = v[i];
}
}
WarpMutex *mutex;
KeyType *bkeys;
ElemType *bvalues;
uint8_t *btimestamps;
uint32_t num_elems_per_value;
};
*/
// Cache Interface
......@@ -200,6 +157,7 @@ namespace gpucache {
int slot_num = bucket.Get(ctx, key);
if (slot_num != -1) {
// printf("warp_id:%d,lane_id:%d find key %d\n",ctx.global_warp_idx,ctx.lane_id,key);
find_mask[idx] = true;
bucket.ReadOneValue(ctx, slot_num, &results[idx * num_elem_per_value]);
}
......
......@@ -91,14 +91,13 @@ namespace gpucache {
assert(num_query <= cache_cfg.max_query_num && "num_query must less than max_query_num");
assert(queries.dtype().toScalarType() == ktype && "key type doesn't match");
//std::cout << "Get queries: " << queries << std::endl;
// printf("hello!!!\n");
//std::cout << "Get num_query: " << num_query <<"\n";
//std::cout << "Get queries.sizes() " << queries.sizes() << "\n";
auto keys = queries.data_ptr();
auto stream = at::cuda::getCurrentCUDAStream(cache_cfg.device_id).stream();
auto result = torch::empty({num_query, cache_cfg.dim},
torch::dtype(vtype).device(torch::kCUDA, cache_cfg.device_id));
auto find_mask = torch::empty({num_query},
auto find_mask = torch::zeros({num_query},
torch::dtype(torch::kBool).device(torch::kCUDA, cache_cfg.device_id));
if (key_is_int32) {
......@@ -121,6 +120,7 @@ namespace gpucache {
reinterpret_cast<bool *>(find_mask.data_ptr()));
});
}
// std::cout << "find_mask:" << find_mask << std::endl;
return std::make_pair(result, find_mask);
}
......
......@@ -45,6 +45,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
.def("Device",&gpucache::CacheWrapper::DeviceId,"return device id")
.def("Dim",&gpucache::CacheWrapper::Dim,"return value dim");
m.def("NewCache", &gpucache::NewCache, "create a lru cache",py::return_value_policy::reference);
m.def("NewCache", &gpucache::NewCache, "create a cache",py::return_value_policy::reference);
/*-----------------------------------------------------------------------------------------------------------------------------------*/
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -48,7 +48,7 @@ namespace gpucache{
constexpr int warpsize = 32;
std::unordered_set<uint32_t> in_cache;
const size_t n_iter = 32;
const uint32_t n_keys = 1024;
const uint32_t n_keys = 2048;
int32_t* d_keys;
......@@ -165,7 +165,7 @@ namespace gpucache{
//test put
std::cout << "test put" << std::endl;
memset(values, 0,values_size);
for (size_t i = 0; i < n_keys; i++){
for (size_t i = 0; i < n_keys; i++){ // for key i, the first number of embeddding[key] should be key i +123
values[i * num_elem_per_value] = keys[i] + 123;
}
CUDA_CHECK(cudaMemcpy(d_values,values,values_size,cudaMemcpyDefault));
......@@ -209,12 +209,12 @@ namespace gpucache{
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::LRU;
cfg.value_size = 32;
cfg.value_size = 172*4;
cfg.capacity = 4096 * 2;
cfg.key_size = 4;
cfg.max_query_num = 2048;
cfg.device_id = 0;
cfg.dim = 8;
cfg.dim = 172;
LRUCache<int32_t,uint32_t> *cache = NewLRUCache<int32_t,uint32_t>(cfg);
TestCache(*cache,cfg.dim);
......@@ -225,12 +225,12 @@ namespace gpucache{
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::FIFO;
cfg.value_size = 32;
cfg.value_size = 172*4;
cfg.capacity = 4096 * 2;
cfg.key_size = 4;
cfg.max_query_num = 2048;
cfg.device_id = 0;
cfg.dim = 8;
cfg.dim = 172;
FIFOCache<int32_t,uint32_t> *cache = NewFIFOCache<int32_t,uint32_t>(cfg);
TestCache(*cache,cfg.dim);
......@@ -241,17 +241,32 @@ namespace gpucache{
CacheConfig cfg{};
cfg.strategy = CacheConfig::CacheEvictStrategy::LFU;
cfg.value_size = 32;
cfg.value_size = 172*4;
cfg.capacity = 4096 * 2;
cfg.key_size = 4;
cfg.max_query_num = 2048;
cfg.device_id = 0;
cfg.dim = 8;
cfg.dim = 172;
LFUCache<int32_t,uint32_t> *cache = NewLFUCache<int32_t,uint32_t>(cfg);
TestCache(*cache,cfg.dim);
}
TEST(GPUCACHE, LRUCACHEGET){
CacheConfig cfg{CacheConfig::CacheEvictStrategy::LRU,8192,4,4*172,8192,0,172};
auto t = torch::empty({1},torch::dtype(torch::kInt32).device(torch::kCUDA,0));
auto cache = NewCache(t,cfg);
auto size = std::vector<int64_t>{6483};
auto keys = torch::randint(0,12481924,size,torch::dtype(torch::kInt32)).to(torch::kCUDA,0);
// auto keys = torch::arange(0,5,torch::dtype(torch::kInt32)).to(torch::kCUDA, 0);
auto [values, find_mask] = cache->Get(6483,keys);
for(int i = 0; i < find_mask.size(0);i++){
ASSERT_FALSE(find_mask[i].item<bool>()) << "should be false";
}
CUDA_CHECK(cudaDeviceSynchronize());
}
TEST(GPUCACHE, FIFOCACHEWRAPPER){
CacheConfig cfg{CacheConfig::CacheEvictStrategy::FIFO,8192,4,32,2048,0,8};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment