change the torch_utilsdependece path

54424543 · zlj · 8768fe6f · 54424543 · 54424543 · 54424543
Commit 54424543 authored Dec 23, 2023 by zlj
24 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+set(CMAKE_SOURCE_DIR "/mnt/data/")
 cmake_minimum_required(VERSION 3.15)
 project(starrygl_ops VERSION 0.1)
@@ -45,7 +46,7 @@ if(WITH_METIS)
    add_definitions(-DWITH_METIS)
    set(GKLIB_DIR "${CMAKE_SOURCE_DIR}/third_party/GKlib")
    set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
+    message(${METIS_DIR})
    set(GKLIB_INCLUDE_DIRS "${GKLIB_DIR}/include")
    file(GLOB_RECURSE GKLIB_LIBRARIES "${GKLIB_DIR}/lib/lib*.a")
@@ -104,10 +105,10 @@ endif()
 # add libsampler.so
 set(SAMLPER_NAME "${PROJECT_NAME}_sampler")
+set(BOOST_INCLUDE_DIRS "${CMAKE_SOURCE_DIR}/third_party/boost_1_83_0")
+include_directories(${BOOST_INCLUDE_DIRS})
 file(GLOB_RECURSE SAMPLER_SRCS "csrc/sampler/*.cpp")
 add_library(${SAMLPER_NAME} SHARED ${SAMPLER_SRCS})
 target_include_directories(${SAMLPER_NAME} PRIVATE "csrc/sampler/include")
 target_compile_options(${SAMLPER_NAME} PRIVATE -O3)

--- a/Makefile
+++ b/Makefile
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.16
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+#=============================================================================
+# Special targets provided by cmake.
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+.SUFFIXES: .hpux_make_needs_suffix_list
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+#=============================================================================
+# Set environment variables for the build.
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+# Escaping for special characters.
+EQUALS = =
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/zlj/starrygl
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/zlj/starrygl
+#=============================================================================
+# Targets provided globally by CMake.
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+# The main all target
+all: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/zlj/starrygl/CMakeFiles /home/zlj/starrygl/CMakeFiles/progress.marks
+	$(MAKE) -f CMakeFiles/Makefile2 all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/zlj/starrygl/CMakeFiles 0
+.PHONY : all
+# The main clean target
+clean:
+	$(MAKE) -f CMakeFiles/Makefile2 clean
+.PHONY : clean
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+# Prepare targets for installation.
+preinstall: all
+	$(MAKE) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall
+# Prepare targets for installation.
+preinstall/fast:
+	$(MAKE) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall/fast
+# clear depends
+depend:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+#=============================================================================
+# Target rules for targets named starrygl_ops
+# Build rule for target.
+starrygl_ops: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 starrygl_ops
+.PHONY : starrygl_ops
+# fast build rule for target.
+starrygl_ops/fast:
+	$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/build
+.PHONY : starrygl_ops/fast
+#=============================================================================
+# Target rules for targets named mtmetis
+# Build rule for target.
+mtmetis: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 mtmetis
+.PHONY : mtmetis
+# fast build rule for target.
+mtmetis/fast:
+	$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/build
+.PHONY : mtmetis/fast
+#=============================================================================
+# Target rules for targets named metis
+# Build rule for target.
+metis: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 metis
+.PHONY : metis
+# fast build rule for target.
+metis/fast:
+	$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/build
+.PHONY : metis/fast
+#=============================================================================
+# Target rules for targets named starrygl_ops_sampler
+# Build rule for target.
+starrygl_ops_sampler: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 starrygl_ops_sampler
+.PHONY : starrygl_ops_sampler
+# fast build rule for target.
+starrygl_ops_sampler/fast:
+	$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/build
+.PHONY : starrygl_ops_sampler/fast
+#=============================================================================
+# Target rules for targets named uvm
+# Build rule for target.
+uvm: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 uvm
+.PHONY : uvm
+# fast build rule for target.
+uvm/fast:
+	$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/build
+.PHONY : uvm/fast
+csrc/export.o: csrc/export.cpp.o
+.PHONY : csrc/export.o
+# target to build an object file
+csrc/export.cpp.o:
+	$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/csrc/export.cpp.o
+.PHONY : csrc/export.cpp.o
+csrc/export.i: csrc/export.cpp.i
+.PHONY : csrc/export.i
+# target to preprocess a source file
+csrc/export.cpp.i:
+	$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/csrc/export.cpp.i
+.PHONY : csrc/export.cpp.i
+csrc/export.s: csrc/export.cpp.s
+.PHONY : csrc/export.s
+# target to generate assembly for a file
+csrc/export.cpp.s:
+	$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/csrc/export.cpp.s
+.PHONY : csrc/export.cpp.s
+csrc/partition/metis.o: csrc/partition/metis.cpp.o
+.PHONY : csrc/partition/metis.o
+# target to build an object file
+csrc/partition/metis.cpp.o:
+	$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/csrc/partition/metis.cpp.o
+.PHONY : csrc/partition/metis.cpp.o
+csrc/partition/metis.i: csrc/partition/metis.cpp.i
+.PHONY : csrc/partition/metis.i
+# target to preprocess a source file
+csrc/partition/metis.cpp.i:
+	$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/csrc/partition/metis.cpp.i
+.PHONY : csrc/partition/metis.cpp.i
+csrc/partition/metis.s: csrc/partition/metis.cpp.s
+.PHONY : csrc/partition/metis.s
+# target to generate assembly for a file
+csrc/partition/metis.cpp.s:
+	$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/csrc/partition/metis.cpp.s
+.PHONY : csrc/partition/metis.cpp.s
+csrc/partition/mtmetis.o: csrc/partition/mtmetis.cpp.o
+.PHONY : csrc/partition/mtmetis.o
+# target to build an object file
+csrc/partition/mtmetis.cpp.o:
+	$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/csrc/partition/mtmetis.cpp.o
+.PHONY : csrc/partition/mtmetis.cpp.o
+csrc/partition/mtmetis.i: csrc/partition/mtmetis.cpp.i
+.PHONY : csrc/partition/mtmetis.i
+# target to preprocess a source file
+csrc/partition/mtmetis.cpp.i:
+	$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/csrc/partition/mtmetis.cpp.i
+.PHONY : csrc/partition/mtmetis.cpp.i
+csrc/partition/mtmetis.s: csrc/partition/mtmetis.cpp.s
+.PHONY : csrc/partition/mtmetis.s
+# target to generate assembly for a file
+csrc/partition/mtmetis.cpp.s:
+	$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/csrc/partition/mtmetis.cpp.s
+.PHONY : csrc/partition/mtmetis.cpp.s
+csrc/sampler/export.o: csrc/sampler/export.cpp.o
+.PHONY : csrc/sampler/export.o
+# target to build an object file
+csrc/sampler/export.cpp.o:
+	$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/csrc/sampler/export.cpp.o
+.PHONY : csrc/sampler/export.cpp.o
+csrc/sampler/export.i: csrc/sampler/export.cpp.i
+.PHONY : csrc/sampler/export.i
+# target to preprocess a source file
+csrc/sampler/export.cpp.i:
+	$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/csrc/sampler/export.cpp.i
+.PHONY : csrc/sampler/export.cpp.i
+csrc/sampler/export.s: csrc/sampler/export.cpp.s
+.PHONY : csrc/sampler/export.s
+# target to generate assembly for a file
+csrc/sampler/export.cpp.s:
+	$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/csrc/sampler/export.cpp.s
+.PHONY : csrc/sampler/export.cpp.s
+csrc/uvm/uvm.o: csrc/uvm/uvm.cpp.o
+.PHONY : csrc/uvm/uvm.o
+# target to build an object file
+csrc/uvm/uvm.cpp.o:
+	$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/csrc/uvm/uvm.cpp.o
+.PHONY : csrc/uvm/uvm.cpp.o
+csrc/uvm/uvm.i: csrc/uvm/uvm.cpp.i
+.PHONY : csrc/uvm/uvm.i
+# target to preprocess a source file
+csrc/uvm/uvm.cpp.i:
+	$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/csrc/uvm/uvm.cpp.i
+.PHONY : csrc/uvm/uvm.cpp.i
+csrc/uvm/uvm.s: csrc/uvm/uvm.cpp.s
+.PHONY : csrc/uvm/uvm.s
+# target to generate assembly for a file
+csrc/uvm/uvm.cpp.s:
+	$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/csrc/uvm/uvm.cpp.s
+.PHONY : csrc/uvm/uvm.cpp.s
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... rebuild_cache"
+	@echo "... edit_cache"
+	@echo "... starrygl_ops"
+	@echo "... mtmetis"
+	@echo "... metis"
+	@echo "... starrygl_ops_sampler"
+	@echo "... uvm"
+	@echo "... csrc/export.o"
+	@echo "... csrc/export.i"
+	@echo "... csrc/export.s"
+	@echo "... csrc/partition/metis.o"
+	@echo "... csrc/partition/metis.i"
+	@echo "... csrc/partition/metis.s"
+	@echo "... csrc/partition/mtmetis.o"
+	@echo "... csrc/partition/mtmetis.i"
+	@echo "... csrc/partition/mtmetis.s"
+	@echo "... csrc/sampler/export.o"
+	@echo "... csrc/sampler/export.i"
+	@echo "... csrc/sampler/export.s"
+	@echo "... csrc/uvm/uvm.o"
+	@echo "... csrc/uvm/uvm.i"
+	@echo "... csrc/uvm/uvm.s"
+.PHONY : help
+#=============================================================================
+# Special targets to cleanup operation of make.
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
--- a/config/TGN.yml
+++ b/config/TGN.yml
@@ -23,7 +23,7 @@ gnn:
    dim_time: 100
    dim_out: 100
 train:
-  - epoch: 10
+  - epoch: 5
    #batch_size: 100
    # reorder: 16
    lr: 0.0001

--- a/csrc/sampler/export.cpp
+++ b/csrc/sampler/export.cpp
@@ -2,6 +2,7 @@
 #include <sampler.h>
 #include <output.h>
 #include <neighbors.h>
+#include <temporal_utils.h>
 /*------------Python Bind--------------------------------------------------------------*/
@@ -16,7 +17,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        py::return_value_policy::reference)
    .def("divide_nodes_to_part", 
        &divide_nodes_to_part, 
-        py::return_value_policy::reference);
+        py::return_value_policy::reference)
+    .def("sparse_get_index", 
+        &sparse_get_index, 
+        py::return_value_policy::reference)
+    .def("get_norm_temporal",
+        &get_norm_temporal, 
+        py::return_value_policy::reference
+    );
    py::class_<TemporalGraphBlock>(m, "TemporalGraphBlock")
        .def(py::init<vector<NodeIDType> &, vector<NodeIDType> &,
@@ -79,4 +87,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        .def("neighbor_sample_from_nodes", &ParallelSampler::neighbor_sample_from_nodes)
        .def("reset", &ParallelSampler::reset)
        .def("get_ret", [](const ParallelSampler &ps) { return ps.ret; });
 }
\ No newline at end of file
--- a/starrygl/sample/part_utils/torch_utils.cpp
+++ b/starrygl/sample/part_utils/torch_utils.cpp
-#include <pybind11/pybind11.h>
+#pragma once
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
 #include <torch/extension.h>
 #include <parallel_hashmap/phmap.h>
 #include <cstring>

--- a/data_maker.py
+++ b/data_maker.py
@@ -44,14 +44,14 @@ def load_feat(d, rand_de=0, rand_dn=0):
 data_name = args.data_name
-g = np.load('../tgl_main/DATA/'+data_name+'/ext_full.npz')
+g = np.load('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/ext_full.npz')
-df = pd.read_csv('../tgl_main/DATA/'+data_name+'/edges.csv')
+df = pd.read_csv('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/edges.csv')
-if os.path.exists('../tgl_main/DATA/'+data_name+'/node_features.pt'):
+if os.path.exists('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/node_features.pt'):
-    n_feat = torch.load('../tgl_main/DATA/'+data_name+'/node_features.pt')
+    n_feat = torch.load('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/node_features.pt')
 else:
    n_feat = None
-if os.path.exists('../tgl_main/DATA/'+data_name+'/edge_features.pt'):
+if os.path.exists('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/edge_features.pt'):
-    e_feat = torch.load('../tgl_main/DATA/'+data_name+'/edge_features.pt')
+    e_feat = torch.load('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/edge_features.pt')
 else:
    e_feat = None

--- a/detect_cuda_compute_capabilities.cu
+++ b/detect_cuda_compute_capabilities.cu
+#include <cuda_runtime.h>
+#include <cstdio>
+int main()
+{
+  int count = 0;
+  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;
+  if (count == 0) return -1;
+  for (int device = 0; device < count; ++device)
+  {
+    cudaDeviceProp prop;
+    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))
+      std::printf("%d.%d ", prop.major, prop.minor);
+  }
+  return 0;
+}
--- a/detect_cuda_version.cc
+++ b/detect_cuda_version.cc
+#include <cuda.h>
+#include <cstdio>
+int main() {
+  printf("%d.%d", CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100);
+  return 0;
+}
--- a/install.sh
+++ b/install.sh
@@ -3,9 +3,9 @@
 mkdir -p build && cd build
 cmake .. \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-    -DCMAKE_PREFIX_PATH="/home/hwj/.miniconda3/envs/sgl/lib/python3.10/site-packages" \
+    -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/dgnn/lib/python3.10/site-packages" \
-    -DPython3_ROOT_DIR="/home/hwj/.miniconda3/envs/sgl" \
+    -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/dgnn" \
-    -DCUDA_TOOLKIT_ROOT_DIR="/home/hwj/.local/cuda-11.8" \
+    -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/local/cuda-12.2" \
 && make -j32 \
 && rm -rf ../starrygl/lib \
 && mkdir ../starrygl/lib \

--- a/piplined_train_tgnn.py
+++ b/piplined_train_tgnn.py
+import argparse
+import os
+import sys
+from os.path import abspath, join, dirname
+from starrygl.distributed.context import DistributedContext
+from starrygl.distributed.utils import DistIndex
+from starrygl.module.modules import GeneralModel
+from starrygl.module.utils import parse_config
+from starrygl.sample.graph_core import DataSet, GraphData, TemporalNeighborSampleGraph
+from starrygl.sample.memory.shared_mailbox import SharedMailBox
+from starrygl.sample.sample_core.base import NegativeSampling
+from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+from starrygl.sample.part_utils.partition_tgnn import partition_load
+import torch
+import time
+import torch
+import torch.nn.functional as F
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+import os
+from starrygl.sample.data_loader import DistributedDataLoader
+from starrygl.sample.batch_data import SAMPLE_TYPE
+"""
+test command 
+python test.py --world_size 2 --rank 0 
+--world_size', default=4, type=int, metavar='W',
+                    help='number of workers')
+parser.add_argument('--rank', default=0, type=int, metavar='W',
+                    help='rank of the worker')
+parser.add_argument('--log_interval', type=int, default=10, metavar='N',
+                    help='interval between training status logs')
+parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
+                    help='how much to value future rewards')
+parser.add_argument('--seed', type=int, default=1, metavar='S',
+                    help='random seed  for reproducibility')
+parser.add_argument('--num_sampler', type=int, default=10, metavar='S',
+                    help='number of samplers')
+parser.add_argument('--queue_size', type=int, default=10, metavar='S',
+                    help='sampler queue size')
+"""
+parser = argparse.ArgumentParser(
+    description="RPC Reinforcement Learning Example",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+)
+parser.add_argument('--rank', default=0, type=str, metavar='W',
+                    help='name of dataset')
+parser.add_argument('--world_size', default=1, type=int, metavar='W',
+                    help='number of negative samples')
+args = parser.parse_args()
+from sklearn.metrics import average_precision_score, roc_auc_score
+import torch
+import time
+import random
+import dgl
+import numpy as np
+from sklearn.metrics import average_precision_score, roc_auc_score
+from torch.nn.parallel import DistributedDataParallel as DDP
+os.environ['CUDA_VISIBLE_DEVICES'] = str(args.rank)
+os.environ["RANK"] = str(args.rank)
+os.environ["WORLD_SIZE"] = str(args.world_size)
+os.environ["LOCAL_RANK"] = str(0)
+os.environ["MASTER_ADDR"] = '127.0.0.1'
+os.environ["MASTER_PORT"] = '9337'
+def seed_everything(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+seed_everything(1234)
+def main():   
+    use_cuda = True
+    sample_param, memory_param, gnn_param, train_param = parse_config('./config/TGN.yml')
+    torch.set_num_threads(12)
+    ctx = DistributedContext.init(backend="nccl", use_gpu=True)
+    device_id = torch.cuda.current_device()
+    print('use cuda on',device_id)
+    pdata = partition_load("./dataset/here/MOOC", algo="metis_for_tgnn")    
+    graph = GraphData(pdata = pdata)
+    sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+    mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat = pdata.edge_attr.shape[1] if pdata.edge_attr is not None else 0)
+    sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10],graph_data=sample_graph, workers=10,policy = 'recent',graph_name = "wiki_train")
+    train_data = torch.masked_select(graph.edge_index,pdata.train_mask.to(graph.edge_index.device)).reshape(2,-1)
+    train_ts = torch.masked_select(graph.edge_ts,pdata.train_mask.to(graph.edge_index.device))
+    val_data = torch.masked_select(graph.edge_index,pdata.val_mask.to(graph.edge_index.device)).reshape(2,-1)
+    val_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
+    test_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
+    test_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device)) 
+    print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
+    train_data = DataSet(edges = train_data,ts =train_ts,eids = torch.nonzero(pdata.train_mask).view(-1))
+    test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
+    val_data = DataSet(edges = val_data,ts = val_ts,eids = torch.nonzero(pdata.val_mask).view(-1))
+    #train_neg_sampler = PreNegativeSampling('triplet',torch.masked_select(pdata.edge_index['pos_edge'],graph.data.train_mask).reshape(2,-1))
+    neg_sampler = NegativeSampling('triplet')
+    trainloader = DistributedDataLoader(graph,train_data,sampler = sampler,
+                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
+                                        neg_sampler=neg_sampler,
+                                        batch_size = 1000,
+                                        shuffle=False,
+                                        drop_last=True,
+                                        chunk_size = None,
+                                        train=True,
+                                        queue_size = 1000,
+                                        mailbox = mailbox)
+    testloader = DistributedDataLoader(graph,test_data,sampler = sampler,
+                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
+                                        neg_sampler=neg_sampler,
+                                        batch_size = 1000,
+                                        shuffle=False,
+                                        drop_last=False,
+                                        chunk_size = None,
+                                        train=False,
+                                        queue_size = 100,
+                                        mailbox = mailbox)
+    valloader = DistributedDataLoader(graph,val_data,sampler = sampler,
+                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
+                                        neg_sampler=neg_sampler,
+                                        batch_size = 1000,
+                                        shuffle=False,
+                                        drop_last=False,
+                                        chunk_size = None,
+                                        train=False,
+                                        queue_size = 100,
+                                        mailbox = mailbox)
+    gnn_dim_node = 0 if graph.x is None else pdata.x.shape[1]
+    gnn_dim_edge = 0 if graph.edge_attr is None else pdata.edge_attr.shape[1]
+    avg_time  = 0
+    if use_cuda:
+        model = GeneralModel(gnn_dim_node, gnn_dim_edge, sample_param, memory_param, gnn_param, train_param).cuda()
+        device = torch.device('cuda')
+    else:
+        model = GeneralModel(gnn_dim_node, gnn_dim_edge, sample_param, memory_param, gnn_param, train_param)
+        device = torch.device('cpu')
+    model = DDP(model,find_unused_parameters=True)
+    train_stream = torch.cuda.Stream()
+    send_stream = torch.cuda.Stream()
+    scatter_stream = torch.cuda.Stream()
+    val_losses = list()
+    def eval(mode='val'):
+        neg_samples = 1
+        model.eval()
+        aps = list()
+        aucs_mrrs = list()
+        if mode == 'val':
+            loader = valloader
+        elif mode == 'test':
+            loader = testloader
+        elif mode == 'train':
+            loader = trainloader
+        with torch.no_grad():
+            total_loss = 0
+            signal = torch.tensor([0],dtype = int,device = device)
+            for roots,mfgs,metadata in loader:
+                pred_pos, pred_neg = model(mfgs,metadata)
+                total_loss += creterion(pred_pos, torch.ones_like(pred_pos))
+                total_loss += creterion(pred_neg, torch.zeros_like(pred_neg))
+                y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
+                y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
+                aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
+                aucs_mrrs.append(roc_auc_score(y_true, y_pred))
+                if mailbox is not None:
+                    src = metadata['src_pos_index']
+                    dst = metadata['dst_pos_index']
+                    ts = roots.ts
+                    if(graph.edge_attr.device == torch.device('cpu')):
+                        edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None
+                    else:
+                        edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None
+                    dist_index_mapper = mfgs[0][0].srcdata['ID']
+                    root_index = torch.cat((src,dst))
+                    last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
+                    last_updated_memory = model.module.memory_updater.last_updated_memory[root_index]
+                    last_updated_ts=model.module.memory_updater.last_updated_ts[root_index]
+                    index, memory, memory_ts = mailbox.get_update_memory(last_updated_nid,
+                                                                       last_updated_memory,
+                                                                       last_updated_ts)
+                    index, mail, mail_ts = mailbox.get_update_mail(dist_index_mapper,
+                                                 src,dst,ts,edge_feats,
+                                                 model.module.memory_updater.last_updated_memory,
+                                                 )
+                    mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
+        #ap = float(torch.tensor(aps).mean())
+        #if neg_samples > 1:
+        #    auc_mrr = float(torch.cat(aucs_mrrs).mean())
+        #else:
+        #    auc_mrr = float(torch.tensor(aucs_mrrs).mean())
+        world_size = dist.get_world_size()
+        apc = torch.empty([loader.expected_idx*world_size],dtype = torch.float,device='cuda')
+        auc_mrr = torch.empty([loader.expected_idx*world_size],dtype = torch.float,device = 'cuda')
+        dist.all_gather_into_tensor(apc,torch.tensor(aps,device ='cuda',dtype=torch.float))
+        dist.all_gather_into_tensor(auc_mrr,torch.tensor(aucs_mrrs,device ='cuda',dtype=torch.float))
+        ap = float(torch.tensor(apc).mean())
+        auc_mrr = float(torch.tensor(auc_mrr).mean())
+        return ap, auc_mrr
+    creterion = torch.nn.BCEWithLogitsLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=train_param['lr'])
+    for e in range(train_param['epoch']):
+        torch.cuda.synchronize()
+        epoch_start_time = time.time()
+        train_aps = list()
+        print('Epoch {:d}:'.format(e))
+        time_prep = 0
+        total_loss = 0
+        model.train()
+        if mailbox is not None:
+            mailbox.reset()
+            model.module.memory_updater.last_updated_nid = None
+            model.module.memory_updater.last_updated_memory = None
+            model.module.memory_updater.last_updated_ts = None
+        for roots,mfgs,metadata in trainloader:
+            t_prep_s = time.time()
+            with torch.cuda.stream(train_stream):
+                optimizer.zero_grad()
+                pred_pos, pred_neg = model(mfgs,metadata)
+                loss = creterion(pred_pos, torch.ones_like(pred_pos))
+                loss += creterion(pred_neg, torch.zeros_like(pred_neg))
+                total_loss += float(loss)
+                loss.backward()
+                optimizer.step()
+                #torch.cuda.synchronize()
+                t_prep_s = time.time()
+                y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
+                y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
+                train_aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
+                if mailbox is not None:
+                    src = metadata['src_pos_index']
+                    dst = metadata['dst_pos_index']
+                    ts = roots.ts
+                    if(graph.edge_attr.device == torch.device('cpu')):
+                        edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None
+                    else:
+                        edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None
+                    dist_index_mapper = mfgs[0][0].srcdata['ID']
+                    root_index = torch.cat((src,dst))
+                    last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
+                    last_updated_memory = model.module.memory_updater.last_updated_memory[root_index]
+                    last_updated_ts=model.module.memory_updater.last_updated_ts[root_index]
+                    index, memory, memory_ts = mailbox.get_update_memory(last_updated_nid,
+                                                                       last_updated_memory,
+                                                                       last_updated_ts)
+                    index, mail, mail_ts = mailbox.get_update_mail(dist_index_mapper,
+                                                 src,dst,ts,edge_feats,
+                                                 model.module.memory_updater.last_updated_memory,
+                                                 )
+                    mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
+        torch.cuda.synchronize()
+        time_prep = time.time() - epoch_start_time
+        avg_time += time.time() - epoch_start_time
+        train_ap = float(torch.tensor(train_aps).mean())    
+        ap = 0
+        auc = 0
+        ap, auc = eval('val')
+        print('\ttrain loss:{:.4f}  train ap:{:4f}  val ap:{:4f}  val auc:{:4f}'.format(total_loss,train_ap, ap, auc))
+        print('\ttotal time:{:.2f}s  prep time:{:.2f}s'.format(time.time()-epoch_start_time, time_prep))    
+    model.eval()
+    if mailbox is not None:
+        mailbox.reset()
+        model.module.memory_updater.last_updated_nid = None
+        eval('train')
+        eval('val')
+    ap, auc = eval('test')
+    eval_neg_samples = 1
+    if eval_neg_samples > 1:
+        print('\ttest AP:{:4f}  test MRR:{:4f}'.format(ap, auc))
+    else:
+        print('\ttest AP:{:4f}  test AUC:{:4f}'.format(ap, auc))    
+    print('test_dataset',test_data.edges.shape[1],'avg_time',avg_time/train_param['epoch'])
+    ctx.shutdown()
+if __name__ == "__main__":
+    main()
--- a/starrygl/distributed/cclib.py
+++ b/starrygl/distributed/cclib.py
@@ -133,7 +133,6 @@ def all_to_all_s(
    output_sizes = [t-s for s, t in zip(output_rowptr, output_rowptr[1:])]
    input_sizes = [t-s for s, t in zip(input_rowptr, input_rowptr[1:])]
    return dist.all_to_all_single(
        output=output_tensor,
        input=input_tensor,

--- a/starrygl/distributed/utils.py
+++ b/starrygl/distributed/utils.py
@@ -17,6 +17,7 @@ class TensorAccessor:
        self._data = data
        self._ctx = DistributedContext.get_default_context()
        self._rref = rpc.RRef(data)
+        self.stream = torch.cuda.Stream()
    @property
    def data(self):
@@ -29,6 +30,12 @@ class TensorAccessor:
    @property
    def ctx(self):
        return self._ctx
+    @staticmethod
+    @rpc.functions.async_execution
+    def _index_selet(self):
+        fut = torch.futures.Future()
+        fut.set_result(None)
+        return fut
    def all_gather_rrefs(self) -> List[rpc.RRef]:
        return self.ctx.all_gather_remote_objects(self.rref)
@@ -180,7 +187,7 @@ class DistributedTensor:
    def ctx(self):
        return self.accessor.ctx
-    def all_to_all_ind2ptr(self, dist_index: Union[Tensor, DistIndex]) -> Dict[str, Union[List[int], Tensor]]:
+    def all_to_all_ind2ptr(self, dist_index: Union[Tensor, DistIndex],group = None) -> Dict[str, Union[List[int], Tensor]]:
        if isinstance(dist_index, Tensor):
            dist_index = DistIndex(dist_index)
        send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts())
@@ -196,7 +203,7 @@ class DistributedTensor:
        recv_ptr = recv_ptr.tolist()
        recv_ind = torch.full((recv_ptr[-1],), (2**62-1)*2+1, dtype=dist_index.dtype, device=dist_index.device)
-        all_to_all_s(recv_ind, dist_index.loc, send_ptr, recv_ptr)
+        all_to_all_s(recv_ind, dist_index.loc, recv_ptr, send_ptr,group=group)
        return {
            "send_ptr": send_ptr,
@@ -209,6 +216,7 @@ class DistributedTensor:
        send_ptr: Optional[List[int]] = None,
        recv_ptr: Optional[List[int]] = None,
        recv_ind: Optional[List[int]] = None,
+        group = None
    ) -> Tensor:
        if dist_index is not None:
            dist_dict = self.all_to_all_ind2ptr(dist_index)
@@ -218,7 +226,7 @@ class DistributedTensor:
        data = self.accessor.data[recv_ind]
        recv = torch.empty(send_ptr[-1], *data.shape[1:], dtype=data.dtype, device=data.device)
-        all_to_all_s(recv, data, send_ptr, recv_ptr)
+        all_to_all_s(recv, data, send_ptr, recv_ptr,group=group)
        return recv
    def all_to_all_set(self,
@@ -227,6 +235,7 @@ class DistributedTensor:
        send_ptr: Optional[List[int]] = None,
        recv_ptr: Optional[List[int]] = None,
        recv_ind: Optional[List[int]] = None,
+        group = None
    ):
        if dist_index is not None:
            dist_dict = self.all_to_all_ind2ptr(dist_index)
@@ -235,7 +244,7 @@ class DistributedTensor:
            recv_ind = dist_dict["recv_ind"]
        recv = torch.empty(recv_ptr[-1], *data.shape[1:], dtype=data.dtype, device=data.device)
-        all_to_all_s(recv, data, recv_ptr, send_ptr)
+        all_to_all_s(recv, data, recv_ptr, send_ptr,group=group)
        self.accessor.data.index_copy_(0, recv_ind, recv)
    def index_select(self, dist_index: Union[Tensor, DistIndex]):

--- a/starrygl/sample/batch_data.py
+++ b/starrygl/sample/batch_data.py
@@ -7,6 +7,8 @@ from starrygl.sample.graph_core import DataSet
 from starrygl.sample.graph_core import GraphData
 from starrygl.sample.sample_core.base import BaseSampler, NegativeSampling
 import dgl
+from starrygl.sample.stream_manager import PipelineManager, getPipelineManger
 """
 入参不变，出参变为：
 sample_from_nodes
@@ -42,7 +44,7 @@ def prepare_input(node_feat, edge_feat, mem_embedding,mfgs,dist_nid,dist_eid):
                    #print(idx.shape[0],b.srcdata['mem_ts'].shape)
        return mfgs
-def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = torch.device('cuda')):
+def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = torch.device('cuda'),group = None):
    if len(sample_out) > 1:
        sample_out,metadata = sample_out
@@ -76,26 +78,25 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device =
    dist_nid = nid_mapper[nid_tensor].to(device)
    dist_nid,nid_inv = dist_nid.unique(return_inverse = True)
    if isinstance(graph.edge_attr,DistributedTensor):
-        local_index, input_split,output_split = graph.edge_attr.gather_select_index(dist_eid)
+        ind_dict = graph.edge_attr.all_to_all_ind2ptr(dist_eid,group = group)
-        edge_feat = graph.edge_attr.scatter_data(local_index,input_split=input_split,out_split=output_split)
+        edge_feat = graph.edge_attr.all_to_all_get(group = group,**ind_dict)
    else:
        edge_feat = graph._get_edge_attr(dist_eid)
-    local_index = None
+    ind_dict = None
    if isinstance(graph.x,DistributedTensor):
-        local_index, input_split,output_split = graph.x.gather_select_index(dist_nid)
+        ind_dict = graph.x.all_to_all_ind2ptr(dist_nid,group = group)
-        node_feat = graph.x.scatter_data(local_index,input_split=input_split,out_split=output_split)
+        node_feat = graph.x.all_to_all_get(group = group,**ind_dict)
    else:
        node_feat = graph._get_node_attr(dist_nid)
    if mailbox is not None:
        if torch.distributed.get_world_size() > 1:
            if node_feat is None:
-                local_index, input_split,output_split = mailbox.node_memory.gather_select_index(dist_nid)
+                ind_dict = mailbox.node_memory.all_to_all_ind2ptr(dist_nid,group = group)
-            mem = mailbox.gather_memory(local_index,input_split,output_split)
+            mem = mailbox.gather_memory(**ind_dict)
        else:
            mem = mailbox.get_memory(dist_nid)
    else:
        mem = None
    def build_block():
        mfgs = list()
        col = torch.arange(0,root_len,device = device)
@@ -110,23 +111,12 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device =
                                 device = device)
            idx = nid_inv[0:row_len + elen]
            e_idx = eid_inv[col_len:col_len+elen]
-            b.srcdata['ID'] = idx#dist_nid[idx]
+            b.srcdata['ID'] = idx
            if sample_out[r].delta_ts().shape[0] > 0:
                b.edata['dt'] = sample_out[r].delta_ts().to(device)
            if src_ts is not None:
                b.srcdata['ts'] = src_ts[0:row_len + eid_len[r]]
-            b.edata['ID'] = e_idx#dist_eid[e_idx]
+            b.edata['ID'] = e_idx
-            #if edge_feat is not None:
-            #    b.edata['f'] = edge_feat[e_idx]
-            #if r == len(eid_len)-1:
-            #    if node_feat is not None:
-            #        b.srcdata['h'] = node_feat[idx]
-            #    if mem_embedding is not None:
-            #        node_memory,node_memory_ts,mailbox,mailbox_ts = mem_embedding
-            #        b.srcdata['mem'] = node_memory[idx]
-            #        b.srcdata['mem_ts'] = node_memory_ts[idx]
-            #        b.srcdata['mem_input'] = mailbox[idx].cuda().reshape(b.srcdata['ID'].shape[0], -1)
-            #        b.srcdata['mail_ts'] = mailbox_ts[idx]
            col = row
            col_len += eid_len[r]
            row_len += eid_len[r]
@@ -134,27 +124,7 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device =
        mfgs = list(map(list, zip(*[iter(mfgs)])))
        mfgs.reverse()
        return data,mfgs,metadata
    data,mfgs,metadata = build_block()
-    #if dist.get_world_size() > 1:  
-    #    if(node_feat is None):
-    #        node_feat = torch.futures.Future()
-    #        node_feat.set_result(None)
-    #    if(edge_feat is None):
-    #        edge_feat = torch.futures.Future()
-    #        edge_feat.set_result(None) 
-    #    if(mem is None):
-    #        mem = torch.futures.Future()
-    #        mem.set_result(None)
-    #    def callback(fs,mfgs,dist_nid,dist_eid):
-    #        node_feat,edge_feat,mem_embedding = fs.value()
-    #        node_feat = node_feat.value()
-    #        edge_feat = edge_feat.value()
-    #        mem_embedding = mem_embedding.value()
-    #        return prepare_input(node_feat,edge_feat,mem_embedding,mfgs,dist_nid,dist_eid)
-    #    cal = lambda fut: callback(fs=fut,mfgs = mfgs,dist_nid = dist_nid,dist_eid =dist_eid)
-    #    return data,torch.futures.collect_all([node_feat,edge_feat,mem]).then(cal),metadata
-    #else:
    mfgs = prepare_input(node_feat,edge_feat,mem,mfgs,dist_nid,dist_eid)
        #return build_block(node_feat,edge_feat,mem)#data,mfgs,metadata
    return data,mfgs,metadata
@@ -164,9 +134,15 @@ def graph_sample(graph, sampler:BaseSampler,
                      sample_fn, data, 
                      neg_sampling = None,
                      mailbox = None,
-                      device = torch.device('cuda')):
+                      device = torch.device('cuda'),
+                      async_op = False):
    out = sample_fn(sampler,data,neg_sampling)
+    if async_op == False:
        return to_block(graph,data,out,mailbox,device)
+    else:
+        manger =  getPipelineManger()
+        future = manger.submit('lookup',to_block,graph,data,out,mailbox,device)
+        return future
 def sample_from_nodes(sampler:BaseSampler,  data:DataSet, **kwargs):
    out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1))

--- a/starrygl/sample/cache/buffer_index.cpp
+++ b/starrygl/sample/cache/buffer_index.cpp
+#include <omp.h>
+#include <torch/extension.h>
+#include <time.h>
+#include <random>
+#include <phmap.h>
+#include <boost/thread/mutex.hpp>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#define MTX boost::mutex
+using namespace std;
+namespace py = pybind11;
+namespace th = torch;
+typedef int64_t NodeIDType;
+typedef int64_t EdgeIDType;
+typedef float WeightType;
+typedef float TimeStampType;
+#define EXTRAARGS , phmap::priv::hash_default_hash<K>, \
+                            phmap::priv::hash_default_eq<K>, \
+                            std::allocator<K>, 4, MTX
+template <class K, class V>
+using HashM  = phmap::parallel_flat_hash_map<K, V EXTRAARGS>;
+void init_cache_buffer(long long size){
+}
+at::Storage set_cache_buffer(at::Stroge ind){
+}
+//按策略更新替换cache上的数据，返回对应的索引
+at::Storage update_buffer_index(string policy, at::Storge ind, u){
+}
+at::Storage get_buffer_index(at::Storage ind){
+}
+PYBIND11_MODULE(sample_cores, m)
+{
+    m
+    .def("get_neighbors", 
+        &get_neighbors, 
+        py::return_value_policy::reference)    
+    .
+}
\ No newline at end of file
--- a/starrygl/sample/cache/oneTensor.cu
+++ b/starrygl/sample/cache/oneTensor.cu
--- a/starrygl/sample/cache/uvm_table.cu
+++ b/starrygl/sample/cache/uvm_table.cu
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cooperative_groups.h>
+#include <cuda_runtime_api.h>
+#include <atomic>
+#include <iostream>
+#include <limits>
+#include <mutex>
+#include <uvm_table.hpp>
+namespace cg = cooperative_groups;
+namespace {
+constexpr int set_size = 4;
+constexpr int block_size = 256;
+template <typename key_type>
+__host__ __device__ key_type hash(key_type key) {
+  return key;
+}
+template <typename key_type>
+__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
+                                const int num_sets, int* set_sizes, const int max_set_size,
+                                key_type* missing_keys, int* num_missing_keys) {
+  __shared__ key_type s_missing_keys[block_size];
+  __shared__ int s_missing_count;
+  __shared__ size_t s_missing_idx;
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  if (block.thread_rank() == 0) {
+    s_missing_count = 0;
+  }
+  block.sync();
+  size_t idx = grid.thread_rank();
+  if (idx < num_keys) {
+    auto key = new_keys[idx];
+    size_t idx_set = hash(key) % num_sets;
+    int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
+    if (prev_set_size < max_set_size) {
+      keys[idx_set * max_set_size + prev_set_size] = key;
+    } else {
+      int count = atomicAdd(&s_missing_count, 1);
+      s_missing_keys[count] = key;
+    }
+  }
+  block.sync();
+  if (block.thread_rank() == 0) {
+    s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
+  }
+  block.sync();
+  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
+    missing_keys[s_missing_idx + i] = s_missing_keys[i];
+  }
+}
+template <typename key_type, typename index_type>
+__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
+                                  const key_type* keys, const size_t num_sets,
+                                  const int max_set_size, index_type* output_indices) {
+  constexpr int tile_size = set_size;
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<tile_size>(block);
+  int num_keys = *num_keys_ptr;
+  if (num_keys == 0) return;
+#if (CUDA_VERSION < 11060)
+  size_t num_threads_per_grid = grid.size();
+#else
+  size_t num_threads_per_grid = grid.num_threads();
+#endif
+  size_t step = (num_keys - 1) / num_threads_per_grid + 1;
+  for (size_t i = 0; i < step; i++) {
+    size_t idx = i * num_threads_per_grid + grid.thread_rank();
+    key_type query_key = std::numeric_limits<key_type>::max();
+    if (idx < num_keys) {
+      query_key = query_keys[idx];
+    }
+    auto idx_set = hash(query_key) % num_sets;
+    for (int j = 0; j < tile_size; j++) {
+      auto current_idx_set = tile.shfl(idx_set, j);
+      auto current_query_key = tile.shfl(query_key, j);
+      if (current_query_key == std::numeric_limits<key_type>::max()) {
+        continue;
+      }
+      auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
+      int existed = tile.ballot(current_query_key == candidate_key);
+      auto current_idx = tile.shfl(idx, 0) + j;
+      if (existed) {
+        int src_lane = __ffs(existed) - 1;
+        size_t found_idx = current_idx_set * set_size + src_lane;
+        output_indices[current_idx] = num_sets * src_lane + current_idx_set;
+      } else {
+        output_indices[current_idx] = std::numeric_limits<index_type>::max();
+      }
+    }
+  }
+}
+template <typename key_type, typename index_type>
+__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
+                                  const key_type* keys, const size_t num_sets,
+                                  const int max_set_size, index_type* output_indices,
+                                  key_type* missing_keys, int* missing_positions,
+                                  int* missing_count) {
+  __shared__ key_type s_missing_keys[block_size];
+  __shared__ key_type s_missing_positions[block_size];
+  __shared__ int s_missing_count;
+  __shared__ int s_missing_idx;
+  constexpr int tile_size = set_size;
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<tile_size>(block);
+  if (block.thread_rank() == 0) {
+    s_missing_count = 0;
+  }
+  block.sync();
+  size_t idx = grid.thread_rank();
+  key_type query_key = std::numeric_limits<key_type>::max();
+  if (idx < num_keys) {
+    query_key = query_keys[idx];
+  }
+  auto idx_set = hash(query_key) % num_sets;
+  for (int j = 0; j < tile_size; j++) {
+    auto current_idx_set = tile.shfl(idx_set, j);
+    auto current_query_key = tile.shfl(query_key, j);
+    if (current_query_key == std::numeric_limits<key_type>::max()) {
+      continue;
+    }
+    auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
+    int existed = tile.ballot(current_query_key == candidate_key);
+    if (existed) {
+      int src_lane = __ffs(existed) - 1;
+      size_t found_idx = current_idx_set * set_size + src_lane;
+      output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
+    } else {
+      auto current_idx = tile.shfl(idx, 0) + j;
+      output_indices[current_idx] = std::numeric_limits<index_type>::max();
+      if (tile.thread_rank() == 0) {
+        int s_count = atomicAdd(&s_missing_count, 1);
+        s_missing_keys[s_count] = current_query_key;
+        s_missing_positions[s_count] = current_idx;
+      }
+    }
+  }
+  if (missing_keys == nullptr) {
+    if (grid.thread_rank() == 0 && missing_count) {
+      *missing_count = 0;
+    }
+    return;
+  }
+  block.sync();
+  if (block.thread_rank() == 0) {
+    s_missing_idx = atomicAdd(missing_count, s_missing_count);
+  }
+  block.sync();
+  for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
+    missing_keys[s_missing_idx + i] = s_missing_keys[i];
+    missing_positions[s_missing_idx + i] = s_missing_positions[i];
+  }
+}
+template <int warp_size>
+__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
+                                               const size_t emb_vec_size_in_float,
+                                               volatile float* d_dst, const float* d_src) {
+  // 16 bytes align
+  if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
+#pragma unroll
+    for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
+      d_dst[i] = d_src[i];
+    }
+  } else {
+#pragma unroll
+    for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
+      *(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
+    }
+  }
+}
+template <typename index_type, typename vec_type>
+__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
+                                    const vec_type* vectors, const int vec_size,
+                                    vec_type* output_vectors) {
+  constexpr int warp_size = 32;
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<warp_size>(block);
+#if (CUDA_VERSION < 11060)
+  auto num_threads_per_grid = grid.size();
+#else
+  auto num_threads_per_grid = grid.num_threads();
+#endif
+  for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
+    int key_num = step * num_threads_per_grid + grid.thread_rank();
+    index_type idx = std::numeric_limits<index_type>::max();
+    if (key_num < num_keys) {
+      idx = query_indices[key_num];
+    }
+#pragma unroll 4
+    for (size_t j = 0; j < warp_size; j++) {
+      index_type current_idx = tile.shfl(idx, j);
+      index_type idx_write = tile.shfl(key_num, 0) + j;
+      if (current_idx == std::numeric_limits<index_type>::max()) continue;
+      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
+                                vectors + current_idx * vec_size);
+    }
+  }
+}
+template <typename index_type, typename vec_type>
+__global__ void distribute_vectors_kernel(const index_type* positions, const size_t num_keys,
+                                          const vec_type* vectors, const int vec_size,
+                                          vec_type* output_vectors) {
+  constexpr int warp_size = 32;
+  auto grid = cg::this_grid();
+  auto block = cg::this_thread_block();
+  auto tile = cg::tiled_partition<warp_size>(block);
+#if (CUDA_VERSION < 11060)
+  auto num_threads_per_grid = grid.size();
+#else
+  auto num_threads_per_grid = grid.num_threads();
+#endif
+  for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
+    size_t key_num = step * num_threads_per_grid + grid.thread_rank();
+    index_type idx = std::numeric_limits<index_type>::max();
+    if (key_num < num_keys) {
+      idx = positions[key_num];
+    }
+#pragma unroll 4
+    for (size_t j = 0; j < warp_size; j++) {
+      size_t idx_write = tile.shfl(idx, j);
+      size_t idx_read = tile.shfl(key_num, 0) + j;
+      if (idx_write == std::numeric_limits<index_type>::max()) continue;
+      warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
+                                output_vectors + (size_t)idx_write * vec_size,
+                                vectors + (size_t)idx_read * vec_size);
+    }
+  }
+}
+}  // namespace
+namespace gpu_cache {
+template <typename key_type, typename index_type, typename vec_type>
+UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
+                                                   const size_t host_table_capacity,
+                                                   const int max_batch_size, const int vec_size,
+                                                   const vec_type default_value)
+    : max_batch_size_(std::max(100000, max_batch_size)),
+      vec_size_(vec_size),
+      num_set_((device_table_capacity - 1) / set_size + 1),
+      num_host_set_((host_table_capacity - 1) / set_size + 1),
+      table_capacity_(num_set_ * set_size),
+      default_vector_(vec_size, default_value),
+      device_table_(device_table_capacity, set_size, max_batch_size_),
+      host_table_(host_table_capacity * 1.1, set_size, max_batch_size_) {
+  CUDA_CHECK(cudaMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
+  CUDA_CHECK(cudaMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
+  CUDA_CHECK(cudaMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
+  CUDA_CHECK(cudaMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
+  CUDA_CHECK(cudaMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
+  CUDA_CHECK(cudaMallocHost(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
+  CUDA_CHECK(cudaMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
+  CUDA_CHECK(cudaMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
+  CUDA_CHECK(cudaMalloc(&d_missing_count_, sizeof(int)));
+  CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(int)));
+  CUDA_CHECK(cudaStreamCreate(&query_stream_));
+  for (int i = 0; i < num_buffers_; i++) {
+    int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
+    CUDA_CHECK(
+        cudaMallocHost(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
+    CUDA_CHECK(cudaMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
+    CUDA_CHECK(cudaStreamCreate(&cpy_streams_[i]));
+    CUDA_CHECK(cudaEventCreate(&cpy_events_[i]));
+  }
+  CUDA_CHECK(cudaMallocHost(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
+  CUDA_CHECK(cudaEventCreate(&query_event_));
+  h_vectors_.resize(host_table_.capacity * vec_size_);
+}
+template <typename key_type, typename index_type, typename vec_type>
+void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
+                                                   const vec_type* h_vectors,
+                                                   const size_t num_keys) {
+  std::vector<key_type> h_missing_keys;
+  size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size =
+        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
+    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
+                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+    device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
+    CUDA_CHECK(cudaDeviceSynchronize());
+    int num_missing_keys;
+    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
+                          cudaMemcpyDeviceToHost));
+    size_t prev_size = h_missing_keys.size();
+    h_missing_keys.resize(prev_size + num_missing_keys);
+    CUDA_CHECK(cudaMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
+                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
+  }
+  std::vector<key_type> h_final_missing_keys;
+  num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size =
+        i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
+    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
+                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+    host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
+    CUDA_CHECK(cudaDeviceSynchronize());
+    int num_missing_keys;
+    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
+                          cudaMemcpyDeviceToHost));
+    size_t prev_size = h_final_missing_keys.size();
+    h_final_missing_keys.resize(prev_size + num_missing_keys);
+    CUDA_CHECK(cudaMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
+                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
+  }
+  std::vector<key_type> h_keys_buffer(max_batch_size_);
+  std::vector<index_type> h_indices_buffer(max_batch_size_);
+  std::vector<int> h_positions_buffer(max_batch_size_);
+  num_batches = (num_keys - 1) / max_batch_size_ + 1;
+  size_t num_hit_keys = 0;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size =
+        i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
+    CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
+                          sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+    device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
+                        d_missing_positions_, d_missing_count_, 0);
+    CUDA_CHECK(cudaStreamSynchronize(0));
+    CUDA_CHECK(cudaMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
+                          sizeof(*d_vectors_) * this_batch_size * vec_size_,
+                          cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaStreamSynchronize(0));
+    if (num_hit_keys < device_table_.capacity) {
+      distribute_vectors_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, 0>>>(
+          d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
+      CUDA_CHECK(cudaStreamSynchronize(0));
+    }
+    int num_missing_keys;
+    CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
+                          cudaMemcpyDeviceToHost));
+    num_hit_keys += this_batch_size - num_missing_keys;
+    host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
+                      nullptr, 0);
+    CUDA_CHECK(cudaMemcpy(h_keys_buffer.data(), d_missing_keys_,
+                          sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost))
+    CUDA_CHECK(cudaMemcpy(h_indices_buffer.data(), d_output_indices_,
+                          sizeof(*d_output_indices_) * num_missing_keys, cudaMemcpyDeviceToHost))
+    CUDA_CHECK(cudaMemcpy(h_positions_buffer.data(), d_missing_positions_,
+                          sizeof(*d_missing_positions_) * num_missing_keys, cudaMemcpyDeviceToHost))
+    for (int j = 0; j < num_missing_keys; j++) {
+      if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
+        memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
+               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
+               sizeof(*h_vectors) * vec_size_);
+      } else {
+        size_t prev_idx = h_vectors_.size() / vec_size_;
+        h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
+        h_vectors_.resize(h_vectors_.size() + vec_size_);
+        memcpy(h_vectors_.data() + prev_idx * vec_size_,
+               h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
+               sizeof(*h_vectors) * vec_size_);
+      }
+    }
+  }
+  CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
+}
+template <typename key_type, typename index_type, typename vec_type>
+void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
+                                                     vec_type* d_vectors, cudaStream_t stream) {
+  if (!num_keys) return;
+  CUDA_CHECK(cudaEventRecord(query_event_, stream));
+  CUDA_CHECK(cudaStreamWaitEvent(query_stream_, query_event_));
+  static_assert(num_buffers_ >= 2);
+  device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
+                      d_missing_count_, query_stream_);
+  CUDA_CHECK(cudaEventRecord(query_event_, query_stream_));
+  CUDA_CHECK(cudaStreamWaitEvent(cpy_streams_[0], query_event_));
+  int num_missing_keys;
+  CUDA_CHECK(cudaMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
+                             cudaMemcpyDeviceToHost, cpy_streams_[0]));
+  host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
+  CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
+  CUDA_CHECK(cudaMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
+  CUDA_CHECK(cudaMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
+                             sizeof(index_type) * num_missing_keys, cudaMemcpyDeviceToHost,
+                             query_stream_));
+  CUDA_CHECK(cudaMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
+                             cudaMemcpyDeviceToHost, cpy_streams_[0]));
+  read_vectors_kernel<<<(num_keys - 1) / block_size + 1, block_size, 0, cpy_streams_[1]>>>(
+      d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
+  CUDA_CHECK(cudaStreamSynchronize(query_stream_));
+  CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
+  int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
+  for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
+    int num_keys_this_buffer = buffer_num != num_buffers_ - 1
+                                   ? num_keys_per_buffer
+                                   : num_missing_keys - num_keys_per_buffer * buffer_num;
+    if (!num_keys_this_buffer) break;
+#pragma omp parallel for num_threads(8)
+    for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
+      size_t idx_key = buffer_num * num_keys_per_buffer + i;
+      index_type index = h_output_host_indices_[idx_key];
+      if (index == std::numeric_limits<index_type>::max()) {
+        key_type key = h_missing_keys_[idx_key];
+        auto iterator = h_final_missing_items_.find(key);
+        if (iterator != h_final_missing_items_.end()) {
+          index = iterator->second;
+        }
+      }
+      if (index != std::numeric_limits<index_type>::max()) {
+        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
+               sizeof(vec_type) * vec_size_);
+      } else {
+        memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
+               sizeof(vec_type) * vec_size_);
+      }
+    }
+    CUDA_CHECK(cudaMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
+                               sizeof(vec_type) * num_keys_this_buffer * vec_size_,
+                               cudaMemcpyHostToDevice, cpy_streams_[buffer_num]));
+    distribute_vectors_kernel<<<(num_keys_this_buffer - 1) / block_size + 1, block_size, 0,
+                                cpy_streams_[buffer_num]>>>(
+        d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
+        d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
+  }
+  for (int i = 0; i < num_buffers_; i++) {
+    CUDA_CHECK(cudaEventRecord(cpy_events_[i], cpy_streams_[i]));
+    CUDA_CHECK(cudaStreamWaitEvent(stream, cpy_events_[i]));
+  }
+}
+template <typename key_type, typename index_type, typename vec_type>
+void UvmTable<key_type, index_type, vec_type>::clear(cudaStream_t stream) {
+  device_table_.clear(stream);
+  host_table_.clear(stream);
+}
+template <typename key_type, typename index_type, typename vec_type>
+UvmTable<key_type, index_type, vec_type>::~UvmTable() {
+  CUDA_CHECK(cudaFree(d_keys_buffer_));
+  CUDA_CHECK(cudaFree(d_vectors_buffer_));
+  CUDA_CHECK(cudaFree(d_vectors_));
+  CUDA_CHECK(cudaFree(d_output_indices_));
+  CUDA_CHECK(cudaFree(d_output_host_indices_));
+  CUDA_CHECK(cudaFreeHost(h_output_host_indices_));
+  CUDA_CHECK(cudaFree(d_missing_keys_));
+  CUDA_CHECK(cudaFree(d_missing_positions_));
+  CUDA_CHECK(cudaFree(d_missing_count_));
+  CUDA_CHECK(cudaFreeHost(h_missing_keys_));
+  CUDA_CHECK(cudaStreamDestroy(query_stream_));
+  CUDA_CHECK(cudaEventDestroy(query_event_));
+  for (int i = 0; i < num_buffers_; i++) {
+    CUDA_CHECK(cudaFreeHost(h_cpy_buffers_[i]));
+    CUDA_CHECK(cudaFree(d_cpy_buffers_[i]));
+    CUDA_CHECK(cudaStreamDestroy(cpy_streams_[i]));
+    CUDA_CHECK(cudaEventDestroy(cpy_events_[i]));
+  }
+}
+template <typename key_type, typename index_type>
+HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
+    : max_set_size_(set_size), batch_size_(batch_size) {
+  if (expected_capacity) {
+    num_sets = (expected_capacity - 1) / set_size + 1;
+  } else {
+    num_sets = 10000;
+  }
+  capacity = num_sets * set_size;
+  CUDA_CHECK(cudaMalloc(&keys, sizeof(*keys) * capacity));
+  CUDA_CHECK(cudaMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
+  CUDA_CHECK(cudaMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
+}
+template <typename key_type, typename index_type>
+HashBlock<key_type, index_type>::~HashBlock() {
+  CUDA_CHECK(cudaFree(keys));
+  CUDA_CHECK(cudaFree(set_sizes_));
+}
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
+                                            index_type* output_indices, key_type* missing_keys,
+                                            int* missing_positions, int* num_missing_keys,
+                                            cudaStream_t stream) {
+  if (num_keys == 0) {
+    return;
+  }
+  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
+    hash_query_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
+        query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
+        missing_positions, num_missing_keys);
+  }
+}
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
+                                            index_type* output_indices, cudaStream_t stream) {
+  hash_query_kernel<<<128, 64, 0, stream>>>(query_keys, num_keys, keys, num_sets, max_set_size_,
+                                            output_indices);
+}
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
+                                          key_type* missing_keys, int* num_missing_keys,
+                                          cudaStream_t stream) {
+  if (num_keys == 0) {
+    return;
+  }
+  size_t num_batches = (num_keys - 1) / batch_size_ + 1;
+  for (size_t i = 0; i < num_batches; i++) {
+    size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
+    hash_add_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
+        new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
+        missing_keys, num_missing_keys);
+  }
+}
+template <typename key_type, typename index_type>
+void HashBlock<key_type, index_type>::clear(cudaStream_t stream) {
+  CUDA_CHECK(cudaMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
+}
+template class HashBlock<int, size_t>;
+template class HashBlock<int64_t, size_t>;
+template class HashBlock<size_t, size_t>;
+template class HashBlock<unsigned int, size_t>;
+template class HashBlock<long long, size_t>;
+template class UvmTable<int, size_t>;
+template class UvmTable<int64_t, size_t>;
+template class UvmTable<size_t, size_t>;
+template class UvmTable<unsigned int, size_t>;
+template class UvmTable<long long, size_t>;
+}  // namespace gpu_cache
\ No newline at end of file
--- a/starrygl/sample/data_loader.py
+++ b/starrygl/sample/data_loader.py
@@ -148,7 +148,7 @@ class DistributedDataLoader:
                return batch_data
            else :
                raise StopIteration
-        else :
+        if self.queue_size > 0 :
            num_reqs = min(self.queue_size - self.num_pending,self.expected_idx - self.submitted)
            for _ in range(num_reqs):
                if len(self.result_queue)>0:

--- a/starrygl/sample/graph_core/__init__.py
+++ b/starrygl/sample/graph_core/__init__.py
@@ -4,7 +4,7 @@ import os.path as osp
 import torch
 import torch.distributed as dist
 from torch_geometric.data import Data
-class GraphData():
+class GraphStore():
    def __init__(self, pdata, device = torch.device('cuda'), all_on_gpu = False):
        self.device = device
        self.ids = pdata.ids.to(device)
@@ -14,8 +14,8 @@ class GraphData():
        else:
            self.edge_ts = None
        self.sample_graph = pdata.sample_graph
-        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).data.to('cpu')
+        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
-        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).data.to('cpu')
+        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
        if all_on_gpu:
            self.nids_mapper = self.nids_mapper.to(device)
            self.eids_mapper = self.eids_mapper.to(device)
@@ -30,9 +30,9 @@ class GraphData():
            self.x = None
        if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None:
            if world_size > 1:
-                self.edge_attr = DistributedTensor(pdata.edge_attr.to('cpu').to(torch.float))
+                self.edge_attr = DistributedTensor(pdata.edge_attr.to('cuda').to(torch.float))
            else:
-                self.edge_attr = pdata.edge_attr.to('cpu').to(torch.float)
+                self.edge_attr = pdata.edge_attr.to('cuda').to(torch.float)
        else:
            self.edge_attr = None
@@ -106,7 +106,7 @@ class DataSet:
                setattr(d,k,v[indx])
        return d
-class TemporalGraphData():
+class TemporalGraphData(GraphStore):
    def __init__(self,pdata,device):
        super(TemporalGraphData,self).__init__(pdata,device)
    def _set_temporal_batch_cache(self,size,pin_size):
@@ -117,7 +117,7 @@ class TemporalGraphData():
-class TemporalNeighborSampleGraph(GraphData):
+class TemporalNeighborSampleGraph(GraphStore):
    def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
        self.edge_index = sample_graph['edge_index']
        self.num_edges = self.edge_index.shape[1]

--- a/starrygl/sample/memory/shared_mailbox.py
+++ b/starrygl/sample/memory/shared_mailbox.py
+from typing import Union
 from typing import List
+from typing import Optional
 import torch
 from torch.distributed import rpc
 import torch_scatter
@@ -118,7 +120,7 @@ class SharedMailBox():
    def set_mailbox_all_to_all(self,index,memory,
                               memory_ts,mail,mail_ts,
-                               reduce_Op = None):
+                               reduce_Op = None,group = None):
        #futs: List[torch.futures.Future] = []
        if self.num_parts == 1:
            dist_index = DistIndex(index)
@@ -132,7 +134,7 @@ class SharedMailBox():
                                          device = self.device)
            indic = torch.searchsorted(index,self.partptr,right=False)
            scatter_len_list = indic[1:] - indic[0:-1]
-            torch.distributed.all_to_all_single(gather_len_list,scatter_len_list)
+            torch.distributed.all_to_all_single(gather_len_list,scatter_len_list,group = group)
            input_split = scatter_len_list.tolist()
            output_split = gather_len_list.tolist()
            gather_id_list = torch.empty(
@@ -143,7 +145,7 @@ class SharedMailBox():
            output_split = gather_len_list.tolist()
            torch.distributed.all_to_all_single(
                gather_id_list,index,output_split_sizes=output_split, 
-                input_split_sizes=input_split)
+                input_split_sizes=input_split,group = group)
            index = gather_id_list
            gather_memory = torch.empty(
                [gather_len_list.sum(),memory.shape[1]],
@@ -160,22 +162,81 @@ class SharedMailBox():
            torch.distributed.all_to_all_single(
                gather_memory,memory,
                output_split_sizes=output_split, 
-                input_split_sizes=input_split)
+                input_split_sizes=input_split,group = group)
            torch.distributed.all_to_all_single(
                gather_memory_ts,memory_ts,
                output_split_sizes=output_split, 
-                input_split_sizes=input_split)
+                input_split_sizes=input_split,group = group)
            torch.distributed.all_to_all_single(
                gather_mail,mail,
                output_split_sizes=output_split, 
-                input_split_sizes=input_split)
+                input_split_sizes=input_split,group = group)
            torch.distributed.all_to_all_single(
                gather_mail_ts,mail_ts,
                output_split_sizes=output_split, 
-                input_split_sizes=input_split)
+                input_split_sizes=input_split,group = group)
            self.set_mailbox_local(DistIndex(index).loc,gather_mail,gather_mail_ts,Reduce_Op = reduce_Op)
            self.set_memory_local(DistIndex(index).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op)
+    def set_mailbox_all_to_all(self,index,memory,
+                               memory_ts,mail,mail_ts,
+                               reduce_Op = None,group = None):
+        #futs: List[torch.futures.Future] = []
+        if self.num_parts == 1:
+            dist_index = DistIndex(index)
+            part_idx = dist_index.part
+            index = dist_index.loc
+            self.set_mailbox_local(index,mail,mail_ts)
+            self.set_memory_local(index,memory,memory_ts)
+        else:
+            gather_len_list = torch.empty([self.num_parts],
+                                          dtype = int,
+                                          device = self.device)
+            indic = torch.searchsorted(index,self.partptr,right=False)
+            scatter_len_list = indic[1:] - indic[0:-1]
+            torch.distributed.all_to_all_single(gather_len_list,scatter_len_list,group = group)
+            input_split = scatter_len_list.tolist()
+            output_split = gather_len_list.tolist()
+            gather_id_list = torch.empty(
+                [gather_len_list.sum()],
+                dtype = torch.long,
+                device = self.device)
+            input_split = scatter_len_list.tolist()
+            output_split = gather_len_list.tolist()
+            torch.distributed.all_to_all_single(
+                gather_id_list,index,output_split_sizes=output_split, 
+                input_split_sizes=input_split,group = group)
+            index = gather_id_list
+            gather_memory = torch.empty(
+                [gather_len_list.sum(),memory.shape[1]],
+                dtype = memory.dtype,device = self.device)
+            gather_memory_ts = torch.empty(
+                [gather_len_list.sum()],
+                dtype = memory_ts.dtype,device = self.device)
+            gather_mail = torch.empty(
+                [gather_len_list.sum(),mail.shape[1]],
+                dtype = mail.dtype,device = self.device)
+            gather_mail_ts = torch.empty(
+                [gather_len_list.sum()],
+                dtype = mail_ts.dtype,device = self.device)
+            torch.distributed.all_to_all_single(
+                gather_memory,memory,
+                output_split_sizes=output_split, 
+                input_split_sizes=input_split,group = group)
+            torch.distributed.all_to_all_single(
+                gather_memory_ts,memory_ts,
+                output_split_sizes=output_split, 
+                input_split_sizes=input_split,group = group)
+            torch.distributed.all_to_all_single(
+                gather_mail,mail,
+                output_split_sizes=output_split, 
+                input_split_sizes=input_split,group = group)
+            torch.distributed.all_to_all_single(
+                gather_mail_ts,mail_ts,
+                output_split_sizes=output_split, 
+                input_split_sizes=input_split,group = group)
+            self.set_mailbox_local(DistIndex(index).loc,gather_mail,gather_mail_ts,Reduce_Op = reduce_Op)
+            self.set_memory_local(DistIndex(index).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op)
    def get_update_mail(self,dist_indx_mapper,
                 src,dst,ts,edge_feats,
@@ -234,9 +295,16 @@ class SharedMailBox():
                #print(memory.shape[0])
                return memory,memory_ts,mail,mail_ts
            return torch.futures.collect_all([memory,memory_ts,mail,mail_ts]).then(callback)
-    def gather_memory(self,index,input_split,out_split):
+    def gather_memory(
-        return self.node_memory.scatter_data(index,input_split,out_split),\
+            self,
-                self.node_memory_ts.scatter_data(index,input_split,out_split),\
+            dist_index: Union[torch.Tensor, DistIndex, None] = None,
-                self.mailbox.scatter_data(index,input_split,out_split),\
+            send_ptr: Optional[List[int]] = None,
-                self.mailbox_ts.scatter_data(index,input_split,out_split)
+            recv_ptr: Optional[List[int]] = None,
+            recv_ind: Optional[List[int]] = None,
+            group = None
+        ):
+        return self.node_memory.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group),\
+                self.node_memory_ts.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group),\
+                self.mailbox.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group),\
+                self.mailbox_ts.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group)
--- a/starrygl/sample/part_utils/partition_tgnn.py
+++ b/starrygl/sample/part_utils/partition_tgnn.py
@@ -2,7 +2,8 @@ import parser
 from torch_sparse import SparseTensor
 from torch_geometric.data import Data
 from torch_geometric.utils import degree
-from .torch_utils import get_norm_temporal
+from starrygl.lib.libstarrygl_ops import metis_partition
+from starrygl.lib.libstarrygl_ops_sampler import get_norm_temporal
 import os.path as osp
 import os
 import shutil

--- a/starrygl/sample/part_utils/setup.py
+++ b/starrygl/sample/part_utils/setup.py
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CppExtension
-setup(
-    name='torch_utils',
-    ext_modules=[
-        CppExtension(
-            name='torch_utils',
-            sources=['./torch_utils.cpp'],
-            extra_compile_args=['-fopenmp','-Xlinker',' -export-dynamic'],
-            include_dirs=["../sample_core"],
-        ),
-    ],
-    cmdclass={
-        'build_ext': BuildExtension
-    })#
-#setup(
-#    name='cpu_cache_manager',
-#    ext_modules=[
-#        CppExtension(
-#            name='cpu_cache_manager',
-#            sources=['cpu_cache_manager.cpp'],
-#            extra_compile_args=['-fopenmp','-Xlinker',' -export-dynamic'],
-#            include_dirs=["./"],
-#        ),
-#    ],
-#    cmdclass={
-#        'build_ext': BuildExtension
-#    })#
-#
\ No newline at end of file
--- a/starrygl/sample/stream_manager.py
+++ b/starrygl/sample/stream_manager.py
+from concurrent.futures import ThreadPoolExecutor, thread
+from multiprocessing import Process
+from multiprocessing.pool import ThreadPool
+from typing import Deque
 import torch
-class WorkStreamEvent:
+import torch.distributed as dist
-    def __init__(self):
+stage = ['train_stream','write_memory','write_mail','lookup']
-        self.train_stream = torch.cuda.Stream()
-        self.write_memory_stream = torch.cuda.Stream()
-        self.fetch_stream = torch.cuda.Stream()
+class PipelineManager:
-        self.write_mail_stream = torch.cuda.Stream()
+    def __init__(self,num_tasks = 10):
-        self.event = None
+        self.stream_set = {}
-event = WorkStreamEvent()
+        self.dist_set = {}
-def get_event():
+        self.args_queue = {}
-    return event.event
+        self.thread_pool = ThreadPoolExecutor(num_tasks)
+        for k in stage:
+            self.stream_set[k] = torch.cuda.Stream()
+            self.dist_set[k] = dist.new_group()
+            self.args_queue[k] = Deque()
+    def submit(self,state,func,*args):
+        future = self.thread_pool.submit(target=self.run, args=(state,func,args))
+        return future
+    def run(self,state,func,*args):
+        while torch.cuda.stream(self.stream_set[state]):
+            func(args, group = self.dist_set[state])
+manger = None
+def getPipelineManger():
+    global manger
+    if manger == None:
+        manger = PipelineManager()
+    return manger
\ No newline at end of file
--- a/test.sh
+++ b/test.sh
+nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 0 > taobao4.out &
+nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 1 > taobao4.out &
+nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 2 > taobao4.out &
+nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 3 > taobao4.out &
+nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 0 > GDELT4.out &
+nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 1 > GDELT4.out &
+nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 2 > GDELT4.out &
+nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 3 > GDELT4.out &
+nohup python train_tgnn.py --dataname ML25M --world_size 2 --rank 0 > ML25M2.out &
+nohup python train_tgnn.py --dataname ML25M --world_size 2 --rank 1 > ML25M2.out &
+nohup python train_tgnn.py --dataname TaoBao --world_size 2 --rank 0 > TaoBao2.out &
+nohup python train_tgnn.py --dataname TaoBao --world_size 2 --rank 1 > TaoBao2.out &
+nohup python train_tgnn.py --dataname ML25M --world_size 1 --rank 0 > ML25M1.out &
+nohup python train_tgnn.py --dataname TaoBao --world_size 1 --rank 0 > TaoBao1.out &
+nohup python train_tgnn.py --dataname GDELT --world_size 1 --rank 0 > GDELT1.out &
\ No newline at end of file
--- a/train_tgnn.py
+++ b/train_tgnn.py
@@ -25,24 +25,7 @@ import os
 from starrygl.sample.data_loader import DistributedDataLoader
 from starrygl.sample.batch_data import SAMPLE_TYPE
-"""
-test command 
-python test.py --world_size 2 --rank 0 
--world_size', default=4, type=int, metavar='W',
-                    help='number of workers')
-parser.add_argument('--rank', default=0, type=int, metavar='W',
-                    help='rank of the worker')
-parser.add_argument('--log_interval', type=int, default=10, metavar='N',
-                    help='interval between training status logs')
-parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
-                    help='how much to value future rewards')
-parser.add_argument('--seed', type=int, default=1, metavar='S',
-                    help='random seed  for reproducibility')
-parser.add_argument('--num_sampler', type=int, default=10, metavar='S',
-                    help='number of samplers')
-parser.add_argument('--queue_size', type=int, default=10, metavar='S',
-                    help='sampler queue size')
-"""
 parser = argparse.ArgumentParser(
    description="RPC Reinforcement Learning Example",
    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
@@ -51,6 +34,8 @@ parser.add_argument('--rank', default=0, type=str, metavar='W',
                    help='name of dataset')
 parser.add_argument('--world_size', default=1, type=int, metavar='W',
                    help='number of negative samples')
+parser.add_argument('--dataname', default=1, type=str, metavar='W',
+                    help='number of negative samples')
 args = parser.parse_args()
 from sklearn.metrics import average_precision_score, roc_auc_score
 import torch
@@ -82,7 +67,7 @@ def main():
    ctx = DistributedContext.init(backend="nccl", use_gpu=True)
    device_id = torch.cuda.current_device()
    print('use cuda on',device_id)
-    pdata = partition_load("./dataset/here/GDELT", algo="metis_for_tgnn")    
+    pdata = partition_load("./dataset/here/{}".format(args.dataname), algo="metis_for_tgnn")    
    graph = GraphData(pdata = pdata)
    sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
@@ -174,10 +159,12 @@ def main():
                    dst = metadata['dst_pos_index']
                    ts = roots.ts
-                    if(graph.edge_attr.device == torch.device('cpu')):
+                    if graph.edge_attr is None:
-                        edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None
+                        edge_feats = None
+                    elif(graph.edge_attr.device == torch.device('cpu')):
+                        edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda')
                    else:
-                        edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None
+                        edge_feats = graph.edge_attr[roots.eids] 
                    dist_index_mapper = mfgs[0][0].srcdata['ID']
                    root_index = torch.cat((src,dst))
                    last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
@@ -247,10 +234,12 @@ def main():
                    dst = metadata['dst_pos_index']
                    ts = roots.ts
-                    if(graph.edge_attr.device == torch.device('cpu')):
+                    if graph.edge_attr is None:
-                        edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None
+                        edge_feats = None
+                    elif(graph.edge_attr.device == torch.device('cpu')):
+                        edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda')
                    else:
-                        edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None
+                        edge_feats = graph.edge_attr[roots.eids] 
                    dist_index_mapper = mfgs[0][0].srcdata['ID']
                    root_index = torch.cat((src,dst))
                    last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]