Merge branch 'master' into hzq

7f481360 · xxx · cd3f3cd2 · 6acf7ed1 · 7f481360 · 7f481360
Commit 7f481360 authored Jan 23, 2024 by xxx
45 changed files
--- a/.gitattributes
+++ b/.gitattributes
+install.sh merge=ours
\ No newline at end of file
--- a/.gitignore
+++ b/.gitignore
+*.tgz
+*.my
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -169,8 +171,12 @@ cython_debug/

 /third_party
 /.vscode
+/.history
+/.cache

 /run_route.py
 /dataset
 /test_*
 /*.ipynb
+saved_models/
+saved_checkpoints/
\ No newline at end of file
--- a/config/DyRep.yml
+++ b/config/DyRep.yml
+sampling:
+  - layer: 1
+    neighbor: 
+      - 10
+    strategy: 'recent'
+    prop_time: False
+    history: 1
+    duration: 0
+    num_thread: 32
+memory: 
+  - type: 'node'
+    dim_time: 100
+    deliver_to: 'self'
+    mail_combine: 'last'
+    memory_update: 'gru'
+    mailbox_size: 1
+    combine_node_feature: True
+    dim_out: 100
+gnn:
+  - arch: 'transformer_attention'
+    use_src_emb: True
+    use_dst_emb: True
+    layer: 1
+    att_head: 2
+    dim_time: 100
+    dim_out: 100
+train:
+  - epoch: 50
+    batch_size: 100
+    # reorder: 16
+    lr: 0.0001
+    dropout: 0.1
+    att_dropout: 0.2
+    all_on_gpu: True
\ No newline at end of file
--- a/config/JODIE.yml
+++ b/config/JODIE.yml
+sampling:
+  - no_sample: True
+    history: 1
+memory: 
+  - type: 'node'
+    dim_time: 100
+    deliver_to: 'self'
+    mail_combine: 'last'
+    memory_update: 'rnn'
+    mailbox_size: 1
+    combine_node_feature: True
+    dim_out: 100
+gnn:
+  - arch: 'identity'
+    use_src_emb: False
+    use_dst_emb: False
+    time_transform: 'JODIE'
+train:
+  - epoch: 20
+    batch_size: 200
+    lr: 0.0001
+    dropout: 0.1
+    all_on_gpu: True
\ No newline at end of file
--- a/config/TGAT.yml
+++ b/config/TGAT.yml
+sampling:
+  - layer: 2
+    neighbor: 
+      - 10
+      - 10
+    strategy: 'uniform'
+    prop_time: False
+    history: 1
+    duration: 0
+    num_thread: 32
+memory: 
+  - type: 'none'
+    dim_out: 0
+gnn:
+  - arch: 'transformer_attention'
+    layer: 2
+    att_head: 2
+    dim_time: 100
+    dim_out: 100
+train:
+  - epoch: 100
+    batch_size: 600
+    lr: 0.0001
+    dropout: 0.1
+    att_dropout: 0.1
+    all_on_gpu: True
\ No newline at end of file
--- a/config/TGN.yml
+++ b/config/TGN.yml
@@ -18,13 +18,15 @@ memory:
    dim_out: 100
 gnn:
  - arch: 'transformer_attention'
+    use_src_emb: False
+    use_dst_emb: False
    layer: 1
    att_head: 2
    dim_time: 100
    dim_out: 100
 train:
-  - epoch: 5
-    #batch_size: 100
+  - epoch: 20
+    batch_size: 200
    # reorder: 16
    lr: 0.0001
    dropout: 0.2

--- a/config/TIGE.yml
+++ b/config/TIGE.yml
+sampling:
+  - layer: 1
+    neighbor: 
+      - 10
+    strategy: 'recent'
+    prop_time: False
+    history: 1
+    duration: 0
+    num_thread: 32
+memory: 
+  - type: 'node'
+    dim_time: 100
+    deliver_to: 'self'
+    mail_combine: 'last'
+    memory_update: 'gru'
+    mailbox_size: 1
+    combine_node_feature: True
+    dim_out: 100
+gnn:
+  - arch: 'transformer_attention'
+    use_src_emb: True
+    use_dst_emb: True
+    layer: 1
+    att_head: 2
+    dim_time: 100
+    dim_out: 100
+train:
+  - epoch: 20
+    batch_size: 200
+    # reorder: 16
+    lr: 0.0001
+    dropout: 0.2
+    att_dropout: 0.2
+    all_on_gpu: True
\ No newline at end of file
--- a/csrc/sampler/export.cpp
+++ b/csrc/sampler/export.cpp
 #include<head.h>
 #include <sampler.h>
+#include <tppr.h>
 #include <output.h>
 #include <neighbors.h>
 #include <temporal_utils.h>
@@ -88,4 +89,22 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
        .def("reset", &ParallelSampler::reset)
        .def("get_ret", [](const ParallelSampler &ps) { return ps.ret; });

+    py::class_<ParallelTppRComputer>(m, "ParallelTppRComputer")
+        .def(py::init<TemporalNeighborBlock &, NodeIDType, EdgeIDType, int,
+                      int, int, int, vector<float>&, vector<float>& >())
+        .def_readonly("ret", &ParallelTppRComputer::ret, py::return_value_policy::reference)
+        .def("reset_ret", &ParallelTppRComputer::reset_ret)
+        .def("reset_tppr", &ParallelTppRComputer::reset_tppr)
+        .def("reset_val_tppr", &ParallelTppRComputer::reset_val_tppr)
+        .def("backup_tppr", &ParallelTppRComputer::backup_tppr)
+        .def("restore_tppr", &ParallelTppRComputer::restore_tppr)
+        .def("restore_val_tppr", &ParallelTppRComputer::restore_val_tppr)
+        .def("get_pruned_topk", &ParallelTppRComputer::get_pruned_topk)
+        .def("extract_streaming_tppr", &ParallelTppRComputer::extract_streaming_tppr)
+        .def("streaming_topk", &ParallelTppRComputer::streaming_topk)
+        .def("single_streaming_topk", &ParallelTppRComputer::single_streaming_topk)
+        .def("streaming_topk_no_fake", &ParallelTppRComputer::streaming_topk_no_fake)
+        .def("compute_val_tppr", &ParallelTppRComputer::compute_val_tppr)
+        .def("get_ret", [](const ParallelTppRComputer &ps) { return ps.ret; });
+
 }
\ No newline at end of file
--- a/csrc/sampler/include/head.h
+++ b/csrc/sampler/include/head.h
 #pragma once
 #include <iostream>
+#include <algorithm>
 #include <torch/extension.h>
 #include <omp.h>
 #include <time.h>
@@ -17,6 +18,12 @@ typedef int64_t NodeIDType;
 typedef int64_t EdgeIDType;
 typedef float WeightType;
 typedef float TimeStampType;
+typedef tuple<NodeIDType, EdgeIDType, TimeStampType> PPRKeyType;
+typedef double PPRValueType;
+typedef phmap::parallel_flat_hash_map<PPRKeyType, PPRValueType> PPRDictType;
+typedef vector<PPRDictType> PPRListDictType;
+typedef vector<vector<PPRDictType>> PPRListListDictType;
+typedef vector<vector<double>> NormListType;

 class TemporalNeighborBlock;
 class TemporalGraphBlock;
@@ -28,6 +35,7 @@ int nodeIdToInOut(NodeIDType nid, int pid, const vector<NodeIDType>& part_ptr);
 int nodeIdToPartId(NodeIDType nid, const vector<NodeIDType>& part_ptr);
 vector<th::Tensor> divide_nodes_to_part(th::Tensor nodes, const vector<NodeIDType>& part_ptr, int threads);
 NodeIDType sample_multinomial(const vector<WeightType>& weights, default_random_engine& e);
+vector<int64_t> sample_max(const vector<WeightType>& weights, int k);



@@ -173,3 +181,17 @@ NodeIDType sample_multinomial(const vector<WeightType>& weights, default_random_
    sample_indice = distance(cumulative_weights.begin(), it);
    return sample_indice;
 }
+
+vector<int64_t> sample_max(const vector<WeightType>& weights, int k) {
+    vector<int64_t> indices(weights.size());
+    for (int i = 0; i < weights.size(); ++i) {
+        indices[i] = i;
+    }
+
+    // 使用部分排序算法（选择算法）找到前k个最大值的索引
+    partial_sort(indices.begin(), indices.begin() + k, indices.end(), 
+                 [&weights](int64_t a, int64_t b) { return weights[a] > weights[b]; });
+
+    // 返回前k个最大值的索引
+    return vector<int64_t>(indices.begin(), indices.begin() + k);
+}
\ No newline at end of file
--- a/csrc/sampler/include/neighbors.h
+++ b/csrc/sampler/include/neighbors.h
@@ -287,10 +287,15 @@ void TemporalNeighborBlock::update_edge_weight(

    for(int64_t i=0; i<edge_num; i++){
        //修改节点与邻居边的权重
-        AT_ASSERTM(this->inverted_index[dst[i]].count(src[i])==1, "Unexist Edge Index: "+to_string(src[i])+", "+to_string(dst[i]));
 		int index;
-        if(this->with_eid) index = this->inverted_index[dst[i]][eid_ptr[i]];
-        else index = this->inverted_index[dst[i]][src[i]];
+        if(this->with_eid){
+            AT_ASSERTM(this->inverted_index[dst[i]].count(eid_ptr[i])==1, "Unexist Eid --> Col: "+to_string(eid_ptr[i])+"-->"+to_string(dst[i]));
+            index = this->inverted_index[dst[i]][eid_ptr[i]];
+        }
+        else{
+            AT_ASSERTM(this->inverted_index[dst[i]].count(src[i])==1, "Unexist Edge Index: "+to_string(src[i])+", "+to_string(dst[i]));
+            index = this->inverted_index[dst[i]][src[i]];
+        }
        this->edge_weight[dst[i]][index] = ew[i];
    }
 }

--- a/csrc/sampler/include/output.h
+++ b/csrc/sampler/include/output.h
@@ -11,6 +11,7 @@ class TemporalGraphBlock
        vector<int64_t> src_index;
        vector<NodeIDType> sample_nodes;
        vector<TimeStampType> sample_nodes_ts;
+        vector<WeightType> e_weights;
        double sample_time = 0;
        double tot_time = 0;
        int64_t sample_edge_num = 0;

--- a/csrc/sampler/include/sampler.h
+++ b/csrc/sampler/include/sampler.h
@@ -105,13 +105,13 @@ void ParallelSampler :: neighbor_sample_from_nodes_static_layer(th::Tensor nodes
            // uniform_int_distribution<> u(0, tnb.deg[node]-1);            
            // while(temp_s.size()!=fanout && temp_s.size()<tnb.neighbors_set[node].size()){
            for(int i=0;i<fanout;i++){
-                //ѭ��ѡ��fanout���ھ�
+                //循环选择fanout个邻居
                NodeIDType indice;
-                if(policy == "weighted"){//���Ǳ�Ȩ����Ϣ
+                if(policy == "weighted"){//考虑边权重信息
                    const vector<WeightType>& ew = tnb.edge_weight[node];
                    indice = sample_multinomial(ew, e);
                }
-                else if(policy == "uniform"){//���Ȳ���
+                else if(policy == "uniform"){//均匀采样
                    // indice = u(e);
                    indice = rand_r(&loc_seed) % (nei.size());
                }
@@ -119,7 +119,7 @@ void ParallelSampler :: neighbor_sample_from_nodes_static_layer(th::Tensor nodes
                auto chosen_e_iter = edge.begin() + indice;
                if(part_unique){
                    auto rst = temp_s.insert(*chosen_n_iter);
-                    if(rst.second){ //���ظ�
+                    if(rst.second){ //不重复
                        eid_threads[tid].emplace_back(*chosen_e_iter);
                        node_s_threads[tid].insert(*chosen_n_iter);
                        if(!tnb.neighbors_set.empty() && temp_s.size()<fanout && temp_s.size()<tnb.neighbors_set[node].size()) fanout++;
@@ -229,7 +229,7 @@ void ParallelSampler :: neighbor_sample_from_nodes_with_before_layer(
            }
        }
        else{
-            //��ѡ�ھӱߴ����ȳ��Ļ���Ҫ���ѡ��fanout���ھ�
+            //可选邻居边大于扇出的话需要随机选择fanout个邻居
            tgb_i[tid].src_index.insert(tgb_i[tid].src_index.end(), fanout, i);
            uniform_int_distribution<> u(0, end_index-1);
            //cout<<end_index<<endl;

--- a/csrc/sampler/include/tppr.h
+++ b/csrc/sampler/include/tppr.h
--- a/data_maker.py
+++ b/data_maker.py
@@ -114,11 +114,15 @@ edge_weight_dict = {}
 edge_weight_dict['edata'] = 2*neg_nums
 edge_weight_dict['sample_data'] = 1*neg_nums
 edge_weight_dict['neg_data'] = 1
-partition_save('./dataset/here/'+data_name, data, 1, 'metis_for_tgnn',
-               edge_weight_dict=edge_weight_dict)
-partition_save('./dataset/here/'+data_name, data, 2, 'metis_for_tgnn',
-               edge_weight_dict=edge_weight_dict)
-partition_save('./dataset/here/'+data_name, data, 4, 'metis_for_tgnn',
+#partition_save('./dataset/here/'+data_name, data, 1, 'metis_for_tgnn',
+#               edge_weight_dict=edge_weight_dict)
+#partition_save('./dataset/here/'+data_name, data, 2, 'metis_for_tgnn',
+#               edge_weight_dict=edge_weight_dict)
+#partition_save('./dataset/here/'+data_name, data, 4, 'metis_for_tgnn',
+#               edge_weight_dict=edge_weight_dict)
+#partition_save('./dataset/here/'+data_name, data, 8, 'metis_for_tgnn',
+#               edge_weight_dict=edge_weight_dict)
+partition_save('./dataset/here/'+data_name, data, 16, 'metis_for_tgnn',
               edge_weight_dict=edge_weight_dict)
 #
 # partition_save('./dataset/here/'+data_name, data, 4, 'metis_for_tgnn',

--- a/demo_dtdg.py
+++ b/demo_dtdg.py
--- a/docs/source/advanced/data_proc.rst
+++ b/docs/source/advanced/data_proc.rst
+Advanced Data Preprocessing
+===========================
+
+.. note::
+    详细介绍一下StarryGL几种数据管理类，例如GraphData，的使用细节，内部索引结构的设计和底层操作。
\ No newline at end of file
--- a/docs/source/advanced/index.rst
+++ b/docs/source/advanced/index.rst
@@ -4,4 +4,4 @@ Advanced Concepts
 .. toctree::
    sampling_parallel/index
    partition_parallel/index
-    timeline_parallel/index
\ No newline at end of file
+    timeline_parallel/index
--- a/docs/source/advanced/pp_training.rst
+++ b/docs/source/advanced/pp_training.rst
+Distributed Partition Parallel
+==============================
+
+.. note::
+    分布式分区并行训练部分
\ No newline at end of file
--- a/docs/source/advanced/tp_training.rst
+++ b/docs/source/advanced/tp_training.rst
+Distributed Timeline Parallel
+=============================
+
+.. note::
+    分布式时序并行
\ No newline at end of file
--- a/docs/source/advanced/ts_sampling.rst
+++ b/docs/source/advanced/ts_sampling.rst
+Distributed Temporal Sampling
+=============================
+
+.. note::
+    基于分布式时序图采样的训练模式
\ No newline at end of file
--- a/docs/source/api/python/cache.rst
+++ b/docs/source/api/python/cache.rst
+starrygl.sample.cache.fetch_cache
+=================================
+
+.. note::
+    The cache used in feature fetching
+
+.. currentmodule:: starrygl.sample.cache.fetch_cache
+
+.. autoclass::
+    FetchFeatureCache
+    :members:
+
--- a/docs/source/api/python/graph_core.rst
+++ b/docs/source/api/python/graph_core.rst
+starrygl.sample.graph_core
+==========================
+
+.. note::
+    Distributed Data Structure used in sampling training
+
+.. currentmodule:: starrygl.sample.graph_core
+
+.. autoclass::
+    DistributedGraphStore
+    :members:
+
+.. autoclass::
+    DataSet
+.. autoclass::
+    TemporalNeighborSampleGraph
\ No newline at end of file
--- a/docs/source/api/python/index.rst
+++ b/docs/source/api/python/index.rst
@@ -5,4 +5,6 @@ Package References
    distributed
    neighbor_sampler
    memory
-    data_loader
\ No newline at end of file
+    data_loader
+    graph_core
+    cache
--- a/docs/source/tutorial/dataset.rst
+++ b/docs/source/tutorial/dataset.rst
 Preparing the Temporal Graph Dataset
 ====================================

-.. note::
-    包含从原始数据开始的数据清洗和预处理步骤，最终形成可以被StarryGL使用的数据文件
\ No newline at end of file
+In this tutorial, we will show the preparation process of the temporal graph datase that can be used by StarryGL.
+
+Read Raw Data
+-------------
+
+Take Wikipedia dataset as an example, the raw data files are as follows:
+
+- `edges.csv`: the temporal edges of the graph
+- `node_features.pt`: the node features of the graph
+- `edge_features.pt`: the edge features of the graph
+
+Here is an example to read the raw data files:
+
+.. code-block:: python
+    
+    data_name = args.data_name
+    df = pd.read_csv('raw_data/'+data_name+'/edges.csv')
+    if os.path.exists('raw_data/'+data_name+'/node_features.pt'):
+        n_feat = torch.load('raw_data/'+data_name+'/node_features.pt')
+    else:
+        n_feat = None
+    if os.path.exists('raw_data/'+data_name+'/edge_features.pt'):
+        e_feat = torch.load('raw_data/'+data_name+'/edge_features.pt')
+    else:
+        e_feat = None
+    src = torch.from_numpy(np.array(df.src.values)).long()
+    dst = torch.from_numpy(np.array(df.dst.values)).long()
+    ts = torch.from_numpy(np.array(df.time.values)).long()
+    neg_nums = args.num_neg_sample
+
+    edge_index = torch.cat((src[np.newaxis, :], dst[np.newaxis, :]), 0)
+    num_nodes = edge_index.view(-1).max().item()+1
+    num_edges = edge_index.shape[1]
+    print('the number of nodes in graph is {}, \
+        the number of edges in graph is {}'.format(num_nodes, num_edges))
+
+Preprocess Data
+---------------
+
+After reading the raw data, we need to preprocess the data to get the data format that can be used by StarryGL. The following code shows the preprocessing process:
+
+.. code-block:: python
+
+    sample_graph = {}
+    sample_src = torch.cat([src.view(-1, 1), dst.view(-1, 1)], dim=1)\
+        .reshape(1, -1)
+    sample_dst = torch.cat([dst.view(-1, 1), src.view(-1, 1)], dim=1)\
+        .reshape(1, -1)
+    sample_ts = torch.cat([ts.view(-1, 1), ts.view(-1, 1)], dim=1).reshape(-1)
+    sample_eid = torch.arange(num_edges).view(-1, 1).repeat(1, 2).reshape(-1)
+    sample_graph['edge_index'] = torch.cat([sample_src, sample_dst], dim=0)
+    sample_graph['ts'] = sample_ts
+    sample_graph['eids'] = sample_eid
+    neg_sampler = NegativeSampling('triplet')
+    neg_src = neg_sampler.sample(edge_index.shape[1]*neg_nums, num_nodes)
+    neg_sample = neg_src.reshape(-1, neg_nums)
+
+
+    edge_ts = torch.torch.from_numpy(np.array(ts)).float()
+    data = Data() #torch_geometric.data.Data()
+    data.num_nodes = num_nodes
+    data.num_edges = num_edges
+    data.edge_index = edge_index
+    data.edge_ts = edge_ts
+    data.neg_sample = neg_sample
+
+    if n_feat is not None:
+        data.x = n_feat
+    if e_feat is not None:
+        data.edge_attr = e_feat
+
+    data.train_mask = (torch.from_numpy(np.array(df.ext_roll.values)) == 0)
+    data.val_mask = (torch.from_numpy(np.array(df.ext_roll.values)) == 1)
+    data.test_mask = (torch.from_numpy(np.array(df.ext_roll.values)) == 2)
+    sample_graph['train_mask'] = data.train_mask[sample_eid]
+    sample_graph['test_mask'] = data.test_mask[sample_eid]
+    sample_graph['val_mask'] = data.val_mask[sample_eid]
+    data.sample_graph = sample_graph
+
+    data.y = torch.zeros(edge_index.shape[1])
+    edge_index_dict = {}
+    edge_index_dict['edata'] = data.edge_index
+    edge_index_dict['sample_data'] = data.sample_graph['edge_index']
+    edge_index_dict['neg_data'] = torch.cat([neg_src.view(1, -1),
+                                            dst.view(-1, 1).repeat(1, neg_nums).
+                                            reshape(1, -1)], dim=0)
+    data.edge_index_dict = edge_index_dict
+    edge_weight_dict = {}
+    edge_weight_dict['edata'] = 2*neg_nums
+    edge_weight_dict['sample_data'] = 1*neg_nums
+    edge_weight_dict['neg_data'] = 1
+
+We construct a torch_geometric.data.Data object to store the data. The data object contains the following attributes:
+
+- `num_nodes`: the number of nodes in the graph
+- `num_edges`: the number of edges in the graph
+- `edge_index`: the edge index of the graph
+- `edge_ts`: the timestamp of the edges
+- `neg_sample`: the negative samples of the edges
+- `x`: the node features of the graph
+- `edge_attr`: the edge features of the graph
+- `train_mask`: the train mask of the edges
+- `val_mask`: the validation mask of the edges
+- `test_mask`: the test mask of the edges
+- `sample_graph`: the sampled graph
+- `edge_index_dict`: the edge index of the sampled graph
+
+Finally, we can partition the graph and save the data:
+
+.. code-block:: python
+
+    partition_save('./dataset/here/'+data_name, data, 16, 'metis_for_tgnn',
+               edge_weight_dict=edge_weight_dict)
--- a/docs/source/tutorial/distributed.rst
+++ b/docs/source/tutorial/distributed.rst
--- a/docs/source/tutorial/index.rst
+++ b/docs/source/tutorial/index.rst
@@ -5,5 +5,4 @@ Tutorials
    intro
    module
    dataset
-    application
    distributed
\ No newline at end of file
--- a/docs/source/tutorial/intro.rst
+++ b/docs/source/tutorial/intro.rst
 Introduction to Temporal GNN
 ==============================================

-.. note::
-    简单介绍一下时序GNN，应用场景，需要解决的问题等，相当于一个总体的介绍
+There are so many real-word systems that can be formulated as temporal interaction graphs, such as social network and citation network. In these systems, the nodes represent the entities and the edges represent the interactions between entities. The interactions are usually time-stamped, which means the edges are associated with time. Temporal interaction graphs are dynamic, which means the graph structure changes over time. For example, in a social network, the friendship between two people may be established or broken at different time. In a citation network, a paper may cite another paper at different time. 
+
+To encapsulate the temporal information present in these graphs and learn dynamic representations, researchers have introduced temporal graph neural networks (GNNs). These networks are capable of modeling both structural and temporal dependencies within the graph. Numerous innovative frameworks have been proposed to date, achieving outstanding performance in specific tasks such as link prediction. Based on two different methods to represent temporal graphs, we can divide temporal GNNs into two categories: 
+
+1. continuous-time temporal GNNs, which model the temporal graph as a sequence of interactions
+2. discrete-time temporal GNNs, which model the temporal graph as a sequence of snapshots
+
+However, as the temporal graph expands—potentially encompassing millions of nodes and billions of edges—it becomes increasingly challenging to scale temporal GNN training to accommodate these larger graphs. The reasons are twofold: first, sampling neighbors from a larger graph demands more time; second, chronological training also incurs a higher time cost. To address these challenges, we introduce StarryGL in this tutorial. StarryGL is a distributed temporal GNN framework designed to efficiently navigate the complexities of training larger temporal graphs.
\ No newline at end of file
--- a/docs/source/tutorial/module.rst
+++ b/docs/source/tutorial/module.rst
 Creating Temporal GNN Models
 ============================

-.. note::
-    介绍如何创建GNN模型，找最经典最简洁的两个例子即可。包括 **离散时间动态图模型** 模型构建和 **连续时间动态图模型**。
\ No newline at end of file
+Continuous-time Temporal GNN Models
+-----------------------------------
+
+To create a continuous-time temporal GNN model, we first need to define a configuration file with the suffix yml to specify the model structures and parameters. Here we use the configuration file :code:`TGN.yml` for TGN model as an example:
+
+.. code-block:: yaml
+
+    sampling:
+      - layer: 1
+        neighbor: 
+          - 10
+        strategy: 'recent'
+        prop_time: False
+        history: 1
+        duration: 0
+        num_thread: 32
+    memory: 
+      - type: 'node'
+        dim_time: 100
+        deliver_to: 'self'
+        mail_combine: 'last'
+        memory_update: 'gru'
+        mailbox_size: 1
+        combine_node_feature: True
+        dim_out: 100
+    gnn:
+      - arch: 'transformer_attention'
+        use_src_emb: False
+        use_dst_emb: False
+        layer: 1
+        att_head: 2
+        dim_time: 100
+        dim_out: 100
+    train:
+      - epoch: 20
+        batch_size: 200
+        # reorder: 16
+        lr: 0.0001
+        dropout: 0.2
+        att_dropout: 0.2
+        all_on_gpu: True
+
+The configuration file is composed of four parts: :code:`sampling`, :code:`memory`, :code:`gnn` and :code:`train`. Here are their meanings:
+
+- :code:`sampling`: This part specifies the sampling strategy for the temporal graph. :code:`layer` field specifies the number of layers in the sampling strategy. The :code:`neighbor` field specifies the number of neighbors to sample for each layer. The :code:`strategy` field specifies the sampling strategy(recent or uniform). The :code:`prop_time` field specifies whether to propagate the time information. The :code:`history` field specifies the number of historical timestamps to use. The :code:`duration` field specifies the duration of the time window. The :code:`num_thread` field specifies the number of threads to use for sampling.
+- :code:`memory`: This part specifies the memory module. :code:`type` field specifies the type of memory module(node or none). :code:`dim_time` field specifies the dimension of the time embedding. :code:`deliver_to` field specifies the destination of the message. :code:`mail_combine` field specifies the way to combine the messages. :code:`memory_update` field specifies the way to update the memory. :code:`mailbox_size` field specifies the size of the mailbox. :code:`combine_node_feature` field specifies whether to combine the node features. :code:`dim_out` field specifies the dimension of the output.
+- :code:`gnn`: This part specifies the GNN module. :code:`arch` field specifies the architecture of the GNN module. :code:`use_src_emb` field specifies whether to use the source embedding. :code:`use_dst_emb` field specifies whether to use the destination embedding. :code:`layer` field specifies the number of layers in the GNN module. :code:`att_head` field specifies the number of attention heads. :code:`dim_time` field specifies the dimension of the time embedding. :code:`dim_out` field specifies the dimension of the output.
+- :code:`train`: This part specifies the training parameters. :code:`epoch` field specifies the number of epochs. :code:`batch_size` field specifies the batch size. :code:`lr` field specifies the learning rate. :code:`dropout` field specifies the dropout rate. :code:`att_dropout` field specifies the attention dropout rate. :code:`all_on_gpu` field specifies whether to put all the data on GPU.
+
+After defining the configuration file, we can firstly read the parameters from the configuration file and create the model by constructing a :code:`General Model` object:
+
+.. code-block:: python
+
+    def parse_config(f):
+        conf = yaml.safe_load(open(f, 'r'))
+        sample_param = conf['sampling'][0]
+        memory_param = conf['memory'][0]
+        gnn_param = conf['gnn'][0]
+        train_param = conf['train'][0]
+        return sample_param, memory_param, gnn_param, train_param
+    
+    sample_param, memory_param, gnn_param, train_param = parse_config('./config/{}.yml'.format(args.model))
+    model = GeneralModel(gnn_dim_node, gnn_dim_edge, sample_param, memory_param, gnn_param, train_param).cuda()
+    model = DDP(model)
+
+Then a :code:`GeneralModel` object is created. If needed, we can adjust the model's parameters by modifying the contents of the configuration file. Here we provide 5 models for continuous-time temporal GNNs:
+
+- :code:`TGN`: The TGN model proposed in `Temporal Graph Networks for Deep Learning on Dynamic Graphs <https://arxiv.org/abs/2006.10637>`__.
+- :code:`DyRep`: The DyRep model proposed in `Representation Learning and Reasoning on Temporal Knowledge Graphs <https://arxiv.org/abs/1803.04051>`__.
+- :code:`TIGER`: The TIGER model proposed in `TIGER: A Transformer-Based Framework for Temporal Knowledge Graph Completion <https://arxiv.org/abs/2302.06057>`__.
+- :code:`Jodie`: The Jodie model proposed in `JODIE: Joint Optimization of Dynamics and Importance for Online Embedding <https://arxiv.org/abs/1908.01207>`__.
+- :code:`TGAT`: The TGAT model proposed in `Temporal Graph Attention for Deep Temporal Modeling <https://arxiv.org/abs/2002.07962>`__.
\ No newline at end of file
--- a/install.sh
+++ b/install.sh
@@ -3,11 +3,17 @@
 mkdir -p build && cd build
 cmake .. \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-    -DCMAKE_PREFIX_PATH="/home/hwj/.miniconda3/envs/sgl/lib/python3.10/site-packages" \
-    -DPython3_ROOT_DIR="/home/hwj/.miniconda3/envs/sgl" \
-    -DCUDA_TOOLKIT_ROOT_DIR="/home/hwj/.local/cuda-11.8" \
+<<<<<<< HEAD
+    -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/dgnn/lib/python3.10/site-packages" \
+    -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/dgnn" \
+    -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/local/cuda-12.2" \
+=======
+    -DCMAKE_PREFIX_PATH=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())") \
+    -DPython3_ROOT_DIR=$(python -c "import sys; print(sys.prefix)") \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME:-"$(realpath $(dirname $(which nvcc))/../)"} \
+>>>>>>> 98ad16d40e5e3e0a7dcdbc4f21dc9e164abc625f
 && make -j32 \
 && rm -rf ../starrygl/lib \
 && mkdir ../starrygl/lib \
 && cp lib*.so ../starrygl/lib/ \
-&& patchelf --set-rpath '$ORIGIN:$ORIGIN/lib' --force-rpath ../starrygl/lib/*.so
+&& patchelf --set-rpath '$ORIGIN:$ORIGIN/lib' --force-rpath ../starrygl/lib/*.so
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ torch==2.1.1+cu118
 torchvision==0.16.1+cu118
 torchaudio==2.1.1+cu118

--extra-index-url https://data.pyg.org/whl/torch-2.1.0+cu118.html
+--find-links https://data.pyg.org/whl/torch-2.1.0+cu118.html
 torch_geometric==2.4.0
 pyg_lib==0.3.1+pt21cu118
 torch_scatter==2.1.2+pt21cu118
@@ -11,6 +11,12 @@ torch_sparse==0.6.18+pt21cu118
 torch_cluster==1.6.3+pt21cu118
 torch_spline_conv==1.2.2+pt21cu118

+--find-links https://data.dgl.ai/wheels/cu118/repo.html
+dgl==1.1.3+cu118
+
+--find-links https://data.dgl.ai/wheels-test/repo.html
+dglgo==0.0.2
+
 ogb
 tqdm
-networkx
\ No newline at end of file
+networkx
--- a/starrygl/distributed/utils.py
+++ b/starrygl/distributed/utils.py
@@ -294,7 +294,7 @@ class DistributedTensor:
        index = dist_index.loc

        futs: List[torch.futures.Future] = []
-        for i in range(self.num_parts()):
+        for i in range(self.num_parts):
            mask = part_idx == i
            f = self.accessor.async_index_copy_(0, index[mask], source[mask], self.rrefs[i])
            futs.append(f)
@@ -308,7 +308,7 @@ class DistributedTensor:
        index = dist_index.loc

        futs: List[torch.futures.Future] = []
-        for i in range(self.num_parts()):
+        for i in range(self.num_parts):
            mask = part_idx == i
            f = self.accessor.async_index_add_(0, index[mask], source[mask], self.rrefs[i])
            futs.append(f)

--- a/starrygl/evaluation/evaluate.py
+++ b/starrygl/evaluation/evaluate.py
--- a/starrygl/module/modules.py
+++ b/starrygl/module/modules.py
 import torch
-import dgl
 from os.path import abspath, join, dirname
 import sys
 sys.path.insert(0, join(abspath(dirname(__file__))))
@@ -47,7 +46,7 @@ class GeneralModel(torch.nn.Module):
        self.edge_predictor = EdgePredictor(gnn_param['dim_out'])
        if 'combine' in gnn_param and gnn_param['combine'] == 'rnn':
            self.combiner = torch.nn.RNN(gnn_param['dim_out'], gnn_param['dim_out'])
-    
+            
                
    def forward(self, mfgs, metadata = None,neg_samples=1):
        if self.memory_param['type'] == 'node':
@@ -68,8 +67,14 @@ class GeneralModel(torch.nn.Module):
            out = torch.stack(out, dim=0)
            out = self.combiner(out)[0][-1, :, :]
        #metadata需要在前面去重的时候记一下id
+        if self.gnn_param['use_src_emb'] or self.gnn_param['use_dst_emb']:
+            self.embedding = out.detach().clone()
+        else:
+            self.embedding = None
        if metadata is not None:
            #out = torch.cat((out[metadata['dst_pos_pos']],out[metadata['src_id_pos']],out[metadata['dst_neg_pos']]),0)
+            if self.gnn_param['dyrep']:
+                out = self.memory_updater.last_updated_memory
            out = torch.cat((out[metadata['src_pos_index']],out[metadata['dst_pos_index']],out[metadata['src_neg_index']]),0)
        return self.edge_predictor(out, neg_samples=neg_samples)


--- a/starrygl/module/utils.py
+++ b/starrygl/module/utils.py
 import yaml
-
+import numpy as np

 def parse_config(f):
    conf = yaml.safe_load(open(f, 'r'))
@@ -7,4 +7,32 @@ def parse_config(f):
    memory_param = conf['memory'][0]
    gnn_param = conf['gnn'][0]
    train_param = conf['train'][0]
-    return sample_param, memory_param, gnn_param, train_param
\ No newline at end of file
+    return sample_param, memory_param, gnn_param, train_param
+
+class EarlyStopMonitor(object):
+  def __init__(self, max_round=3, higher_better=True, tolerance=1e-10):
+    self.max_round = max_round
+    self.num_round = 0
+
+    self.epoch_count = 0
+    self.best_epoch = 0
+
+    self.last_best = None
+    self.higher_better = higher_better
+    self.tolerance = tolerance
+
+  def early_stop_check(self, curr_val):
+    if not self.higher_better:
+      curr_val *= -1
+    if self.last_best is None:
+      self.last_best = curr_val
+    elif (curr_val - self.last_best) / np.abs(self.last_best) > self.tolerance:
+      self.last_best = curr_val
+      self.num_round = 0
+      self.best_epoch = self.epoch_count
+    else:
+      self.num_round += 1
+
+    self.epoch_count += 1
+
+    return self.num_round >= self.max_round
\ No newline at end of file
--- a/starrygl/nn/emma.py
+++ b/starrygl/nn/emma.py
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.autograd as autograd
-
-from torch import Tensor
-from typing import *
-
-from torch_scatter import segment_csr, gather_csr
-from torch_sparse import SparseTensor
-
-
-__all__ = [
-    "EmmaAttention",
-    "EmmaSum",
-]
-
-
-class EmmaAttention(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.register_buffer(
-            "his_x",
-            torch.empty(0),
-            persistent=False,
-        )
-        self.register_buffer(
-            "his_m",
-            torch.empty(0),
-            persistent=False,
-        )
-        self.register_buffer(
-            "inv_w",
-            torch.empty(0),
-            persistent=False,
-        )
-        self.reset_parameters()
-    
-    def reset_parameters(self):
-        self.get_buffer("his_x").zero_()
-        self.get_buffer("his_m").fill_(-torch.inf)
-        self.get_buffer("inv_w").zero_()
-    
-    def forward(self, x: Tensor, max_a: Tensor, agg_n: Tensor):
-        if self.training:
-            his_x = self.get_buffer("his_x")
-            his_m = self.get_buffer("his_m")
-            inv_w = self.get_buffer("inv_w")
-
-            x = EmmaAttentionFunction.apply(
-                x, max_a, his_x, his_m, agg_n, inv_w)
-        else:
-            inv_w = 1.0 / agg_n.data
-            inv_w[agg_n == 0] = 0.0
-
-            self._copy_or_clone("his_x", x)
-            self._copy_or_clone("his_m", max_a)
-            self._copy_or_clone("inv_w", inv_w)
-        return x
-    
-    def _copy_or_clone(self, name: str, x: Tensor):
-        _x = self.get_buffer(name)
-        if _x.size() != x.size():
-            self.register_buffer(
-                name, x.data.clone(), persistent=False)
-        else:
-            _x.copy_(x.data)
-    
-    @staticmethod
-    def softmax_gat(
-        src_a: Tensor,
-        dst_a: Tensor,
-        adj_t: SparseTensor,
-        negative_slope: float = 0.01,
-    ) -> Tuple[SparseTensor, Tensor]:
-        assert src_a.dim() in {1, 2}
-        assert src_a.dim() == dst_a.dim()
-
-        ptr, ind, val = adj_t.csr()
-        
-        a = src_a[ind] + gather_csr(dst_a, ptr)
-        a = F.leaky_relu(a, negative_slope=negative_slope)
-
-        with torch.no_grad():
-            max_a = torch.full_like(dst_a, -torch.inf)
-            max_a = segment_csr(a, ptr, reduce="max", out=max_a)
-        exp_a = torch.exp(a - gather_csr(max_a, ptr))
-
-        if val is not None:
-            assert val.dim() == 1
-            if exp_a.dim() == 1:
-                exp_a = exp_a * val
-            else:
-                exp_a = exp_a * val.unsqueeze(-1)
-
-        sum_exp_a = segment_csr(exp_a, ptr, reduce="sum")
-        exp_a = exp_a / gather_csr(sum_exp_a, ptr)
-        with torch.no_grad():
-            max_a.add_(sum_exp_a.log())
-
-        adj_t = SparseTensor(rowptr=ptr, col=ind, value=exp_a)
-        return adj_t, max_a
-    
-    @staticmethod
-    def apply_gat(
-        x: Tensor,
-        src_a: Tensor,
-        dst_a: Tensor,
-        adj_t: SparseTensor,
-        negative_slope: float = 0.01,
-    ) -> Tuple[Tensor, Tensor]:
-        adj_t, max_a = EmmaAttention.softmax_gat(
-            src_a=src_a, dst_a=dst_a,
-            adj_t=adj_t, negative_slope=negative_slope,
-        )
-
-        ptr, ind, val = adj_t.csr()
-        if val.dim() == 1:
-            assert x.dim() == 2
-            x = adj_t @ x
-        elif val.dim() == 2:
-            assert x.dim() == 3
-            assert x.size(1) == val.size(1)
-            xs = []
-            for i in range(x.size(1)):
-                xs.append(
-                    SparseTensor(
-                        rowptr=ptr, col=ind, value=val[:,i],
-                    ) @ x[:,i,:]
-                )
-            x = torch.cat(xs, dim=1).view(-1, *x.shape[1:])
-
-        return x, max_a
-
-class EmmaAttentionFunction(autograd.Function):
-    @staticmethod
-    def forward(
-        ctx: autograd.function.FunctionCtx,
-        x: Tensor,
-        max_a: Tensor,
-        his_x: Tensor,
-        his_m: Tensor,
-        agg_n: Tensor,
-        inv_w: Tensor,
-    ):
-        assert x.dim() in {2, 3}
-        assert x.dim() == his_x.dim()
-        assert max_a.dim() == his_m.dim()
-
-        beta = (1.0 - inv_w * agg_n).clamp_(0.0, 1.0)
-        if x.dim() == 2:
-            assert max_a.dim() == 1
-        elif x.dim() == 3:
-            assert max_a.dim() == 2
-            beta = beta.unsqueeze_(-1)
-
-        max_m = torch.max(max_a, his_m)
-
-        p = (his_m - max_m).nan_to_num_(0.0).exp_().mul_(beta)
-        q = (max_a - max_m).nan_to_num_(0.0).exp_()
-
-        t = p + q
-        p.div_(t).unsqueeze_(-1)
-        q.div_(t).unsqueeze_(-1)
-
-        his_x.mul_(p).add_(x * q)
-        his_m.copy_(max_m).add_(t.log_())
-
-        ctx.save_for_backward(q)
-        return his_x
-    
-    @staticmethod
-    def backward(
-        ctx: autograd.function.FunctionCtx,
-        grad: Tensor,
-    ):
-        q, = ctx.saved_tensors
-        return grad * q, None, None, None, None, None
-
-class EmmaSum(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.register_buffer(
-            "his_x",
-            torch.empty(0),
-            persistent=False,
-        )
-        self.register_buffer(
-            "inv_w",
-            torch.empty(0),
-            persistent=False,
-        )
-        self.reset_parameters()
-    
-    def reset_parameters(self):
-        self.get_buffer("his_x").zero_()
-        self.get_buffer("inv_w").zero_()
-    
-    def forward(self, x: Tensor, agg_n: Tensor, aggr: str = "sum"):
-        assert aggr in {"sum", "mean"}
-
-        if self.training:
-            his_x = self.get_buffer("his_x")
-            inv_w = self.get_buffer("inv_w")
-
-            x = EmmaSumFunction.apply(x, his_x, agg_n, inv_w)
-        else:
-            inv_w = 1.0 / agg_n.data
-            inv_w[agg_n == 0] = 0.0
-
-            self._copy_or_clone("his_x", x)
-            self._copy_or_clone("inv_w", inv_w)
-
-        if aggr == "mean":
-            x = x * inv_w[:,None]
-        return x
-    
-    def _copy_or_clone(self, name: str, x: Tensor):
-        _x = self.get_buffer(name)
-        if _x.size() != x.size():
-            self.register_buffer(
-                name, x.data.clone(), persistent=False)
-        else:
-            _x.copy_(x.data)
-
-class EmmaSumFunction(autograd.Function):
-    @staticmethod
-    def forward(
-        ctx: autograd.function.FunctionCtx,
-        x: Tensor,
-        his_x: Tensor,
-        agg_n: Tensor,
-        inv_w: Tensor,
-    ):
-        assert x.dim() == 2
-        assert his_x.dim() == x.dim()
-
-        beta = (1.0 - inv_w * agg_n) \
-            .clamp_(0.0, 1.0).unsqueeze_(-1)
-
-        his_x.mul_(beta).add_(x)
-        
-        # ctx.save_for_backward(inv_w)
-        return his_x
-    
-    @staticmethod
-    def backward(
-        ctx: autograd.function.FunctionCtx,
-        grad: Tensor,
-    ):
-        # inv_w, = ctx.saved_tensors
-        # return grad * inv_w[:,None], None, None, None
-        return grad, None, None, None
\ No newline at end of file
--- a/starrygl/parallel/layerpipe.py
+++ b/starrygl/parallel/layerpipe.py
@@ -75,6 +75,12 @@ class LayerPipe(ABC):
                models.append((key, val))
        return tuple(models)
    
+    def parameters(self):
+        params: List[nn.Parameter] = []
+        for name, m in self.get_model():
+            params.extend(m.parameters())
+        return params
+    
    def register_route(self, *xs: Tensor):
        for t in xs:
            t.requires_route = True

--- a/starrygl/parallel/timeline/pipe.py
+++ b/starrygl/parallel/timeline/pipe.py
@@ -55,6 +55,12 @@ class SequencePipe(ABC):
                models.append((key, val))
        return tuple(models)
    
+    def parameters(self):
+        params: List[nn.Parameter] = []
+        for name, m in self.get_model():
+            params.extend(m.parameters())
+        return params
+    
    def to(self, device: Any):
        for _, net in self.get_model():
            net.to(device)

--- a/starrygl/sample/cache/fetch_cache.py
+++ b/starrygl/sample/cache/fetch_cache.py
@@ -17,12 +17,31 @@ class FetchFeatureCache:
                 graph: DistributedGraphStore,
                 mailbox:SharedMailBox = None,
                 policy = 'lru'):
+        """
+        method to create a fetch cache instance.
+
+        Args:
+            num_nodes: Total number of nodes in the graph.
+            num_edges: Total number of edges in the graph.
+            edge_cache_ratio: The hit rate of cache edges.
+            node_cache_ratio: The hit rate of cache nodes.
+            graph: Distributed graph store.
+            mailbox: used for storing information.
+            policy: Caching policy, either 'lru' or 'static'.
+
+        """
        global _FetchCache
        _FetchCache = FetchFeatureCache(num_nodes, num_edges,
                 edge_cache_ratio, node_cache_ratio,
                 graph,mailbox,policy)
    @staticmethod
    def getFetchCache():
+        """
+        method to get the existing fetch cache instance.
+
+        Returns:
+            FetchFeatureCache: The existing fetch cache instance.
+        """
        global _FetchCache
        return _FetchCache
    def __init__(self, num_nodes: int, num_edges: int,
@@ -31,6 +50,19 @@ class FetchFeatureCache:
                 mailbox:SharedMailBox = None,
                 policy = 'lru'
                ):
+        """
+        Initializes the FetchFeatureCache instance.
+
+        Args:
+            num_nodes: Total number of nodes in the graph.
+            num_edges: Total number of edges in the graph.
+            edge_cache_ratio: The hit rate of cache edges.
+            node_cache_ratio: The hit rate of cache nodes.
+            graph: Distributed graph store.
+            mailbox: used for storing information.
+            policy: Caching policy, either 'lru' or 'static'.
+
+        """
        if policy == 'lru':
            init_fn = LRU_cache.LRUCache
        elif policy == 'static':
@@ -62,7 +94,17 @@ class FetchFeatureCache:

    def fetch_feature(self, nid: Optional[torch.Tensor] = None, dist_nid = None,
                      eid: Optional[torch.Tensor] = None,  dist_eid = None
-                      ):  
+                      ):
+        """
+        Fetches node and edge features along with mailbox memory.
+
+        Args:
+            nid: Node indices to fetch features for.
+            dist_nid: The remote communication corresponding to nid.
+            eid: Edge indices to fetch features for.
+            dist_eid: The remote communication corresponding to eid.
+
+        """
        nfeat = None
        mem = None
        efeat = None
@@ -147,6 +189,14 @@ class FetchFeatureCache:
        return nfeat,efeat,mem

    def init_cache_with_presample(self,dataloader, num_epoch:int = 10):
+        """
+        Initializes the cache with pre-sampled data from the provided dataloader.
+
+        Args:
+            dataloader: The data loader we implement, containing the graph data.
+            num_epoch: Number of epochs to pre-sample the data.
+
+        """
        node_size = self.node_cache.capacity if self.node_cache is not None else 0
        edge_size = self.edge_cache.capacity if self.edge_cache is not None else 0
        node_counts,edge_counts = pre_sample(dataloader=dataloader,

--- a/starrygl/sample/data_loader.py
+++ b/starrygl/sample/data_loader.py
@@ -21,10 +21,54 @@ import math

 class DistributedDataLoader:
    ''' 
-     Args:
-            data_path: the path of loaded graph ,each part 0 of graph is saved on $path$/rank_0
-            num_replicas: the num of worker
-            
+    We will perform feature fetch in the data loader.
+    you can simply define a data loader for use, while starrygl assisting in fetching node or edge features:
+
+        
+        
+    Args:
+        graph: distributed graph store
+    
+        data: the graph data
+        
+        sampler: a parallel sampler like `NeighborSampler` above
+        
+        sampler_fn: sample type
+        
+        neg_sampler: negative sampler
+        
+        batch_size: batch size
+        
+        mailbox: APAN's mailbox and TGN's memory implemented by starrygl
+    
+    Examples:
+
+        .. code-block:: python
+        
+            import torch
+
+            from starrygl.sample.data_loader import DistributedDataLoader
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            from starrygl.sample.sample_core.base import NegativeSampling
+            from starrygl.sample.batch_data import SAMPLE_TYPE
+
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")    
+            graph = DistributedGraphStore(pdata = pdata, uvm_edge = False, uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.  edge_attr is not None else 0)
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10], graph_data=sample_graph,    workers=15,policy = 'recent',graph_name = "wiki_train")
+            neg_sampler = NegativeSampling('triplet')
+            train_data = torch.masked_select(graph.edge_index, pdata.train_mask.to(graph.edge_index.device)).reshape    (2, -1)
+            trainloader = DistributedDataLoader(graph, train_data, sampler=sampler, sampler_fn=SAMPLE_TYPE. SAMPLE_FROM_TEMPORAL_EDGES,neg_sampler=neg_sampler, batch_size=1000, shuffle=False, drop_last=True, chunk_size = None,train=True, mailbox=mailbox )
+
+    In the data loader, we will call the `graph_sample`, sourced from `starrygl.sample.batch_data`.
+
+    And the `to_block` function in the `graph_sample` will implement feature fetching.
+    If cache is not used, we will directly fetch node or edge features from the graph data, 
+    otherwise we will call `fetch_data` for feature fetching.     

    '''
    def __init__(
@@ -111,10 +155,10 @@ class DistributedDataLoader:
        self.expected_idx = data_size // self.batch_size if self.drop_last is True else int(math.ceil(data_size/self.batch_size))

        if dist.get_world_size() > 1:
-            num_epochs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
-            print(num_epochs)
-            dist.all_reduce(num_epochs, op=op)
-            self.expected_idx = int(num_epochs.item())
+            num_batchs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
+            print("num_batchs:", num_batchs)
+            dist.all_reduce(num_batchs, op=op)
+            self.expected_idx = int(num_batchs.item())

    def _next_data(self):   
        if self.current_pos >= self.dataset.len:
@@ -148,6 +192,7 @@ class DistributedDataLoader:
                                          self.device)
                self.recv_idxs += 1
                assert batch_data is not None
+                torch.cuda.synchronize()
                return batch_data
            else :
                raise StopIteration

--- a/starrygl/sample/graph_core/__init__.py
+++ b/starrygl/sample/graph_core/__init__.py
+import starrygl
 from starrygl.distributed.context import DistributedContext
 from starrygl.distributed.utils import DistIndex, DistributedTensor
 from starrygl.sample.graph_core.utils import build_mapper
@@ -6,8 +7,22 @@ import torch
 import torch.distributed as dist
 from torch_geometric.data import Data

-from starrygl.utils.uvm import cudaMemoryAdvise, uvm_advise, uvm_empty, uvm_prefetch, uvm_share
+
 class DistributedGraphStore:
+    '''
+
+    Initializes the DistributedGraphStore with distributed graph data.
+
+    Args:
+        pdata: Graph data object containing ids, eids, edge_index, edge_ts, sample_graph, x, and edge_attr.
+
+        device: Device to which tensors are moved (default is 'cuda').
+
+        uvm_node: If True, enables Unified Virtual Memory (UVM) for node data.
+
+        uvm_edge: If True, enables Unified Virtual Memory (UVM) for edge data.
+
+    '''
    def __init__(self, pdata, device = torch.device('cuda'),
                 uvm_node = False, 
                 uvm_edge = False):
@@ -36,12 +51,12 @@ class DistributedGraphStore:
                x = pdata.x.to(self.device)
            else:
                if self.device.type == 'cuda':
-                    x = uvm_empty(*pdata.x.size(),
+                    x = starrygl.utils.uvm.uvm_empty(*pdata.x.size(),
                                    dtype=pdata.x.dtype,
                                    device=ctx.device)
-                    uvm_share(x,device = ctx.device)
-                    uvm_advise(x,cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
-                    uvm_prefetch(x)
+                    starrygl.utils.uvm.uvm_share(x,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(x,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(x)
            if world_size > 1:
                self.x = DistributedTensor(pdata.x.to(self.device).to(torch.float))
            else:
@@ -56,12 +71,12 @@ class DistributedGraphStore:
                edge_attr = pdata.edge_attr.to(self.device)
            else:
                if self.device.type == 'cuda':
-                    edge_attr = uvm_empty(*pdata.edge_attr.size(),
+                    edge_attr = starrygl.utils.uvm.uvm_empty(*pdata.edge_attr.size(),
                                    dtype=pdata.edge_attr.dtype,
                                    device=ctx.device)
-                    uvm_share(edge_attr,device = ctx.device)
-                    uvm_advise(edge_attr,cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
-                    uvm_prefetch(edge_attr)
+                    starrygl.utils.uvm.uvm_share(edge_attr,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(edge_attr,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(edge_attr)
            if world_size > 1:
                self.edge_attr = DistributedTensor(edge_attr)
            else:
@@ -70,6 +85,15 @@ class DistributedGraphStore:
            self.edge_attr = None

    def _get_node_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves node attributes for the specified node IDs.
+
+        Args:
+            ids: Node IDs for which to retrieve attributes.
+
+            asyncOp: If True, performs asynchronous operation for distributed data.
+
+        '''
        if self.x is None:
            return None
        elif dist.get_world_size() == 1:
@@ -81,6 +105,15 @@ class DistributedGraphStore:
            return self.x.index_select(ids)
    
    def _get_edge_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves edge attributes for the specified edge IDs.
+
+        Args:
+            ids: Edge IDs for which to retrieve attributes.
+
+            asyncOp: If True, performs asynchronous operation for distributed data.
+
+        '''
        if self.edge_attr is None:
            return None
        elif dist.get_world_size() == 1:
@@ -93,9 +126,32 @@ class DistributedGraphStore:
            return self.edge_attr.index_select(ids)
    
    def _get_dist_index(self,ind,mapper):
+        '''
+        Retrieves the distributed index for the specified local index using the provided mapper.
+
+        Args:
+            ind: Local index for which to retrieve the distributed index.
+
+            mapper: Mapper providing the distributed index.
+
+        '''
        return mapper[ind.to(mapper.device)]

 class DataSet:
+    '''
+
+    Args:
+        nodes: Tensor representing nodes. If not None, it is moved to the specified device.
+
+        edges: Tensor representing edges. If not None, it is moved to the specified device.
+
+        labels: Optional parameter for labels.
+
+        ts: Tensor representing timestamps. If not None, it is moved to the specified device.
+
+        device: Device to which tensors are moved (default is 'cuda').
+
+    '''
    def __init__(self,nodes = None,
                 edges = None,
                 labels = None, 
@@ -110,10 +166,15 @@ class DataSet:
        if labels is not None:
            self.labels = labels
        self.len = self.nodes.shape[0] if nodes is not None else self.edges.shape[1] 
+        
        for k, v in kwargs.items():
            assert isinstance(v,torch.Tensor) and v.shape[0]==self.len
            setattr(self, k, v.to(device))
    def _get_empty(self):
+        '''
+        Creates an empty dataset with the same device and data types as the current instance.
+
+        '''
        nodes = torch.empty([],dtype = self.nodes.dtype,device= self.nodes.device)if hasattr(self,'nodes') else None
        edges = torch.empty([[],[]],dtype = self.edges.dtype,device= self.edge.device)if hasattr(self,'edges') else None
        d = DataSet(nodes,edges)
@@ -126,6 +187,13 @@ class DataSet:

    #@staticmethod
    def get_next(self,indx):
+        '''
+        Retrieves the next dataset based on the provided index.
+
+        Args:
+            indx: Index specifying the dataset to retrieve.
+
+        '''
        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
        edges = self.edges[:,indx] if hasattr(self,'edges') else None
        d = DataSet(nodes,edges)
@@ -138,6 +206,10 @@ class DataSet:

    #@staticmethod
    def shuffle(self):
+        '''
+        Shuffles the dataset and returns a new dataset with the same attributes.
+
+        '''
        indx = torch.randperm(self.len)
        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
        edges = self.edges[:,indx] if hasattr(self,'edges') else None
@@ -151,7 +223,7 @@ class DataSet:
    
 class TemporalGraphData(DistributedGraphStore):
    def __init__(self,pdata,device):
-        super(TemporalGraphData,self).__init__(pdata,device)
+        super(DistributedGraphStore,self).__init__(pdata,device)
    def _set_temporal_batch_cache(self,size,pin_size):
        pass
    def _load_feature_to_cuda(self,ids):
@@ -161,6 +233,17 @@ class TemporalGraphData(DistributedGraphStore):


 class TemporalNeighborSampleGraph(DistributedGraphStore):
+    '''
+
+    Args:
+        sample_graph: A dictionary containing graph structure information, including 'edge_index', 'ts' (edge timestamp), and 'eids' (edge identifiers).
+
+        mode: Specifies the dataset mode ('train', 'val', 'test', or 'full').
+
+        eids_mapper: Optional parameter for edge identifiers mapping.
+
+
+    '''
    def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
        self.edge_index = sample_graph['edge_index']
        self.num_edges = self.edge_index.shape[1]

--- a/starrygl/sample/memory/shared_mailbox.py
+++ b/starrygl/sample/memory/shared_mailbox.py
+import starrygl
 from typing import Union
 from typing import List
 from typing import Optional
@@ -8,9 +9,41 @@ from starrygl.distributed.context import DistributedContext
 from starrygl.distributed.utils import DistIndex, DistributedTensor
 import torch.distributed as dist

-from starrygl.utils.uvm import cudaMemoryAdvise, uvm_advise, uvm_empty, uvm_prefetch, uvm_share
+#from starrygl.utils.uvm import cudaMemoryAdvise

 class SharedMailBox():
+    '''
+    We will first define our mailbox, including our definitions of mialbox and memory:
+    .. code-block:: python
+        from starrygl.sample.memory.shared_mailbox import SharedMailBox
+        mailbox = SharedMailBox(num_nodes=num_nodes, memory_param=memory_param, dim_edge_feat=dim_edge_feat)
+
+    Args:
+        num_nodes (int): number of nodes
+
+        memory_param (dict): the memory parameters in the yaml file,refer to TGL
+
+        dim_edge_feat (int): the dim of edge feature
+
+        device (torch.device): the device used to store MailBox
+
+        uvm (bool): 1-use uvm, 0-don't use uvm
+    
+    Examples:
+
+        .. code-block:: python
+        
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.edge_attr is not None else 0)
+
+    We then need to hand over the mailbox to the data loader as in the above example, so that the relevant memory/mailbox can be directly loaded during training.
+
+    During the training, we will call `get_update_memory`/`get_update_mail` function constantly updates 
+    the relevant storage,which is the idea related to TGN.
+    '''
    def __init__(self,
                 num_nodes,
                 memory_param,
@@ -47,18 +80,18 @@ class SharedMailBox():
        if uvm is True:
            
            ctx = DistributedContext.get_default_context()
-            node_memory = uvm_empty(*node_memory.shape,
+            node_memory = starrygl.utils.uvm.uvm_empty(*node_memory.shape,
                                    dtype=node_memory.dtype,
                                    device=ctx.device)
-            uvm_share(node_memory,device = ctx.device)
-            uvm_advise(node_memory,cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
-            uvm_prefetch(node_memory)
-            mailbox = uvm_empty(*mailbox.shape,
+            starrygl.utils.uvm.uvm_share(node_memory,device = ctx.device)
+            starrygl.utils.uvm.uvm_advise(node_memory,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+            starrygl.utils.uvm.uvm_prefetch(node_memory)
+            mailbox = starrygl.utils.uvm.uvm_empty(*mailbox.shape,
                                    dtype=mailbox.dtype,
                                    device=ctx.device)
-            uvm_share(mailbox,device = ctx.device)
-            uvm_advise(mailbox,cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
-            uvm_prefetch(mailbox)
+            starrygl.utils.uvm.uvm_share(mailbox,device = ctx.device)
+            starrygl.utils.uvm.vm_advise(mailbox,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+            starrygl.utils.uvm.uvm_prefetch(mailbox)
        self.node_memory = DistributedTensor(node_memory)
        self.node_memory_ts = DistributedTensor(node_memory_ts)
        self.mailbox = DistributedTensor(mailbox)
@@ -266,7 +299,7 @@ class SharedMailBox():

    def get_update_mail(self,dist_indx_mapper,
                 src,dst,ts,edge_feats,
-                 memory):
+                 memory,embedding=None,use_src_emb=False,use_dst_emb=False):
        if edge_feats is not None:
            edge_feats = edge_feats.to(self.device).to(self.mailbox.dtype)
        src = src.to(self.device)
@@ -276,12 +309,14 @@ class SharedMailBox():

        mem_src = memory[src]
        mem_dst = memory[dst]
+        if embedding is not None:
+            emb_src = embedding[src]
+            emb_dst = embedding[dst]
+        src_mail = torch.cat([emb_src if use_src_emb else mem_src, emb_dst if use_dst_emb else mem_dst], dim=1)
+        dst_mail = torch.cat([emb_dst if use_src_emb else mem_dst, emb_src if use_dst_emb else mem_src], dim=1)
        if edge_feats is not None:
-            src_mail = torch.cat([mem_src, mem_dst, edge_feats], dim=1)
-            dst_mail = torch.cat([mem_dst, mem_src, edge_feats], dim=1)
-        else:
-            src_mail = torch.cat([mem_src, mem_dst], dim=1)
-            dst_mail = torch.cat([mem_dst, mem_src], dim=1)
+            src_mail = torch.cat([src_mail, edge_feats], dim=1)
+            dst_mail = torch.cat([dst_mail, edge_feats], dim=1)
        mail = torch.cat([src_mail, dst_mail], dim=1).reshape(-1, src_mail.shape[1])
        mail_ts = torch.cat((ts,ts),-1).to(self.device).to(self.mailbox_ts.dtype)
        unq_index,inv = torch.unique(index,return_inverse = True)
@@ -291,7 +326,6 @@ class SharedMailBox():
        index = unq_index
        return index,mail,mail_ts
    
-
    def get_update_memory(self,index,memory,memory_ts):
        unq_index,inv = torch.unique(index,return_inverse = True)
        max_ts,idx = torch_scatter.scatter_max(memory_ts,inv,0)

--- a/starrygl/sample/part_utils/partition_tgnn.py
+++ b/starrygl/sample/part_utils/partition_tgnn.py
 from torch_sparse import SparseTensor
 from torch_geometric.data import Data
 from torch_geometric.utils import degree
-
 import os.path as osp
 import os
 import shutil

--- a/starrygl/sample/sample_core/Utils copy.py
+++ b/starrygl/sample/sample_core/Utils copy.py
-import os.path as osp
-import torch
-class GraphData():
-    def __init__(self, path):
-        assert path is not None and osp.exists(path),'path 不存在'
-        id,edge_index,data,partptr =torch.load(path)
-        # 当前分区序号
-        self.partition_id = id
-        # 总分区数
-        self.partitions = partptr.numel() - 1
-        # 全图结构数据
-        self.num_nodes = partptr[self.partitions]
-        self.num_edges = edge_index[0].numel()
-        self.edge_index = edge_index
-        # 该分区下的数据（包含特征向量和子图结构）pyg Data数据结构
-        self.data = data
-        # 分区映射关系
-        self.partptr = partptr
-        self.eid = [i for i in range(self.num_edges)]
-
-    def __init__(self, id, edge_index, data, partptr, timestamp=None):
-        # 当前分区序号
-        self.partition_id = id
-        # 总分区数
-        self.partitions = partptr.numel() - 1
-        # 全图结构数据
-        self.num_nodes = partptr[self.partitions]
-        if edge_index is not None:
-            self.num_edges = edge_index[0].numel()
-        self.edge_index = edge_index
-        self.edge_ts = timestamp
-        # 该分区下的数据（包含特征向量和子图结构）pyg Data数据结构
-        self.data = data
-        # 分区映射关系
-        self.partptr = partptr
-        # edge id
-        self.eid = torch.tensor([i for i in range(0, self.num_edges)])
-
-    def select_attr(self,index):
-        return torch.index_select(self.data.x,0,index)
-
-    #返回全局的节点id 所对应的分区
-    def get_part_num(self):
-        return self.data.x.size()[0]
-
-    def select_attr(self,index):
-        return torch.index_select(self.data.x,0,index)
-    def select_y(self,index):
-        return torch.index_select(self.data.y,0,index)
-    #返回全局的节点id 所对应的分区
-    def get_localId_by_partitionId(self,id,index):
-        #print(index)
-        if(id == -1 or id == 0):
-            return index
-        else:
-            return torch.add(index,-self.partptr[id])
-    def get_globalId_by_partitionId(self,id,index):
-        if(id == -1 or id == 0):
-            return index
-        else:
-            return torch.add(index,self.partptr[id])
-
-    def get_node_num(self):    
-        return self.num_nodes
-    
-    
-    def localId_to_globalId(self,id,partitionId:int = -1):
-        '''
-        将分区partitionId内的点id映射为全局的id
-        '''
-        if partitionId == -1:
-            partitionId = self.partition_id
-        assert id >=self.partptr[self.partition_id] and id < self.partptr[self.partition_id+1]
-        ids_before = 0
-        if self.partition_id>0:
-            ids_before = self.partptr[self.partition_id-1]
-        return id+ids_before
-    
-    def get_partitionId_by_globalId(self,id):
-        '''
-        通过全局id得到对应的分区序号
-        '''
-        partitionId = -1
-        assert id>=0 and id<self.num_nodes,'id 超过范围'
-        for i in range(self.partitions):
-            if id>=self.partptr[i] and id<self.partptr[i+1]:
-                partitionId = i
-                break
-        assert partitionId>=0, 'id 不存在对应的分区'
-        return partitionId
-    
-    def get_nodes_by_partitionId(self,id):
-        '''
-        根据partitioId 返回该分区的节点数量
-        
-        '''
-        assert id>=0 and id<self.partitions,'partitionId 非法'
-        return (int)(self.partptr[id+1]-self.partptr[id])
-        
-    
-    def __repr__(self):
-        return (f'{self.__class__.__name__}(\n'
-                f'  partition_id={self.partition_id}\n'
-                f'  data={self.data},\n'
-                f'  global_info('
-                f'num_nodes={self.num_nodes},'
-                f' num_edges={self.num_edges},'
-                f' num_parts={self.partitions},'
-                f' edge_index=[2,{self.edge_index[0].numel()}])\n'
-                f')')
--- a/starrygl/sample/sample_core/neighbor_sampler.py
+++ b/starrygl/sample/sample_core/neighbor_sampler.py
+import starrygl
 import sys
 from os.path import abspath, join, dirname
-
 sys.path.insert(0, join(abspath(dirname(__file__))))
 import math
 import torch
@@ -9,16 +9,72 @@ from typing import Optional, Tuple

 from .base import BaseSampler, NegativeSampling, SampleOutput, SampleType
 # from sample_cores import ParallelSampler, get_neighbors, heads_unique
-from starrygl.lib.libstarrygl_sampler import ParallelSampler, get_neighbors
-from torch.distributed.rpc import rpc_async

-# def outer_sample(graph_name, nodes, ts, fanout_index, with_outer_sample = SampleType.Outer):# 默认此时继续向外采样
-#     local_sampler = get_local_sampler(graph_name)
-#     assert local_sampler is not None, 'Local_sampler is None!!!'
-#     out = local_sampler.sample_from_nodes(nodes, with_outer_sample, ts, fanout_index)
-#     return out
+from torch.distributed.rpc import rpc_async

 class NeighborSampler(BaseSampler):
+    r'''  
+    Parallel sampling is crucial for expanding model training to a large amount of data.Due to the large scale and complexity of graph data, traditional serial sampling may lead to significant waste of computing and storage resources. The significance of parallel sampling lies in improving the efficiency and overall computational speed of sampling by simultaneously sampling from multiple nodes or neighbors.
+    
+    This helps to accelerate the training and inference process of the model, making it more scalable and practical when dealing with large-scale graph data.
+    
+    Our parallel sampling adopts a hybrid approach of CPU and GPU, where the entire graph structure is stored on the CPU and then uploaded to the GPU after sampling the graph structure on the CPU. Each trainer has a separate sampler for parallel training.
+    
+    We have encapsulated the functions for parallel sampling, and you can easily use them in the following ways:
+    
+        .. code-block:: python
+            # First,you need to import Python packages
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            # Then,you can use ours parallel sampler
+            sampler = NeighborSampler(num_nodes=num_nodes, num_layers=num_layers, fanout=fanout, graph_data=graph_data,
+            workers=workers, is_distinct = is_distinct, policy = policy, edge_weight= edge_weight, graph_name =     graph_name)
+    
+    Args:
+        num_nodes (int): the num of all nodes in the graph
+    
+        num_layers (int): the num of layers to be sampled
+    
+        fanout (list): the list of max neighbors' number chosen for each layer
+    
+        graph_data (:class: starrygl.sample.sample_core.neighbor_sampler): the graph data you want to sample
+    
+        workers (int): the number of threads, default value is 1
+    
+        is_distinct (bool): 1-need distinct muti-edge, 0-don't need distinct muti-edge
+    
+        policy (str): "uniform" or "recent" or "weighted"
+    
+        edge_weight (torch.Tensor,Optional): the initial weights of edges
+    
+        graph_name (str): the name of graph should provide edge_index or (neighbors, deg)
+    
+    Examples:
+    
+        .. code-block:: python
+    
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")
+            graph = DistributedGraphStore(pdata = pdata,uvm_edge = False,uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10],
+            graph_data=sample_graph, workers=15, policy = 'recent', graph_name = "wiki_train")
+    
+    
+    If you want to directly call parallel sampling functions, use the following methods:
+    
+        .. code-block:: python
+            
+            # the parameter meaning is the same as the `Args` above
+            from starrygl.lib.libstarrygl_sampler import ParallelSampler, get_neighbors
+            # get neighbor infomation table,row and col come from graph_data.edge_index=(row, col)
+            tnb = get_neighbors(graph_name, row.contiguous(), col.contiguous(), num_nodes, is_distinct, graph_data. eid, edge_weight, timestamp)
+            # call parallel sampler
+            p_sampler = ParallelSampler(self.tnb, num_nodes, graph_data.num_edges, workers, fanout, num_layers, policy)
+    
+    For complete usage and more details, please refer to `~starrygl.sample.sample_core.neighbor_sampler`
+    '''
    def __init__(
        self,
        num_nodes: int,
@@ -68,11 +124,11 @@ class NeighborSampler(BaseSampler):
                row, col = graph_data.edge_index
            if(edge_weight is not None):
                edge_weight = edge_weight.float().contiguous()
-            self.tnb = get_neighbors(graph_name, row.contiguous(), col.contiguous(), num_nodes, is_distinct, eid, edge_weight, timestamp)
+            self.tnb = starrygl.sampler_ops.get_neighbors(graph_name, row.contiguous(), col.contiguous(), num_nodes, is_distinct, eid, edge_weight, timestamp)
        else:
            assert tnb is not None
            self.tnb = tnb
-        self.p_sampler = ParallelSampler(self.tnb, num_nodes, graph_data.num_edges, workers, 
+        self.p_sampler = starrygl.sampler_ops.ParallelSampler(self.tnb, num_nodes, graph_data.num_edges, workers, 
                                         fanout, num_layers, policy)
    
    def _get_sample_info(self):

--- a/train_tgnn.py
+++ b/train_tgnn.py