commit for temporary

023440c3 · xxx · 77ed8a6d · 98ad16d4 · 77ed8a6d · 023440c3
Commit 023440c3 authored Jan 23, 2024 by xxx
11 changed files
--- a/csrc/sampler/include/negative_sample.cpp
+++ b/csrc/sampler/include/negative_sample.cpp
-#pragma once
-#include <head.h>
-
-void random_sample_with_collision_check(int size, th::Tensor batch_src_node_id, th::Tensor batch_dst_node_id){
-    auto src_ptr = batch_src_node_id.data_ptr<NodeIDType>();
-    auto dst_ptr = batch_dst_node_id.data_ptr<NodeIDType>();
-    
-#pragma omp for num_threads(10)
-    for(int i = 0; i<size; i++){
-
-    }
-}
\ No newline at end of file
--- a/docs/source/tutorial/dataset.rst
+++ b/docs/source/tutorial/dataset.rst
 Preparing the Temporal Graph Dataset
 ====================================

-.. note::
-    包含从原始数据开始的数据清洗和预处理步骤，最终形成可以被StarryGL使用的数据文件
\ No newline at end of file
+In this tutorial, we will show the preparation process of the temporal graph datase that can be used by StarryGL.
+
+Read Raw Data
+-------------
+
+Take Wikipedia dataset as an example, the raw data files are as follows:
+
+- `edges.csv`: the temporal edges of the graph
+- `node_features.pt`: the node features of the graph
+- `edge_features.pt`: the edge features of the graph
+
+Here is an example to read the raw data files:
+
+.. code-block:: python
+    
+    data_name = args.data_name
+    df = pd.read_csv('raw_data/'+data_name+'/edges.csv')
+    if os.path.exists('raw_data/'+data_name+'/node_features.pt'):
+        n_feat = torch.load('raw_data/'+data_name+'/node_features.pt')
+    else:
+        n_feat = None
+    if os.path.exists('raw_data/'+data_name+'/edge_features.pt'):
+        e_feat = torch.load('raw_data/'+data_name+'/edge_features.pt')
+    else:
+        e_feat = None
+    src = torch.from_numpy(np.array(df.src.values)).long()
+    dst = torch.from_numpy(np.array(df.dst.values)).long()
+    ts = torch.from_numpy(np.array(df.time.values)).long()
+    neg_nums = args.num_neg_sample
+
+    edge_index = torch.cat((src[np.newaxis, :], dst[np.newaxis, :]), 0)
+    num_nodes = edge_index.view(-1).max().item()+1
+    num_edges = edge_index.shape[1]
+    print('the number of nodes in graph is {}, \
+        the number of edges in graph is {}'.format(num_nodes, num_edges))
+
+Preprocess Data
+---------------
+
+After reading the raw data, we need to preprocess the data to get the data format that can be used by StarryGL. The following code shows the preprocessing process:
+
+.. code-block:: python
+
+    sample_graph = {}
+    sample_src = torch.cat([src.view(-1, 1), dst.view(-1, 1)], dim=1)\
+        .reshape(1, -1)
+    sample_dst = torch.cat([dst.view(-1, 1), src.view(-1, 1)], dim=1)\
+        .reshape(1, -1)
+    sample_ts = torch.cat([ts.view(-1, 1), ts.view(-1, 1)], dim=1).reshape(-1)
+    sample_eid = torch.arange(num_edges).view(-1, 1).repeat(1, 2).reshape(-1)
+    sample_graph['edge_index'] = torch.cat([sample_src, sample_dst], dim=0)
+    sample_graph['ts'] = sample_ts
+    sample_graph['eids'] = sample_eid
+    neg_sampler = NegativeSampling('triplet')
+    neg_src = neg_sampler.sample(edge_index.shape[1]*neg_nums, num_nodes)
+    neg_sample = neg_src.reshape(-1, neg_nums)
+
+
+    edge_ts = torch.torch.from_numpy(np.array(ts)).float()
+    data = Data() #torch_geometric.data.Data()
+    data.num_nodes = num_nodes
+    data.num_edges = num_edges
+    data.edge_index = edge_index
+    data.edge_ts = edge_ts
+    data.neg_sample = neg_sample
+
+    if n_feat is not None:
+        data.x = n_feat
+    if e_feat is not None:
+        data.edge_attr = e_feat
+
+    data.train_mask = (torch.from_numpy(np.array(df.ext_roll.values)) == 0)
+    data.val_mask = (torch.from_numpy(np.array(df.ext_roll.values)) == 1)
+    data.test_mask = (torch.from_numpy(np.array(df.ext_roll.values)) == 2)
+    sample_graph['train_mask'] = data.train_mask[sample_eid]
+    sample_graph['test_mask'] = data.test_mask[sample_eid]
+    sample_graph['val_mask'] = data.val_mask[sample_eid]
+    data.sample_graph = sample_graph
+
+    data.y = torch.zeros(edge_index.shape[1])
+    edge_index_dict = {}
+    edge_index_dict['edata'] = data.edge_index
+    edge_index_dict['sample_data'] = data.sample_graph['edge_index']
+    edge_index_dict['neg_data'] = torch.cat([neg_src.view(1, -1),
+                                            dst.view(-1, 1).repeat(1, neg_nums).
+                                            reshape(1, -1)], dim=0)
+    data.edge_index_dict = edge_index_dict
+    edge_weight_dict = {}
+    edge_weight_dict['edata'] = 2*neg_nums
+    edge_weight_dict['sample_data'] = 1*neg_nums
+    edge_weight_dict['neg_data'] = 1
+
+We construct a torch_geometric.data.Data object to store the data. The data object contains the following attributes:
+
+- `num_nodes`: the number of nodes in the graph
+- `num_edges`: the number of edges in the graph
+- `edge_index`: the edge index of the graph
+- `edge_ts`: the timestamp of the edges
+- `neg_sample`: the negative samples of the edges
+- `x`: the node features of the graph
+- `edge_attr`: the edge features of the graph
+- `train_mask`: the train mask of the edges
+- `val_mask`: the validation mask of the edges
+- `test_mask`: the test mask of the edges
+- `sample_graph`: the sampled graph
+- `edge_index_dict`: the edge index of the sampled graph
+
+Finally, we can partition the graph and save the data:
+
+.. code-block:: python
+
+    partition_save('./dataset/here/'+data_name, data, 16, 'metis_for_tgnn',
+               edge_weight_dict=edge_weight_dict)
--- a/docs/source/tutorial/distributed.rst
+++ b/docs/source/tutorial/distributed.rst
--- a/docs/source/tutorial/index.rst
+++ b/docs/source/tutorial/index.rst
@@ -5,5 +5,4 @@ Tutorials
    intro
    module
    dataset
-    application
    distributed
\ No newline at end of file
--- a/docs/source/tutorial/intro.rst
+++ b/docs/source/tutorial/intro.rst
 Introduction to Temporal GNN
 ==============================================

-.. note::
-    简单介绍一下时序GNN，应用场景，需要解决的问题等，相当于一个总体的介绍
+There are so many real-word systems that can be formulated as temporal interaction graphs, such as social network and citation network. In these systems, the nodes represent the entities and the edges represent the interactions between entities. The interactions are usually time-stamped, which means the edges are associated with time. Temporal interaction graphs are dynamic, which means the graph structure changes over time. For example, in a social network, the friendship between two people may be established or broken at different time. In a citation network, a paper may cite another paper at different time. 
+
+To encapsulate the temporal information present in these graphs and learn dynamic representations, researchers have introduced temporal graph neural networks (GNNs). These networks are capable of modeling both structural and temporal dependencies within the graph. Numerous innovative frameworks have been proposed to date, achieving outstanding performance in specific tasks such as link prediction. Based on two different methods to represent temporal graphs, we can divide temporal GNNs into two categories: 
+
+1. continuous-time temporal GNNs, which model the temporal graph as a sequence of interactions
+2. discrete-time temporal GNNs, which model the temporal graph as a sequence of snapshots
+
+However, as the temporal graph expands—potentially encompassing millions of nodes and billions of edges—it becomes increasingly challenging to scale temporal GNN training to accommodate these larger graphs. The reasons are twofold: first, sampling neighbors from a larger graph demands more time; second, chronological training also incurs a higher time cost. To address these challenges, we introduce StarryGL in this tutorial. StarryGL is a distributed temporal GNN framework designed to efficiently navigate the complexities of training larger temporal graphs.
\ No newline at end of file
--- a/docs/source/tutorial/module.rst
+++ b/docs/source/tutorial/module.rst
 Creating Temporal GNN Models
 ============================

-.. note::
-    介绍如何创建GNN模型，找最经典最简洁的两个例子即可。包括 **离散时间动态图模型** 模型构建和 **连续时间动态图模型**。
\ No newline at end of file
+Continuous-time Temporal GNN Models
+-----------------------------------
+
+To create a continuous-time temporal GNN model, we first need to define a configuration file with the suffix yml to specify the model structures and parameters. Here we use the configuration file :code:`TGN.yml` for TGN model as an example:
+
+.. code-block:: yaml
+
+    sampling:
+      - layer: 1
+        neighbor: 
+          - 10
+        strategy: 'recent'
+        prop_time: False
+        history: 1
+        duration: 0
+        num_thread: 32
+    memory: 
+      - type: 'node'
+        dim_time: 100
+        deliver_to: 'self'
+        mail_combine: 'last'
+        memory_update: 'gru'
+        mailbox_size: 1
+        combine_node_feature: True
+        dim_out: 100
+    gnn:
+      - arch: 'transformer_attention'
+        use_src_emb: False
+        use_dst_emb: False
+        layer: 1
+        att_head: 2
+        dim_time: 100
+        dim_out: 100
+    train:
+      - epoch: 20
+        batch_size: 200
+        # reorder: 16
+        lr: 0.0001
+        dropout: 0.2
+        att_dropout: 0.2
+        all_on_gpu: True
+
+The configuration file is composed of four parts: :code:`sampling`, :code:`memory`, :code:`gnn` and :code:`train`. Here are their meanings:
+
+- :code:`sampling`: This part specifies the sampling strategy for the temporal graph. :code:`layer` field specifies the number of layers in the sampling strategy. The :code:`neighbor` field specifies the number of neighbors to sample for each layer. The :code:`strategy` field specifies the sampling strategy(recent or uniform). The :code:`prop_time` field specifies whether to propagate the time information. The :code:`history` field specifies the number of historical timestamps to use. The :code:`duration` field specifies the duration of the time window. The :code:`num_thread` field specifies the number of threads to use for sampling.
+- :code:`memory`: This part specifies the memory module. :code:`type` field specifies the type of memory module(node or none). :code:`dim_time` field specifies the dimension of the time embedding. :code:`deliver_to` field specifies the destination of the message. :code:`mail_combine` field specifies the way to combine the messages. :code:`memory_update` field specifies the way to update the memory. :code:`mailbox_size` field specifies the size of the mailbox. :code:`combine_node_feature` field specifies whether to combine the node features. :code:`dim_out` field specifies the dimension of the output.
+- :code:`gnn`: This part specifies the GNN module. :code:`arch` field specifies the architecture of the GNN module. :code:`use_src_emb` field specifies whether to use the source embedding. :code:`use_dst_emb` field specifies whether to use the destination embedding. :code:`layer` field specifies the number of layers in the GNN module. :code:`att_head` field specifies the number of attention heads. :code:`dim_time` field specifies the dimension of the time embedding. :code:`dim_out` field specifies the dimension of the output.
+- :code:`train`: This part specifies the training parameters. :code:`epoch` field specifies the number of epochs. :code:`batch_size` field specifies the batch size. :code:`lr` field specifies the learning rate. :code:`dropout` field specifies the dropout rate. :code:`att_dropout` field specifies the attention dropout rate. :code:`all_on_gpu` field specifies whether to put all the data on GPU.
+
+After defining the configuration file, we can firstly read the parameters from the configuration file and create the model by constructing a :code:`General Model` object:
+
+.. code-block:: python
+
+    def parse_config(f):
+        conf = yaml.safe_load(open(f, 'r'))
+        sample_param = conf['sampling'][0]
+        memory_param = conf['memory'][0]
+        gnn_param = conf['gnn'][0]
+        train_param = conf['train'][0]
+        return sample_param, memory_param, gnn_param, train_param
+    
+    sample_param, memory_param, gnn_param, train_param = parse_config('./config/{}.yml'.format(args.model))
+    model = GeneralModel(gnn_dim_node, gnn_dim_edge, sample_param, memory_param, gnn_param, train_param).cuda()
+    model = DDP(model)
+
+Then a :code:`GeneralModel` object is created. If needed, we can adjust the model's parameters by modifying the contents of the configuration file. Here we provide 5 models for continuous-time temporal GNNs:
+
+- :code:`TGN`: The TGN model proposed in `Temporal Graph Networks for Deep Learning on Dynamic Graphs <https://arxiv.org/abs/2006.10637>`__.
+- :code:`DyRep`: The DyRep model proposed in `Representation Learning and Reasoning on Temporal Knowledge Graphs <https://arxiv.org/abs/1803.04051>`__.
+- :code:`TIGER`: The TIGER model proposed in `TIGER: A Transformer-Based Framework for Temporal Knowledge Graph Completion <https://arxiv.org/abs/2302.06057>`__.
+- :code:`Jodie`: The Jodie model proposed in `JODIE: Joint Optimization of Dynamics and Importance for Online Embedding <https://arxiv.org/abs/1908.01207>`__.
+- :code:`TGAT`: The TGAT model proposed in `Temporal Graph Attention for Deep Temporal Modeling <https://arxiv.org/abs/2002.07962>`__.
\ No newline at end of file
--- a/install.sh
+++ b/install.sh
@@ -3,9 +3,9 @@
 mkdir -p build && cd build
 cmake .. \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-    -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/dgnn/lib/python3.10/site-packages" \
-    -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/dgnn" \
-    -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/.local/cuda-12.2" \
+    -DCMAKE_PREFIX_PATH=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())") \
+    -DPython3_ROOT_DIR=$(python -c "import sys; print(sys.prefix)") \
+    -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_HOME:-"$(realpath $(dirname $(which nvcc))/../)"} \
 && make -j32 \
 && rm -rf ../starrygl/lib \
 && mkdir ../starrygl/lib \

--- a/starrygl/module/modules.py
+++ b/starrygl/module/modules.py
@@ -58,6 +58,7 @@ class GeneralModel(torch.nn.Module):
                rst = self.layers['l' + str(l) + 'h' + str(h)](mfgs[l][h])
                if 'time_transform' in self.gnn_param and self.gnn_param['time_transform'] == 'JODIE':
                    rst = self.layers['l0h' + str(h) + 't'](rst, mfgs[l][h].srcdata['mem_ts'], mfgs[l][h].srcdata['ts'])
+                print(rst,mfgs[l][h])
                if l != self.gnn_param['layer'] - 1:
                    mfgs[l + 1][h].srcdata['h'] = rst
                else:

--- a/starrygl/sample/batch_data.py
+++ b/starrygl/sample/batch_data.py
@@ -135,7 +135,7 @@ def to_block(graph: DistributedGraphStore, data, sample_out, mailbox:MailBox = N
            if sample_out[r].delta_ts().shape[0] > 0:
                b.edata['dt'] = sample_out[r].delta_ts().to(device)
            if src_ts is not None:
-                b.srcdata['ts'] = src_ts[0:row_len + eid_len[r]]
+                b.srcdata['ts'] = src_ts[0:row_len + elen]
            b.edata['ID'] = e_idx
            #print(b.all_edges)
            #print(dist_nid[b.srcdata['ID']],dist_nid[b.srcdata['ID'][col[sample_out[r].src_index().to(device)]]])

--- a/starrygl/sample/part_utils/partition_tgnn.py
+++ b/starrygl/sample/part_utils/partition_tgnn.py
-import parser
 from torch_sparse import SparseTensor
 from torch_geometric.data import Data
 from torch_geometric.utils import degree
@@ -8,7 +7,7 @@ import os
 import shutil
 import torch
 import torch.utils.data
-import metis
+#import metis
 import networkx as nx
 import torch.distributed as dist


--- a/train_tgnn.py
+++ b/train_tgnn.py
@@ -56,11 +56,11 @@ import dgl
 import numpy as np
 from sklearn.metrics import average_precision_score, roc_auc_score
 from torch.nn.parallel import DistributedDataParallel as DDP
-#os.environ['CUDA_VISIBLE_DEVICES'] = str(args.rank)
-#os.environ["RANK"] = str(args.rank)
-#os.environ["WORLD_SIZE"] = str(args.world_size)
-#os.environ["LOCAL_RANK"] = str(0)
-torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+os.environ['CUDA_VISIBLE_DEVICES'] = str(args.rank)
+os.environ["RANK"] = str(args.rank)
+os.environ["WORLD_SIZE"] = str(args.world_size)
+os.environ["LOCAL_RANK"] = str(0)
+#torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 os.environ["MASTER_ADDR"] = '127.0.0.1'
 os.environ["MASTER_PORT"] = '9437'
 def seed_everything(seed=42):
@@ -80,7 +80,7 @@ def main():
    torch.set_num_threads(int(40/torch.distributed.get_world_size()))
    device_id = torch.cuda.current_device()
    print('use cuda on',device_id)
-    pdata = partition_load("/mnt/data/part_data/v2/here/{}".format(args.dataname), algo="metis_for_tgnn")    
+    pdata = partition_load("/mnt/data/part_data/here/{}".format(args.dataname), algo="metis_for_tgnn")    
    graph = DistributedGraphStore(pdata = pdata)

    Path("./saved_models/").mkdir(parents=True, exist_ok=True)
@@ -103,15 +103,15 @@ def main():
    print(sample_graph.eid.shape,sample_graph.edge_index.shape,sample_graph.edge_ts.shape,graph.edge_attr)
    train_data = torch.masked_select(graph.edge_index,pdata.train_mask.to(graph.edge_index.device)).reshape(2,-1)
    train_ts = torch.masked_select(graph.edge_ts,pdata.train_mask.to(graph.edge_index.device))
-    val_data = torch.masked_select(graph.edge_index,pdata.val_mask.to(graph.edge_index.device)).reshape(2,-1)
-    val_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
-    test_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
-    test_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device)) 
+    test_data = torch.masked_select(graph.edge_index,pdata.val_mask.to(graph.edge_index.device)).reshape(2,-1)
+    test_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
+    val_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
+    val_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device)) 
    print(train_ts.max(),val_ts.max(),test_ts.max())
    #print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
    train_data = DataSet(edges = train_data,ts =train_ts,eids = torch.nonzero(pdata.train_mask).view(-1))
-    test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
-    val_data = DataSet(edges = val_data,ts = val_ts,eids = torch.nonzero(pdata.val_mask).view(-1))
+    test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.val_mask).view(-1))
+    val_data = DataSet(edges = val_data,ts = val_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
    neg_sampler = NegativeSampling('triplet')
    trainloader = DistributedDataLoader(graph,train_data,sampler = sampler,
                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
@@ -185,6 +185,7 @@ def main():
                    y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
                    y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
                    aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
+
                    aucs_mrrs.append(roc_auc_score(y_true, y_pred))
                    if mailbox is not None:
                        src = metadata['src_pos_index']
@@ -256,7 +257,6 @@ def main():

                optimizer.zero_grad()
                pred_pos, pred_neg = model(mfgs,metadata)
-                
                loss = creterion(pred_pos, torch.ones_like(pred_pos))
                loss += creterion(pred_neg, torch.zeros_like(pred_neg))
                total_loss += float(loss)