Commit 54424543 by zlj

change the torch_utilsdependece path

parent 8768fe6f
set(CMAKE_SOURCE_DIR "/mnt/data/")
cmake_minimum_required(VERSION 3.15) cmake_minimum_required(VERSION 3.15)
project(starrygl_ops VERSION 0.1) project(starrygl_ops VERSION 0.1)
...@@ -45,7 +46,7 @@ if(WITH_METIS) ...@@ -45,7 +46,7 @@ if(WITH_METIS)
add_definitions(-DWITH_METIS) add_definitions(-DWITH_METIS)
set(GKLIB_DIR "${CMAKE_SOURCE_DIR}/third_party/GKlib") set(GKLIB_DIR "${CMAKE_SOURCE_DIR}/third_party/GKlib")
set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS") set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
message(${METIS_DIR})
set(GKLIB_INCLUDE_DIRS "${GKLIB_DIR}/include") set(GKLIB_INCLUDE_DIRS "${GKLIB_DIR}/include")
file(GLOB_RECURSE GKLIB_LIBRARIES "${GKLIB_DIR}/lib/lib*.a") file(GLOB_RECURSE GKLIB_LIBRARIES "${GKLIB_DIR}/lib/lib*.a")
...@@ -104,10 +105,10 @@ endif() ...@@ -104,10 +105,10 @@ endif()
# add libsampler.so # add libsampler.so
set(SAMLPER_NAME "${PROJECT_NAME}_sampler") set(SAMLPER_NAME "${PROJECT_NAME}_sampler")
set(BOOST_INCLUDE_DIRS "${CMAKE_SOURCE_DIR}/third_party/boost_1_83_0")
include_directories(${BOOST_INCLUDE_DIRS})
file(GLOB_RECURSE SAMPLER_SRCS "csrc/sampler/*.cpp") file(GLOB_RECURSE SAMPLER_SRCS "csrc/sampler/*.cpp")
add_library(${SAMLPER_NAME} SHARED ${SAMPLER_SRCS}) add_library(${SAMLPER_NAME} SHARED ${SAMPLER_SRCS})
target_include_directories(${SAMLPER_NAME} PRIVATE "csrc/sampler/include") target_include_directories(${SAMLPER_NAME} PRIVATE "csrc/sampler/include")
target_compile_options(${SAMLPER_NAME} PRIVATE -O3) target_compile_options(${SAMLPER_NAME} PRIVATE -O3)
......
# CMAKE generated file: DO NOT EDIT!
# Generated by "Unix Makefiles" Generator, CMake Version 3.16
# Default target executed when no arguments are given to make.
default_target: all
.PHONY : default_target
# Allow only one "make -f Makefile2" at a time, but pass parallelism.
.NOTPARALLEL:
#=============================================================================
# Special targets provided by cmake.
# Disable implicit rules so canonical targets will work.
.SUFFIXES:
# Remove some rules from gmake that .SUFFIXES does not remove.
SUFFIXES =
.SUFFIXES: .hpux_make_needs_suffix_list
# Suppress display of executed commands.
$(VERBOSE).SILENT:
# A target that is always out of date.
cmake_force:
.PHONY : cmake_force
#=============================================================================
# Set environment variables for the build.
# The shell in which to execute make rules.
SHELL = /bin/sh
# The CMake executable.
CMAKE_COMMAND = /usr/bin/cmake
# The command to remove a file.
RM = /usr/bin/cmake -E remove -f
# Escaping for special characters.
EQUALS = =
# The top-level source directory on which CMake was run.
CMAKE_SOURCE_DIR = /home/zlj/starrygl
# The top-level build directory on which CMake was run.
CMAKE_BINARY_DIR = /home/zlj/starrygl
#=============================================================================
# Targets provided globally by CMake.
# Special rule for the target rebuild_cache
rebuild_cache:
@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
/usr/bin/cmake -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
.PHONY : rebuild_cache
# Special rule for the target rebuild_cache
rebuild_cache/fast: rebuild_cache
.PHONY : rebuild_cache/fast
# Special rule for the target edit_cache
edit_cache:
@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
.PHONY : edit_cache
# Special rule for the target edit_cache
edit_cache/fast: edit_cache
.PHONY : edit_cache/fast
# The main all target
all: cmake_check_build_system
$(CMAKE_COMMAND) -E cmake_progress_start /home/zlj/starrygl/CMakeFiles /home/zlj/starrygl/CMakeFiles/progress.marks
$(MAKE) -f CMakeFiles/Makefile2 all
$(CMAKE_COMMAND) -E cmake_progress_start /home/zlj/starrygl/CMakeFiles 0
.PHONY : all
# The main clean target
clean:
$(MAKE) -f CMakeFiles/Makefile2 clean
.PHONY : clean
# The main clean target
clean/fast: clean
.PHONY : clean/fast
# Prepare targets for installation.
preinstall: all
$(MAKE) -f CMakeFiles/Makefile2 preinstall
.PHONY : preinstall
# Prepare targets for installation.
preinstall/fast:
$(MAKE) -f CMakeFiles/Makefile2 preinstall
.PHONY : preinstall/fast
# clear depends
depend:
$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
.PHONY : depend
#=============================================================================
# Target rules for targets named starrygl_ops
# Build rule for target.
starrygl_ops: cmake_check_build_system
$(MAKE) -f CMakeFiles/Makefile2 starrygl_ops
.PHONY : starrygl_ops
# fast build rule for target.
starrygl_ops/fast:
$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/build
.PHONY : starrygl_ops/fast
#=============================================================================
# Target rules for targets named mtmetis
# Build rule for target.
mtmetis: cmake_check_build_system
$(MAKE) -f CMakeFiles/Makefile2 mtmetis
.PHONY : mtmetis
# fast build rule for target.
mtmetis/fast:
$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/build
.PHONY : mtmetis/fast
#=============================================================================
# Target rules for targets named metis
# Build rule for target.
metis: cmake_check_build_system
$(MAKE) -f CMakeFiles/Makefile2 metis
.PHONY : metis
# fast build rule for target.
metis/fast:
$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/build
.PHONY : metis/fast
#=============================================================================
# Target rules for targets named starrygl_ops_sampler
# Build rule for target.
starrygl_ops_sampler: cmake_check_build_system
$(MAKE) -f CMakeFiles/Makefile2 starrygl_ops_sampler
.PHONY : starrygl_ops_sampler
# fast build rule for target.
starrygl_ops_sampler/fast:
$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/build
.PHONY : starrygl_ops_sampler/fast
#=============================================================================
# Target rules for targets named uvm
# Build rule for target.
uvm: cmake_check_build_system
$(MAKE) -f CMakeFiles/Makefile2 uvm
.PHONY : uvm
# fast build rule for target.
uvm/fast:
$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/build
.PHONY : uvm/fast
csrc/export.o: csrc/export.cpp.o
.PHONY : csrc/export.o
# target to build an object file
csrc/export.cpp.o:
$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/csrc/export.cpp.o
.PHONY : csrc/export.cpp.o
csrc/export.i: csrc/export.cpp.i
.PHONY : csrc/export.i
# target to preprocess a source file
csrc/export.cpp.i:
$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/csrc/export.cpp.i
.PHONY : csrc/export.cpp.i
csrc/export.s: csrc/export.cpp.s
.PHONY : csrc/export.s
# target to generate assembly for a file
csrc/export.cpp.s:
$(MAKE) -f CMakeFiles/starrygl_ops.dir/build.make CMakeFiles/starrygl_ops.dir/csrc/export.cpp.s
.PHONY : csrc/export.cpp.s
csrc/partition/metis.o: csrc/partition/metis.cpp.o
.PHONY : csrc/partition/metis.o
# target to build an object file
csrc/partition/metis.cpp.o:
$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/csrc/partition/metis.cpp.o
.PHONY : csrc/partition/metis.cpp.o
csrc/partition/metis.i: csrc/partition/metis.cpp.i
.PHONY : csrc/partition/metis.i
# target to preprocess a source file
csrc/partition/metis.cpp.i:
$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/csrc/partition/metis.cpp.i
.PHONY : csrc/partition/metis.cpp.i
csrc/partition/metis.s: csrc/partition/metis.cpp.s
.PHONY : csrc/partition/metis.s
# target to generate assembly for a file
csrc/partition/metis.cpp.s:
$(MAKE) -f CMakeFiles/metis.dir/build.make CMakeFiles/metis.dir/csrc/partition/metis.cpp.s
.PHONY : csrc/partition/metis.cpp.s
csrc/partition/mtmetis.o: csrc/partition/mtmetis.cpp.o
.PHONY : csrc/partition/mtmetis.o
# target to build an object file
csrc/partition/mtmetis.cpp.o:
$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/csrc/partition/mtmetis.cpp.o
.PHONY : csrc/partition/mtmetis.cpp.o
csrc/partition/mtmetis.i: csrc/partition/mtmetis.cpp.i
.PHONY : csrc/partition/mtmetis.i
# target to preprocess a source file
csrc/partition/mtmetis.cpp.i:
$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/csrc/partition/mtmetis.cpp.i
.PHONY : csrc/partition/mtmetis.cpp.i
csrc/partition/mtmetis.s: csrc/partition/mtmetis.cpp.s
.PHONY : csrc/partition/mtmetis.s
# target to generate assembly for a file
csrc/partition/mtmetis.cpp.s:
$(MAKE) -f CMakeFiles/mtmetis.dir/build.make CMakeFiles/mtmetis.dir/csrc/partition/mtmetis.cpp.s
.PHONY : csrc/partition/mtmetis.cpp.s
csrc/sampler/export.o: csrc/sampler/export.cpp.o
.PHONY : csrc/sampler/export.o
# target to build an object file
csrc/sampler/export.cpp.o:
$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/csrc/sampler/export.cpp.o
.PHONY : csrc/sampler/export.cpp.o
csrc/sampler/export.i: csrc/sampler/export.cpp.i
.PHONY : csrc/sampler/export.i
# target to preprocess a source file
csrc/sampler/export.cpp.i:
$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/csrc/sampler/export.cpp.i
.PHONY : csrc/sampler/export.cpp.i
csrc/sampler/export.s: csrc/sampler/export.cpp.s
.PHONY : csrc/sampler/export.s
# target to generate assembly for a file
csrc/sampler/export.cpp.s:
$(MAKE) -f CMakeFiles/starrygl_ops_sampler.dir/build.make CMakeFiles/starrygl_ops_sampler.dir/csrc/sampler/export.cpp.s
.PHONY : csrc/sampler/export.cpp.s
csrc/uvm/uvm.o: csrc/uvm/uvm.cpp.o
.PHONY : csrc/uvm/uvm.o
# target to build an object file
csrc/uvm/uvm.cpp.o:
$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/csrc/uvm/uvm.cpp.o
.PHONY : csrc/uvm/uvm.cpp.o
csrc/uvm/uvm.i: csrc/uvm/uvm.cpp.i
.PHONY : csrc/uvm/uvm.i
# target to preprocess a source file
csrc/uvm/uvm.cpp.i:
$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/csrc/uvm/uvm.cpp.i
.PHONY : csrc/uvm/uvm.cpp.i
csrc/uvm/uvm.s: csrc/uvm/uvm.cpp.s
.PHONY : csrc/uvm/uvm.s
# target to generate assembly for a file
csrc/uvm/uvm.cpp.s:
$(MAKE) -f CMakeFiles/uvm.dir/build.make CMakeFiles/uvm.dir/csrc/uvm/uvm.cpp.s
.PHONY : csrc/uvm/uvm.cpp.s
# Help Target
help:
@echo "The following are some of the valid targets for this Makefile:"
@echo "... all (the default if no target is provided)"
@echo "... clean"
@echo "... depend"
@echo "... rebuild_cache"
@echo "... edit_cache"
@echo "... starrygl_ops"
@echo "... mtmetis"
@echo "... metis"
@echo "... starrygl_ops_sampler"
@echo "... uvm"
@echo "... csrc/export.o"
@echo "... csrc/export.i"
@echo "... csrc/export.s"
@echo "... csrc/partition/metis.o"
@echo "... csrc/partition/metis.i"
@echo "... csrc/partition/metis.s"
@echo "... csrc/partition/mtmetis.o"
@echo "... csrc/partition/mtmetis.i"
@echo "... csrc/partition/mtmetis.s"
@echo "... csrc/sampler/export.o"
@echo "... csrc/sampler/export.i"
@echo "... csrc/sampler/export.s"
@echo "... csrc/uvm/uvm.o"
@echo "... csrc/uvm/uvm.i"
@echo "... csrc/uvm/uvm.s"
.PHONY : help
#=============================================================================
# Special targets to cleanup operation of make.
# Special rule to run CMake to check the build system integrity.
# No rule that depends on this can have commands that come from listfiles
# because they might be regenerated.
cmake_check_build_system:
$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
.PHONY : cmake_check_build_system
...@@ -23,7 +23,7 @@ gnn: ...@@ -23,7 +23,7 @@ gnn:
dim_time: 100 dim_time: 100
dim_out: 100 dim_out: 100
train: train:
- epoch: 10 - epoch: 5
#batch_size: 100 #batch_size: 100
# reorder: 16 # reorder: 16
lr: 0.0001 lr: 0.0001
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#include <sampler.h> #include <sampler.h>
#include <output.h> #include <output.h>
#include <neighbors.h> #include <neighbors.h>
#include <temporal_utils.h>
/*------------Python Bind--------------------------------------------------------------*/ /*------------Python Bind--------------------------------------------------------------*/
...@@ -16,7 +17,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) ...@@ -16,7 +17,14 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
py::return_value_policy::reference) py::return_value_policy::reference)
.def("divide_nodes_to_part", .def("divide_nodes_to_part",
&divide_nodes_to_part, &divide_nodes_to_part,
py::return_value_policy::reference); py::return_value_policy::reference)
.def("sparse_get_index",
&sparse_get_index,
py::return_value_policy::reference)
.def("get_norm_temporal",
&get_norm_temporal,
py::return_value_policy::reference
);
py::class_<TemporalGraphBlock>(m, "TemporalGraphBlock") py::class_<TemporalGraphBlock>(m, "TemporalGraphBlock")
.def(py::init<vector<NodeIDType> &, vector<NodeIDType> &, .def(py::init<vector<NodeIDType> &, vector<NodeIDType> &,
...@@ -79,4 +87,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) ...@@ -79,4 +87,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
.def("neighbor_sample_from_nodes", &ParallelSampler::neighbor_sample_from_nodes) .def("neighbor_sample_from_nodes", &ParallelSampler::neighbor_sample_from_nodes)
.def("reset", &ParallelSampler::reset) .def("reset", &ParallelSampler::reset)
.def("get_ret", [](const ParallelSampler &ps) { return ps.ret; }); .def("get_ret", [](const ParallelSampler &ps) { return ps.ret; });
} }
\ No newline at end of file
#include <pybind11/pybind11.h> #pragma once
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <torch/extension.h> #include <torch/extension.h>
#include <parallel_hashmap/phmap.h> #include <parallel_hashmap/phmap.h>
#include <cstring> #include <cstring>
......
...@@ -44,14 +44,14 @@ def load_feat(d, rand_de=0, rand_dn=0): ...@@ -44,14 +44,14 @@ def load_feat(d, rand_de=0, rand_dn=0):
data_name = args.data_name data_name = args.data_name
g = np.load('../tgl_main/DATA/'+data_name+'/ext_full.npz') g = np.load('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/ext_full.npz')
df = pd.read_csv('../tgl_main/DATA/'+data_name+'/edges.csv') df = pd.read_csv('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/edges.csv')
if os.path.exists('../tgl_main/DATA/'+data_name+'/node_features.pt'): if os.path.exists('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/node_features.pt'):
n_feat = torch.load('../tgl_main/DATA/'+data_name+'/node_features.pt') n_feat = torch.load('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/node_features.pt')
else: else:
n_feat = None n_feat = None
if os.path.exists('../tgl_main/DATA/'+data_name+'/edge_features.pt'): if os.path.exists('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/edge_features.pt'):
e_feat = torch.load('../tgl_main/DATA/'+data_name+'/edge_features.pt') e_feat = torch.load('/mnt/nfs/fzz/TGL-DATA/'+data_name+'/edge_features.pt')
else: else:
e_feat = None e_feat = None
......
#include <cuda_runtime.h>
#include <cstdio>
int main()
{
int count = 0;
if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;
if (count == 0) return -1;
for (int device = 0; device < count; ++device)
{
cudaDeviceProp prop;
if (cudaSuccess == cudaGetDeviceProperties(&prop, device))
std::printf("%d.%d ", prop.major, prop.minor);
}
return 0;
}
#include <cuda.h>
#include <cstdio>
int main() {
printf("%d.%d", CUDA_VERSION / 1000, (CUDA_VERSION / 10) % 100);
return 0;
}
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
mkdir -p build && cd build mkdir -p build && cd build
cmake .. \ cmake .. \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DCMAKE_PREFIX_PATH="/home/hwj/.miniconda3/envs/sgl/lib/python3.10/site-packages" \ -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/dgnn/lib/python3.10/site-packages" \
-DPython3_ROOT_DIR="/home/hwj/.miniconda3/envs/sgl" \ -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/dgnn" \
-DCUDA_TOOLKIT_ROOT_DIR="/home/hwj/.local/cuda-11.8" \ -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/local/cuda-12.2" \
&& make -j32 \ && make -j32 \
&& rm -rf ../starrygl/lib \ && rm -rf ../starrygl/lib \
&& mkdir ../starrygl/lib \ && mkdir ../starrygl/lib \
......
import argparse
import os
import sys
from os.path import abspath, join, dirname
from starrygl.distributed.context import DistributedContext
from starrygl.distributed.utils import DistIndex
from starrygl.module.modules import GeneralModel
from starrygl.module.utils import parse_config
from starrygl.sample.graph_core import DataSet, GraphData, TemporalNeighborSampleGraph
from starrygl.sample.memory.shared_mailbox import SharedMailBox
from starrygl.sample.sample_core.base import NegativeSampling
from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
from starrygl.sample.part_utils.partition_tgnn import partition_load
import torch
import time
import torch
import torch.nn.functional as F
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
from starrygl.sample.data_loader import DistributedDataLoader
from starrygl.sample.batch_data import SAMPLE_TYPE
"""
test command
python test.py --world_size 2 --rank 0
--world_size', default=4, type=int, metavar='W',
help='number of workers')
parser.add_argument('--rank', default=0, type=int, metavar='W',
help='rank of the worker')
parser.add_argument('--log_interval', type=int, default=10, metavar='N',
help='interval between training status logs')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='how much to value future rewards')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed for reproducibility')
parser.add_argument('--num_sampler', type=int, default=10, metavar='S',
help='number of samplers')
parser.add_argument('--queue_size', type=int, default=10, metavar='S',
help='sampler queue size')
"""
parser = argparse.ArgumentParser(
description="RPC Reinforcement Learning Example",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument('--rank', default=0, type=str, metavar='W',
help='name of dataset')
parser.add_argument('--world_size', default=1, type=int, metavar='W',
help='number of negative samples')
args = parser.parse_args()
from sklearn.metrics import average_precision_score, roc_auc_score
import torch
import time
import random
import dgl
import numpy as np
from sklearn.metrics import average_precision_score, roc_auc_score
from torch.nn.parallel import DistributedDataParallel as DDP
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.rank)
os.environ["RANK"] = str(args.rank)
os.environ["WORLD_SIZE"] = str(args.world_size)
os.environ["LOCAL_RANK"] = str(0)
os.environ["MASTER_ADDR"] = '127.0.0.1'
os.environ["MASTER_PORT"] = '9337'
def seed_everything(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
seed_everything(1234)
def main():
use_cuda = True
sample_param, memory_param, gnn_param, train_param = parse_config('./config/TGN.yml')
torch.set_num_threads(12)
ctx = DistributedContext.init(backend="nccl", use_gpu=True)
device_id = torch.cuda.current_device()
print('use cuda on',device_id)
pdata = partition_load("./dataset/here/MOOC", algo="metis_for_tgnn")
graph = GraphData(pdata = pdata)
sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat = pdata.edge_attr.shape[1] if pdata.edge_attr is not None else 0)
sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10],graph_data=sample_graph, workers=10,policy = 'recent',graph_name = "wiki_train")
train_data = torch.masked_select(graph.edge_index,pdata.train_mask.to(graph.edge_index.device)).reshape(2,-1)
train_ts = torch.masked_select(graph.edge_ts,pdata.train_mask.to(graph.edge_index.device))
val_data = torch.masked_select(graph.edge_index,pdata.val_mask.to(graph.edge_index.device)).reshape(2,-1)
val_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
test_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
test_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device))
print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
train_data = DataSet(edges = train_data,ts =train_ts,eids = torch.nonzero(pdata.train_mask).view(-1))
test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
val_data = DataSet(edges = val_data,ts = val_ts,eids = torch.nonzero(pdata.val_mask).view(-1))
#train_neg_sampler = PreNegativeSampling('triplet',torch.masked_select(pdata.edge_index['pos_edge'],graph.data.train_mask).reshape(2,-1))
neg_sampler = NegativeSampling('triplet')
trainloader = DistributedDataLoader(graph,train_data,sampler = sampler,
sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
neg_sampler=neg_sampler,
batch_size = 1000,
shuffle=False,
drop_last=True,
chunk_size = None,
train=True,
queue_size = 1000,
mailbox = mailbox)
testloader = DistributedDataLoader(graph,test_data,sampler = sampler,
sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
neg_sampler=neg_sampler,
batch_size = 1000,
shuffle=False,
drop_last=False,
chunk_size = None,
train=False,
queue_size = 100,
mailbox = mailbox)
valloader = DistributedDataLoader(graph,val_data,sampler = sampler,
sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
neg_sampler=neg_sampler,
batch_size = 1000,
shuffle=False,
drop_last=False,
chunk_size = None,
train=False,
queue_size = 100,
mailbox = mailbox)
gnn_dim_node = 0 if graph.x is None else pdata.x.shape[1]
gnn_dim_edge = 0 if graph.edge_attr is None else pdata.edge_attr.shape[1]
avg_time = 0
if use_cuda:
model = GeneralModel(gnn_dim_node, gnn_dim_edge, sample_param, memory_param, gnn_param, train_param).cuda()
device = torch.device('cuda')
else:
model = GeneralModel(gnn_dim_node, gnn_dim_edge, sample_param, memory_param, gnn_param, train_param)
device = torch.device('cpu')
model = DDP(model,find_unused_parameters=True)
train_stream = torch.cuda.Stream()
send_stream = torch.cuda.Stream()
scatter_stream = torch.cuda.Stream()
val_losses = list()
def eval(mode='val'):
neg_samples = 1
model.eval()
aps = list()
aucs_mrrs = list()
if mode == 'val':
loader = valloader
elif mode == 'test':
loader = testloader
elif mode == 'train':
loader = trainloader
with torch.no_grad():
total_loss = 0
signal = torch.tensor([0],dtype = int,device = device)
for roots,mfgs,metadata in loader:
pred_pos, pred_neg = model(mfgs,metadata)
total_loss += creterion(pred_pos, torch.ones_like(pred_pos))
total_loss += creterion(pred_neg, torch.zeros_like(pred_neg))
y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
aucs_mrrs.append(roc_auc_score(y_true, y_pred))
if mailbox is not None:
src = metadata['src_pos_index']
dst = metadata['dst_pos_index']
ts = roots.ts
if(graph.edge_attr.device == torch.device('cpu')):
edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None
else:
edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None
dist_index_mapper = mfgs[0][0].srcdata['ID']
root_index = torch.cat((src,dst))
last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
last_updated_memory = model.module.memory_updater.last_updated_memory[root_index]
last_updated_ts=model.module.memory_updater.last_updated_ts[root_index]
index, memory, memory_ts = mailbox.get_update_memory(last_updated_nid,
last_updated_memory,
last_updated_ts)
index, mail, mail_ts = mailbox.get_update_mail(dist_index_mapper,
src,dst,ts,edge_feats,
model.module.memory_updater.last_updated_memory,
)
mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
#ap = float(torch.tensor(aps).mean())
#if neg_samples > 1:
# auc_mrr = float(torch.cat(aucs_mrrs).mean())
#else:
# auc_mrr = float(torch.tensor(aucs_mrrs).mean())
world_size = dist.get_world_size()
apc = torch.empty([loader.expected_idx*world_size],dtype = torch.float,device='cuda')
auc_mrr = torch.empty([loader.expected_idx*world_size],dtype = torch.float,device = 'cuda')
dist.all_gather_into_tensor(apc,torch.tensor(aps,device ='cuda',dtype=torch.float))
dist.all_gather_into_tensor(auc_mrr,torch.tensor(aucs_mrrs,device ='cuda',dtype=torch.float))
ap = float(torch.tensor(apc).mean())
auc_mrr = float(torch.tensor(auc_mrr).mean())
return ap, auc_mrr
creterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=train_param['lr'])
for e in range(train_param['epoch']):
torch.cuda.synchronize()
epoch_start_time = time.time()
train_aps = list()
print('Epoch {:d}:'.format(e))
time_prep = 0
total_loss = 0
model.train()
if mailbox is not None:
mailbox.reset()
model.module.memory_updater.last_updated_nid = None
model.module.memory_updater.last_updated_memory = None
model.module.memory_updater.last_updated_ts = None
for roots,mfgs,metadata in trainloader:
t_prep_s = time.time()
with torch.cuda.stream(train_stream):
optimizer.zero_grad()
pred_pos, pred_neg = model(mfgs,metadata)
loss = creterion(pred_pos, torch.ones_like(pred_pos))
loss += creterion(pred_neg, torch.zeros_like(pred_neg))
total_loss += float(loss)
loss.backward()
optimizer.step()
#torch.cuda.synchronize()
t_prep_s = time.time()
y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
train_aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
if mailbox is not None:
src = metadata['src_pos_index']
dst = metadata['dst_pos_index']
ts = roots.ts
if(graph.edge_attr.device == torch.device('cpu')):
edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None
else:
edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None
dist_index_mapper = mfgs[0][0].srcdata['ID']
root_index = torch.cat((src,dst))
last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
last_updated_memory = model.module.memory_updater.last_updated_memory[root_index]
last_updated_ts=model.module.memory_updater.last_updated_ts[root_index]
index, memory, memory_ts = mailbox.get_update_memory(last_updated_nid,
last_updated_memory,
last_updated_ts)
index, mail, mail_ts = mailbox.get_update_mail(dist_index_mapper,
src,dst,ts,edge_feats,
model.module.memory_updater.last_updated_memory,
)
mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
torch.cuda.synchronize()
time_prep = time.time() - epoch_start_time
avg_time += time.time() - epoch_start_time
train_ap = float(torch.tensor(train_aps).mean())
ap = 0
auc = 0
ap, auc = eval('val')
print('\ttrain loss:{:.4f} train ap:{:4f} val ap:{:4f} val auc:{:4f}'.format(total_loss,train_ap, ap, auc))
print('\ttotal time:{:.2f}s prep time:{:.2f}s'.format(time.time()-epoch_start_time, time_prep))
model.eval()
if mailbox is not None:
mailbox.reset()
model.module.memory_updater.last_updated_nid = None
eval('train')
eval('val')
ap, auc = eval('test')
eval_neg_samples = 1
if eval_neg_samples > 1:
print('\ttest AP:{:4f} test MRR:{:4f}'.format(ap, auc))
else:
print('\ttest AP:{:4f} test AUC:{:4f}'.format(ap, auc))
print('test_dataset',test_data.edges.shape[1],'avg_time',avg_time/train_param['epoch'])
ctx.shutdown()
if __name__ == "__main__":
main()
...@@ -133,7 +133,6 @@ def all_to_all_s( ...@@ -133,7 +133,6 @@ def all_to_all_s(
output_sizes = [t-s for s, t in zip(output_rowptr, output_rowptr[1:])] output_sizes = [t-s for s, t in zip(output_rowptr, output_rowptr[1:])]
input_sizes = [t-s for s, t in zip(input_rowptr, input_rowptr[1:])] input_sizes = [t-s for s, t in zip(input_rowptr, input_rowptr[1:])]
return dist.all_to_all_single( return dist.all_to_all_single(
output=output_tensor, output=output_tensor,
input=input_tensor, input=input_tensor,
......
...@@ -17,6 +17,7 @@ class TensorAccessor: ...@@ -17,6 +17,7 @@ class TensorAccessor:
self._data = data self._data = data
self._ctx = DistributedContext.get_default_context() self._ctx = DistributedContext.get_default_context()
self._rref = rpc.RRef(data) self._rref = rpc.RRef(data)
self.stream = torch.cuda.Stream()
@property @property
def data(self): def data(self):
...@@ -29,6 +30,12 @@ class TensorAccessor: ...@@ -29,6 +30,12 @@ class TensorAccessor:
@property @property
def ctx(self): def ctx(self):
return self._ctx return self._ctx
@staticmethod
@rpc.functions.async_execution
def _index_selet(self):
fut = torch.futures.Future()
fut.set_result(None)
return fut
def all_gather_rrefs(self) -> List[rpc.RRef]: def all_gather_rrefs(self) -> List[rpc.RRef]:
return self.ctx.all_gather_remote_objects(self.rref) return self.ctx.all_gather_remote_objects(self.rref)
...@@ -180,7 +187,7 @@ class DistributedTensor: ...@@ -180,7 +187,7 @@ class DistributedTensor:
def ctx(self): def ctx(self):
return self.accessor.ctx return self.accessor.ctx
def all_to_all_ind2ptr(self, dist_index: Union[Tensor, DistIndex]) -> Dict[str, Union[List[int], Tensor]]: def all_to_all_ind2ptr(self, dist_index: Union[Tensor, DistIndex],group = None) -> Dict[str, Union[List[int], Tensor]]:
if isinstance(dist_index, Tensor): if isinstance(dist_index, Tensor):
dist_index = DistIndex(dist_index) dist_index = DistIndex(dist_index)
send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts()) send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts())
...@@ -196,7 +203,7 @@ class DistributedTensor: ...@@ -196,7 +203,7 @@ class DistributedTensor:
recv_ptr = recv_ptr.tolist() recv_ptr = recv_ptr.tolist()
recv_ind = torch.full((recv_ptr[-1],), (2**62-1)*2+1, dtype=dist_index.dtype, device=dist_index.device) recv_ind = torch.full((recv_ptr[-1],), (2**62-1)*2+1, dtype=dist_index.dtype, device=dist_index.device)
all_to_all_s(recv_ind, dist_index.loc, send_ptr, recv_ptr) all_to_all_s(recv_ind, dist_index.loc, recv_ptr, send_ptr,group=group)
return { return {
"send_ptr": send_ptr, "send_ptr": send_ptr,
...@@ -209,6 +216,7 @@ class DistributedTensor: ...@@ -209,6 +216,7 @@ class DistributedTensor:
send_ptr: Optional[List[int]] = None, send_ptr: Optional[List[int]] = None,
recv_ptr: Optional[List[int]] = None, recv_ptr: Optional[List[int]] = None,
recv_ind: Optional[List[int]] = None, recv_ind: Optional[List[int]] = None,
group = None
) -> Tensor: ) -> Tensor:
if dist_index is not None: if dist_index is not None:
dist_dict = self.all_to_all_ind2ptr(dist_index) dist_dict = self.all_to_all_ind2ptr(dist_index)
...@@ -218,7 +226,7 @@ class DistributedTensor: ...@@ -218,7 +226,7 @@ class DistributedTensor:
data = self.accessor.data[recv_ind] data = self.accessor.data[recv_ind]
recv = torch.empty(send_ptr[-1], *data.shape[1:], dtype=data.dtype, device=data.device) recv = torch.empty(send_ptr[-1], *data.shape[1:], dtype=data.dtype, device=data.device)
all_to_all_s(recv, data, send_ptr, recv_ptr) all_to_all_s(recv, data, send_ptr, recv_ptr,group=group)
return recv return recv
def all_to_all_set(self, def all_to_all_set(self,
...@@ -227,6 +235,7 @@ class DistributedTensor: ...@@ -227,6 +235,7 @@ class DistributedTensor:
send_ptr: Optional[List[int]] = None, send_ptr: Optional[List[int]] = None,
recv_ptr: Optional[List[int]] = None, recv_ptr: Optional[List[int]] = None,
recv_ind: Optional[List[int]] = None, recv_ind: Optional[List[int]] = None,
group = None
): ):
if dist_index is not None: if dist_index is not None:
dist_dict = self.all_to_all_ind2ptr(dist_index) dist_dict = self.all_to_all_ind2ptr(dist_index)
...@@ -235,7 +244,7 @@ class DistributedTensor: ...@@ -235,7 +244,7 @@ class DistributedTensor:
recv_ind = dist_dict["recv_ind"] recv_ind = dist_dict["recv_ind"]
recv = torch.empty(recv_ptr[-1], *data.shape[1:], dtype=data.dtype, device=data.device) recv = torch.empty(recv_ptr[-1], *data.shape[1:], dtype=data.dtype, device=data.device)
all_to_all_s(recv, data, recv_ptr, send_ptr) all_to_all_s(recv, data, recv_ptr, send_ptr,group=group)
self.accessor.data.index_copy_(0, recv_ind, recv) self.accessor.data.index_copy_(0, recv_ind, recv)
def index_select(self, dist_index: Union[Tensor, DistIndex]): def index_select(self, dist_index: Union[Tensor, DistIndex]):
......
...@@ -7,6 +7,8 @@ from starrygl.sample.graph_core import DataSet ...@@ -7,6 +7,8 @@ from starrygl.sample.graph_core import DataSet
from starrygl.sample.graph_core import GraphData from starrygl.sample.graph_core import GraphData
from starrygl.sample.sample_core.base import BaseSampler, NegativeSampling from starrygl.sample.sample_core.base import BaseSampler, NegativeSampling
import dgl import dgl
from starrygl.sample.stream_manager import PipelineManager, getPipelineManger
""" """
入参不变,出参变为: 入参不变,出参变为:
sample_from_nodes sample_from_nodes
...@@ -42,7 +44,7 @@ def prepare_input(node_feat, edge_feat, mem_embedding,mfgs,dist_nid,dist_eid): ...@@ -42,7 +44,7 @@ def prepare_input(node_feat, edge_feat, mem_embedding,mfgs,dist_nid,dist_eid):
#print(idx.shape[0],b.srcdata['mem_ts'].shape) #print(idx.shape[0],b.srcdata['mem_ts'].shape)
return mfgs return mfgs
def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = torch.device('cuda')): def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = torch.device('cuda'),group = None):
if len(sample_out) > 1: if len(sample_out) > 1:
sample_out,metadata = sample_out sample_out,metadata = sample_out
...@@ -76,26 +78,25 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = ...@@ -76,26 +78,25 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device =
dist_nid = nid_mapper[nid_tensor].to(device) dist_nid = nid_mapper[nid_tensor].to(device)
dist_nid,nid_inv = dist_nid.unique(return_inverse = True) dist_nid,nid_inv = dist_nid.unique(return_inverse = True)
if isinstance(graph.edge_attr,DistributedTensor): if isinstance(graph.edge_attr,DistributedTensor):
local_index, input_split,output_split = graph.edge_attr.gather_select_index(dist_eid) ind_dict = graph.edge_attr.all_to_all_ind2ptr(dist_eid,group = group)
edge_feat = graph.edge_attr.scatter_data(local_index,input_split=input_split,out_split=output_split) edge_feat = graph.edge_attr.all_to_all_get(group = group,**ind_dict)
else: else:
edge_feat = graph._get_edge_attr(dist_eid) edge_feat = graph._get_edge_attr(dist_eid)
local_index = None ind_dict = None
if isinstance(graph.x,DistributedTensor): if isinstance(graph.x,DistributedTensor):
local_index, input_split,output_split = graph.x.gather_select_index(dist_nid) ind_dict = graph.x.all_to_all_ind2ptr(dist_nid,group = group)
node_feat = graph.x.scatter_data(local_index,input_split=input_split,out_split=output_split) node_feat = graph.x.all_to_all_get(group = group,**ind_dict)
else: else:
node_feat = graph._get_node_attr(dist_nid) node_feat = graph._get_node_attr(dist_nid)
if mailbox is not None: if mailbox is not None:
if torch.distributed.get_world_size() > 1: if torch.distributed.get_world_size() > 1:
if node_feat is None: if node_feat is None:
local_index, input_split,output_split = mailbox.node_memory.gather_select_index(dist_nid) ind_dict = mailbox.node_memory.all_to_all_ind2ptr(dist_nid,group = group)
mem = mailbox.gather_memory(local_index,input_split,output_split) mem = mailbox.gather_memory(**ind_dict)
else: else:
mem = mailbox.get_memory(dist_nid) mem = mailbox.get_memory(dist_nid)
else: else:
mem = None mem = None
def build_block(): def build_block():
mfgs = list() mfgs = list()
col = torch.arange(0,root_len,device = device) col = torch.arange(0,root_len,device = device)
...@@ -110,23 +111,12 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = ...@@ -110,23 +111,12 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device =
device = device) device = device)
idx = nid_inv[0:row_len + elen] idx = nid_inv[0:row_len + elen]
e_idx = eid_inv[col_len:col_len+elen] e_idx = eid_inv[col_len:col_len+elen]
b.srcdata['ID'] = idx#dist_nid[idx] b.srcdata['ID'] = idx
if sample_out[r].delta_ts().shape[0] > 0: if sample_out[r].delta_ts().shape[0] > 0:
b.edata['dt'] = sample_out[r].delta_ts().to(device) b.edata['dt'] = sample_out[r].delta_ts().to(device)
if src_ts is not None: if src_ts is not None:
b.srcdata['ts'] = src_ts[0:row_len + eid_len[r]] b.srcdata['ts'] = src_ts[0:row_len + eid_len[r]]
b.edata['ID'] = e_idx#dist_eid[e_idx] b.edata['ID'] = e_idx
#if edge_feat is not None:
# b.edata['f'] = edge_feat[e_idx]
#if r == len(eid_len)-1:
# if node_feat is not None:
# b.srcdata['h'] = node_feat[idx]
# if mem_embedding is not None:
# node_memory,node_memory_ts,mailbox,mailbox_ts = mem_embedding
# b.srcdata['mem'] = node_memory[idx]
# b.srcdata['mem_ts'] = node_memory_ts[idx]
# b.srcdata['mem_input'] = mailbox[idx].cuda().reshape(b.srcdata['ID'].shape[0], -1)
# b.srcdata['mail_ts'] = mailbox_ts[idx]
col = row col = row
col_len += eid_len[r] col_len += eid_len[r]
row_len += eid_len[r] row_len += eid_len[r]
...@@ -134,27 +124,7 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device = ...@@ -134,27 +124,7 @@ def to_block(graph: GraphData, data, sample_out, mailbox:MailBox = None,device =
mfgs = list(map(list, zip(*[iter(mfgs)]))) mfgs = list(map(list, zip(*[iter(mfgs)])))
mfgs.reverse() mfgs.reverse()
return data,mfgs,metadata return data,mfgs,metadata
data,mfgs,metadata = build_block() data,mfgs,metadata = build_block()
#if dist.get_world_size() > 1:
# if(node_feat is None):
# node_feat = torch.futures.Future()
# node_feat.set_result(None)
# if(edge_feat is None):
# edge_feat = torch.futures.Future()
# edge_feat.set_result(None)
# if(mem is None):
# mem = torch.futures.Future()
# mem.set_result(None)
# def callback(fs,mfgs,dist_nid,dist_eid):
# node_feat,edge_feat,mem_embedding = fs.value()
# node_feat = node_feat.value()
# edge_feat = edge_feat.value()
# mem_embedding = mem_embedding.value()
# return prepare_input(node_feat,edge_feat,mem_embedding,mfgs,dist_nid,dist_eid)
# cal = lambda fut: callback(fs=fut,mfgs = mfgs,dist_nid = dist_nid,dist_eid =dist_eid)
# return data,torch.futures.collect_all([node_feat,edge_feat,mem]).then(cal),metadata
#else:
mfgs = prepare_input(node_feat,edge_feat,mem,mfgs,dist_nid,dist_eid) mfgs = prepare_input(node_feat,edge_feat,mem,mfgs,dist_nid,dist_eid)
#return build_block(node_feat,edge_feat,mem)#data,mfgs,metadata #return build_block(node_feat,edge_feat,mem)#data,mfgs,metadata
return data,mfgs,metadata return data,mfgs,metadata
...@@ -164,9 +134,15 @@ def graph_sample(graph, sampler:BaseSampler, ...@@ -164,9 +134,15 @@ def graph_sample(graph, sampler:BaseSampler,
sample_fn, data, sample_fn, data,
neg_sampling = None, neg_sampling = None,
mailbox = None, mailbox = None,
device = torch.device('cuda')): device = torch.device('cuda'),
async_op = False):
out = sample_fn(sampler,data,neg_sampling) out = sample_fn(sampler,data,neg_sampling)
if async_op == False:
return to_block(graph,data,out,mailbox,device) return to_block(graph,data,out,mailbox,device)
else:
manger = getPipelineManger()
future = manger.submit('lookup',to_block,graph,data,out,mailbox,device)
return future
def sample_from_nodes(sampler:BaseSampler, data:DataSet, **kwargs): def sample_from_nodes(sampler:BaseSampler, data:DataSet, **kwargs):
out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1)) out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1))
......
#include <omp.h>
#include <torch/extension.h>
#include <time.h>
#include <random>
#include <phmap.h>
#include <boost/thread/mutex.hpp>
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#define MTX boost::mutex
using namespace std;
namespace py = pybind11;
namespace th = torch;
typedef int64_t NodeIDType;
typedef int64_t EdgeIDType;
typedef float WeightType;
typedef float TimeStampType;
#define EXTRAARGS , phmap::priv::hash_default_hash<K>, \
phmap::priv::hash_default_eq<K>, \
std::allocator<K>, 4, MTX
template <class K, class V>
using HashM = phmap::parallel_flat_hash_map<K, V EXTRAARGS>;
void init_cache_buffer(long long size){
}
at::Storage set_cache_buffer(at::Stroge ind){
}
//按策略更新替换cache上的数据,返回对应的索引
at::Storage update_buffer_index(string policy, at::Storge ind, u){
}
at::Storage get_buffer_index(at::Storage ind){
}
PYBIND11_MODULE(sample_cores, m)
{
m
.def("get_neighbors",
&get_neighbors,
py::return_value_policy::reference)
.
}
\ No newline at end of file
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <cuda_runtime_api.h>
#include <atomic>
#include <iostream>
#include <limits>
#include <mutex>
#include <uvm_table.hpp>
namespace cg = cooperative_groups;
namespace {
constexpr int set_size = 4;
constexpr int block_size = 256;
template <typename key_type>
__host__ __device__ key_type hash(key_type key) {
return key;
}
template <typename key_type>
__global__ void hash_add_kernel(const key_type* new_keys, const int num_keys, key_type* keys,
const int num_sets, int* set_sizes, const int max_set_size,
key_type* missing_keys, int* num_missing_keys) {
__shared__ key_type s_missing_keys[block_size];
__shared__ int s_missing_count;
__shared__ size_t s_missing_idx;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
if (block.thread_rank() == 0) {
s_missing_count = 0;
}
block.sync();
size_t idx = grid.thread_rank();
if (idx < num_keys) {
auto key = new_keys[idx];
size_t idx_set = hash(key) % num_sets;
int prev_set_size = atomicAdd(&set_sizes[idx_set], 1);
if (prev_set_size < max_set_size) {
keys[idx_set * max_set_size + prev_set_size] = key;
} else {
int count = atomicAdd(&s_missing_count, 1);
s_missing_keys[count] = key;
}
}
block.sync();
if (block.thread_rank() == 0) {
s_missing_idx = atomicAdd(num_missing_keys, s_missing_count);
}
block.sync();
for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
missing_keys[s_missing_idx + i] = s_missing_keys[i];
}
}
template <typename key_type, typename index_type>
__global__ void hash_query_kernel(const key_type* query_keys, int* num_keys_ptr,
const key_type* keys, const size_t num_sets,
const int max_set_size, index_type* output_indices) {
constexpr int tile_size = set_size;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<tile_size>(block);
int num_keys = *num_keys_ptr;
if (num_keys == 0) return;
#if (CUDA_VERSION < 11060)
size_t num_threads_per_grid = grid.size();
#else
size_t num_threads_per_grid = grid.num_threads();
#endif
size_t step = (num_keys - 1) / num_threads_per_grid + 1;
for (size_t i = 0; i < step; i++) {
size_t idx = i * num_threads_per_grid + grid.thread_rank();
key_type query_key = std::numeric_limits<key_type>::max();
if (idx < num_keys) {
query_key = query_keys[idx];
}
auto idx_set = hash(query_key) % num_sets;
for (int j = 0; j < tile_size; j++) {
auto current_idx_set = tile.shfl(idx_set, j);
auto current_query_key = tile.shfl(query_key, j);
if (current_query_key == std::numeric_limits<key_type>::max()) {
continue;
}
auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
int existed = tile.ballot(current_query_key == candidate_key);
auto current_idx = tile.shfl(idx, 0) + j;
if (existed) {
int src_lane = __ffs(existed) - 1;
size_t found_idx = current_idx_set * set_size + src_lane;
output_indices[current_idx] = num_sets * src_lane + current_idx_set;
} else {
output_indices[current_idx] = std::numeric_limits<index_type>::max();
}
}
}
}
template <typename key_type, typename index_type>
__global__ void hash_query_kernel(const key_type* query_keys, const int num_keys,
const key_type* keys, const size_t num_sets,
const int max_set_size, index_type* output_indices,
key_type* missing_keys, int* missing_positions,
int* missing_count) {
__shared__ key_type s_missing_keys[block_size];
__shared__ key_type s_missing_positions[block_size];
__shared__ int s_missing_count;
__shared__ int s_missing_idx;
constexpr int tile_size = set_size;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<tile_size>(block);
if (block.thread_rank() == 0) {
s_missing_count = 0;
}
block.sync();
size_t idx = grid.thread_rank();
key_type query_key = std::numeric_limits<key_type>::max();
if (idx < num_keys) {
query_key = query_keys[idx];
}
auto idx_set = hash(query_key) % num_sets;
for (int j = 0; j < tile_size; j++) {
auto current_idx_set = tile.shfl(idx_set, j);
auto current_query_key = tile.shfl(query_key, j);
if (current_query_key == std::numeric_limits<key_type>::max()) {
continue;
}
auto candidate_key = keys[current_idx_set * set_size + tile.thread_rank()];
int existed = tile.ballot(current_query_key == candidate_key);
if (existed) {
int src_lane = __ffs(existed) - 1;
size_t found_idx = current_idx_set * set_size + src_lane;
output_indices[tile.shfl(idx, 0) + j] = num_sets * src_lane + current_idx_set;
} else {
auto current_idx = tile.shfl(idx, 0) + j;
output_indices[current_idx] = std::numeric_limits<index_type>::max();
if (tile.thread_rank() == 0) {
int s_count = atomicAdd(&s_missing_count, 1);
s_missing_keys[s_count] = current_query_key;
s_missing_positions[s_count] = current_idx;
}
}
}
if (missing_keys == nullptr) {
if (grid.thread_rank() == 0 && missing_count) {
*missing_count = 0;
}
return;
}
block.sync();
if (block.thread_rank() == 0) {
s_missing_idx = atomicAdd(missing_count, s_missing_count);
}
block.sync();
for (size_t i = block.thread_rank(); i < s_missing_count; i += block.num_threads()) {
missing_keys[s_missing_idx + i] = s_missing_keys[i];
missing_positions[s_missing_idx + i] = s_missing_positions[i];
}
}
template <int warp_size>
__forceinline__ __device__ void warp_tile_copy(const size_t lane_idx,
const size_t emb_vec_size_in_float,
volatile float* d_dst, const float* d_src) {
// 16 bytes align
if (emb_vec_size_in_float % 4 != 0 || (size_t)d_dst % 16 != 0 || (size_t)d_src % 16 != 0) {
#pragma unroll
for (size_t i = lane_idx; i < emb_vec_size_in_float; i += warp_size) {
d_dst[i] = d_src[i];
}
} else {
#pragma unroll
for (size_t i = lane_idx; i < emb_vec_size_in_float / 4; i += warp_size) {
*(float4*)(d_dst + i * 4) = __ldg((const float4*)(d_src + i * 4));
}
}
}
template <typename index_type, typename vec_type>
__global__ void read_vectors_kernel(const index_type* query_indices, const int num_keys,
const vec_type* vectors, const int vec_size,
vec_type* output_vectors) {
constexpr int warp_size = 32;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<warp_size>(block);
#if (CUDA_VERSION < 11060)
auto num_threads_per_grid = grid.size();
#else
auto num_threads_per_grid = grid.num_threads();
#endif
for (int step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
int key_num = step * num_threads_per_grid + grid.thread_rank();
index_type idx = std::numeric_limits<index_type>::max();
if (key_num < num_keys) {
idx = query_indices[key_num];
}
#pragma unroll 4
for (size_t j = 0; j < warp_size; j++) {
index_type current_idx = tile.shfl(idx, j);
index_type idx_write = tile.shfl(key_num, 0) + j;
if (current_idx == std::numeric_limits<index_type>::max()) continue;
warp_tile_copy<warp_size>(tile.thread_rank(), vec_size, output_vectors + idx_write * vec_size,
vectors + current_idx * vec_size);
}
}
}
template <typename index_type, typename vec_type>
__global__ void distribute_vectors_kernel(const index_type* positions, const size_t num_keys,
const vec_type* vectors, const int vec_size,
vec_type* output_vectors) {
constexpr int warp_size = 32;
auto grid = cg::this_grid();
auto block = cg::this_thread_block();
auto tile = cg::tiled_partition<warp_size>(block);
#if (CUDA_VERSION < 11060)
auto num_threads_per_grid = grid.size();
#else
auto num_threads_per_grid = grid.num_threads();
#endif
for (size_t step = 0; step < (num_keys - 1) / num_threads_per_grid + 1; step++) {
size_t key_num = step * num_threads_per_grid + grid.thread_rank();
index_type idx = std::numeric_limits<index_type>::max();
if (key_num < num_keys) {
idx = positions[key_num];
}
#pragma unroll 4
for (size_t j = 0; j < warp_size; j++) {
size_t idx_write = tile.shfl(idx, j);
size_t idx_read = tile.shfl(key_num, 0) + j;
if (idx_write == std::numeric_limits<index_type>::max()) continue;
warp_tile_copy<warp_size>(tile.thread_rank(), vec_size,
output_vectors + (size_t)idx_write * vec_size,
vectors + (size_t)idx_read * vec_size);
}
}
}
} // namespace
namespace gpu_cache {
template <typename key_type, typename index_type, typename vec_type>
UvmTable<key_type, index_type, vec_type>::UvmTable(const size_t device_table_capacity,
const size_t host_table_capacity,
const int max_batch_size, const int vec_size,
const vec_type default_value)
: max_batch_size_(std::max(100000, max_batch_size)),
vec_size_(vec_size),
num_set_((device_table_capacity - 1) / set_size + 1),
num_host_set_((host_table_capacity - 1) / set_size + 1),
table_capacity_(num_set_ * set_size),
default_vector_(vec_size, default_value),
device_table_(device_table_capacity, set_size, max_batch_size_),
host_table_(host_table_capacity * 1.1, set_size, max_batch_size_) {
CUDA_CHECK(cudaMalloc(&d_keys_buffer_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_vectors_buffer_, sizeof(vec_type) * max_batch_size_ * vec_size_));
CUDA_CHECK(cudaMalloc(&d_vectors_, sizeof(vec_type) * device_table_.capacity * vec_size_));
CUDA_CHECK(cudaMalloc(&d_output_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_output_host_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMallocHost(&h_output_host_indices_, sizeof(index_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_missing_keys_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_missing_positions_, sizeof(int) * max_batch_size_));
CUDA_CHECK(cudaMalloc(&d_missing_count_, sizeof(int)));
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(int)));
CUDA_CHECK(cudaStreamCreate(&query_stream_));
for (int i = 0; i < num_buffers_; i++) {
int batch_size_per_buffer = ceil(1.0 * max_batch_size_ / num_buffers_);
CUDA_CHECK(
cudaMallocHost(&h_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
CUDA_CHECK(cudaMalloc(&d_cpy_buffers_[i], sizeof(vec_type) * batch_size_per_buffer * vec_size));
CUDA_CHECK(cudaStreamCreate(&cpy_streams_[i]));
CUDA_CHECK(cudaEventCreate(&cpy_events_[i]));
}
CUDA_CHECK(cudaMallocHost(&h_missing_keys_, sizeof(key_type) * max_batch_size_));
CUDA_CHECK(cudaEventCreate(&query_event_));
h_vectors_.resize(host_table_.capacity * vec_size_);
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::add(const key_type* h_keys,
const vec_type* h_vectors,
const size_t num_keys) {
std::vector<key_type> h_missing_keys;
size_t num_batches = (num_keys - 1) / max_batch_size_ + 1;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size =
i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
device_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
CUDA_CHECK(cudaDeviceSynchronize());
int num_missing_keys;
CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
cudaMemcpyDeviceToHost));
size_t prev_size = h_missing_keys.size();
h_missing_keys.resize(prev_size + num_missing_keys);
CUDA_CHECK(cudaMemcpy(h_missing_keys.data() + prev_size, d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
}
std::vector<key_type> h_final_missing_keys;
num_batches = h_missing_keys.size() ? (h_missing_keys.size() - 1) / max_batch_size_ + 1 : 0;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size =
i != num_batches - 1 ? max_batch_size_ : h_missing_keys.size() - i * max_batch_size_;
CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_missing_keys.data() + i * max_batch_size_,
sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
host_table_.add(d_keys_buffer_, this_batch_size, d_missing_keys_, d_missing_count_, 0);
CUDA_CHECK(cudaDeviceSynchronize());
int num_missing_keys;
CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
cudaMemcpyDeviceToHost));
size_t prev_size = h_final_missing_keys.size();
h_final_missing_keys.resize(prev_size + num_missing_keys);
CUDA_CHECK(cudaMemcpy(h_final_missing_keys.data() + prev_size, d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost));
}
std::vector<key_type> h_keys_buffer(max_batch_size_);
std::vector<index_type> h_indices_buffer(max_batch_size_);
std::vector<int> h_positions_buffer(max_batch_size_);
num_batches = (num_keys - 1) / max_batch_size_ + 1;
size_t num_hit_keys = 0;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size =
i != num_batches - 1 ? max_batch_size_ : num_keys - i * max_batch_size_;
CUDA_CHECK(cudaMemcpy(d_keys_buffer_, h_keys + i * max_batch_size_,
sizeof(*d_keys_buffer_) * this_batch_size, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
device_table_.query(d_keys_buffer_, this_batch_size, d_output_indices_, d_missing_keys_,
d_missing_positions_, d_missing_count_, 0);
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpy(d_vectors_buffer_, h_vectors + i * max_batch_size_ * vec_size_,
sizeof(*d_vectors_) * this_batch_size * vec_size_,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaStreamSynchronize(0));
if (num_hit_keys < device_table_.capacity) {
distribute_vectors_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, 0>>>(
d_output_indices_, this_batch_size, d_vectors_buffer_, vec_size_, d_vectors_);
CUDA_CHECK(cudaStreamSynchronize(0));
}
int num_missing_keys;
CUDA_CHECK(cudaMemcpy(&num_missing_keys, d_missing_count_, sizeof(num_missing_keys),
cudaMemcpyDeviceToHost));
num_hit_keys += this_batch_size - num_missing_keys;
host_table_.query(d_missing_keys_, num_missing_keys, d_output_indices_, nullptr, nullptr,
nullptr, 0);
CUDA_CHECK(cudaMemcpy(h_keys_buffer.data(), d_missing_keys_,
sizeof(*d_missing_keys_) * num_missing_keys, cudaMemcpyDeviceToHost))
CUDA_CHECK(cudaMemcpy(h_indices_buffer.data(), d_output_indices_,
sizeof(*d_output_indices_) * num_missing_keys, cudaMemcpyDeviceToHost))
CUDA_CHECK(cudaMemcpy(h_positions_buffer.data(), d_missing_positions_,
sizeof(*d_missing_positions_) * num_missing_keys, cudaMemcpyDeviceToHost))
for (int j = 0; j < num_missing_keys; j++) {
if (h_indices_buffer[j] != std::numeric_limits<index_type>::max()) {
memcpy(h_vectors_.data() + h_indices_buffer[j] * vec_size_,
h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
sizeof(*h_vectors) * vec_size_);
} else {
size_t prev_idx = h_vectors_.size() / vec_size_;
h_final_missing_items_.emplace(h_keys_buffer[j], prev_idx);
h_vectors_.resize(h_vectors_.size() + vec_size_);
memcpy(h_vectors_.data() + prev_idx * vec_size_,
h_vectors + (i * max_batch_size_ + h_positions_buffer[j]) * vec_size_,
sizeof(*h_vectors) * vec_size_);
}
}
}
CUDA_CHECK(cudaMemset(d_missing_count_, 0, sizeof(*d_missing_count_)));
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::query(const key_type* d_keys, const int num_keys,
vec_type* d_vectors, cudaStream_t stream) {
if (!num_keys) return;
CUDA_CHECK(cudaEventRecord(query_event_, stream));
CUDA_CHECK(cudaStreamWaitEvent(query_stream_, query_event_));
static_assert(num_buffers_ >= 2);
device_table_.query(d_keys, num_keys, d_output_indices_, d_missing_keys_, d_missing_positions_,
d_missing_count_, query_stream_);
CUDA_CHECK(cudaEventRecord(query_event_, query_stream_));
CUDA_CHECK(cudaStreamWaitEvent(cpy_streams_[0], query_event_));
int num_missing_keys;
CUDA_CHECK(cudaMemcpyAsync(&num_missing_keys, d_missing_count_, sizeof(*d_missing_count_),
cudaMemcpyDeviceToHost, cpy_streams_[0]));
host_table_.query(d_missing_keys_, d_missing_count_, d_output_host_indices_, query_stream_);
CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
CUDA_CHECK(cudaMemsetAsync(d_missing_count_, 0, sizeof(*d_missing_count_), query_stream_));
CUDA_CHECK(cudaMemcpyAsync(h_output_host_indices_, d_output_host_indices_,
sizeof(index_type) * num_missing_keys, cudaMemcpyDeviceToHost,
query_stream_));
CUDA_CHECK(cudaMemcpyAsync(h_missing_keys_, d_missing_keys_, sizeof(key_type) * num_missing_keys,
cudaMemcpyDeviceToHost, cpy_streams_[0]));
read_vectors_kernel<<<(num_keys - 1) / block_size + 1, block_size, 0, cpy_streams_[1]>>>(
d_output_indices_, num_keys, d_vectors_, vec_size_, d_vectors);
CUDA_CHECK(cudaStreamSynchronize(query_stream_));
CUDA_CHECK(cudaStreamSynchronize(cpy_streams_[0]));
int num_keys_per_buffer = ceil(1.0 * num_missing_keys / num_buffers_);
for (int buffer_num = 0; buffer_num < num_buffers_; buffer_num++) {
int num_keys_this_buffer = buffer_num != num_buffers_ - 1
? num_keys_per_buffer
: num_missing_keys - num_keys_per_buffer * buffer_num;
if (!num_keys_this_buffer) break;
#pragma omp parallel for num_threads(8)
for (size_t i = 0; i < static_cast<size_t>(num_keys_this_buffer); i++) {
size_t idx_key = buffer_num * num_keys_per_buffer + i;
index_type index = h_output_host_indices_[idx_key];
if (index == std::numeric_limits<index_type>::max()) {
key_type key = h_missing_keys_[idx_key];
auto iterator = h_final_missing_items_.find(key);
if (iterator != h_final_missing_items_.end()) {
index = iterator->second;
}
}
if (index != std::numeric_limits<index_type>::max()) {
memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, h_vectors_.data() + index * vec_size_,
sizeof(vec_type) * vec_size_);
} else {
memcpy(h_cpy_buffers_[buffer_num] + i * vec_size_, default_vector_.data(),
sizeof(vec_type) * vec_size_);
}
}
CUDA_CHECK(cudaMemcpyAsync(d_cpy_buffers_[buffer_num], h_cpy_buffers_[buffer_num],
sizeof(vec_type) * num_keys_this_buffer * vec_size_,
cudaMemcpyHostToDevice, cpy_streams_[buffer_num]));
distribute_vectors_kernel<<<(num_keys_this_buffer - 1) / block_size + 1, block_size, 0,
cpy_streams_[buffer_num]>>>(
d_missing_positions_ + buffer_num * num_keys_per_buffer, num_keys_this_buffer,
d_cpy_buffers_[buffer_num], vec_size_, d_vectors);
}
for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(cudaEventRecord(cpy_events_[i], cpy_streams_[i]));
CUDA_CHECK(cudaStreamWaitEvent(stream, cpy_events_[i]));
}
}
template <typename key_type, typename index_type, typename vec_type>
void UvmTable<key_type, index_type, vec_type>::clear(cudaStream_t stream) {
device_table_.clear(stream);
host_table_.clear(stream);
}
template <typename key_type, typename index_type, typename vec_type>
UvmTable<key_type, index_type, vec_type>::~UvmTable() {
CUDA_CHECK(cudaFree(d_keys_buffer_));
CUDA_CHECK(cudaFree(d_vectors_buffer_));
CUDA_CHECK(cudaFree(d_vectors_));
CUDA_CHECK(cudaFree(d_output_indices_));
CUDA_CHECK(cudaFree(d_output_host_indices_));
CUDA_CHECK(cudaFreeHost(h_output_host_indices_));
CUDA_CHECK(cudaFree(d_missing_keys_));
CUDA_CHECK(cudaFree(d_missing_positions_));
CUDA_CHECK(cudaFree(d_missing_count_));
CUDA_CHECK(cudaFreeHost(h_missing_keys_));
CUDA_CHECK(cudaStreamDestroy(query_stream_));
CUDA_CHECK(cudaEventDestroy(query_event_));
for (int i = 0; i < num_buffers_; i++) {
CUDA_CHECK(cudaFreeHost(h_cpy_buffers_[i]));
CUDA_CHECK(cudaFree(d_cpy_buffers_[i]));
CUDA_CHECK(cudaStreamDestroy(cpy_streams_[i]));
CUDA_CHECK(cudaEventDestroy(cpy_events_[i]));
}
}
template <typename key_type, typename index_type>
HashBlock<key_type, index_type>::HashBlock(size_t expected_capacity, int set_size, int batch_size)
: max_set_size_(set_size), batch_size_(batch_size) {
if (expected_capacity) {
num_sets = (expected_capacity - 1) / set_size + 1;
} else {
num_sets = 10000;
}
capacity = num_sets * set_size;
CUDA_CHECK(cudaMalloc(&keys, sizeof(*keys) * capacity));
CUDA_CHECK(cudaMalloc(&set_sizes_, sizeof(*set_sizes_) * num_sets));
CUDA_CHECK(cudaMemset(set_sizes_, 0, sizeof(*set_sizes_) * num_sets));
}
template <typename key_type, typename index_type>
HashBlock<key_type, index_type>::~HashBlock() {
CUDA_CHECK(cudaFree(keys));
CUDA_CHECK(cudaFree(set_sizes_));
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::query(const key_type* query_keys, const size_t num_keys,
index_type* output_indices, key_type* missing_keys,
int* missing_positions, int* num_missing_keys,
cudaStream_t stream) {
if (num_keys == 0) {
return;
}
size_t num_batches = (num_keys - 1) / batch_size_ + 1;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
hash_query_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
query_keys, this_batch_size, keys, num_sets, max_set_size_, output_indices, missing_keys,
missing_positions, num_missing_keys);
}
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::query(const key_type* query_keys, int* num_keys,
index_type* output_indices, cudaStream_t stream) {
hash_query_kernel<<<128, 64, 0, stream>>>(query_keys, num_keys, keys, num_sets, max_set_size_,
output_indices);
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::add(const key_type* new_keys, const size_t num_keys,
key_type* missing_keys, int* num_missing_keys,
cudaStream_t stream) {
if (num_keys == 0) {
return;
}
size_t num_batches = (num_keys - 1) / batch_size_ + 1;
for (size_t i = 0; i < num_batches; i++) {
size_t this_batch_size = i != num_batches - 1 ? batch_size_ : num_keys - i * batch_size_;
hash_add_kernel<<<(this_batch_size - 1) / block_size + 1, block_size, 0, stream>>>(
new_keys + i * this_batch_size, this_batch_size, keys, num_sets, set_sizes_, max_set_size_,
missing_keys, num_missing_keys);
}
}
template <typename key_type, typename index_type>
void HashBlock<key_type, index_type>::clear(cudaStream_t stream) {
CUDA_CHECK(cudaMemsetAsync(set_sizes_, 0, sizeof(*set_sizes_) * num_sets, stream));
}
template class HashBlock<int, size_t>;
template class HashBlock<int64_t, size_t>;
template class HashBlock<size_t, size_t>;
template class HashBlock<unsigned int, size_t>;
template class HashBlock<long long, size_t>;
template class UvmTable<int, size_t>;
template class UvmTable<int64_t, size_t>;
template class UvmTable<size_t, size_t>;
template class UvmTable<unsigned int, size_t>;
template class UvmTable<long long, size_t>;
} // namespace gpu_cache
\ No newline at end of file
...@@ -148,7 +148,7 @@ class DistributedDataLoader: ...@@ -148,7 +148,7 @@ class DistributedDataLoader:
return batch_data return batch_data
else : else :
raise StopIteration raise StopIteration
else : if self.queue_size > 0 :
num_reqs = min(self.queue_size - self.num_pending,self.expected_idx - self.submitted) num_reqs = min(self.queue_size - self.num_pending,self.expected_idx - self.submitted)
for _ in range(num_reqs): for _ in range(num_reqs):
if len(self.result_queue)>0: if len(self.result_queue)>0:
......
...@@ -4,7 +4,7 @@ import os.path as osp ...@@ -4,7 +4,7 @@ import os.path as osp
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from torch_geometric.data import Data from torch_geometric.data import Data
class GraphData(): class GraphStore():
def __init__(self, pdata, device = torch.device('cuda'), all_on_gpu = False): def __init__(self, pdata, device = torch.device('cuda'), all_on_gpu = False):
self.device = device self.device = device
self.ids = pdata.ids.to(device) self.ids = pdata.ids.to(device)
...@@ -14,8 +14,8 @@ class GraphData(): ...@@ -14,8 +14,8 @@ class GraphData():
else: else:
self.edge_ts = None self.edge_ts = None
self.sample_graph = pdata.sample_graph self.sample_graph = pdata.sample_graph
self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).data.to('cpu') self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).data.to('cpu') self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
if all_on_gpu: if all_on_gpu:
self.nids_mapper = self.nids_mapper.to(device) self.nids_mapper = self.nids_mapper.to(device)
self.eids_mapper = self.eids_mapper.to(device) self.eids_mapper = self.eids_mapper.to(device)
...@@ -30,9 +30,9 @@ class GraphData(): ...@@ -30,9 +30,9 @@ class GraphData():
self.x = None self.x = None
if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None: if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None:
if world_size > 1: if world_size > 1:
self.edge_attr = DistributedTensor(pdata.edge_attr.to('cpu').to(torch.float)) self.edge_attr = DistributedTensor(pdata.edge_attr.to('cuda').to(torch.float))
else: else:
self.edge_attr = pdata.edge_attr.to('cpu').to(torch.float) self.edge_attr = pdata.edge_attr.to('cuda').to(torch.float)
else: else:
self.edge_attr = None self.edge_attr = None
...@@ -106,7 +106,7 @@ class DataSet: ...@@ -106,7 +106,7 @@ class DataSet:
setattr(d,k,v[indx]) setattr(d,k,v[indx])
return d return d
class TemporalGraphData(): class TemporalGraphData(GraphStore):
def __init__(self,pdata,device): def __init__(self,pdata,device):
super(TemporalGraphData,self).__init__(pdata,device) super(TemporalGraphData,self).__init__(pdata,device)
def _set_temporal_batch_cache(self,size,pin_size): def _set_temporal_batch_cache(self,size,pin_size):
...@@ -117,7 +117,7 @@ class TemporalGraphData(): ...@@ -117,7 +117,7 @@ class TemporalGraphData():
class TemporalNeighborSampleGraph(GraphData): class TemporalNeighborSampleGraph(GraphStore):
def __init__(self, sample_graph=None, mode='full', eids_mapper=None): def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
self.edge_index = sample_graph['edge_index'] self.edge_index = sample_graph['edge_index']
self.num_edges = self.edge_index.shape[1] self.num_edges = self.edge_index.shape[1]
......
from typing import Union
from typing import List from typing import List
from typing import Optional
import torch import torch
from torch.distributed import rpc from torch.distributed import rpc
import torch_scatter import torch_scatter
...@@ -118,7 +120,7 @@ class SharedMailBox(): ...@@ -118,7 +120,7 @@ class SharedMailBox():
def set_mailbox_all_to_all(self,index,memory, def set_mailbox_all_to_all(self,index,memory,
memory_ts,mail,mail_ts, memory_ts,mail,mail_ts,
reduce_Op = None): reduce_Op = None,group = None):
#futs: List[torch.futures.Future] = [] #futs: List[torch.futures.Future] = []
if self.num_parts == 1: if self.num_parts == 1:
dist_index = DistIndex(index) dist_index = DistIndex(index)
...@@ -132,7 +134,7 @@ class SharedMailBox(): ...@@ -132,7 +134,7 @@ class SharedMailBox():
device = self.device) device = self.device)
indic = torch.searchsorted(index,self.partptr,right=False) indic = torch.searchsorted(index,self.partptr,right=False)
scatter_len_list = indic[1:] - indic[0:-1] scatter_len_list = indic[1:] - indic[0:-1]
torch.distributed.all_to_all_single(gather_len_list,scatter_len_list) torch.distributed.all_to_all_single(gather_len_list,scatter_len_list,group = group)
input_split = scatter_len_list.tolist() input_split = scatter_len_list.tolist()
output_split = gather_len_list.tolist() output_split = gather_len_list.tolist()
gather_id_list = torch.empty( gather_id_list = torch.empty(
...@@ -143,7 +145,7 @@ class SharedMailBox(): ...@@ -143,7 +145,7 @@ class SharedMailBox():
output_split = gather_len_list.tolist() output_split = gather_len_list.tolist()
torch.distributed.all_to_all_single( torch.distributed.all_to_all_single(
gather_id_list,index,output_split_sizes=output_split, gather_id_list,index,output_split_sizes=output_split,
input_split_sizes=input_split) input_split_sizes=input_split,group = group)
index = gather_id_list index = gather_id_list
gather_memory = torch.empty( gather_memory = torch.empty(
[gather_len_list.sum(),memory.shape[1]], [gather_len_list.sum(),memory.shape[1]],
...@@ -160,22 +162,81 @@ class SharedMailBox(): ...@@ -160,22 +162,81 @@ class SharedMailBox():
torch.distributed.all_to_all_single( torch.distributed.all_to_all_single(
gather_memory,memory, gather_memory,memory,
output_split_sizes=output_split, output_split_sizes=output_split,
input_split_sizes=input_split) input_split_sizes=input_split,group = group)
torch.distributed.all_to_all_single( torch.distributed.all_to_all_single(
gather_memory_ts,memory_ts, gather_memory_ts,memory_ts,
output_split_sizes=output_split, output_split_sizes=output_split,
input_split_sizes=input_split) input_split_sizes=input_split,group = group)
torch.distributed.all_to_all_single( torch.distributed.all_to_all_single(
gather_mail,mail, gather_mail,mail,
output_split_sizes=output_split, output_split_sizes=output_split,
input_split_sizes=input_split) input_split_sizes=input_split,group = group)
torch.distributed.all_to_all_single( torch.distributed.all_to_all_single(
gather_mail_ts,mail_ts, gather_mail_ts,mail_ts,
output_split_sizes=output_split, output_split_sizes=output_split,
input_split_sizes=input_split) input_split_sizes=input_split,group = group)
self.set_mailbox_local(DistIndex(index).loc,gather_mail,gather_mail_ts,Reduce_Op = reduce_Op) self.set_mailbox_local(DistIndex(index).loc,gather_mail,gather_mail_ts,Reduce_Op = reduce_Op)
self.set_memory_local(DistIndex(index).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op) self.set_memory_local(DistIndex(index).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op)
def set_mailbox_all_to_all(self,index,memory,
memory_ts,mail,mail_ts,
reduce_Op = None,group = None):
#futs: List[torch.futures.Future] = []
if self.num_parts == 1:
dist_index = DistIndex(index)
part_idx = dist_index.part
index = dist_index.loc
self.set_mailbox_local(index,mail,mail_ts)
self.set_memory_local(index,memory,memory_ts)
else:
gather_len_list = torch.empty([self.num_parts],
dtype = int,
device = self.device)
indic = torch.searchsorted(index,self.partptr,right=False)
scatter_len_list = indic[1:] - indic[0:-1]
torch.distributed.all_to_all_single(gather_len_list,scatter_len_list,group = group)
input_split = scatter_len_list.tolist()
output_split = gather_len_list.tolist()
gather_id_list = torch.empty(
[gather_len_list.sum()],
dtype = torch.long,
device = self.device)
input_split = scatter_len_list.tolist()
output_split = gather_len_list.tolist()
torch.distributed.all_to_all_single(
gather_id_list,index,output_split_sizes=output_split,
input_split_sizes=input_split,group = group)
index = gather_id_list
gather_memory = torch.empty(
[gather_len_list.sum(),memory.shape[1]],
dtype = memory.dtype,device = self.device)
gather_memory_ts = torch.empty(
[gather_len_list.sum()],
dtype = memory_ts.dtype,device = self.device)
gather_mail = torch.empty(
[gather_len_list.sum(),mail.shape[1]],
dtype = mail.dtype,device = self.device)
gather_mail_ts = torch.empty(
[gather_len_list.sum()],
dtype = mail_ts.dtype,device = self.device)
torch.distributed.all_to_all_single(
gather_memory,memory,
output_split_sizes=output_split,
input_split_sizes=input_split,group = group)
torch.distributed.all_to_all_single(
gather_memory_ts,memory_ts,
output_split_sizes=output_split,
input_split_sizes=input_split,group = group)
torch.distributed.all_to_all_single(
gather_mail,mail,
output_split_sizes=output_split,
input_split_sizes=input_split,group = group)
torch.distributed.all_to_all_single(
gather_mail_ts,mail_ts,
output_split_sizes=output_split,
input_split_sizes=input_split,group = group)
self.set_mailbox_local(DistIndex(index).loc,gather_mail,gather_mail_ts,Reduce_Op = reduce_Op)
self.set_memory_local(DistIndex(index).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op)
def get_update_mail(self,dist_indx_mapper, def get_update_mail(self,dist_indx_mapper,
src,dst,ts,edge_feats, src,dst,ts,edge_feats,
...@@ -234,9 +295,16 @@ class SharedMailBox(): ...@@ -234,9 +295,16 @@ class SharedMailBox():
#print(memory.shape[0]) #print(memory.shape[0])
return memory,memory_ts,mail,mail_ts return memory,memory_ts,mail,mail_ts
return torch.futures.collect_all([memory,memory_ts,mail,mail_ts]).then(callback) return torch.futures.collect_all([memory,memory_ts,mail,mail_ts]).then(callback)
def gather_memory(self,index,input_split,out_split): def gather_memory(
return self.node_memory.scatter_data(index,input_split,out_split),\ self,
self.node_memory_ts.scatter_data(index,input_split,out_split),\ dist_index: Union[torch.Tensor, DistIndex, None] = None,
self.mailbox.scatter_data(index,input_split,out_split),\ send_ptr: Optional[List[int]] = None,
self.mailbox_ts.scatter_data(index,input_split,out_split) recv_ptr: Optional[List[int]] = None,
recv_ind: Optional[List[int]] = None,
group = None
):
return self.node_memory.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group),\
self.node_memory_ts.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group),\
self.mailbox.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group),\
self.mailbox_ts.all_to_all_get(dist_index,send_ptr,recv_ptr,recv_ind,group)
...@@ -2,7 +2,8 @@ import parser ...@@ -2,7 +2,8 @@ import parser
from torch_sparse import SparseTensor from torch_sparse import SparseTensor
from torch_geometric.data import Data from torch_geometric.data import Data
from torch_geometric.utils import degree from torch_geometric.utils import degree
from .torch_utils import get_norm_temporal from starrygl.lib.libstarrygl_ops import metis_partition
from starrygl.lib.libstarrygl_ops_sampler import get_norm_temporal
import os.path as osp import os.path as osp
import os import os
import shutil import shutil
......
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CppExtension
setup(
name='torch_utils',
ext_modules=[
CppExtension(
name='torch_utils',
sources=['./torch_utils.cpp'],
extra_compile_args=['-fopenmp','-Xlinker',' -export-dynamic'],
include_dirs=["../sample_core"],
),
],
cmdclass={
'build_ext': BuildExtension
})#
#setup(
# name='cpu_cache_manager',
# ext_modules=[
# CppExtension(
# name='cpu_cache_manager',
# sources=['cpu_cache_manager.cpp'],
# extra_compile_args=['-fopenmp','-Xlinker',' -export-dynamic'],
# include_dirs=["./"],
# ),
# ],
# cmdclass={
# 'build_ext': BuildExtension
# })#
#
\ No newline at end of file
from concurrent.futures import ThreadPoolExecutor, thread
from multiprocessing import Process
from multiprocessing.pool import ThreadPool
from typing import Deque
import torch import torch
class WorkStreamEvent: import torch.distributed as dist
def __init__(self): stage = ['train_stream','write_memory','write_mail','lookup']
self.train_stream = torch.cuda.Stream()
self.write_memory_stream = torch.cuda.Stream()
self.fetch_stream = torch.cuda.Stream() class PipelineManager:
self.write_mail_stream = torch.cuda.Stream() def __init__(self,num_tasks = 10):
self.event = None self.stream_set = {}
event = WorkStreamEvent() self.dist_set = {}
def get_event(): self.args_queue = {}
return event.event self.thread_pool = ThreadPoolExecutor(num_tasks)
for k in stage:
self.stream_set[k] = torch.cuda.Stream()
self.dist_set[k] = dist.new_group()
self.args_queue[k] = Deque()
def submit(self,state,func,*args):
future = self.thread_pool.submit(target=self.run, args=(state,func,args))
return future
def run(self,state,func,*args):
while torch.cuda.stream(self.stream_set[state]):
func(args, group = self.dist_set[state])
manger = None
def getPipelineManger():
global manger
if manger == None:
manger = PipelineManager()
return manger
\ No newline at end of file
nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 0 > taobao4.out &
nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 1 > taobao4.out &
nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 2 > taobao4.out &
nohup python train_tgnn.py --dataname TaoBao --world_size 4 --rank 3 > taobao4.out &
nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 0 > GDELT4.out &
nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 1 > GDELT4.out &
nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 2 > GDELT4.out &
nohup python train_tgnn.py --dataname GDELT --world_size 4 --rank 3 > GDELT4.out &
nohup python train_tgnn.py --dataname ML25M --world_size 2 --rank 0 > ML25M2.out &
nohup python train_tgnn.py --dataname ML25M --world_size 2 --rank 1 > ML25M2.out &
nohup python train_tgnn.py --dataname TaoBao --world_size 2 --rank 0 > TaoBao2.out &
nohup python train_tgnn.py --dataname TaoBao --world_size 2 --rank 1 > TaoBao2.out &
nohup python train_tgnn.py --dataname ML25M --world_size 1 --rank 0 > ML25M1.out &
nohup python train_tgnn.py --dataname TaoBao --world_size 1 --rank 0 > TaoBao1.out &
nohup python train_tgnn.py --dataname GDELT --world_size 1 --rank 0 > GDELT1.out &
\ No newline at end of file
...@@ -25,24 +25,7 @@ import os ...@@ -25,24 +25,7 @@ import os
from starrygl.sample.data_loader import DistributedDataLoader from starrygl.sample.data_loader import DistributedDataLoader
from starrygl.sample.batch_data import SAMPLE_TYPE from starrygl.sample.batch_data import SAMPLE_TYPE
"""
test command
python test.py --world_size 2 --rank 0
--world_size', default=4, type=int, metavar='W',
help='number of workers')
parser.add_argument('--rank', default=0, type=int, metavar='W',
help='rank of the worker')
parser.add_argument('--log_interval', type=int, default=10, metavar='N',
help='interval between training status logs')
parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
help='how much to value future rewards')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed for reproducibility')
parser.add_argument('--num_sampler', type=int, default=10, metavar='S',
help='number of samplers')
parser.add_argument('--queue_size', type=int, default=10, metavar='S',
help='sampler queue size')
"""
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="RPC Reinforcement Learning Example", description="RPC Reinforcement Learning Example",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
...@@ -51,6 +34,8 @@ parser.add_argument('--rank', default=0, type=str, metavar='W', ...@@ -51,6 +34,8 @@ parser.add_argument('--rank', default=0, type=str, metavar='W',
help='name of dataset') help='name of dataset')
parser.add_argument('--world_size', default=1, type=int, metavar='W', parser.add_argument('--world_size', default=1, type=int, metavar='W',
help='number of negative samples') help='number of negative samples')
parser.add_argument('--dataname', default=1, type=str, metavar='W',
help='number of negative samples')
args = parser.parse_args() args = parser.parse_args()
from sklearn.metrics import average_precision_score, roc_auc_score from sklearn.metrics import average_precision_score, roc_auc_score
import torch import torch
...@@ -82,7 +67,7 @@ def main(): ...@@ -82,7 +67,7 @@ def main():
ctx = DistributedContext.init(backend="nccl", use_gpu=True) ctx = DistributedContext.init(backend="nccl", use_gpu=True)
device_id = torch.cuda.current_device() device_id = torch.cuda.current_device()
print('use cuda on',device_id) print('use cuda on',device_id)
pdata = partition_load("./dataset/here/GDELT", algo="metis_for_tgnn") pdata = partition_load("./dataset/here/{}".format(args.dataname), algo="metis_for_tgnn")
graph = GraphData(pdata = pdata) graph = GraphData(pdata = pdata)
sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full') sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
...@@ -174,10 +159,12 @@ def main(): ...@@ -174,10 +159,12 @@ def main():
dst = metadata['dst_pos_index'] dst = metadata['dst_pos_index']
ts = roots.ts ts = roots.ts
if(graph.edge_attr.device == torch.device('cpu')): if graph.edge_attr is None:
edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None edge_feats = None
elif(graph.edge_attr.device == torch.device('cpu')):
edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda')
else: else:
edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None edge_feats = graph.edge_attr[roots.eids]
dist_index_mapper = mfgs[0][0].srcdata['ID'] dist_index_mapper = mfgs[0][0].srcdata['ID']
root_index = torch.cat((src,dst)) root_index = torch.cat((src,dst))
last_updated_nid = model.module.memory_updater.last_updated_nid[root_index] last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
...@@ -247,10 +234,12 @@ def main(): ...@@ -247,10 +234,12 @@ def main():
dst = metadata['dst_pos_index'] dst = metadata['dst_pos_index']
ts = roots.ts ts = roots.ts
if(graph.edge_attr.device == torch.device('cpu')): if graph.edge_attr is None:
edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda') if graph.edge_attr is not None else None edge_feats = None
elif(graph.edge_attr.device == torch.device('cpu')):
edge_feats = graph.edge_attr[roots.eids.to('cpu')].to('cuda')
else: else:
edge_feats = graph.edge_attr[roots.eids] if graph.edge_attr is not None else None edge_feats = graph.edge_attr[roots.eids]
dist_index_mapper = mfgs[0][0].srcdata['ID'] dist_index_mapper = mfgs[0][0].srcdata['ID']
root_index = torch.cat((src,dst)) root_index = torch.cat((src,dst))
last_updated_nid = model.module.memory_updater.last_updated_nid[root_index] last_updated_nid = model.module.memory_updater.last_updated_nid[root_index]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment