Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
starrygl-DynamicHistory
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhlj
starrygl-DynamicHistory
Commits
2bfb3403
Commit
2bfb3403
authored
Dec 14, 2023
by
Wenjie Huang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add install.sh
parent
d1488225
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
154 additions
and
90 deletions
+154
-90
.gitignore
+1
-0
CMakeLists.txt
+28
-8
csrc/include/utils.h
+0
-6
csrc/metis/metis.cpp
+13
-7
install.sh
+11
-0
starrygl/distributed/cclib.py
+24
-17
starrygl/distributed/context.py
+77
-52
No files found.
.gitignore
View file @
2bfb3403
...
@@ -162,3 +162,4 @@ cython_debug/
...
@@ -162,3 +162,4 @@ cython_debug/
/dataset
/dataset
/test_*
/test_*
/*.ipynb
/*.ipynb
/third_party
CMakeLists.txt
View file @
2bfb3403
...
@@ -5,6 +5,7 @@ project(starrygl_ops VERSION 0.1)
...
@@ -5,6 +5,7 @@ project(starrygl_ops VERSION 0.1)
option
(
WITH_PYTHON
"Link to Python when building"
ON
)
option
(
WITH_PYTHON
"Link to Python when building"
ON
)
option
(
WITH_CUDA
"Link to CUDA when building"
ON
)
option
(
WITH_CUDA
"Link to CUDA when building"
ON
)
option
(
WITH_METIS
"Link to METIS when building"
ON
)
option
(
WITH_METIS
"Link to METIS when building"
ON
)
option
(
WITH_MTMETIS
"Link to multi-threaded METIS when building"
ON
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD 17
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
...
@@ -27,18 +28,38 @@ if(WITH_CUDA)
...
@@ -27,18 +28,38 @@ if(WITH_CUDA)
endif
()
endif
()
if
(
WITH_METIS
)
if
(
WITH_METIS
)
set
(
GKLIB_DIR
"third_party/GKlib"
)
add_definitions
(
-DWITH_METIS
)
set
(
METIS_DIR
"third_party/METIS"
)
set
(
GKLIB_DIR
"
${
CMAKE_SOURCE_DIR
}
/third_party/GKlib"
)
set
(
METIS_DIR
"
${
CMAKE_SOURCE_DIR
}
/third_party/METIS"
)
add_subdirectory
(
${
GKLIB_DIR
}
)
set
(
GKLIB_INCLUDE_DIRS
"
${
GKLIB_DIR
}
/include"
)
set
(
GKLIB_LIBRARIES
"
${
GKLIB_DIR
}
/lib"
)
set
(
METIS_INCLUDE_DIRS
"
${
METIS_DIR
}
/include"
)
set
(
METIS_LIBRARIES
"
${
METIS_DIR
}
/lib"
)
include_directories
(
${
METIS_INCLUDE_DIRS
}
)
link_libraries
(
${
METIS_LIBRARIES
}
)
endif
()
if
(
WITH_MTMETIS
)
add_definitions
(
-DWITH_MTMETIS
)
set
(
MTMETIS_DIR
"
${
CMAKE_SOURCE_DIR
}
/third_party/mt-metis"
)
set
(
MTMETIS_INCLUDE_DIRS
"
${
MTMETIS_DIR
}
/include"
)
set
(
MTMETIS_LIBRARIES
"
${
MTMETIS_DIR
}
/lib"
)
include_directories
(
${
MTMETIS_INCLUDE_DIRS
}
)
link_libraries
(
${
MTMETIS_LIBRARIES
}
)
endif
()
endif
()
find_package
(
OpenMP REQUIRED
)
link_libraries
(
OpenMP::OpenMP_CXX
)
find_package
(
Torch REQUIRED
)
find_package
(
Torch REQUIRED
)
include_directories
(
${
TORCH_INCLUDE_DIRS
}
)
include_directories
(
${
TORCH_INCLUDE_DIRS
}
)
add_compile_options
(
${
TORCH_CXX_FLAGS
}
)
add_compile_options
(
${
TORCH_CXX_FLAGS
}
)
# find_package(OpenMP REQUIRED)
include_directories
(
"csrc/include"
)
include_directories
(
"csrc/include"
)
file
(
GLOB_RECURSE UVM_SRCS
"csrc/uvm/*.cpp"
)
file
(
GLOB_RECURSE UVM_SRCS
"csrc/uvm/*.cpp"
)
...
@@ -50,10 +71,9 @@ if(WITH_PYTHON)
...
@@ -50,10 +71,9 @@ if(WITH_PYTHON)
find_library
(
TORCH_PYTHON_LIBRARY torch_python PATHS
"
${
TORCH_INSTALL_PREFIX
}
/lib"
)
find_library
(
TORCH_PYTHON_LIBRARY torch_python PATHS
"
${
TORCH_INSTALL_PREFIX
}
/lib"
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
${
TORCH_PYTHON_LIBRARY
}
)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
${
TORCH_PYTHON_LIBRARY
}
)
endif
()
endif
()
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
${
TORCH_LIBRARIES
}
)
# target_link_libraries(${PROJECT_NAME} PRIVATE OpenMP::OpenMP_CXX)
target_link_libraries
(
${
PROJECT_NAME
}
PRIVATE
${
TORCH_LIBRARIES
}
)
target_compile_definitions
(
${
PROJECT_NAME
}
PRIVATE -DTORCH_EXTENSION_NAME=lib
${
PROJECT_NAME
}
)
target_compile_definitions
(
${
PROJECT_NAME
}
PRIVATE -DTORCH_EXTENSION_NAME=lib
${
PROJECT_NAME
}
)
# set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" OUTPUT_NAME "_C")
# set_target_properties(${PROJECT_NAME} PROPERTIES PREFIX "" OUTPUT_NAME "_C")
install
(
TARGETS
${
PROJECT_NAME
}
DESTINATION
"
${
CMAKE_SOURCE_DIR
}
/starrygl/lib"
)
#
install(TARGETS ${PROJECT_NAME} DESTINATION "${CMAKE_SOURCE_DIR}/starrygl/lib")
csrc/include/utils.h
deleted
100644 → 0
View file @
d1488225
#pragma once
#include "extension.h"
#define CHECK_CUDA(x) \
AT_ASSERTM(x.device().is_cuda(), #x "must be CUDA Tensor")
csrc/metis/metis.cpp
View file @
2bfb3403
#include <torch/torch.h>
#include <torch/all.h>
#include <metis.h>
#include <mtmetis.h>
at
::
Tensor
metis_partition
()
{
// at::Tensor metis_partition(
// at::Tensor rowptr,
// at::Tensor col,
// at::optional<at::Tensor> optional_value,
// ) {
}
//
}
at
::
Tensor
metis_mt_partition
()
{
// at::Tensor metis_mt_partition() {
}
//
}
\ No newline at end of file
\ No newline at end of file
install.sh
0 → 100644
View file @
2bfb3403
#!/bin/bash
mkdir
-p
build
&&
cd
build
cmake ..
\
-DCMAKE_EXPORT_COMPILE_COMMANDS
=
ON
\
-DCMAKE_PREFIX_PATH
=
"/home/hwj/.miniconda3/envs/sgl/lib/python3.10/site-packages"
\
-DPython3_ROOT_DIR
=
"/home/hwj/.miniconda3/envs/sgl"
\
-DCUDA_TOOLKIT_ROOT_DIR
=
"/home/hwj/.local/cuda-11.7"
\
&&
make
-j32
\
&&
cp libstarrygl_ops.so ../starrygl/lib/
\ No newline at end of file
starrygl/distributed/cclib.py
View file @
2bfb3403
...
@@ -45,8 +45,8 @@ def all_to_all_v(
...
@@ -45,8 +45,8 @@ def all_to_all_v(
assert
len
(
output_tensor_list
)
==
world_size
assert
len
(
output_tensor_list
)
==
world_size
assert
len
(
input_tensor_list
)
==
world_size
assert
len
(
input_tensor_list
)
==
world_size
if
group
is
None
:
#
if group is None:
group
=
dist
.
distributed_c10d
.
_get_default_group
()
#
group = dist.distributed_c10d._get_default_group()
backend
=
dist
.
get_backend
(
group
)
backend
=
dist
.
get_backend
(
group
)
if
backend
==
"nccl"
:
if
backend
==
"nccl"
:
...
@@ -101,20 +101,28 @@ def all_to_all_v(
...
@@ -101,20 +101,28 @@ def all_to_all_v(
return
work
return
work
work
.
wait
()
work
.
wait
()
# else:
def
all_to_all_s
(
# assert backend == "gloo", f"backend must be nccl, mpi or gloo"
output_tensor
:
Tensor
,
input_tensor
:
Tensor
,
output_rowptr
:
List
[
int
],
input_rowptr
:
List
[
int
],
group
:
Optional
[
Any
]
=
None
,
async_op
:
bool
=
False
,
):
# rank = dist.get_rank(group)
world_size
=
dist
.
get_world_size
(
group
)
# rank = dist.get_rank(group
)
assert
len
(
output_rowptr
)
==
len
(
input_rowptr
)
# world_size = dist.get_world_size(group)
assert
len
(
output_rowptr
)
==
world_size
+
1
# p2p_op_list: List[dist.P2POp] = []
output_sizes
=
[
t
-
s
for
s
,
t
in
zip
(
output_rowptr
,
output_rowptr
[
1
:])]
# for i in range(1, world_size):
input_sizes
=
[
t
-
s
for
s
,
t
in
zip
(
input_rowptr
,
input_rowptr
[
1
:])]
# send_i = (rank + i) % world_size
# recv_i = (rank - i + world_size) % world_size
# p2p_op_list.extend([
return
dist
.
all_to_all_single
(
# dist.P2POp(dist.isend, input_tensor_list[send_i], send_i, group=group),
output
=
output_tensor
,
# dist.P2POp(dist.irecv, output_tensor_list[recv_i], recv_i, group=group),
input
=
input_tensor
,
# ])
output_split_sizes
=
output_sizes
,
# dist.batch_isend_irecv(p2p_op_list)
input_split_sizes
=
input_sizes
,
# output_tensor_list[rank][:] = input_tensor_list[rank]
group
=
group
,
\ No newline at end of file
async_op
=
async_op
,
)
starrygl/distributed/context.py
View file @
2bfb3403
...
@@ -7,9 +7,11 @@ import os
...
@@ -7,9 +7,11 @@ import os
from
torch
import
Tensor
from
torch
import
Tensor
from
typing
import
*
from
typing
import
*
from
contextlib
import
contextmanager
import
logging
import
logging
from
.cclib
import
all_to_all_v
from
.cclib
import
all_to_all_v
,
all_to_all_s
from
.rpc
import
rpc_remote_call
,
rpc_remote_void_call
from
.rpc
import
rpc_remote_call
,
rpc_remote_void_call
...
@@ -157,61 +159,84 @@ class DistributedContext:
...
@@ -157,61 +159,84 @@ class DistributedContext:
def
remote_void_call
(
self
,
method
,
rref
:
rpc
.
RRef
,
*
args
,
**
kwargs
):
def
remote_void_call
(
self
,
method
,
rref
:
rpc
.
RRef
,
*
args
,
**
kwargs
):
return
rpc_remote_void_call
(
method
,
rref
,
*
args
,
**
kwargs
)
return
rpc_remote_void_call
(
method
,
rref
,
*
args
,
**
kwargs
)
def
all_to_all_v
(
self
,
#
def all_to_all_v(self,
output_tensor_list
:
List
[
Tensor
],
#
output_tensor_list: List[Tensor],
input_tensor_list
:
List
[
Tensor
],
#
input_tensor_list: List[Tensor],
group
:
Optional
[
Any
]
=
None
,
# group: Any
= None,
async_op
:
bool
=
False
,
#
async_op: bool = False,
):
#
):
return
all_to_all_v
(
#
return all_to_all_v(
output_tensor_list
,
#
output_tensor_list,
input_tensor_list
,
#
input_tensor_list,
group
=
group
,
#
group=group,
async_op
=
async_op
,
#
async_op=async_op,
)
#
)
def
all_to_all_g
(
self
,
#
def all_to_all_g(self,
input_tensor_list
:
List
[
Tensor
],
#
input_tensor_list: List[Tensor],
group
:
Optional
[
Any
]
=
None
,
# group: Any
= None,
async_op
:
bool
=
False
,
#
async_op: bool = False,
):
#
):
send_sizes
=
[
t
.
size
(
0
)
for
t
in
input_tensor_list
]
#
send_sizes = [t.size(0) for t in input_tensor_list]
recv_sizes
=
self
.
get_all_to_all_recv_sizes
(
send_sizes
,
group
)
#
recv_sizes = self.get_all_to_all_recv_sizes(send_sizes, group)
output_tensor_list
:
List
[
Tensor
]
=
[]
#
output_tensor_list: List[Tensor] = []
for
s
,
t
in
zip
(
recv_sizes
,
input_tensor_list
):
#
for s, t in zip(recv_sizes, input_tensor_list):
output_tensor_list
.
append
(
#
output_tensor_list.append(
torch
.
empty
(
s
,
*
t
.
shape
[
1
:],
dtype
=
t
.
dtype
,
device
=
t
.
device
),
#
torch.empty(s, *t.shape[1:], dtype=t.dtype, device=t.device),
)
#
)
work
=
all_to_all_v
(
#
work = all_to_all_v(
output_tensor_list
,
#
output_tensor_list,
input_tensor_list
,
#
input_tensor_list,
group
=
group
,
#
group=group,
async_op
=
async_op
,
#
async_op=async_op,
)
#
)
if
async_op
:
#
if async_op:
assert
work
is
not
None
#
assert work is not None
return
output_tensor_list
,
work
#
return output_tensor_list, work
else
:
#
else:
return
output_tensor_list
#
return output_tensor_list
def
get_all_to_all_recv_sizes
(
self
,
# def all_to_all_s(self,
send_sizes
:
List
[
int
],
# output_tensor: Tensor,
group
:
Optional
[
Any
]
=
None
,
# input_tensor: Tensor,
)
->
List
[
int
]:
# output_rowptr: List[int],
world_size
=
dist
.
get_world_size
(
group
)
# input_rowptr: List[int],
assert
len
(
send_sizes
)
==
world_size
# group: Any = None,
# async_op: bool = False,
if
dist
.
get_backend
(
group
)
==
"gloo"
:
# ):
send_t
=
torch
.
tensor
(
send_sizes
,
dtype
=
torch
.
long
)
# return all_to_all_s(
else
:
# output_tensor, input_tensor,
send_t
=
torch
.
tensor
(
send_sizes
,
dtype
=
torch
.
long
,
device
=
self
.
device
)
# output_rowptr, input_rowptr,
recv_t
=
torch
.
empty_like
(
send_t
)
# group=group, async_op=async_op,
# )
dist
.
all_to_all_single
(
recv_t
,
send_t
,
group
=
group
)
return
recv_t
.
tolist
()
# def get_all_to_all_recv_sizes(self,
# send_sizes: List[int],
# group: Optional[Any] = None,
# ) -> List[int]:
# world_size = dist.get_world_size(group)
# assert len(send_sizes) == world_size
# if dist.get_backend(group) == "gloo":
# send_t = torch.tensor(send_sizes, dtype=torch.long)
# else:
# send_t = torch.tensor(send_sizes, dtype=torch.long, device=self.device)
# recv_t = torch.empty_like(send_t)
# dist.all_to_all_single(recv_t, send_t, group=group)
# return recv_t.tolist()
@contextmanager
def
use_stream
(
self
,
stream
:
torch
.
cuda
.
Stream
,
with_event
:
bool
=
True
):
event
=
torch
.
cuda
.
Event
()
if
with_event
else
None
stream
.
wait_stream
(
torch
.
cuda
.
current_stream
(
self
.
device
))
with
torch
.
cuda
.
stream
(
stream
):
yield
event
if
with_event
:
event
.
record
()
def
all_gather_remote_objects
(
self
,
obj
:
Any
)
->
List
[
rpc
.
RRef
]:
def
all_gather_remote_objects
(
self
,
obj
:
Any
)
->
List
[
rpc
.
RRef
]:
if
not
isinstance
(
obj
,
rpc
.
RRef
):
if
not
isinstance
(
obj
,
rpc
.
RRef
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment