Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
B
BTS-MTGNN
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
zhlj
BTS-MTGNN
Commits
e074b837
Commit
e074b837
authored
Nov 28, 2024
by
zlj
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix time count in code
parent
b9ca4758
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
183 additions
and
164 deletions
+183
-164
config/TGN_large.yml
+1
-1
examples/test_all.sh
+155
-155
examples/train_boundery.py
+7
-4
starrygl/sample/data_loader.py
+18
-2
starrygl/sample/memory/shared_mailbox.py
+2
-2
No files found.
config/TGN_large.yml
View file @
e074b837
...
...
@@ -27,7 +27,7 @@ gnn:
dim_time
:
100
dim_out
:
100
train
:
-
epoch
:
50
-
epoch
:
1
batch_size
:
3000
# reorder: 16
lr
:
0.0004
...
...
examples/test_all.sh
View file @
e074b837
...
...
@@ -2,7 +2,7 @@
#跑了4卡的TaoBao
# 定义数组变量
seed
=
$1
addr
=
"192.168.1.10
5
"
addr
=
"192.168.1.10
7
"
partition_params
=(
"ours"
)
#"metis" "ldg" "random")
#("ours" "metis" "ldg" "random")
...
...
@@ -10,14 +10,14 @@ partitions="8"
node_per
=
"4"
nnodes
=
"2"
node_rank
=
"0"
probability_params
=(
"0.1"
)
sample_type_params
=(
"boundery_recent_decay"
)
probability_params
=(
"0.1"
"0.05"
"0.01"
"0"
)
sample_type_params
=(
"
recent"
"
boundery_recent_decay"
)
#sample_type_params=("recent" "boundery_recent_decay") #"boundery_recent_uniform")
#memory_type=("all_update" "p2p" "all_reduce" "historical" "local")
memory_type
=(
"all_update"
"historical"
"local"
)
memory_type
=(
"all_update"
)
#"historical")
#memory_type=("local" "all_update" "historical" "all_reduce")
shared_memory_ssim
=(
"0.3"
"0.7"
)
shared_memory_ssim
=(
"0.3"
)
#"historical")
#memory_type=("local" "all_update" "historical" "all_reduce")
#data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk")
...
...
@@ -99,143 +99,78 @@ for data in "${data_param[@]}"; do
done
done
for
data
in
"
${
data_param
[@]
}
"
;
do
model
=
"JODIE_large"
if
[
"
$data
"
=
"WIKI"
]
||
[
"
$data
"
=
"REDDIT"
]
||
[
"
$data
"
=
"LASTFM"
]
;
then
model
=
"JODIE"
#continue
fi
#model="APAN"
mkdir all_
"
$seed
"
/
"
$data
"
mkdir all_
"
$seed
"
/
"
$data
"
/
"
$model
"
mkdir all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/comm
#torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition ours --memory_type local --sample_type recent --topk 0 --seed "$seed" > all_"$seed"/"$data"/"$model"/1.out &
wait
for
partition
in
"
${
partition_params
[@]
}
"
;
do
for
sample
in
"
${
sample_type_params
[@]
}
"
;
do
if
[
"
$sample
"
=
"recent"
]
;
then
for
mem
in
"
${
memory_type
[@]
}
"
;
do
if
[
"
$mem
"
=
"historical"
]
;
then
for
ssim
in
"
${
shared_memory_ssim
[@]
}
"
;
do
if
[
"
$partition
"
=
"ours"
]
||
[
"
$partition
"
=
"metis"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--shared_memory_ssim
"
$ssim
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$ssim
"
-
"
$sample
"
.out &
wait
fi
done
elif
[
"
$mem
"
=
"all_reduce"
]
;
then
if
[
"
$partition
"
=
"ours"
]
||
[
"
$partition
"
=
"metis"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
.out &
wait
fi
else
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0-
"
$mem
"
-
"
$sample
"
.out &
wait
if
[
"
$partition
"
=
"ours"
]
&&
[
"
$mem
"
!=
"all_local"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
.out &
wait
fi
fi
done
else
for
pro
in
"
${
probability_params
[@]
}
"
;
do
for
mem
in
"
${
memory_type
[@]
}
"
;
do
if
[
"
$mem
"
=
"historical"
]
;
then
for
ssim
in
"
${
shared_memory_ssim
[@]
}
"
;
do
if
[
"
$partition
"
=
"ours"
]
||
[
"
$partition
"
=
"metis"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--probability
"
$pro
"
--memory_type
"
$mem
"
--shared_memory_ssim
"
$ssim
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$ssim
"
-
"
$sample
"
-
"
$pro
"
.out &
wait
fi
done
elif
[
"
$mem
"
=
"all_reduce"
]
;
then
if
[
"
$partition
"
=
"ours"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--probability
"
$pro
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
-
"
$pro
"
.out&
wait
fi
else
#torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
wait
if
[
"
$partition
"
=
"ours"
]
&&
[
"
$mem
"
!=
"all_local"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--probability
"
$pro
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
-
"
$pro
"
.out &
wait
fi
fi
done
done
fi
done
done
done
for
data
in
"
${
data_param
[@]
}
"
;
do
model
=
"APAN_large"
if
[
"
$data
"
=
"WIKI"
]
||
[
"
$data
"
=
"REDDIT"
]
||
[
"
$data
"
=
"LASTFM"
]
;
then
model
=
"APAN"
#continue
fi
#model="APAN"
mkdir all_
"
$seed
"
/
"
$data
"
mkdir all_
"
$seed
"
/
"
$data
"
/
"
$model
"
mkdir all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/comm
#torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition ours --memory_type local --sample_type recent --topk 0 --seed "$seed" > all_"$seed"/"$data"/"$model"/1.out &
wait
for
partition
in
"
${
partition_params
[@]
}
"
;
do
for
sample
in
"
${
sample_type_params
[@]
}
"
;
do
if
[
"
$sample
"
=
"recent"
]
;
then
for
mem
in
"
${
memory_type
[@]
}
"
;
do
if
[
"
$mem
"
=
"historical"
]
;
then
for
ssim
in
"
${
shared_memory_ssim
[@]
}
"
;
do
if
[
"
$partition
"
=
"ours"
]
||
[
"
$partition
"
=
"metis"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--shared_memory_ssim
"
$ssim
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$ssim
"
-
"
$sample
"
.out &
wait
fi
done
elif
[
"
$mem
"
=
"all_reduce"
]
;
then
if
[
"
$partition
"
=
"ours"
]
||
[
"
$partition
"
=
"metis"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
.out &
wait
fi
else
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0-
"
$mem
"
-
"
$sample
"
.out &
wait
if
[
"
$partition
"
=
"ours"
]
&&
[
"
$mem
"
!=
"all_local"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
.out &
wait
fi
fi
done
else
for
pro
in
"
${
probability_params
[@]
}
"
;
do
for
mem
in
"
${
memory_type
[@]
}
"
;
do
if
[
"
$mem
"
=
"historical"
]
;
then
for
ssim
in
"
${
shared_memory_ssim
[@]
}
"
;
do
if
[
"
$partition
"
=
"ours"
]
||
[
"
$partition
"
=
"metis"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--probability
"
$pro
"
--memory_type
"
$mem
"
--shared_memory_ssim
"
$ssim
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
_shared-0.01-
"
$mem
"
-
"
$ssim
"
-
"
$sample
"
-
"
$pro
"
.out &
wait
fi
done
elif
[
"
$mem
"
=
"all_reduce"
]
;
then
if
[
"
$partition
"
=
"ours"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--probability
"
$pro
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
-
"
$pro
"
.out&
wait
fi
else
#torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
wait
if
[
"
$partition
"
=
"ours"
]
&&
[
"
$mem
"
!=
"all_local"
]
;
then
torchrun
--nnodes
"
$nnodes
"
--node_rank
"
$node_rank
"
--nproc-per-node
"
$node_per
"
--master-addr
"
$addr
"
--master-port
9445 train_boundery.py
--dataname
"
$data
"
--mode
"
$model
"
--partition
"
$partition
"
--topk
0.1
--sample_type
"
$sample
"
--probability
"
$pro
"
--memory_type
"
$mem
"
--seed
"
$seed
"
>
all_
"
$seed
"
/
"
$data
"
/
"
$model
"
/
"
$partitions
"
-
"
$partition
"
-0
.01-
"
$mem
"
-
"
$sample
"
-
"
$pro
"
.out &
wait
fi
fi
done
done
fi
done
done
done
# for data in "${data_param[@]}"; do
# model="JODI
LE
"
# model="JODI
E_large
"
# if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
# model="JODIE"
# #continue
# fi
# #model="APAN"
# mkdir all_"$seed"/"$data"
# mkdir all_"$seed"/"$data"/"$model"
# mkdir all_"$seed"/"$data"/"$model"/comm
# #torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition ours --memory_type local --sample_type recent --topk 0 --seed "$seed" > all_"$seed"/"$data"/"$model"/1.out &
# wait
# for partition in "${partition_params[@]}"; do
# for sample in "${sample_type_params[@]}"; do
# if [ "$sample" = "recent" ]; then
# for mem in "${memory_type[@]}"; do
# if [ "$mem" = "historical" ]; then
# for ssim in "${shared_memory_ssim[@]}"; do
# if [ "$partition" = "ours" ] || [ "$partition" = "metis" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$ssim"-"$sample".out &
# wait
# fi
# done
# elif [ "$mem" = "all_reduce" ]; then
# if [ "$partition" = "ours" ] || [ "$partition" = "metis" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# wait
# fi
# else
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
# wait
# if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# wait
# fi
# fi
# done
# else
# for pro in "${probability_params[@]}"; do
# for mem in "${memory_type[@]}"; do
# if [ "$mem" = "historical" ]; then
# for ssim in "${shared_memory_ssim[@]}"; do
# if [ "$partition" = "ours" ] || [ "$partition" = "metis" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$ssim"-"$sample"-"$pro".out &
# wait
# fi
# done
# elif [ "$mem" = "all_reduce" ]; then
# if [ "$partition" = "ours"]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out&
# wait
# fi
# else
# #torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
# wait
# if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out &
# wait
# fi
# fi
# done
# done
# fi
# done
# done
# done
# for data in "${data_param[@]}"; do
# model="APAN_large"
# if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
# model="APAN"
# #continue
# fi
# #model="APAN"
# mkdir all_"$seed"/"$data"
...
...
@@ -249,21 +184,21 @@ done
# for mem in "${memory_type[@]}"; do
# if [ "$mem" = "historical" ]; then
# for ssim in "${shared_memory_ssim[@]}"; do
# if [ "$partition" = "ours" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$ssim"-"$sample".out &
# if [ "$partition" = "ours" ]
|| [ "$partition" = "metis" ]
; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
1 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" --seed "$seed
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$ssim"-"$sample".out &
# wait
# fi
# done
# elif [ "$mem" = "all_reduce" ]; then
# if [ "$partition" = "ours" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
01 --sample_type "$sample" --memory_type "$mem"
> all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# if [ "$partition" = "ours" ]
|| [ "$partition" = "metis" ]
; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
1 --sample_type "$sample" --memory_type "$mem" --seed "$seed"
> all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# wait
# fi
# else
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem"
--seed "$seed"
> all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
# wait
# if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
01 --sample_type "$sample" --memory_type "$mem
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
1 --sample_type "$sample" --memory_type "$mem" --seed "$seed
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# wait
# fi
# fi
...
...
@@ -272,23 +207,22 @@ done
# for pro in "${probability_params[@]}"; do
# for mem in "${memory_type[@]}"; do
# if [ "$mem" = "historical" ]; then
# continue
# # for ssim in "${shared_memory_ssim[@]}"; do
# # if [ "$partition" = "ours" ]; then
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all_"$seed"/"$data"/"$partitions"-"$partition"-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
# # wait
# # fi
# # done
# for ssim in "${shared_memory_ssim[@]}"; do
# if [ "$partition" = "ours" ] || [ "$partition" = "metis" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"_shared-0.01-"$mem"-"$ssim"-"$sample"-"$pro".out &
# wait
# fi
# done
# elif [ "$mem" = "all_reduce" ]; then
# if [ "$partition" = "ours"]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
01 --sample_type "$sample" --probability "$pro" --memory_type "$mem"
> all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out&
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed"
> all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out&
# wait
# fi
# else
#
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
#
#torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
# wait
# if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
01 --sample_type "$sample" --probability "$pro" --memory_type "$mem
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out &
# torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.
1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --seed "$seed
" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out &
# wait
# fi
# fi
...
...
@@ -298,3 +232,69 @@ done
# done
# done
# done
# # for data in "${data_param[@]}"; do
# # model="JODILE"
# # if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
# # model="JODIE"
# # fi
# # #model="APAN"
# # mkdir all_"$seed"/"$data"
# # mkdir all_"$seed"/"$data"/"$model"
# # mkdir all_"$seed"/"$data"/"$model"/comm
# # #torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition ours --memory_type local --sample_type recent --topk 0 --seed "$seed" > all_"$seed"/"$data"/"$model"/1.out &
# # wait
# # for partition in "${partition_params[@]}"; do
# # for sample in "${sample_type_params[@]}"; do
# # if [ "$sample" = "recent" ]; then
# # for mem in "${memory_type[@]}"; do
# # if [ "$mem" = "historical" ]; then
# # for ssim in "${shared_memory_ssim[@]}"; do
# # if [ "$partition" = "ours" ]; then
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$ssim"-"$sample".out &
# # wait
# # fi
# # done
# # elif [ "$mem" = "all_reduce" ]; then
# # if [ "$partition" = "ours" ]; then
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# # wait
# # fi
# # else
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
# # wait
# # if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample".out &
# # wait
# # fi
# # fi
# # done
# # else
# # for pro in "${probability_params[@]}"; do
# # for mem in "${memory_type[@]}"; do
# # if [ "$mem" = "historical" ]; then
# # continue
# # # for ssim in "${shared_memory_ssim[@]}"; do
# # # if [ "$partition" = "ours" ]; then
# # # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all_"$seed"/"$data"/"$partitions"-"$partition"-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
# # # wait
# # # fi
# # # done
# # elif [ "$mem" = "all_reduce" ]; then
# # if [ "$partition" = "ours"]; then
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out&
# # wait
# # fi
# # else
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
# # wait
# # if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
# # torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all_"$seed"/"$data"/"$model"/"$partitions"-"$partition"-0.01-"$mem"-"$sample"-"$pro".out &
# # wait
# # fi
# # fi
# # done
# # done
# # fi
# # done
# # done
# # done
examples/train_boundery.py
View file @
e074b837
...
...
@@ -202,7 +202,7 @@ def main():
else
:
graph
,
full_sampler_graph
,
train_mask
,
val_mask
,
test_mask
,
full_train_mask
,
cache_route
=
load_from_speed
(
args
.
dataname
,
seed
=
123457
,
top
=
args
.
topk
,
sampler_graph_add_rev
=
True
,
feature_device
=
torch
.
device
(
'cuda:{}'
.
format
(
ctx
.
local_rank
)),
partition
=
args
.
partition
)
#torch.device('cpu'))
if
(
args
.
dataname
==
'GDELT'
):
train_param
[
'epoch'
]
=
1
train_param
[
'epoch'
]
=
2
#torch.autograd.set_detect_anomaly(True)
# 确保 CUDA 可用
if
torch
.
cuda
.
is_available
():
...
...
@@ -295,7 +295,7 @@ def main():
mode
=
'train'
,
queue_size
=
200
,
mailbox
=
mailbox
,
is_pipeline
=
Fals
e
,
is_pipeline
=
Tru
e
,
use_local_feature
=
False
,
device
=
torch
.
device
(
'cuda:{}'
.
format
(
local_rank
)),
probability
=
args
.
probability
,
...
...
@@ -554,7 +554,8 @@ def main():
optimizer
.
zero_grad
()
ones
=
torch
.
ones
(
metadata
[
'dst_neg_index'
]
.
shape
[
0
],
device
=
model
.
device
,
dtype
=
torch
.
float
)
pred_pos
,
pred_neg
=
model
(
mfgs
,
metadata
,
neg_samples
=
args
.
neg_samples
,
async_param
=
param
)
time_count
.
time_backward
+=
time_count
.
elapsed_event
(
t1
)
t2
=
time_count
.
start_gpu
()
#print(time_count.elapsed_event(t2))
loss
=
creterion
(
pred_pos
,
torch
.
ones_like
(
pred_pos
))
if
args
.
local_neg_sample
is
False
:
...
...
@@ -569,16 +570,18 @@ def main():
#torch.cuda.synchronize()
loss
.
backward
()
optimizer
.
step
()
time_count
.
time_forward
+=
time_count
.
elapsed_event
(
t
1
)
time_count
.
time_forward
+=
time_count
.
elapsed_event
(
t
2
)
#torch.cuda.synchronize()
## train aps
#y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
#y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
#train_aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
#torch.cuda.synchronize()
t3
=
time_count
.
start_gpu
()
mailbox
.
update_shared
()
mailbox
.
update_p2p_mem
()
mailbox
.
update_p2p_mail
()
time_count
.
time_memory_sync
+=
time_count
.
elapsed_event
(
t3
)
#start = time_count.start_gpu()
#torch.cuda.empty_cache()
...
...
starrygl/sample/data_loader.py
View file @
e074b837
...
...
@@ -152,6 +152,8 @@ class DistributedDataLoader:
self
.
local_root
=
0
self
.
probability
=
probability
print
(
'pro {}
\n
'
.
format
(
self
.
probability
))
self
.
time_count
=
[]
def
__iter__
(
self
):
if
self
.
chunk_size
is
None
:
...
...
@@ -255,6 +257,7 @@ class DistributedDataLoader:
return
while
(
len
(
self
.
result_queue
)
==
0
):
pass
t0
=
tt
.
start_gpu
()
batch_data
,
dist_nid
,
dist_eid
=
self
.
result_queue
[
0
]
.
result
()
b
=
batch_data
[
1
][
0
][
0
]
self
.
remote_node
+=
(
DistIndex
(
dist_nid
)
.
part
!=
dist
.
get_rank
())
.
sum
()
.
item
()
...
...
@@ -268,6 +271,8 @@ class DistributedDataLoader:
#end = torch.cuda.Event(enable_timing=True)
#start.record()
stream
.
synchronize
()
tt
.
time_sample_and_build
+=
tt
.
elapsed_event
(
t0
)
t1
=
tt
.
start_gpu
()
#end.record()
#end.synchronize()
#print(start.elapsed_time(end))
...
...
@@ -287,8 +292,11 @@ class DistributedDataLoader:
edge_feat
=
None
t3
=
time
.
time
()
self
.
result_queue
.
append
((
batch_data
,
dist_nid
,
dist_eid
,
edge_feat
,
node_feat
))
tt
.
time_memory_fetch
+=
tt
.
elapsed_event
(
t1
)
t1
=
tt
.
start_gpu
()
self
.
submit
()
tt
.
time_sample_and_build
+=
tt
.
elapsed_event
(
t1
)
@torch.no_grad
()
def
__next__
(
self
):
ctx
=
DistributedContext
.
get_default_context
()
...
...
@@ -333,6 +341,7 @@ class DistributedDataLoader:
raise
StopIteration
else
:
if
self
.
recv_idxs
==
0
:
t0
=
tt
.
start_gpu
()
data
=
self
.
_next_data
()
batch_data
,
dist_nid
,
dist_eid
=
graph_sample
(
self
.
graph
,
...
...
@@ -346,17 +355,21 @@ class DistributedDataLoader:
)
edge_feat
=
get_edge_feature_by_dist
(
self
.
graph
,
dist_eid
,
is_local
,
out_device
=
self
.
device
)
node_feat
,
mem
=
get_node_feature_by_dist
(
self
.
graph
,
self
.
mailbox
,
dist_nid
,
is_local
,
out_device
=
self
.
device
)
t_sample
=
tt
.
elapsed_event
(
t0
)
tt
.
time_sample_and_build
+=
t_sample
t1
=
tt
.
start_gpu
()
prepare_input
(
node_feat
,
edge_feat
,
mem
,
batch_data
[
1
],
dist_nid
,
dist_eid
)
if
(
self
.
mailbox
is
not
None
and
self
.
mailbox
.
historical_cache
is
not
None
):
batch_data
[
1
][
0
][
0
]
.
srcdata
[
'his_mem'
]
=
batch_data
[
1
][
0
][
0
]
.
srcdata
[
'mem'
]
.
clone
()
batch_data
[
1
][
0
][
0
]
.
srcdata
[
'his_ts'
]
=
batch_data
[
1
][
0
][
0
]
.
srcdata
[
'mail_ts'
]
.
clone
()
tt
.
time_memory_fetch
+=
tt
.
elapsed_event
(
t1
)
#if(self.mailbox is not None and self.mailbox.historical_cache is not None):
# id = batch_data[1][0][0].srcdata['ID']
# mask = DistIndex(id).is_shared
#batch_data[1][0][0].srcdata['mem'][mask] = self.mailbox.historical_cache.local_historical_data[DistIndex(id).loc[mask]]
self
.
recv_idxs
+=
1
else
:
t0
=
time_count
.
start_gpu
()
if
(
self
.
recv_idxs
<
self
.
expected_idx
):
assert
len
(
self
.
result_queue
)
>
0
#print(len(self.result_queue[0]))
...
...
@@ -383,6 +396,7 @@ class DistributedDataLoader:
node_feat0
=
node_feat0
[
0
]
node_feat
=
None
mem
=
self
.
mailbox
.
unpack
(
node_feat0
,
mailbox
=
True
)
time_count
.
time_memory_fetch
+=
time_count
.
elapsed_event
(
t0
)
#print(node_feat.shape,edge_feat.shape,mem[0].shape)
#node_feat[1].wait()
#node_feat = node_feat[0]
...
...
@@ -417,7 +431,9 @@ class DistributedDataLoader:
global
executor
if
(
len
(
self
.
result_queue
)
==
0
):
#if(self.recv_idxs+1<=self.expected_idx):
t0
=
tt
.
start_gpu
()
self
.
submit
()
time_count
.
time_sample_and_build
=
tt
.
elapsed_event
(
t0
)
"""
graph_sample(
graph=self.graph,
...
...
starrygl/sample/memory/shared_mailbox.py
View file @
e074b837
...
...
@@ -514,8 +514,8 @@ class SharedMailBox():
self
.
update_shared
()
self
.
update_p2p_mail
()
self
.
update_p2p_mem
()
self
.
handle_last_async
()
self
.
sychronize_shared
()
#
self.handle_last_async()
#
self.sychronize_shared()
#self.historical_cache.add_shared_to_queue(handle0,handle1,shared_id_list,shared_list)
"""
shared_memory = self.node_memory.accessor.data[self.shared_nodes_index]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment