fix smooth aggregation

9359ae3b · zlj · ff19c482 · 9359ae3b · 9359ae3b
Commit 9359ae3b authored Oct 05, 2024 by zlj
Hide whitespace changes
Inline Side-by-side

Showing with 78 additions and 80 deletions

examples/test_all.sh
+76 -78

examples/train_boundery.py
+2 -2

No files found.
--- a/examples/test_all.sh
+++ b/examples/test_all.sh
 #!/bin/bash

 # 定义数组变量
-addr="192.168.1.105"
+addr="192.168.1.107"
 partition_params=("ours" )
 #"metis" "ldg" "random")
 #("ours" "metis" "ldg" "random")
@@ -9,17 +9,16 @@ partitions="4"
 node_per="4"
 nnodes="1"
 node_rank="0"
-probability_params=("1" "0.5" "0.1" "0.05" "0.01" "0")
-sample_type_params=("recent" "boundery_recent_decay" "boundery_recent_uniform")
+probability_params=("0.1" "0.05" "0.01" "0")
+sample_type_params=("boundery_recent_decay")
 #sample_type_params=("recent" "boundery_recent_decay") #"boundery_recent_uniform")
-sample_type_params=("recent")
 #memory_type=("all_update" "p2p" "all_reduce" "historical" "local")
-memory_type=("all_update" "local")
-#"historical" "all_update") #"local" "historical")
+#memory_type=("all_update" "historical" "local")
+memory_type=("historical")
 #memory_type=("local" "all_update" "historical" "all_reduce")
-shared_memory_ssim=("0" "0.3" "0.5" "0.7")
+shared_memory_ssim=("0.3" "0.7")
 #data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk")
-data_param=("WIKI" "LASTFM")
+data_param=("LASTFM" "WikiTalk" "StackOverflow" "GDELT" "TaoBao")
 #data_param=("WIKI" "REDDIT" "LASTFM" "DGraphFin" "WikiTalk" "StackOverflow")
 #data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk" "StackOverflow")
 #data_param=("REDDIT" "WikiTalk")
@@ -28,7 +27,7 @@ mkdir -p all

 # 遍历数组并执行命令
 for data in "${data_param[@]}"; do
-    model="TGN"
+    model="TGN_large"
    if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
        model="TGN"
    fi
@@ -45,6 +44,74 @@ for data in "${data_param[@]}"; do
                    if [ "$mem" = "historical" ]; then
                        for ssim in "${shared_memory_ssim[@]}"; do
                            if [ "$partition" = "ours" ]; then
+                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out &
+                                wait
+                            fi
+                        done
+                    elif [ "$mem" = "all_reduce" ]; then
+                        if [ "$partition" = "ours" ]; then
+                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --memory_type "$mem"  > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
+                            wait
+                        fi
+                    else
+                        torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
+                        wait
+                        if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
+                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
+                            wait
+                        fi
+                    fi
+                done
+            else
+                for pro in "${probability_params[@]}"; do
+                    for mem in "${memory_type[@]}"; do
+                        if [ "$mem" = "historical" ]; then
+                            for ssim in "${shared_memory_ssim[@]}"; do
+                                 if [ "$partition" = "ours" ]; then
+                                     torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
+                                     wait
+                                 fi
+                             done
+                        elif [ "$mem" = "all_reduce" ]; then
+                            if [ "$partition" = "ours"]; then
+                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem"  > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out&
+                                wait
+                            fi
+                        else
+                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
+                            wait
+                            if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
+                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out &
+                                wait
+                            fi
+                        fi
+                    done
+                done
+            fi
+        done
+    done
+done
+
+
+
+for data in "${data_param[@]}"; do
+    model="JODILE"
+    if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
+        model="JODIE"
+    fi
+    #model="APAN"
+    mkdir all/"$data"
+    mkdir all/"$data"/"$model"
+    mkdir all/"$data"/"$model"/comm
+    #torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition ours --memory_type local --sample_type recent --topk 0 > all/"$data"/"$model"/1.out &
+    wait
+    for partition in "${partition_params[@]}"; do
+        for sample in "${sample_type_params[@]}"; do
+            if [ "$sample" = "recent" ]; then
+                for mem in "${memory_type[@]}"; do
+                    if [ "$mem" = "historical" ]; then
+                        for ssim in "${shared_memory_ssim[@]}"; do
+                            if [ "$partition" = "ours" ]; then
                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out &
                                wait
                            fi
@@ -93,72 +160,3 @@ for data in "${data_param[@]}"; do
        done
    done
 done
-
-
-
-# for data in "${data_param[@]}"; do
-#     model="JODILE"
-#     if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
-#         model="JODIE"
-#     fi
-#     #model="APAN"
-#     mkdir all/"$data"
-#     mkdir all/"$data"/"$model"
-#     mkdir all/"$data"/"$model"/comm
-#     #torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition ours --memory_type local --sample_type recent --topk 0 > all/"$data"/"$model"/1.out &
-#     wait
-#     for partition in "${partition_params[@]}"; do
-#         for sample in "${sample_type_params[@]}"; do
-#             if [ "$sample" = "recent" ]; then
-#                 for mem in "${memory_type[@]}"; do
-#                     if [ "$mem" = "historical" ]; then
-#                         for ssim in "${shared_memory_ssim[@]}"; do
-#                             if [ "$partition" = "ours" ]; then
-#                                 torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out &
-#                                 wait
-#                             fi
-#                         done
-#                     elif [ "$mem" = "all_reduce" ]; then
-#                         if [ "$partition" = "ours" ]; then
-#                             torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem"  > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
-#                             wait
-#                         fi
-#                     else
-#                         torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
-#                         wait
-#                         if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
-#                             torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
-#                             wait
-#                         fi
-#                     fi
-#                 done
-#             else
-#                 for pro in "${probability_params[@]}"; do
-#                     for mem in "${memory_type[@]}"; do
-#                         if [ "$mem" = "historical" ]; then
-#                             continue
-#                             # for ssim in "${shared_memory_ssim[@]}"; do
-#                             #     if [ "$partition" = "ours" ]; then
-#                             #         torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
-#                             #         wait
-#                             #     fi
-#                             # done
-#                         elif [ "$mem" = "all_reduce" ]; then
-#                             if [ "$partition" = "ours"]; then
-#                                 torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem"  > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out&
-#                                 wait
-#                             fi
-#                         else
-#                             torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
-#                             wait
-#                             if [ "$partition" = "ours" ] && [ "$mem" != "all_local" ]; then
-#                                 torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out &
-#                                 wait
-#                             fi
-#                         fi
-#                     done
-#                 done
-#             fi
-#         done
-#     done
-# done
--- a/examples/train_boundery.py
+++ b/examples/train_boundery.py
@@ -63,7 +63,7 @@ parser.add_argument('--partition', default='part', type=str, metavar='W',
                    help='name of model')
 parser.add_argument('--topk', default='0', type=str, metavar='W',
                    help='name of model')
-parser.add_argument('--probability', default=0.1, type=float, metavar='W',
+parser.add_argument('--probability', default=1, type=float, metavar='W',
                    help='name of model')
 parser.add_argument('--sample_type', default='recent', type=str, metavar='W',
                    help='name of model')
@@ -653,7 +653,7 @@ def main():
            break
        else:
            print('\ttrain loss:{:.4f}  train ap:{:4f}  val ap:{:4f}  val auc:{:4f} test ap {:4f} test auc{:4f}\n'.format(total_loss,train_ap, ap, auc,test_ap,test_auc))
-            print('\ttotal time:{:.2f}s  prep time:{:.2f}s\n test time {:.2f}'.format(time.time()-epoch_start_time, time_prep),t_test)    
+            print('\ttotal time:{:.2f}s  prep time:{:.2f}s\n test time {:.2f}'.format(time.time()-epoch_start_time, time_prep,t_test))    
            torch.save(model.module.state_dict(), get_checkpoint_path(e))
        if args.model == 'TGN':
            print('weight {} {}\n'.format(tt.weight_count_local,tt.weight_count_remote))