fix bugs in jodie and APAN

ef367556 · zhlj · 4927a9e0 · ce4b726f · ef367556 · ef367556
Commit ef367556 authored Nov 06, 2024 by zhlj
11 changed files
--- a/bts-mtgnn.md
+++ b/bts-mtgnn.md
+# Introduction
+· 
\ No newline at end of file
--- a/config/APAN.yml
+++ b/config/APAN.yml
@@ -22,7 +22,7 @@ gnn:
 train:
  - epoch: 100
    batch_size: 1000
-    lr: 0.0001
+    lr: 0.0002
    dropout: 0.1
    att_dropout: 0.1
    # all_on_gpu: True
\ No newline at end of file
--- a/examples/a.out
+++ b/examples/a.out
-LOCAL RANK 0, RANK0
+mkdir: cannot create directory ‘all_12347/WIKI’: File exists
-use cuda on 0
+mkdir: cannot create directory ‘all_12347/WIKI/JODIE’: File exists
-9228
+mkdir: cannot create directory ‘all_12347/WIKI/JODIE/comm’: File exists
-get_neighbors consume: 0.0103759s
+[2024-11-04 12:37:21,971] torch.distributed.run: [WARNING] 
-Epoch 0:
+[2024-11-04 12:37:21,971] torch.distributed.run: [WARNING] *****************************************
-	train loss:377.5712  train ap:0.903848  val ap:0.886584  val auc:0.904656
+[2024-11-04 12:37:21,971] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+[2024-11-04 12:37:21,971] torch.distributed.run: [WARNING] *****************************************
-	total time:11.40s  prep time:9.88s
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 1:
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:329.1190  train ap:0.920000  val ap:0.885216  val auc:0.904735
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.32s  prep time:9.79s
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 2:
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:316.1359  train ap:0.924376  val ap:0.895123  val auc:0.912622
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.49s  prep time:9.95s
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 3:
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:311.4889  train ap:0.926138  val ap:0.893922  val auc:0.912589
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.50s  prep time:9.97s
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 4:
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:302.2057  train ap:0.929684  val ap:0.889695  val auc:0.909766
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.48s  prep time:9.95s
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 5:
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:300.2464  train ap:0.931034  val ap:0.897774  val auc:0.916421
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.48s  prep time:9.95s
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 6:
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:293.5465  train ap:0.934657  val ap:0.896159  val auc:0.914983
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.55s  prep time:10.02s
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 7:
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:285.9396  train ap:0.937834  val ap:0.905351  val auc:0.922268
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.52s  prep time:9.99s
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+Traceback (most recent call last):
-Epoch 8:
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
-	train loss:281.7048  train ap:0.941035  val ap:0.909690  val auc:0.924262
+Traceback (most recent call last):
+Traceback (most recent call last):
-	total time:11.51s  prep time:9.98s
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
-	 fetch time:0.00s write back time:0.00s
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
-Epoch 9:
+    model.module.memory_updater.empty_cache()
-	train loss:273.8330  train ap:0.945250  val ap:0.913860  val auc:0.928068
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
+        main()main()
-	total time:11.56s  prep time:10.00s
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
-	 fetch time:0.00s write back time:0.00s
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
+    self.filter.clear()
-Epoch 10:
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
-	train loss:268.6164  train ap:0.947141  val ap:0.917379  val auc:0.930309
+        model.module.memory_updater.empty_cache()model.module.memory_updater.empty_cache()
-	total time:11.77s  prep time:10.19s
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
-	 fetch time:0.00s write back time:0.00s
+        self.filter.clear()self.filter.clear()
-Epoch 11:
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
-	train loss:265.0121  train ap:0.949457  val ap:0.918648  val auc:0.931452
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
+    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-	total time:11.62s  prep time:10.08s
+AttributeError: 'AsyncMemeoryUpdater' object has no attribute 'filter'
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-	 fetch time:0.00s write back time:0.00s
+AttributeError: AttributeError'AsyncMemeoryUpdater' object has no attribute 'filter': 'AsyncMemeoryUpdater' object has no attribute 'filter'
-Epoch 12:
-	train loss:255.6320  train ap:0.953506  val ap:0.919272  val auc:0.932783
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
-	total time:11.50s  prep time:9.98s
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
-	 fetch time:0.00s write back time:0.00s
+    model.module.memory_updater.empty_cache()
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
-Epoch 13:
+    self.filter.clear()
-	train loss:252.6296  train ap:0.954798  val ap:0.924649  val auc:0.936515
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
+    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-	total time:11.50s  prep time:9.96s
+AttributeError: 'AsyncMemeoryUpdater' object has no attribute 'filter'
+[W tensorpipe_agent.cpp:725] RPC agent for worker2 encountered error when reading incoming request from worker0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
-	 fetch time:0.00s write back time:0.00s
+[2024-11-04 12:37:31,980] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 538187) of binary: /home/zlj/.miniconda3/envs/tgnn_3.10/bin/python
+Traceback (most recent call last):
-Epoch 14:
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/bin/torchrun", line 33, in <module>
-	train loss:248.4476  train ap:0.956243  val ap:0.925952  val auc:0.938199
+    sys.exit(load_entry_point('torch==2.1.1+cu118', 'console_scripts', 'torchrun')())
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
-	total time:11.53s  prep time:10.00s
+    return f(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
-	 fetch time:0.00s write back time:0.00s
+    run(args)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
-Epoch 15:
+    elastic_launch(
-	train loss:243.4459  train ap:0.958749  val ap:0.929440  val auc:0.940865
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
-	total time:11.54s  prep time:10.01s
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
+    raise ChildFailedError(
-	 fetch time:0.00s write back time:0.00s
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
+============================================================
-Epoch 16:
+train_boundery.py FAILED
-	train loss:238.6286  train ap:0.960667  val ap:0.936339  val auc:0.946161
+------------------------------------------------------------
+Failures:
-	total time:17.48s  prep time:15.12s
+[1]:
+  time      : 2024-11-04_12:37:31
-	 fetch time:0.00s write back time:0.00s
+  host      : gpu05
+  rank      : 1 (local_rank: 1)
-Epoch 17:
+  exitcode  : 1 (pid: 538188)
-	train loss:234.5283  train ap:0.961787  val ap:0.933828  val auc:0.944680
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-	total time:18.09s  prep time:15.69s
+[2]:
+  time      : 2024-11-04_12:37:31
-	 fetch time:0.00s write back time:0.00s
+  host      : gpu05
+  rank      : 2 (local_rank: 2)
-Epoch 18:
+  exitcode  : 1 (pid: 538189)
-	train loss:227.3527  train ap:0.964591  val ap:0.932110  val auc:0.943765
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-	total time:18.17s  prep time:15.46s
+[3]:
+  time      : 2024-11-04_12:37:31
-	 fetch time:0.00s write back time:0.00s
+  host      : gpu05
+  rank      : 3 (local_rank: 3)
-Epoch 19:
+  exitcode  : 1 (pid: 538190)
-	train loss:223.7772  train ap:0.965486  val ap:0.937780  val auc:0.947312
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-	total time:17.80s  prep time:15.43s
+------------------------------------------------------------
+Root Cause (first observed failure):
-	 fetch time:0.00s write back time:0.00s
+[0]:
+  time      : 2024-11-04_12:37:31
-Epoch 20:
+  host      : gpu05
-	train loss:221.9428  train ap:0.966139  val ap:0.938104  val auc:0.948022
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 538187)
-	total time:18.31s  prep time:15.82s
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-	 fetch time:0.00s write back time:0.00s
+============================================================
+mkdir: cannot create directory ‘all_12347/LASTFM’: File exists
-Epoch 21:
+mkdir: cannot create directory ‘all_12347/LASTFM/JODIE’: File exists
-	train loss:216.8870  train ap:0.968285  val ap:0.942088  val auc:0.950660
+mkdir: cannot create directory ‘all_12347/LASTFM/JODIE/comm’: File exists
+[2024-11-04 12:37:33,221] torch.distributed.run: [WARNING] 
-	total time:18.14s  prep time:15.48s
+[2024-11-04 12:37:33,221] torch.distributed.run: [WARNING] *****************************************
+[2024-11-04 12:37:33,221] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
-	 fetch time:0.00s write back time:0.00s
+[2024-11-04 12:37:33,221] torch.distributed.run: [WARNING] *****************************************
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 22:
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:213.5077  train ap:0.968911  val ap:0.944023  val auc:0.951869
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:18.09s  prep time:15.56s
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 23:
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:210.1412  train ap:0.970743  val ap:0.944840  val auc:0.952554
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:17.74s  prep time:15.47s
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 24:
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:208.9109  train ap:0.971101  val ap:0.944029  val auc:0.952720
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:18.47s  prep time:15.73s
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 25:
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:207.5198  train ap:0.970606  val ap:0.944518  val auc:0.952912
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:17.97s  prep time:15.66s
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 26:
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:203.6585  train ap:0.971611  val ap:0.940218  val auc:0.949371
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:17.70s  prep time:15.42s
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 27:
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:203.3531  train ap:0.972317  val ap:0.949000  val auc:0.956595
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:18.01s  prep time:15.33s
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 28:
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:198.1525  train ap:0.973525  val ap:0.948420  val auc:0.955604
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:17.78s  prep time:15.31s
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 29:
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:197.6365  train ap:0.973818  val ap:0.944911  val auc:0.953313
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+Traceback (most recent call last):
-	total time:17.74s  prep time:15.49s
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
-	 fetch time:0.00s write back time:0.00s
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
+        main()main()
-Epoch 30:
-	train loss:197.7800  train ap:0.973573  val ap:0.950356  val auc:0.958595
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
-	total time:18.24s  prep time:15.60s
+        model.module.memory_updater.empty_cache()model.module.memory_updater.empty_cache()
-	 fetch time:0.00s write back time:0.00s
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
-Epoch 31:
+        self.filter.clear()self.filter.clear()
-	train loss:194.4391  train ap:0.974730  val ap:0.952775  val auc:0.959729
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
-	total time:17.84s  prep time:15.23s
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-	 fetch time:0.00s write back time:0.00s
+AttributeErrorAttributeError: : 'AsyncMemeoryUpdater' object has no attribute 'filter''AsyncMemeoryUpdater' object has no attribute 'filter'
-Epoch 32:
-	train loss:190.1150  train ap:0.976038  val ap:0.953111  val auc:0.959360
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
-	total time:17.72s  prep time:15.46s
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
-	 fetch time:0.00s write back time:0.00s
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
-Epoch 33:
+        model.module.memory_updater.empty_cache()main()
-	train loss:185.7417  train ap:0.976925  val ap:0.954769  val auc:0.961057
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
-	total time:18.04s  prep time:15.56s
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 484, in main
+    self.filter.clear()
-	 fetch time:0.00s write back time:0.00s
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
+    model.module.memory_updater.empty_cache()
-Epoch 34:
+  File "/home/zlj/BTS-MTGNN/starrygl/module/memorys.py", line 533, in empty_cache
-	train loss:189.0004  train ap:0.976267  val ap:0.954641  val auc:0.961198
+    self.filter.clear()
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1695, in __getattr__
-	total time:17.89s  prep time:15.12s
+    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+AttributeError: 'AsyncMemeoryUpdater' object has no attribute 'filter'    raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
-	 fetch time:0.00s write back time:0.00s
+AttributeError: 'AsyncMemeoryUpdater' object has no attribute 'filter'
-Epoch 35:
+[W tensorpipe_agent.cpp:725] RPC agent for worker0 encountered error when reading incoming request from worker2: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
-	train loss:185.4487  train ap:0.977420  val ap:0.954675  val auc:0.960969
+[W tensorpipe_agent.cpp:725] RPC agent for worker0 encountered error when reading incoming request from worker3: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
+[2024-11-04 12:37:48,237] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 538841) of binary: /home/zlj/.miniconda3/envs/tgnn_3.10/bin/python
-	total time:17.65s  prep time:15.13s
+Traceback (most recent call last):
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/bin/torchrun", line 33, in <module>
-	 fetch time:0.00s write back time:0.00s
+    sys.exit(load_entry_point('torch==2.1.1+cu118', 'console_scripts', 'torchrun')())
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
-Epoch 36:
+    return f(*args, **kwargs)
-	train loss:185.9187  train ap:0.977260  val ap:0.955284  val auc:0.961039
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
+    run(args)
-	total time:17.67s  prep time:15.36s
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
+    elastic_launch(
-	 fetch time:0.00s write back time:0.00s
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
-Epoch 37:
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
-	train loss:184.6686  train ap:0.977626  val ap:0.955124  val auc:0.961923
+    raise ChildFailedError(
+torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
-	total time:17.90s  prep time:15.42s
+============================================================
+train_boundery.py FAILED
-	 fetch time:0.00s write back time:0.00s
+------------------------------------------------------------
+Failures:
-Epoch 38:
+[1]:
-	train loss:183.1190  train ap:0.977930  val ap:0.956069  val auc:0.962114
+  time      : 2024-11-04_12:37:48
+  host      : gpu05
-	total time:18.10s  prep time:15.26s
+  rank      : 1 (local_rank: 1)
+  exitcode  : 1 (pid: 538842)
-	 fetch time:0.00s write back time:0.00s
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-Epoch 39:
+[2]:
-	train loss:179.3445  train ap:0.978350  val ap:0.958382  val auc:0.963833
+  time      : 2024-11-04_12:37:48
+  host      : gpu05
-	total time:18.05s  prep time:15.60s
+  rank      : 2 (local_rank: 2)
+  exitcode  : 1 (pid: 538843)
-	 fetch time:0.00s write back time:0.00s
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-Epoch 40:
+[3]:
-	train loss:174.6380  train ap:0.980014  val ap:0.956793  val auc:0.963013
+  time      : 2024-11-04_12:37:48
+  host      : gpu05
-	total time:18.28s  prep time:15.77s
+  rank      : 3 (local_rank: 3)
+  exitcode  : 1 (pid: 538844)
-	 fetch time:0.00s write back time:0.00s
+  error_file: <N/A>
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
-Epoch 41:
+------------------------------------------------------------
-	train loss:178.2737  train ap:0.979067  val ap:0.958580  val auc:0.964227
+Root Cause (first observed failure):
+[0]:
-	total time:18.24s  prep time:15.51s
+  time      : 2024-11-04_12:37:48
+  host      : gpu05
-	 fetch time:0.00s write back time:0.00s
+  rank      : 0 (local_rank: 0)
+  exitcode  : 1 (pid: 538841)
-Epoch 42:
+  error_file: <N/A>
-	train loss:175.7294  train ap:0.979611  val ap:0.960288  val auc:0.965754
+  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
+============================================================
-	total time:17.98s  prep time:15.62s
+mkdir: cannot create directory ‘all_12347/WikiTalk’: File exists
+mkdir: cannot create directory ‘all_12347/WikiTalk/JODIE’: File exists
-	 fetch time:0.00s write back time:0.00s
+mkdir: cannot create directory ‘all_12347/WikiTalk/JODIE/comm’: File exists
+[2024-11-04 12:37:49,479] torch.distributed.run: [WARNING] 
-Epoch 43:
+[2024-11-04 12:37:49,479] torch.distributed.run: [WARNING] *****************************************
-	train loss:173.2326  train ap:0.980324  val ap:0.960428  val auc:0.965867
+[2024-11-04 12:37:49,479] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
+[2024-11-04 12:37:49,479] torch.distributed.run: [WARNING] *****************************************
-	total time:18.21s  prep time:15.60s
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 44:
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:172.3492  train ap:0.980196  val ap:0.962143  val auc:0.966774
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:18.35s  prep time:15.80s
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 45:
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:168.8601  train ap:0.981180  val ap:0.963014  val auc:0.968132
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:17.73s  prep time:15.50s
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 46:
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:169.5997  train ap:0.981473  val ap:0.961124  val auc:0.966405
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:13.20s  prep time:11.67s
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 47:
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:167.5232  train ap:0.981394  val ap:0.961333  val auc:0.966534
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.49s  prep time:9.96s
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 48:
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:165.6863  train ap:0.981684  val ap:0.960024  val auc:0.965201
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.50s  prep time:9.97s
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-Epoch 49:
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
-	train loss:165.3790  train ap:0.981795  val ap:0.962299  val auc:0.967019
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	total time:11.54s  prep time:9.98s
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
-	 fetch time:0.00s write back time:0.00s
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
-Loading the best model at epoch 45
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
-	test AP:0.946485  test AUC:0.954197
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
-test_dataset 23621 avg_time 13.31522078514099 
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+[2024-11-04 12:38:02,873] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers
+[2024-11-04 12:38:02,874] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 539474 closing signal SIGINT
+[2024-11-04 12:38:02,874] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 539475 closing signal SIGINT
+[2024-11-04 12:38:02,874] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 539476 closing signal SIGINT
+[2024-11-04 12:38:02,874] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 539477 closing signal SIGINT
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 203, in main
+    graph,full_sampler_graph,train_mask,val_mask,test_mask,full_train_mask,cache_route = load_from_speed(args.dataname,seed=123457,top=args.topk,sampler_graph_add_rev=True, feature_device=torch.device('cuda:{}'.format(ctx.local_rank)),partition=args.partition)#torch.device('cpu'))
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 361, in load_from_speed
+    return load_from_shared_node_partition(data,node_i,shared_node,sample_add_rev=sampler_graph_add_rev,edge_i=edge_i,reid=None,device=device,feature_device=feature_device)
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 134, in load_from_shared_node_partition
+    print('tot edge {} circ edge {} same edge {}\n'.format(src.shape[0],torch.stack((src,dst)).unique(dim = 1).shape[1],(src==dst).sum().item()))
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_tensor.py", line 881, in unique
+    return torch.unique(
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 976, in _return_output
+    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 882, in _unique_impl
+    output, inverse_indices, counts = _VF.unique_dim(
+KeyboardInterrupt
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 203, in main
+    graph,full_sampler_graph,train_mask,val_mask,test_mask,full_train_mask,cache_route = load_from_speed(args.dataname,seed=123457,top=args.topk,sampler_graph_add_rev=True, feature_device=torch.device('cuda:{}'.format(ctx.local_rank)),partition=args.partition)#torch.device('cpu'))
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 361, in load_from_speed
+    return load_from_shared_node_partition(data,node_i,shared_node,sample_add_rev=sampler_graph_add_rev,edge_i=edge_i,reid=None,device=device,feature_device=feature_device)
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 134, in load_from_shared_node_partition
+    print('tot edge {} circ edge {} same edge {}\n'.format(src.shape[0],torch.stack((src,dst)).unique(dim = 1).shape[1],(src==dst).sum().item()))
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_tensor.py", line 881, in unique
+    return torch.unique(
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 976, in _return_output
+    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 882, in _unique_impl
+    output, inverse_indices, counts = _VF.unique_dim(
+KeyboardInterrupt
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 203, in main
+    graph,full_sampler_graph,train_mask,val_mask,test_mask,full_train_mask,cache_route = load_from_speed(args.dataname,seed=123457,top=args.topk,sampler_graph_add_rev=True, feature_device=torch.device('cuda:{}'.format(ctx.local_rank)),partition=args.partition)#torch.device('cpu'))
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 361, in load_from_speed
+    return load_from_shared_node_partition(data,node_i,shared_node,sample_add_rev=sampler_graph_add_rev,edge_i=edge_i,reid=None,device=device,feature_device=feature_device)
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 134, in load_from_shared_node_partition
+    print('tot edge {} circ edge {} same edge {}\n'.format(src.shape[0],torch.stack((src,dst)).unique(dim = 1).shape[1],(src==dst).sum().item()))
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_tensor.py", line 881, in unique
+    return torch.unique(
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 976, in _return_output
+    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 882, in _unique_impl
+    output, inverse_indices, counts = _VF.unique_dim(
+KeyboardInterrupt
+Traceback (most recent call last):
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 700, in <module>
+    main()
+  File "/home/zlj/BTS-MTGNN/examples/train_boundery.py", line 203, in main
+    graph,full_sampler_graph,train_mask,val_mask,test_mask,full_train_mask,cache_route = load_from_speed(args.dataname,seed=123457,top=args.topk,sampler_graph_add_rev=True, feature_device=torch.device('cuda:{}'.format(ctx.local_rank)),partition=args.partition)#torch.device('cpu'))
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 361, in load_from_speed
+    return load_from_shared_node_partition(data,node_i,shared_node,sample_add_rev=sampler_graph_add_rev,edge_i=edge_i,reid=None,device=device,feature_device=feature_device)
+  File "/home/zlj/BTS-MTGNN/starrygl/sample/part_utils/transformer_from_speed.py", line 134, in load_from_shared_node_partition
+    print('tot edge {} circ edge {} same edge {}\n'.format(src.shape[0],torch.stack((src,dst)).unique(dim = 1).shape[1],(src==dst).sum().item()))
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_tensor.py", line 881, in unique
+    return torch.unique(
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/_jit_internal.py", line 488, in fn
+    return if_false(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 976, in _return_output
+    output, _, _ = _unique_impl(input, sorted, return_inverse, return_counts, dim)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/functional.py", line 882, in _unique_impl
+    output, inverse_indices, counts = _VF.unique_dim(
+KeyboardInterrupt
+[W tensorpipe_agent.cpp:725] RPC agent for worker0 encountered error when reading incoming request from worker3: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
+Traceback (most recent call last):
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/bin/torchrun", line 33, in <module>
+    sys.exit(load_entry_point('torch==2.1.1+cu118', 'console_scripts', 'torchrun')())
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
+    return f(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/run.py", line 806, in main
+    run(args)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/run.py", line 797, in run
+    elastic_launch(
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 736, in run
+    result = self._invoke_run(role)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/home/zlj/.miniconda3/envs/tgnn_3.10/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 539465 got signal: 2
--- a/examples/test_JODIE.sh
+++ b/examples/test_JODIE.sh
+bash test_all.sh  12347
+wait
+bash test_all.sh  12357
+wait
+bash test_all.sh  63457
+wait
\ No newline at end of file
--- a/examples/test_all.sh
+++ b/examples/test_all.sh
@@ -2,41 +2,24 @@
 #跑了4卡的TaoBao
 # 定义数组变量
 seed=$1
-addr="192.168.1.106"
+addr="192.168.1.105"
 partition_params=("ours" )
 #"metis" "ldg" "random")
 #("ours" "metis" "ldg" "random")
-partitions="8"
+partitions="4"
 node_per="4"
-nnodes="2"
+nnodes="1"
-<<<<<<< Updated upstream
 node_rank="0"
 probability_params=("0.1")
 sample_type_params=("boundery_recent_decay")
 #sample_type_params=("recent" "boundery_recent_decay") #"boundery_recent_uniform")
 #memory_type=("all_update" "p2p" "all_reduce" "historical" "local")
-memory_type=( "historical")
+memory_type=("historical")
+#"historical")
 #memory_type=("local" "all_update" "historical" "all_reduce")
 shared_memory_ssim=("0.3")
 #data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk")
-<<<<<<< HEAD
+data_param=("WIKI" "LASTFM" "WikiTalk" "StackOverflow" "GDELT")
-data_param=("GDELT")
-=======
-data_param=("LASTFM")
->>>>>>> 8233776274204f6cf2f8a2eb37022d426d6197d8
-=======
-node_rank="1"
-probability_params=("0.1" "0.05" "0.01" "0")
-sample_type_params=("recent" "boundery_recent_decay")
-#sample_type_params=("recent" "boundery_recent_decay") #"boundery_recent_uniform")
-#memory_type=("all_update" "p2p" "all_reduce" "historical" "local")
-#memory_type=("all_update" "historical" "local")
-memory_type=("historical" "all_update" "local")
-#memory_type=("local" "all_update" "historical" "all_reduce")
-shared_memory_ssim=("0.3" "0.7")
-#data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk")
-data_param=("WIKI" "LASTFM" "WikiTalk" "StackOverflow" "GDELT" "TaoBao")
->>>>>>> Stashed changes
 #data_param=("WIKI" "REDDIT" "LASTFM" "DGraphFin" "WikiTalk" "StackOverflow")
 #data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk" "StackOverflow")
 #data_param=("REDDIT" "WikiTalk")
@@ -48,9 +31,9 @@ data_param=("WIKI" "LASTFM" "WikiTalk" "StackOverflow" "GDELT" "TaoBao")
 #seed=(( RANDOM % 1000000 + 1 ))
 mkdir -p all_"$seed"
 for data in "${data_param[@]}"; do
-    model="APAN"
+    model="JODIE"
    if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
-        model="APAN"
+        model="JODIE"
    fi
    #model="APAN"
    mkdir all_"$seed"/"$data"
@@ -89,11 +72,7 @@ for data in "${data_param[@]}"; do
                        if [ "$mem" = "historical" ]; then
                            for ssim in "${shared_memory_ssim[@]}"; do
                                 if [ "$partition" = "ours" ]; then
-<<<<<<< Updated upstream
                                     torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" --seed "$seed" > all_"$seed"/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample"-"$pro".out &
-=======
-                                     torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.1 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$model"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample"-"$pro".out &
->>>>>>> Stashed changes
                                     wait
                                 fi
                             done

--- a/examples/train_boundery.py
+++ b/examples/train_boundery.py
@@ -229,7 +229,8 @@ def main():
    fanout = sample_param['neighbor'] if 'neighbor' in sample_param else [10]
    policy = sample_param['strategy'] if 'strategy' in sample_param else 'recent'
    no_neg = sample_param['no_neg'] if 'no_neg' in sample_param else False
-    if policy != 'recent':
+    print(policy)
+    if policy == 'recent':
        policy_train = args.sample_type#'boundery_recent_decay'
    else:
        policy_train = policy
@@ -480,7 +481,7 @@ def main():
    val_list = []
    loss_list = []
    for e in range(train_param['epoch']):
-        model.module.memory_updater.empty_cache()
+#        model.module.memory_updater.empty_cache()
        tt._zero()
        torch.cuda.synchronize()
        epoch_start_time = time.time()

--- a/starrygl/module/memorys.py
+++ b/starrygl/module/memorys.py
@@ -336,9 +336,12 @@ class TransformerMemoryUpdater(torch.nn.Module):
    def forward(self, b, param = None):
        Q = self.w_q(b.srcdata['mem']).reshape((b.num_src_nodes(), self.att_h, -1))
        mails = b.srcdata['mem_input'].reshape((b.num_src_nodes(), self.memory_param['mailbox_size'], -1))
+        #print(mails.shape,b.srcdata['mem_input'].shape,b.srcdata['mail_ts'].shape)
        if self.dim_time > 0:
            time_feat = self.time_enc(b.srcdata['ts'][:, None] - b.srcdata['mail_ts']).reshape((b.num_src_nodes(), self.memory_param['mailbox_size'], -1))
+            #print(time_feat.shape)
            mails = torch.cat([mails, time_feat], dim=2)
+            #print(mails.shape)
        K = self.w_k(mails).reshape((b.num_src_nodes(), self.memory_param['mailbox_size'], self.att_h, -1))
        V = self.w_v(mails).reshape((b.num_src_nodes(), self.memory_param['mailbox_size'], self.att_h, -1))
        att = self.att_act((Q[:,None,:,:]*K).sum(dim=3))
@@ -394,7 +397,6 @@ class AsyncMemeoryUpdater(torch.nn.Module):
        self.mailbox.handle_last_async()
        submit_to_queue = False
        if nxt_fetch_func is not None:
-            nxt_fetch_func()
            submit_to_queue = True
        self.mailbox.set_memory_all_reduce(
                                           index,memory,memory_ts,
@@ -404,6 +406,8 @@ class AsyncMemeoryUpdater(torch.nn.Module):
                                           wait_submit=submit_to_queue,spread_mail=spread_mail,
                                           update_cross_mm=False,
                                           )
+        if nxt_fetch_func is not None:
+            nxt_fetch_func()
    def local_func(self,index,memory,memory_ts,mail_index,mail,mail_ts,nxt_fetch_func,spread_mail=False):
        if nxt_fetch_func is not None:
@@ -471,6 +475,7 @@ class AsyncMemeoryUpdater(torch.nn.Module):
                    shared_ind = self.mailbox.is_shared_mask[DistIndex(b.srcdata['ID'][mask]).loc]
                    transition_dense = b.srcdata['his_mem'][mask] + self.filter.get_incretment(shared_ind)
+                    #print(transition_dense.shape)
                    if not (transition_dense.max().item() == 0):      
                        transition_dense -= transition_dense.min()
                        transition_dense /= transition_dense.max() 
@@ -514,8 +519,8 @@ class AsyncMemeoryUpdater(torch.nn.Module):
                    local_mask = (DistIndex(index).part==torch.distributed.get_rank())
                    local_mask_mail = (DistIndex(index0).part==torch.distributed.get_rank())
-                    #self.mailbox.set_mailbox_local(DistIndex(index0[local_mask_mail]).loc,mail[local_mask_mail],mail_ts[local_mask_mail],Reduce_Op = 'max')
+                    self.mailbox.set_mailbox_local(DistIndex(index0[local_mask_mail]).loc,mail[local_mask_mail],mail_ts[local_mask_mail],Reduce_Op = 'max')
-                    #self.mailbox.set_memory_local(DistIndex(index[local_mask]).loc,memory[local_mask],memory_ts[local_mask], Reduce_Op = 'max')
+                    self.mailbox.set_memory_local(DistIndex(index[local_mask]).loc,memory[local_mask],memory_ts[local_mask], Reduce_Op = 'max')
                    is_deliver=(self.mailbox.deliver_to == 'neighbors')
                    self.update_hunk(index,memory,memory_ts,index0,mail,mail_ts,nxt_fetch_func,spread_mail= is_deliver)

--- a/starrygl/sample/batch_data.py
+++ b/starrygl/sample/batch_data.py
@@ -344,6 +344,7 @@ def to_reversed_block(graph,data, sample_out,device = torch.device('cuda'),uniqu
    else:
        metadata = None
    nid_mapper: torch.Tensor = graph.nids_mapper
+    #print('reverse block {}\n'.format(identity))
    if identity is False:
        assert len(sample_out) == 1
        ret = sample_out[0]
@@ -354,6 +355,8 @@ def to_reversed_block(graph,data, sample_out,device = torch.device('cuda'),uniqu
        dist_eid = torch.tensor([],dtype=torch.long,device=device)
        src_index = ret.src_index().to(device)
    else:
+        #print('is jodie')
+        #print(sample_out)
        src_index = torch.tensor([],dtype=torch.long,device=device)
        dst = torch.tensor([],dtype=torch.long,device=device)
        dist_eid = torch.tensor([],dtype=torch.long,device=device)
@@ -401,6 +404,7 @@ def to_reversed_block(graph,data, sample_out,device = torch.device('cuda'),uniqu
        row_len = root_len
        col = first_block_id[:row_len]
        max_row = col.max().item()+1
+        #print(src_index,dst)
        b = dgl.create_block((col[src_index].to(device),
                                torch.arange(dst.shape[0],device=device,dtype=torch.long)),num_src_nodes=first_block_id.max().item()+1,
                                num_dst_nodes=dst.shape[0])
@@ -424,6 +428,7 @@ def graph_sample(graph,sampler,sample_fn,data,neg_sampling = None,out_device = t
    t_s  = time.time()
    param = {'is_unique':False,'nid_mapper':nid_mapper,'eid_mapper':eid_mapper,'out_device':out_device}
    out = sample_fn(sampler,data,neg_sampling,**param)
+    #print(sampler.policy)
    if reversed is False:
        out,dist_nid,dist_eid = to_block(graph,data,out,out_device)
    else:

--- a/starrygl/sample/memory/change.py
+++ b/starrygl/sample/memory/change.py
@@ -18,9 +18,10 @@ class MemoryMoniter:
        #self.memory_ssim.append(self.ssim(pre_memory,now_memory,method = 'cos'))
        #self.nid_list.append(nid)
    def draw(self,degree,data,model,e):
-        torch.save(self.nid_list,'all_args.seed/{}/{}/memorynid_{}.pt'.format(data,model,e))
+        pass
-        torch.save(self.memorychange,'all_args.seed/{}/{}/memoryF_{}.pt'.format(data,model,e))
+        #torch.save(self.nid_list,'all_args.seed/{}/{}/memorynid_{}.pt'.format(data,model,e))
-        torch.save(self.memory_ssim,'all_args.seed/{}/{}/memcos_{}.pt'.format(data,model,e))
+        #torch.save(self.memorychange,'all_args.seed/{}/{}/memoryF_{}.pt'.format(data,model,e))
+        #torch.save(self.memory_ssim,'all_args.seed/{}/{}/memcos_{}.pt'.format(data,model,e))
        # path = './memory/{}/'.format(data)
        # if not os.path.exists(path):
@@ -87,6 +88,7 @@ class MemoryMoniter:
    def set_zero(self):
-        self.memorychange = []
+        pass
-        self.nid_list =[]
+        #self.memorychange = []
-        self.memory_ssim = []
+        #self.nid_list =[]
+        #self.memory_ssim = []
--- a/starrygl/sample/memory/shared_mailbox.py
+++ b/starrygl/sample/memory/shared_mailbox.py
@@ -146,7 +146,7 @@ class SharedMailBox():
            source_ts = max_ts
            source = source[id]
            index = unq_id
+        #print(self.next_mail_pos[index])
        self.mailbox_ts.accessor.data[index, self.next_mail_pos[index]] = source_ts
        self.mailbox.accessor.data[index, self.next_mail_pos[index]] = source
        if self.memory_param['mailbox_size'] > 1:
@@ -180,9 +180,23 @@ class SharedMailBox():
        if self.deliver_to == 'neighbors':
            assert block is not None and Reduce_score is None
-            mail = torch.cat([mail, mail[block.edges()[0].long()]], dim=0)
+#            print(block.edges().shape)
-            mail_ts = torch.cat([mail_ts, mail_ts[block.edges()[0].long()]], dim=0)
+            root = torch.cat([src,dst]).reshape(-1)
+            #pos = torch.empty(root.max()+1,dtype=torch.long,device=block.device)
+            #print('edge {} {}\n'.format(block.num_src_nodes(),block.edges()[0].max()))
+            #print('root is {} {} {} {}\n'.format(root,root.shape,root.max(),block.edges()[0].shape))
+            #pos_index = torch.arange(root.shape[0],device=root.device,dtype=root.dtype)
+            pos,idx = torch_scatter.scatter_max(mail_ts,root,0)
+            mail = torch.cat([mail, mail[idx]],dim=0)
+            mail_ts = torch.cat([mail_ts, mail_ts[idx]], dim=0)
+            #print('pos is {} {}\n'.format(pos,block.edges()[0].long()))
+            #mail = torch.cat([mail, mail[pos[block.edges()[0].long()]]],dim=0)
+            #mail_ts = torch.cat([mail_ts, mail_ts[pos[block.edges()[0].long()]]], dim=0)
+            #print(root,block.edges()[1].long())
            index = torch.cat([index,block.dstdata['ID'][block.edges()[1].long()]],dim=0)
+            #mail = torch.cat([mail, mail[block.edges()[0].long()]], dim=0)
+            #mail_ts = torch.cat([mail_ts, mail_ts[block.edges()[0].long()]], dim=0)
+            #index = torch.cat([index,block.dstdata['ID'][block.edges()[1].long()]],dim=0)
        if Reduce_score is not None:
            Reduce_score = torch.cat((Reduce_score,Reduce_score),-1).to(self.device)
        if Reduce_score is None:
@@ -192,12 +206,19 @@ class SharedMailBox():
            mail = mail[idx]
            index = unq_index
        else:
-            unq_index,inv = torch.unique(index,return_inverse = True)
+            uni, inv = torch.unique(index, return_inverse=True)
+            perm = torch.arange(inv.size(0), dtype=inv.dtype, device=inv.device)
+            perm = inv.new_empty(uni.size(0)).scatter_(0, inv, perm)
+            index = index[perm]
+            mail = mail[perm]
+            mail_ts = mail_ts[perm]
+            #unq_index,inv = torch.unique(index,return_inverse = True)
            #print(inv.shape,Reduce_score.shape)
-            max_score,idx = torch_scatter.scatter_max(Reduce_score,inv,0)
+            #max_score,idx = torch_scatter.scatter_max(Reduce_score,inv,0)
-            mail_ts = mail_ts[idx]
+            #mail_ts = mail_ts[idx]
-            mail = mail[idx]
+            #mail = mail[idx]
-            index = unq_index
+            #index = unq_index
+        #print('mail {} {}\n'.format(index.shape,mail.shape,mail_ts.shape))
        return index,mail,mail_ts
    def get_update_memory(self,index,memory,memory_ts,embedding):
@@ -205,7 +226,8 @@ class SharedMailBox():
        max_ts,idx = torch_scatter.scatter_max(memory_ts,inv,0)
        ts = max_ts
        index = unq_index
-        memory = memory[idx] 
+        memory = memory[idx]
+        #print('memory {} {}\n'.format(index.shape,memory.shape,ts.shape)) 
        return index,memory,ts
    def pack(self,memory=None,memory_ts=None,mail=None,mail_ts=None,index = None,mode=None):
@@ -250,6 +272,7 @@ class SharedMailBox():
                self.set_mailbox_local(DistIndex(gather_id_list).loc,gather_mail,gather_mail_ts,Reduce_Op = reduce_Op)
            else:
                gather_memory,gather_memory_ts = self.unpack(gather_memory)
+            #print(gather_id_list.shape,gather_memory.shape,gather_memory_ts.shape)
            self.set_memory_local(DistIndex(gather_id_list).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op)
    def handle_last_mail(self,reduce_Op=None,):
        if self.last_mail_sync is not None:
@@ -260,6 +283,7 @@ class SharedMailBox():
            if isinstance(gather_memory,list):
                gather_memory = torch.cat(gather_memory,dim = 0)
            gather_memory,gather_memory_ts = self.unpack(gather_memory)
+            #print(gather_id_list.shape,gather_memory.shape,gather_memory_ts.shape)
            self.set_mailbox_local(DistIndex(gather_id_list).loc,gather_memory,gather_memory_ts, Reduce_Op = reduce_Op)
    def handle_last_async(self,reduce_Op = None):
       self.handle_last_memory(reduce_Op)
@@ -303,6 +327,7 @@ class SharedMailBox():
            return
        index,gather_id_list,mem,gather_memory,input_split,output_split,group,async_op = self.next_wait_mail_job
        self.next_wait_mail_job = None
+        #print(index,gather_id_list)
        handle0 = torch.distributed.all_to_all_single(
            gather_id_list,index,output_split_sizes=output_split, 
            input_split_sizes=input_split,group = group,async_op=async_op)
@@ -409,6 +434,7 @@ class SharedMailBox():
                self.update_p2p_mail()
                self.update_p2p_mem()
                self.handle_last_async()
        ctx = DistributedContext.get_default_context()
@@ -483,7 +509,7 @@ class SharedMailBox():
                    unq_index,inv = torch.unique(shared_index,return_inverse = True)
                    max_ts,idx = torch_scatter.scatter_max(shared_memory_ts,inv,0)
                    shared_memory = shared_memory[idx]
-                    shared_memory = shared_memory[idx]
+                    #shared_memory = shared_memory[idx]
                    shared_memory_ts = shared_memory_ts[idx]
                    shared_mail_ts = shared_mail_ts[idx]
                    shared_mail = shared_mail[idx]
@@ -495,11 +521,13 @@ class SharedMailBox():
                self.set_mailbox_local(self.shared_nodes_index[shared_mail_indx],shared_mail,shared_mail_ts)
            else:
                self.next_wait_gather_memory_job = (shared_list,mem,shared_id_list,shared_memory_ind)
            if not wait_submit:
                self.update_shared()
                self.update_p2p_mail()
                self.update_p2p_mem()
+                self.handle_last_async()
+                self.sychronize_shared()
                #self.historical_cache.add_shared_to_queue(handle0,handle1,shared_id_list,shared_list)
            """
            shared_memory = self.node_memory.accessor.data[self.shared_nodes_index]

--- a/starrygl/sample/sample_core/neighbor_sampler.py
+++ b/starrygl/sample/sample_core/neighbor_sampler.py
@@ -317,8 +317,9 @@ class NeighborSampler(BaseSampler):
        """
        if self.no_neg:
            out,metadata = self.sample_from_nodes(seed[:seed.shape[0]//3*2], seed_ts[:seed.shape[0]//3*2], is_unique=False)
        else:
-            out,metadata = self.sample_from_nodes(seed[:seed.shape[0]], seed_ts[:seed.shape[0]//3*2], is_unique=False)
+            out,metadata = self.sample_from_nodes(seed, seed_ts, is_unique=False)
        src_pos_index = torch.arange(0,num_pos,dtype= torch.long,device=out_device)
        dst_pos_index = torch.arange(num_pos,2*num_pos,dtype= torch.long,device=out_device)
        if neg_sampling.is_triplet() or neg_sampling.is_tgbtriplet():