fix boundary

24a069d6 · zlj · 342421ab · 24a069d6 · 24a069d6 · 342421ab
Commit 24a069d6 authored Aug 31, 2024 by zlj
31 changed files
--- a/config/TGN.yml
+++ b/config/TGN.yml
 sampling:
  - layer: 1
    neighbor: 
-      - 20
+      - 10
    strategy: 'recent'
    prop_time: False
    history: 1
@@ -28,9 +28,9 @@ gnn:
    dim_out: 100
 train:
  - epoch: 100
-    batch_size: 600
+    batch_size: 1000
    # reorder: 16
-    lr: 0.0005
+    lr: 0.0004
    dropout: 0.2
    att_dropout: 0.2
    all_on_gpu: True
--- a/csrc/sampler/include/sampler.h
+++ b/csrc/sampler/include/sampler.h
@@ -289,23 +289,25 @@ void ParallelSampler :: neighbor_sample_from_nodes_with_before_layer(
            TimeStampType delta = end_index-1>=0?(rtts - tnb.timestamp[node][end_index-1])*fanout:0; 
            for(int cid = end_index-1;cid>=0;cid--){
                cal_cnt++;
-                if(cal_cnt>2*fanout)break;
+                if(cal_cnt>fanout)break;
                if(part[tnb.eid[node][cid]] != local_part|| node_part[tnb.neighbors[node][cid]]!= local_part){
                    double ep = exp((double)(tnb.timestamp[node][cid]-rtts)/(delta));
                    sum_p+=ep;pr[cal_cnt-1]=ep;
                    sum_1++;
                }
            }
+            if(sum_p<1e-6)sum_p=1;
            cal_cnt = 0;
            for(int cid = end_index-1;cid>=0;cid--){
                cal_cnt++;
-                if(cal_cnt > 2*fanout)break;
+                if(cal_cnt > fanout)break;
                int eid = tnb.eid[node][cid];
                if(part[tnb.eid[node][cid]] != local_part|| node_part[tnb.neighbors[node][cid]]!= local_part){
                    double p0 = (double)rand_r(&loc_seeds[tid]) / (RAND_MAX + 1.0);
                    double ep = boundery_probility*pr[cal_cnt-1]/sum_p*sum_1;
                    if(p0 > ep)continue;
+                    //cout<<"in"<<endl;
                }
                tgb_i[tid].src_index.emplace_back(i);
                tgb_i[tid].sample_nodes.emplace_back(tnb.neighbors[node][cid]);

--- a/examples/all/WIKI-1.out
+++ b/examples/all/WIKI-1.out
-LOCAL RANK 0, RANK0
-in
-local rank is 0 world_size is 1 memory group is 0 memory rank is 0 memory group size is 1
-[0]
-memory used is torch.Size([9228, 172]) torch.float32 0.005912840366363525
-dist rank is 0 after node feats defination:
-dist rank is 0 after node feats defination:
-local node num 9228 ,local edge num 157474
-num nodes is tensor([157474])
-init 0
-cpu
-tensor([     0,      1,      2,  ..., 157471, 157472, 157473])
-Total GPU memory:  44.3516845703125
-Current GPU memory allocated:  4.76837158203125e-07
-Current GPU memory reserved:  0.001953125
-Max GPU memory allocated during this session:  9728
-Max GPU memory reserved during this session:  2097152
-cpu 0
-Total GPU memory:  44.3516845703125
-Current GPU memory allocated:  4.76837158203125e-07
-Current GPU memory reserved:  0.001953125
-Max GPU memory allocated during this session:  9728
-Max GPU memory reserved during this session:  2097152
-torch.Size([157474, 172])
-Total GPU memory:  44.3516845703125
-Current GPU memory allocated:  4.76837158203125e-07
-Current GPU memory reserved:  0.001953125
-Max GPU memory allocated during this session:  9728
-Max GPU memory reserved during this session:  2097152
-init data loader
-Total GPU memory:  44.3516845703125
-Current GPU memory allocated:  4.76837158203125e-07
-Current GPU memory reserved:  0.001953125
-Max GPU memory allocated during this session:  9728
-Max GPU memory reserved during this session:  2097152
-tensor([], device='cuda:0', dtype=torch.int32)
-100
-get_neighbors consume: 0.0111879s
-0 tensor([0, 0, 0,  ..., 0, 0, 0])
-get_neighbors consume: 0.0105944s
-0 tensor([0, 0, 0,  ..., 0, 0, 0])
-part tensor([[0, 0, 0,  ..., 0, 0, 0],
-        [0, 0, 0,  ..., 0, 0, 0]])
-ts tensor([      0,      36,      77,  ..., 1862639, 1862645, 1862652],
-       device='cuda:0') tensor([      0,      36,      77,  ..., 1862639, 1862645, 1862652],
-       device='cuda:0') tensor([2218300, 2218303, 2218304,  ..., 2678293, 2678333, 2678373],
-       device='cuda:0') tensor([1862653, 1862659, 1862666,  ..., 2218282, 2218288, 2218288],
-       device='cuda:0')
-seed is 69328
-tensor([8228, 8229, 8230, 8231, 8232, 8233, 8234, 8235, 8236, 8237, 8238, 8239,
-        8240, 8241, 8242, 8243, 8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251,
-        8252, 8253, 8254, 8255, 8256, 8257, 8258, 8259, 8260, 8261, 8262, 8263,
-        8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8274, 8275,
-        8276, 8277, 8278, 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8287,
-        8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299,
-        8300, 8301, 8302, 8303, 8304, 8305, 8306, 8307, 8308, 8309, 8310, 8311,
-        8312, 8313, 8314, 8315, 8316, 8317, 8318, 8319, 8320, 8321, 8322, 8323,
-        8324, 8325, 8326, 8327, 8328, 8329, 8330, 8331, 8332, 8333, 8334, 8335,
-        8336, 8337, 8338, 8339, 8340, 8341, 8342, 8343, 8344, 8345, 8346, 8347,
-        8348, 8349, 8350, 8351, 8352, 8353, 8354, 8355, 8356, 8357, 8358, 8359,
-        8360, 8361, 8362, 8363, 8364, 8365, 8366, 8367, 8368, 8369, 8370, 8371,
-        8372, 8373, 8374, 8375, 8376, 8377, 8378, 8379, 8380, 8381, 8382, 8383,
-        8384, 8385, 8386, 8387, 8388, 8389, 8390, 8391, 8392, 8393, 8394, 8395,
-        8396, 8397, 8398, 8399, 8400, 8401, 8402, 8403, 8404, 8405, 8406, 8407,
-        8408, 8409, 8410, 8411, 8412, 8413, 8414, 8415, 8416, 8417, 8418, 8419,
-        8420, 8421, 8422, 8423, 8424, 8425, 8426, 8427, 8428, 8429, 8430, 8431,
-        8432, 8433, 8434, 8435, 8436, 8437, 8438, 8439, 8440, 8441, 8442, 8443,
-        8444, 8445, 8446, 8447, 8448, 8449, 8450, 8451, 8452, 8453, 8454, 8455,
-        8456, 8457, 8458, 8459, 8460, 8461, 8462, 8463, 8464, 8465, 8466, 8467,
-        8468, 8469, 8470, 8471, 8472, 8473, 8474, 8475, 8476, 8477, 8478, 8479,
-        8480, 8481, 8482, 8483, 8484, 8485, 8486, 8487, 8488, 8489, 8490, 8491,
-        8492, 8493, 8494, 8495, 8496, 8497, 8498, 8499, 8500, 8501, 8502, 8503,
-        8504, 8505, 8506, 8507, 8508, 8509, 8510, 8511, 8512, 8513, 8514, 8515,
-        8516, 8517, 8518, 8519, 8520, 8521, 8522, 8523, 8524, 8525, 8526, 8527,
-        8528, 8529, 8530, 8531, 8532, 8533, 8534, 8535, 8536, 8537, 8538, 8539,
-        8540, 8541, 8542, 8543, 8544, 8545, 8546, 8547, 8548, 8549, 8550, 8551,
-        8552, 8553, 8554, 8555, 8556, 8557, 8558, 8559, 8560, 8561, 8562, 8563,
-        8564, 8565, 8566, 8567, 8568, 8569, 8570, 8571, 8572, 8573, 8574, 8575,
-        8576, 8577, 8578, 8579, 8580, 8581, 8582, 8583, 8584, 8585, 8586, 8587,
-        8588, 8589, 8590, 8591, 8592, 8593, 8594, 8595, 8596, 8597, 8598, 8599,
-        8600, 8601, 8602, 8603, 8604, 8605, 8606, 8607, 8608, 8609, 8610, 8611,
-        8612, 8613, 8614, 8615, 8616, 8617, 8618, 8619, 8620, 8621, 8622, 8623,
-        8624, 8625, 8626, 8627, 8628, 8629, 8630, 8631, 8632, 8633, 8634, 8635,
-        8636, 8637, 8638, 8639, 8640, 8641, 8642, 8643, 8644, 8645, 8646, 8647,
-        8648, 8649, 8650, 8651, 8652, 8653, 8654, 8655, 8656, 8657, 8658, 8659,
-        8660, 8661, 8662, 8663, 8664, 8665, 8666, 8667, 8668, 8669, 8670, 8671,
-        8672, 8673, 8674, 8675, 8676, 8677, 8678, 8679, 8680, 8681, 8682, 8683,
-        8684, 8685, 8686, 8687, 8688, 8689, 8690, 8691, 8692, 8693, 8694, 8695,
-        8696, 8697, 8698, 8699, 8700, 8701, 8702, 8703, 8704, 8705, 8706, 8707,
-        8708, 8709, 8710, 8711, 8712, 8713, 8714, 8715, 8716, 8717, 8718, 8719,
-        8720, 8721, 8722, 8723, 8724, 8725, 8726, 8727, 8728, 8729, 8730, 8731,
-        8732, 8733, 8734, 8735, 8736, 8737, 8738, 8739, 8740, 8741, 8742, 8743,
-        8744, 8745, 8746, 8747, 8748, 8749, 8750, 8751, 8752, 8753, 8754, 8755,
-        8756, 8757, 8758, 8759, 8760, 8761, 8762, 8763, 8764, 8765, 8766, 8767,
-        8768, 8769, 8770, 8771, 8772, 8773, 8774, 8775, 8776, 8777, 8778, 8779,
-        8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788, 8789, 8790, 8791,
-        8792, 8793, 8794, 8795, 8796, 8797, 8798, 8799, 8800, 8801, 8802, 8803,
-        8804, 8805, 8806, 8807, 8808, 8809, 8810, 8811, 8812, 8813, 8814, 8815,
-        8816, 8817, 8818, 8819, 8820, 8821, 8822, 8823, 8824, 8825, 8826, 8827,
-        8828, 8829, 8830, 8831, 8832, 8833, 8834, 8835, 8836, 8837, 8838, 8839,
-        8840, 8841, 8842, 8843, 8844, 8845, 8846, 8847, 8848, 8849, 8850, 8851,
-        8852, 8853, 8854, 8855, 8856, 8857, 8858, 8859, 8860, 8861, 8862, 8863,
-        8864, 8865, 8866, 8867, 8868, 8869, 8870, 8871, 8872, 8873, 8874, 8875,
-        8876, 8877, 8878, 8879, 8880, 8881, 8882, 8883, 8884, 8885, 8886, 8887,
-        8888, 8889, 8890, 8891, 8892, 8893, 8894, 8895, 8896, 8897, 8898, 8899,
-        8900, 8901, 8902, 8903, 8904, 8905, 8906, 8907, 8908, 8909, 8910, 8911,
-        8912, 8913, 8914, 8915, 8916, 8917, 8918, 8919, 8920, 8921, 8922, 8923,
-        8924, 8925, 8926, 8927, 8928, 8929, 8930, 8931, 8932, 8933, 8934, 8935,
-        8936, 8937, 8938, 8939, 8940, 8941, 8942, 8943, 8944, 8945, 8946, 8947,
-        8948, 8949, 8950, 8951, 8952, 8953, 8954, 8955, 8956, 8957, 8958, 8959,
-        8960, 8961, 8962, 8963, 8964, 8965, 8966, 8967, 8968, 8969, 8970, 8971,
-        8972, 8973, 8974, 8975, 8976, 8977, 8978, 8979, 8980, 8981, 8982, 8983,
-        8984, 8985, 8986, 8987, 8988, 8989, 8990, 8991, 8992, 8993, 8994, 8995,
-        8996, 8997, 8998, 8999, 9000, 9001, 9002, 9003, 9004, 9005, 9006, 9007,
-        9008, 9009, 9010, 9011, 9012, 9013, 9014, 9015, 9016, 9017, 9018, 9019,
-        9020, 9021, 9022, 9023, 9024, 9025, 9026, 9027, 9028, 9029, 9030, 9031,
-        9032, 9033, 9034, 9035, 9036, 9037, 9038, 9039, 9040, 9041, 9042, 9043,
-        9044, 9045, 9046, 9047, 9048, 9049, 9050, 9051, 9052, 9053, 9054, 9055,
-        9056, 9057, 9058, 9059, 9060, 9061, 9062, 9063, 9064, 9065, 9066, 9067,
-        9068, 9069, 9070, 9071, 9072, 9073, 9074, 9075, 9076, 9077, 9078, 9079,
-        9080, 9081, 9082, 9083, 9084, 9085, 9086, 9087, 9088, 9089, 9090, 9091,
-        9092, 9093, 9094, 9095, 9096, 9097, 9098, 9099, 9100, 9101, 9102, 9103,
-        9104, 9105, 9106, 9107, 9108, 9109, 9110, 9111, 9112, 9113, 9114, 9115,
-        9116, 9117, 9118, 9119, 9120, 9121, 9122, 9123, 9124, 9125, 9126, 9127,
-        9128, 9129, 9130, 9131, 9132, 9133, 9134, 9135, 9136, 9137, 9138, 9139,
-        9140, 9141, 9142, 9143, 9144, 9145, 9146, 9147, 9148, 9149, 9150, 9151,
-        9152, 9153, 9154, 9155, 9156, 9157, 9158, 9159, 9160, 9161, 9162, 9163,
-        9164, 9165, 9166, 9167, 9168, 9169, 9170, 9171, 9172, 9173, 9174, 9175,
-        9176, 9177, 9178, 9179, 9180, 9181, 9182, 9183, 9184, 9185, 9186, 9187,
-        9188, 9189, 9190, 9191, 9192, 9193, 9194, 9195, 9196, 9197, 9198, 9199,
-        9200, 9201, 9202, 9203, 9204, 9205, 9206, 9207, 9208, 9209, 9210, 9211,
-        9212, 9213, 9214, 9215, 9216, 9217, 9218, 9219, 9220, 9221, 9222, 9223,
-        9224, 9225, 9226, 9227])
-seed is 46802
-tensor(36, device='cuda:0') 110232
-tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
-         3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
-         3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
-         3000., 3000., 3000., 3000., 3000., 3000., 2232.]])
-average: 1.0
-max average: 2979.2431640625
-min average: 2979.2431640625
-all_cross_edge:tensor([0])  local edges num: 110232
-tensor(36, device='cuda:0') 110232
-tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
-         3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
-         3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
-         3000., 3000., 3000., 3000., 3000., 3000., 2232.]])
-average: 1.0
-max average: 2979.2431640625
-min average: 2979.2431640625
-all_cross_edge:tensor([0])  local edges num: 110232
-tensor(7, device='cuda:0') 23621
-tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 2621.]])
-average: 1.0
-max average: 2952.625
-min average: 2952.625
-all_cross_edge:tensor([0])  local edges num: 23621
-tensor(7, device='cuda:0') 23621
-tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 2621.]])
-average: 1.0
-max average: 2952.625
-min average: 2952.625
-all_cross_edge:tensor([0])  local edges num: 23621
-init dataloader
-The model has 1.2405433654785156 trainable parameters
-Epoch 0:
-	train time:1.25s
-tensor(117481, device='cuda:0')
-local node number tensor([117481]) remote node number tensor([0]) local edge tensor([631632]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.5162225598469377 time_backward =  0.5316816458944231 time_sample =  0 pre_batch = 0 pre_input = 0.19834021909628063 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.009082171716727316 time_attention = 0.14380833425093442
-mode: val tensor([0.8996]) tensor([0.8919])
-mode: test tensor([0.8854]) tensor([0.8750])
-	train loss:39.3417  train ap: nan  val ap:0.899578  val auc:0.891941 test ap 0.885384 test auc0.874995
-	total time:1.71s  prep time:1.25s
-Epoch 1:
-	train time:0.93s
-tensor(119297, device='cuda:0')
-local node number tensor([119297]) remote node number tensor([0]) local edge tensor([648507]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.33275268971920013 time_backward =  0.4789364457828924 time_sample =  0 pre_batch = 0 pre_input = 0.1096803741529584 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008531142957508564 time_attention = 0.0745205981656909
-mode: val tensor([0.9011]) tensor([0.8921])
-mode: test tensor([0.8868]) tensor([0.8738])
-	train loss:30.7512  train ap: nan  val ap:0.901060  val auc:0.892088 test ap 0.886778 test auc0.873774
-	total time:1.33s  prep time:0.93s
-Epoch 2:
-	train time:0.93s
-tensor(119258, device='cuda:0')
-local node number tensor([119258]) remote node number tensor([0]) local edge tensor([648615]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3358315115328878 time_backward =  0.47259313764516264 time_sample =  0 pre_batch = 0 pre_input = 0.11430373543407768 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008510725339874625 time_attention = 0.07622448226902634
-mode: val tensor([0.9154]) tensor([0.9093])
-mode: test tensor([0.8961]) tensor([0.8869])
-	train loss:28.2729  train ap: nan  val ap:0.915394  val auc:0.909341 test ap 0.896142 test auc0.886938
-	total time:1.33s  prep time:0.93s
-Epoch 3:
-	train time:0.92s
-tensor(119292, device='cuda:0')
-local node number tensor([119292]) remote node number tensor([0]) local edge tensor([648859]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.332613394013606 time_backward =  0.4307080403668806 time_sample =  0 pre_batch = 0 pre_input = 0.15153192111756653 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008439739933237433 time_attention = 0.07364114979282022
-mode: val tensor([0.9314]) tensor([0.9279])
-mode: test tensor([0.9201]) tensor([0.9148])
-	train loss:27.2176  train ap: nan  val ap:0.931445  val auc:0.927941 test ap 0.920075 test auc0.914771
-	total time:1.32s  prep time:0.92s
-Epoch 4:
-	train time:0.93s
-tensor(119377, device='cuda:0')
-local node number tensor([119377]) remote node number tensor([0]) local edge tensor([648636]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3325000318000093 time_backward =  0.44885063578840345 time_sample =  0 pre_batch = 0 pre_input = 0.13642385101411492 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008459722972474992 time_attention = 0.07424444344360381
-mode: val tensor([0.9447]) tensor([0.9432])
-mode: test tensor([0.9347]) tensor([0.9314])
-	train loss:24.5895  train ap: nan  val ap:0.944669  val auc:0.943231 test ap 0.934749 test auc0.931430
-	total time:1.33s  prep time:0.93s
-Epoch 5:
-	train time:0.93s
-tensor(119388, device='cuda:0')
-local node number tensor([119388]) remote node number tensor([0]) local edge tensor([648852]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.32934694492723793 time_backward =  0.45787950325757265 time_sample =  0 pre_batch = 0 pre_input = 0.1311466050101444 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008425442734733224 time_attention = 0.07362530985847116
-mode: val tensor([0.9486]) tensor([0.9478])
-mode: test tensor([0.9400]) tensor([0.9373])
-	train loss:22.7186  train ap: nan  val ap:0.948616  val auc:0.947762 test ap 0.939990 test auc0.937335
-	total time:1.33s  prep time:0.93s
-Epoch 6:
-	train time:0.93s
-tensor(119251, device='cuda:0')
-local node number tensor([119251]) remote node number tensor([0]) local edge tensor([648799]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3296502644661814 time_backward =  0.4680681542959064 time_sample =  0 pre_batch = 0 pre_input = 0.12141763977706432 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008361523854546249 time_attention = 0.07355450722388923
-mode: val tensor([0.9575]) tensor([0.9550])
-mode: test tensor([0.9497]) tensor([0.9463])
-	train loss:22.6257  train ap: nan  val ap:0.957484  val auc:0.955028 test ap 0.949730 test auc0.946280
-	total time:1.33s  prep time:0.93s
-Epoch 7:
-	train time:0.92s
-tensor(119277, device='cuda:0')
-local node number tensor([119277]) remote node number tensor([0]) local edge tensor([648776]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3338864166289568 time_backward =  0.44001385651063174 time_sample =  0 pre_batch = 0 pre_input = 0.14206511317752302 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008449721965007484 time_attention = 0.07399340544361621
-mode: val tensor([0.9656]) tensor([0.9629])
-mode: test tensor([0.9580]) tensor([0.9545])
-	train loss:20.5010  train ap: nan  val ap:0.965612  val auc:0.962899 test ap 0.958044 test auc0.954479
-	total time:1.32s  prep time:0.92s
-Epoch 8:
-	train time:0.93s
-tensor(119249, device='cuda:0')
-local node number tensor([119249]) remote node number tensor([0]) local edge tensor([648369]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.33578664518427104 time_backward =  0.4269245610339567 time_sample =  0 pre_batch = 0 pre_input = 0.154373285244219 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008378260536119342 time_attention = 0.07443791208788753
-mode: val tensor([0.9678]) tensor([0.9651])
-mode: test tensor([0.9624]) tensor([0.9585])
-	train loss:18.9889  train ap: nan  val ap:0.967802  val auc:0.965149 test ap 0.962384 test auc0.958544
-	total time:1.33s  prep time:0.93s
-Epoch 9:
-	train time:0.93s
-tensor(119295, device='cuda:0')
-local node number tensor([119295]) remote node number tensor([0]) local edge tensor([648887]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.347587923752144 time_backward =  0.3889702036976814 time_sample =  0 pre_batch = 0 pre_input = 0.18681017577182502 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008404633379541337 time_attention = 0.07437894865870476
-mode: val tensor([0.9721]) tensor([0.9702])
-mode: test tensor([0.9675]) tensor([0.9645])
-	train loss:17.9763  train ap: nan  val ap:0.972146  val auc:0.970216 test ap 0.967490 test auc0.964478
-	total time:1.34s  prep time:0.93s
-Epoch 10:
-	train time:0.92s
-tensor(119326, device='cuda:0')
-local node number tensor([119326]) remote node number tensor([0]) local edge tensor([648962]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3354337838245556 time_backward =  0.38386718300171196 time_sample =  0 pre_batch = 0 pre_input = 0.18955101375468075 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008398966281674802 time_attention = 0.07468449766747653
-mode: val tensor([0.9729]) tensor([0.9706])
-mode: test tensor([0.9671]) tensor([0.9640])
-	train loss:17.1501  train ap: nan  val ap:0.972866  val auc:0.970649 test ap 0.967064 test auc0.963955
-	total time:1.32s  prep time:0.92s
-Epoch 11:
-	train time:0.92s
-tensor(119276, device='cuda:0')
-local node number tensor([119276]) remote node number tensor([0]) local edge tensor([648907]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3342382105765864 time_backward =  0.3991917232051492 time_sample =  0 pre_batch = 0 pre_input = 0.17684688128065318 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.0083418880822137 time_attention = 0.07429628912359476
-mode: val tensor([0.9745]) tensor([0.9726])
-mode: test tensor([0.9698]) tensor([0.9669])
-	train loss:16.3822  train ap: nan  val ap:0.974546  val auc:0.972600 test ap 0.969842 test auc0.966880
-	total time:1.32s  prep time:0.92s
-Epoch 12:
-	train time:0.92s
-tensor(119081, device='cuda:0')
-local node number tensor([119081]) remote node number tensor([0]) local edge tensor([647286]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3341143539873883 time_backward =  0.42863147507887334 time_sample =  0 pre_batch = 0 pre_input = 0.15269588702358305 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008519083610735834 time_attention = 0.07500944682396948
-mode: val tensor([0.9769]) tensor([0.9746])
-mode: test tensor([0.9705]) tensor([0.9675])
-	train loss:16.1246  train ap: nan  val ap:0.976921  val auc:0.974639 test ap 0.970455 test auc0.967546
-	total time:1.33s  prep time:0.92s
-Epoch 13:
-	train time:0.92s
-tensor(119328, device='cuda:0')
-local node number tensor([119328]) remote node number tensor([0]) local edge tensor([648707]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3364475581329316 time_backward =  0.38464420218952 time_sample =  0 pre_batch = 0 pre_input = 0.1897556931944564 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008665439207106829 time_attention = 0.07490635523572564
-mode: val tensor([0.9791]) tensor([0.9771])
-mode: test tensor([0.9723]) tensor([0.9696])
-	train loss:15.4998  train ap: nan  val ap:0.979124  val auc:0.977097 test ap 0.972275 test auc0.969606
-	total time:1.32s  prep time:0.92s
-Epoch 14:
-	train time:0.92s
-tensor(119268, device='cuda:0')
-local node number tensor([119268]) remote node number tensor([0]) local edge tensor([648277]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.33370649884454906 time_backward =  0.42611016042064875 time_sample =  0 pre_batch = 0 pre_input = 0.15458859503269196 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008416186203248799 time_attention = 0.07499780738726258
-mode: val tensor([0.9790]) tensor([0.9766])
-mode: test tensor([0.9731]) tensor([0.9699])
-	train loss:15.0516  train ap: nan  val ap:0.978998  val auc:0.976639 test ap 0.973110 test auc0.969935
-	total time:1.32s  prep time:0.92s
-Epoch 15:
-	train time:0.92s
-tensor(119402, device='cuda:0')
-local node number tensor([119402]) remote node number tensor([0]) local edge tensor([649428]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3383830221137032 time_backward =  0.4082292983075604 time_sample =  0 pre_batch = 0 pre_input = 0.1682694231858477 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008468184154480696 time_attention = 0.07482897094450891
-mode: val tensor([0.9777]) tensor([0.9761])
-mode: test tensor([0.9726]) tensor([0.9697])
-	train loss:14.6274  train ap: nan  val ap:0.977698  val auc:0.976067 test ap 0.972628 test auc0.969700
-	total time:1.33s  prep time:0.92s
-Epoch 16:
-	train time:0.93s
-tensor(119363, device='cuda:0')
-local node number tensor([119363]) remote node number tensor([0]) local edge tensor([649219]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.342100684880279 time_backward =  0.4223173810169101 time_sample =  0 pre_batch = 0 pre_input = 0.15758785407524556 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.009801693260669708 time_attention = 0.0757779193809256
-mode: val tensor([0.9794]) tensor([0.9774])
-mode: test tensor([0.9744]) tensor([0.9717])
-	train loss:14.2517  train ap: nan  val ap:0.979376  val auc:0.977353 test ap 0.974385 test auc0.971685
-	total time:1.34s  prep time:0.93s
-Epoch 17:
-	train time:0.95s
-tensor(119286, device='cuda:0')
-local node number tensor([119286]) remote node number tensor([0]) local edge tensor([648490]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34202918445225805 time_backward =  0.5509289090987295 time_sample =  0 pre_batch = 0 pre_input = 0.049267802853137255 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.016561739495955408 time_attention = 0.07720356027130038
-mode: val tensor([0.9804]) tensor([0.9785])
-mode: test tensor([0.9753]) tensor([0.9726])
-	train loss:13.8701  train ap: nan  val ap:0.980375  val auc:0.978506 test ap 0.975348 test auc0.972632
-	total time:1.36s  prep time:0.95s
-Epoch 18:
-	train time:0.95s
-tensor(119152, device='cuda:0')
-local node number tensor([119152]) remote node number tensor([0]) local edge tensor([648028]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34078817255795 time_backward =  0.5598571858135983 time_sample =  0 pre_batch = 0 pre_input = 0.04400515847373754 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.016431471100077033 time_attention = 0.07700221717823297
-mode: val tensor([0.9821]) tensor([0.9803])
-mode: test tensor([0.9763]) tensor([0.9735])
-	train loss:13.3467  train ap: nan  val ap:0.982058  val auc:0.980295 test ap 0.976265 test auc0.973481
-	total time:1.36s  prep time:0.95s
-Epoch 19:
-	train time:0.95s
-tensor(119270, device='cuda:0')
-local node number tensor([119270]) remote node number tensor([0]) local edge tensor([648959]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34164636861532927 time_backward =  0.5451775507535785 time_sample =  0 pre_batch = 0 pre_input = 0.054701198590919375 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01695081579964608 time_attention = 0.07749861106276512
-mode: val tensor([0.9815]) tensor([0.9795])
-mode: test tensor([0.9748]) tensor([0.9724])
-	train loss:13.4622  train ap: nan  val ap:0.981470  val auc:0.979522 test ap 0.974832 test auc0.972390
-	total time:1.35s  prep time:0.95s
-Epoch 20:
-	train time:0.95s
-tensor(119248, device='cuda:0')
-local node number tensor([119248]) remote node number tensor([0]) local edge tensor([648423]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34080718306358904 time_backward =  0.5503961594076827 time_sample =  0 pre_batch = 0 pre_input = 0.05070258106570691 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017195376800373197 time_attention = 0.07729481114074588
-mode: val tensor([0.9802]) tensor([0.9784])
-mode: test tensor([0.9741]) tensor([0.9714])
-	train loss:13.2978  train ap: nan  val ap:0.980217  val auc:0.978400 test ap 0.974100 test auc0.971387
-	total time:1.35s  prep time:0.95s
-Epoch 21:
-	train time:0.95s
-tensor(119320, device='cuda:0')
-local node number tensor([119320]) remote node number tensor([0]) local edge tensor([649095]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34298782143741846 time_backward =  0.5449241640744731 time_sample =  0 pre_batch = 0 pre_input = 0.05215074634179473 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01692443701904267 time_attention = 0.0773102946113795
-mode: val tensor([0.9820]) tensor([0.9807])
-mode: test tensor([0.9765]) tensor([0.9738])
-	train loss:12.9960  train ap: nan  val ap:0.982003  val auc:0.980662 test ap 0.976478 test auc0.973790
-	total time:1.35s  prep time:0.95s
-Epoch 22:
-	train time:0.95s
-tensor(119239, device='cuda:0')
-local node number tensor([119239]) remote node number tensor([0]) local edge tensor([648405]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3450987961841747 time_backward =  0.5491135996999219 time_sample =  0 pre_batch = 0 pre_input = 0.052711950964294374 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017411798588000238 time_attention = 0.07751001499127597
-mode: val tensor([0.9822]) tensor([0.9806])
-mode: test tensor([0.9769]) tensor([0.9744])
-	train loss:12.7027  train ap: nan  val ap:0.982188  val auc:0.980581 test ap 0.976870 test auc0.974398
-	total time:1.36s  prep time:0.95s
-Epoch 23:
-	train time:0.95s
-tensor(119378, device='cuda:0')
-local node number tensor([119378]) remote node number tensor([0]) local edge tensor([648240]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3452421477995813 time_backward =  0.5390177856897935 time_sample =  0 pre_batch = 0 pre_input = 0.0565835825400427 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017625993816182017 time_attention = 0.07707663392648101
-mode: val tensor([0.9831]) tensor([0.9816])
-mode: test tensor([0.9775]) tensor([0.9754])
-	train loss:12.3543  train ap: nan  val ap:0.983065  val auc:0.981607 test ap 0.977524 test auc0.975359
-	total time:1.36s  prep time:0.95s
-Epoch 24:
-	train time:0.96s
-tensor(119448, device='cuda:0')
-local node number tensor([119448]) remote node number tensor([0]) local edge tensor([649041]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3626376132015139 time_backward =  0.5006735629867762 time_sample =  0 pre_batch = 0 pre_input = 0.08839153102599084 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.016956130508333445 time_attention = 0.07790239271707833
-mode: val tensor([0.9831]) tensor([0.9813])
-mode: test tensor([0.9776]) tensor([0.9749])
-	train loss:12.1990  train ap: nan  val ap:0.983074  val auc:0.981300 test ap 0.977597 test auc0.974865
-	total time:1.37s  prep time:0.96s
-Epoch 25:
-	train time:0.95s
-tensor(119277, device='cuda:0')
-local node number tensor([119277]) remote node number tensor([0]) local edge tensor([648323]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34278130857273936 time_backward =  0.553080213139765 time_sample =  0 pre_batch = 0 pre_input = 0.04923592531122267 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01747087435796857 time_attention = 0.07763282326050103
-mode: val tensor([0.9834]) tensor([0.9819])
-mode: test tensor([0.9767]) tensor([0.9748])
-	train loss:12.1612  train ap: nan  val ap:0.983410  val auc:0.981872 test ap 0.976738 test auc0.974824
-	total time:1.36s  prep time:0.95s
-Epoch 26:
-	train time:0.95s
-tensor(119356, device='cuda:0')
-local node number tensor([119356]) remote node number tensor([0]) local edge tensor([648680]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3416846259497106 time_backward =  0.5507900801021606 time_sample =  0 pre_batch = 0 pre_input = 0.04809858009684831 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017090636771172285 time_attention = 0.07780991494655609
-mode: val tensor([0.9839]) tensor([0.9824])
-mode: test tensor([0.9778]) tensor([0.9756])
-	train loss:11.9506  train ap: nan  val ap:0.983882  val auc:0.982390 test ap 0.977783 test auc0.975639
-	total time:1.35s  prep time:0.95s
-Epoch 27:
-	train time:0.97s
-tensor(119280, device='cuda:0')
-local node number tensor([119280]) remote node number tensor([0]) local edge tensor([648301]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.35982254473492503 time_backward =  0.5581902038538828 time_sample =  0 pre_batch = 0 pre_input = 0.04036963905673474 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01755107450298965 time_attention = 0.08050818706396967
-mode: val tensor([0.9835]) tensor([0.9820])
-mode: test tensor([0.9780]) tensor([0.9754])
-	train loss:11.8376  train ap: nan  val ap:0.983460  val auc:0.981956 test ap 0.978000 test auc0.975392
-	total time:1.38s  prep time:0.97s
-Epoch 28:
-	train time:0.96s
-tensor(119295, device='cuda:0')
-local node number tensor([119295]) remote node number tensor([0]) local edge tensor([648768]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.347457638126798 time_backward =  0.5592087459517643 time_sample =  0 pre_batch = 0 pre_input = 0.04261839797254652 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.018026720266789198 time_attention = 0.08068907330743968
-mode: val tensor([0.9834]) tensor([0.9821])
-mode: test tensor([0.9795]) tensor([0.9772])
-	train loss:11.7316  train ap: nan  val ap:0.983403  val auc:0.982100 test ap 0.979455 test auc0.977188
-	total time:1.36s  prep time:0.96s
-Epoch 29:
-	train time:0.95s
-tensor(119324, device='cuda:0')
-local node number tensor([119324]) remote node number tensor([0]) local edge tensor([648990]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34383319586049765 time_backward =  0.531271665240638 time_sample =  0 pre_batch = 0 pre_input = 0.06677867460530251 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01735282759182155 time_attention = 0.07825628505088389
-mode: val tensor([0.9835]) tensor([0.9822])
-mode: test tensor([0.9791]) tensor([0.9769])
-	train loss:11.7245  train ap: nan  val ap:0.983545  val auc:0.982210 test ap 0.979084 test auc0.976873
-	total time:1.36s  prep time:0.95s
-Epoch 30:
-	train time:0.95s
-tensor(119323, device='cuda:0')
-local node number tensor([119323]) remote node number tensor([0]) local edge tensor([648623]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34352220210712403 time_backward =  0.5586602225666866 time_sample =  0 pre_batch = 0 pre_input = 0.04339289083145559 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017157140071503818 time_attention = 0.07775286072865129
-mode: val tensor([0.9838]) tensor([0.9826])
-mode: test tensor([0.9788]) tensor([0.9771])
-	train loss:11.4552  train ap: nan  val ap:0.983839  val auc:0.982574 test ap 0.978790 test auc0.977101
-	total time:1.36s  prep time:0.95s
-Epoch 31:
-	train time:0.95s
-tensor(119210, device='cuda:0')
-local node number tensor([119210]) remote node number tensor([0]) local edge tensor([648221]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34196133306249976 time_backward =  0.5591293250909075 time_sample =  0 pre_batch = 0 pre_input = 0.041276065981946886 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017227028030902147 time_attention = 0.07764211669564247
-mode: val tensor([0.9848]) tensor([0.9834])
-mode: test tensor([0.9799]) tensor([0.9774])
-	train loss:11.2347  train ap: nan  val ap:0.984829  val auc:0.983405 test ap 0.979930 test auc0.977359
-	total time:1.36s  prep time:0.95s
-Epoch 32:
-	train time:0.95s
-tensor(119438, device='cuda:0')
-local node number tensor([119438]) remote node number tensor([0]) local edge tensor([649446]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.343736870912835 time_backward =  0.5601736486423761 time_sample =  0 pre_batch = 0 pre_input = 0.04005975800100714 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01796495425514877 time_attention = 0.07848992524668574
-mode: val tensor([0.9841]) tensor([0.9828])
-mode: test tensor([0.9799]) tensor([0.9780])
-	train loss:11.1548  train ap: nan  val ap:0.984132  val auc:0.982835 test ap 0.979914 test auc0.977990
-	total time:1.36s  prep time:0.95s
-Epoch 33:
-	train time:0.95s
-tensor(119236, device='cuda:0')
-local node number tensor([119236]) remote node number tensor([0]) local edge tensor([648148]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34337377769406885 time_backward =  0.5570919638266787 time_sample =  0 pre_batch = 0 pre_input = 0.04119874502066523 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01766906923148781 time_attention = 0.07826034002937376
-mode: val tensor([0.9840]) tensor([0.9825])
-mode: test tensor([0.9798]) tensor([0.9776])
-	train loss:11.1020  train ap: nan  val ap:0.984042  val auc:0.982467 test ap 0.979820 test auc0.977645
-	total time:1.35s  prep time:0.95s
-Epoch 34:
-	train time:0.95s
-tensor(119372, device='cuda:0')
-local node number tensor([119372]) remote node number tensor([0]) local edge tensor([648901]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.34410411852877587 time_backward =  0.5385550218634307 time_sample =  0 pre_batch = 0 pre_input = 0.057488588034175336 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01736996171530336 time_attention = 0.07862102310173213
-mode: val tensor([0.9848]) tensor([0.9833])
-mode: test tensor([0.9792]) tensor([0.9776])
-	train loss:10.9194  train ap: nan  val ap:0.984812  val auc:0.983250 test ap 0.979217 test auc0.977595
-	total time:1.35s  prep time:0.95s
-Epoch 35:
-	train time:0.95s
-tensor(119295, device='cuda:0')
-local node number tensor([119295]) remote node number tensor([0]) local edge tensor([648214]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3435489946277812 time_backward =  0.5511866604210809 time_sample =  0 pre_batch = 0 pre_input = 0.04667527589481324 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01792132097762078 time_attention = 0.0790077920537442
-mode: val tensor([0.9840]) tensor([0.9826])
-mode: test tensor([0.9806]) tensor([0.9786])
-	train loss:10.9772  train ap: nan  val ap:0.984041  val auc:0.982637 test ap 0.980644 test auc0.978567
-	total time:1.35s  prep time:0.95s
-Epoch 36:
-	train time:0.95s
-tensor(119329, device='cuda:0')
-local node number tensor([119329]) remote node number tensor([0]) local edge tensor([648744]) remote edgetensor([0])
-memory comm tensor([0]) shared comm tensor([0])
-time_forward = 0.3426324164029211 time_backward =  0.5523476478410885 time_sample =  0 pre_batch = 0 pre_input = 0.04714676144067198 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017416485701687634 time_attention = 0.07795663003344089
-mode: val tensor([0.9847]) tensor([0.9832])
-mode: test tensor([0.9803]) tensor([0.9784])
-Early stopping at epoch 36
-Loading the best model at epoch 31
-35.07431387901306
-best test AP:0.979930 test auc0.977359
-mode: train tensor([0.9898]) tensor([0.9897])
-mode: val tensor([0.9833]) tensor([0.9818])
-mode: test tensor([0.9787]) tensor([0.9761])
-	val ap:0.983326 val auc:0.981776 test AP:0.978683  test AUC:0.976127
-test_dataset 23621 avg_time 0.9479544291625152 
--- a/examples/all/WIKI-4-ours-0-all_reduce-recent.out
+++ b/examples/all/WIKI-4-ours-0-all_reduce-recent.out
--- a/examples/all/WIKI-4-ours-0-all_update-recent.out
+++ b/examples/all/WIKI-4-ours-0-all_update-recent.out
--- a/examples/all/WIKI-4-ours-0-p2p-recent.out
+++ b/examples/all/WIKI-4-ours-0-p2p-recent.out
--- a/examples/all/WIKI-4-ours_shared-0.01-all_reduce-recent.out
+++ b/examples/all/WIKI-4-ours_shared-0.01-all_reduce-recent.out
--- a/examples/all/WIKI-4-ours_shared-0.01-all_update-recent.out
+++ b/examples/all/WIKI-4-ours_shared-0.01-all_update-recent.out
--- a/examples/all/WIKI-4-ours_shared-0.01-historical-1-recent.out
+++ b/examples/all/WIKI-4-ours_shared-0.01-historical-1-recent.out
--- a/examples/all/WIKI-4-ours_shared-0.01-p2p-recent.out
+++ b/examples/all/WIKI-4-ours_shared-0.01-p2p-recent.out
--- a/examples/all/WIKI_boundary_Convergence_rate.png
+++ b/examples/all/WIKI_boundary_Convergence_rate.png
--- a/examples/all/boundary_AP_WIKI.png
+++ b/examples/all/boundary_AP_WIKI.png
--- a/examples/all/boundary_WIKI.png
+++ b/examples/all/boundary_WIKI.png
--- a/examples/all/boundary_comm_WIKI.png
+++ b/examples/all/boundary_comm_WIKI.png
--- a/examples/all/draw_boundary.py
+++ b/examples/all/draw_boundary.py
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+# 读取文件内容
+ssim_values = [0, 0.1, 0.2, 0.3, 0.4, 2]  # 假设这是你的 ssim 参数值
+probability_values = [1,0.5,0.1,0.05,0.01,0]
+data_values = ['WIKI']  # 存储从文件中读取的数据
+partition = 'ours_shared'
+# 从文件中读取数据，假设数据存储在文件 data.txt 中
+#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
+partitions=4
+topk=0.01
+mem='all_update'#'historical'
+for data in data_values:
+    ap_list = []
+    comm_list = []
+    for p in probability_values:
+        file = '{}/{}-{}-{}-{}-boundery_recent_decay-{}.out'.format(data,partitions,partition,topk,mem,p)
+        prefix = 'best test AP:'
+        cnt = 0
+        sum = 0
+        with open(file, 'r') as file:
+            for line in file:
+                if line.startswith(prefix):
+                    ap = float(line.lstrip(prefix).split(' ')[0])
+                pos = line.find('remote node number tensor')
+                if(pos!=-1):
+                    posr = line.find(']',pos+2+len('remote node number tensor'),)
+                    comm = int(line[pos+2+len('remote node number tensor'):posr])
+                    sum = sum+comm
+                    cnt = cnt+1
+        ap_list.append(ap)
+        comm_list.append(comm/cnt*4)
+    # 绘制柱状图
+    bar_width = 0.4
+    #shared comm tensor
+# 设置柱状图的位置
+    bars = range(len(ssim_values))
+# 绘制柱状图
+    plt.bar([b for b in bars], ap_list, width=bar_width)
+    # 绘制柱状图
+    plt.ylim([0.9,1])
+    plt.xticks([b for b in bars], probability_values)
+    plt.xlabel('probability')
+    plt.ylabel('Test AP')
+    plt.title('{}({} partitions)'.format(data,partitions))
+    plt.savefig('boundary_AP_{}.png'.format(data))
+    plt.clf()
+    plt.bar([b for b in bars], comm_list, width=bar_width)
+    # 绘制柱状图
+    plt.xticks([b for b in bars], probability_values)
+    plt.xlabel('probability')
+    plt.ylabel('Communication volume')
+    plt.title('{}({} partitions)'.format(data,partitions))
+    plt.savefig('boundary_comm_{}.png'.format(data))
+    plt.clf()
+    if partition == 'ours_shared':
+        partition0 = 'ours'
+    else:
+        partition0=partition
+    for p in probability_values:
+        file = '{}/val_{}_{}_{}_0_boundery_recent_decay_{}_all_update_2.pt'.format(data,partition0,topk,partitions,float(p))
+        val_ap = torch.tensor(torch.load(file))
+        epoch = torch.arange(val_ap.shape[0])
+    #绘制曲线图
+        plt.plot(epoch,val_ap, label='probability={}'.format(p))
+    plt.xlabel('Epoch')
+    plt.ylabel('Val AP')
+    plt.title('{}({} partitions)'.format(data,partitions))
+   # plt.grid(True)
+    plt.legend()
+    plt.savefig('{}_boundary_Convergence_rate.png'.format(data))
+    plt.clf()
--- a/examples/all/draw_ssim.py
+++ b/examples/all/draw_ssim.py
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+# 读取文件内容
+ssim_values = [0, 0.1, 0.2, 0.3, 0.4, 2]  # 假设这是你的 ssim 参数值
+data_values = ['WikiTalk']  # 存储从文件中读取的数据
+partition = 'ours_shared'
+# 从文件中读取数据，假设数据存储在文件 data.txt 中
+#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
+partitions=8
+topk=0.01
+mem='historical'
+for data in data_values:
+    ap_list = []
+    comm_list = []
+    for ssim in ssim_values:
+        if ssim == 2:
+            file = '{}/{}-{}-{}-local-recent.out'.format(data,partitions,partition,topk)
+        else:
+            file = '{}/{}-{}-{}-{}-{}-recent.out'.format(data,partitions,partition,topk,mem,ssim)
+        prefix = 'best test AP:'
+        with open(file, 'r') as file:
+            for line in file:
+                if line.startswith(prefix):
+                    ap = float(line.lstrip(prefix).split(' ')[0])
+                pos = line.find('shared comm tensor')
+                if(pos!=-1):
+                    comm = int(line[pos+2+len('shared comm tensor'):len(line)-3])
+        ap_list.append(ap)
+        comm_list.append(comm)
+    # 绘制柱状图
+    bar_width = 0.4
+    #shared comm tensor
+# 设置柱状图的位置
+    bars = range(len(ssim_values))
+# 绘制柱状图
+    plt.bar([b for b in bars], ap_list, width=bar_width)
+    # 绘制柱状图
+    plt.ylim([0.9,1])
+    plt.xticks([b for b in bars], ssim_values)
+    plt.xlabel('SSIM threshold Values')
+    plt.ylabel('Test AP')
+    plt.title('{}({} partitions)'.format(data,partitions))
+    plt.savefig('ssim_{}.png'.format(data))
+    plt.clf()
+    plt.bar([b for b in bars], comm_list, width=bar_width)
+    # 绘制柱状图
+    plt.xticks([b for b in bars], ssim_values)
+    plt.xlabel('SSIM threshold Values')
+    plt.ylabel('Communication volume')
+    plt.title('{}({} partitions)'.format(data,partitions))
+    plt.savefig('comm_{}.png'.format(data))
+    plt.clf()
+    if partition == 'ours_shared':
+        partition0 = 'ours'
+    else:
+        partition0=partition
+    for ssim in ssim_values:
+        if ssim == 2:
+            file = '{}/val_{}_{}_{}_0_recent_0.1_local_2.pt'.format(data,partition0,topk,partitions,)
+        else:
+            file = '{}/val_{}_{}_{}_0_recent_0.1_{}_{}.pt'.format(data,partition0,topk,partitions,mem,float(ssim))
+        val_ap = torch.tensor(torch.load(file))
+        epoch = torch.arange(val_ap.shape[0])
+    #绘制曲线图
+        plt.plot(epoch,val_ap, label='ssim={}'.format(ssim))
+    plt.xlabel('Epoch')
+    plt.ylabel('Val AP')
+    plt.title('{}({} partitions)'.format(data,partitions))
+   # plt.grid(True)
+    plt.legend()
+    plt.savefig('{}_ssim_Convergence_rate.png'.format(data))
+    plt.clf()
--- a/examples/all_neg_1/REDDIT_ssim_Convergence_rate.png
+++ b/examples/all_neg_1/REDDIT_ssim_Convergence_rate.png
--- a/examples/all_neg_1/WIKI_ssim_Convergence_rate.png
+++ b/examples/all_neg_1/WIKI_ssim_Convergence_rate.png
--- a/examples/all_neg_1/draw_ssim.py
+++ b/examples/all_neg_1/draw_ssim.py
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+# 读取文件内容
+ssim_values = [0, 0.1, 0.2, 0.3, 0.4, 2]  # 假设这是你的 ssim 参数值
+data_values = ['WIKI','REDDIT']  # 存储从文件中读取的数据
+partition = 'ours_shared'
+# 从文件中读取数据，假设数据存储在文件 data.txt 中
+#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
+partitions=4
+topk=0.01
+mem='historical'
+for data in data_values:
+    ap_list = []
+    for ssim in ssim_values:
+        if ssim == 2:
+            file = '{}/{}-{}-{}-local-recent.out'.format(data,partitions,partition,topk)
+        else:
+            file = '{}/{}-{}-{}-{}-{}-recent.out'.format(data,partitions,partition,topk,mem,ssim)
+        prefix = 'best test AP:'
+        with open(file, 'r') as file:
+            for line in file:
+                if line.startswith(prefix):
+                    ap = float(line.lstrip(prefix).split(' ')[0])
+        ap_list.append(ap)
+    # 绘制柱状图
+    bar_width = 0.4
+# 设置柱状图的位置
+    bars = range(len(ssim_values))
+# 绘制柱状图
+    plt.bar([b for b in bars], ap_list, width=bar_width)
+    # 绘制柱状图
+    plt.ylim([0.8,1])
+    plt.xticks([b for b in bars], ssim_values)
+    plt.xlabel('SSIM threshold Values')
+    plt.ylabel('Test AP')
+    plt.title('{}({} partitions)'.format(data,partitions))
+    plt.savefig('ssim_{}.png'.format(data))
+    plt.clf()
+    if partition == 'ours_shared':
+        partition0 = 'ours'
+    else:
+        partition0=partition
+    for ssim in ssim_values:
+        if ssim == 2:
+            file = '{}/val_{}_{}_{}_0_recent_0.1_local_2.pt'.format(data,partition0,topk,partitions,)
+        else:
+            file = '{}/val_{}_{}_{}_0_recent_0.1_{}_{}.pt'.format(data,partition0,topk,partitions,mem,float(ssim))
+        val_ap = torch.tensor(torch.load(file))
+        epoch = torch.arange(val_ap.shape[0])
+    #绘制曲线图
+        plt.plot(epoch,val_ap, label='ssim={}'.format(ssim))
+    plt.xlabel('Epoch')
+    plt.ylabel('Val AP')
+    plt.title('{}({} partitions)'.format(data,partitions))
+   # plt.grid(True)
+    plt.legend()
+    plt.ylim([0.98,0.99])
+    plt.savefig('{}_ssim_Convergence_rate.png'.format(data))
+    plt.clf()
--- a/examples/all_neg_1/ssim_REDDIT.png
+++ b/examples/all_neg_1/ssim_REDDIT.png
--- a/examples/all_neg_1/ssim_WIKI.png
+++ b/examples/all_neg_1/ssim_WIKI.png
--- a/examples/test_all.sh
+++ b/examples/test_all.sh
@@ -2,26 +2,35 @@
 # 定义数组变量
 addr="192.168.1.107"
-partition_params=("ours" "metis" "ldg" "random")
+partition_params=("ours")
+#"metis" "ldg" "random")
 #("ours" "metis" "ldg" "random")
-partitions="16"
+partitions="4"
-nnodes="4"
+node_per="4"
+nnodes="1"
 node_rank="0"
 probability_params=("1" "0.5" "0.1" "0.05" "0.01" "0")
+#sample_type_params=("recent") #"boundery_recent_decay" "boundery_recent_uniform")
 sample_type_params=("recent" "boundery_recent_decay" "boundery_recent_uniform")
 #sample_type_params=("recent")
 #memory_type=("all_update" "p2p" "all_reduce" "historical" "local")
-#memory_type=("all_update" "local" "historical")
+memory_type=("all_update")
-memory_type=("local" "all_update" "historical" "all_reduce")
+#memory_type=("local" "all_update" "historical" "all_reduce")
 shared_memory_ssim=("0" "0.1" "0.2" "0.3" "0.4" )
 #data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk")
-data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk" "StackOverflow")
+data_param=("DGraphFin" "WikiTalk")
+#data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk" "StackOverflow")
 #data_param=("REDDIT" "WikiTalk")
 # 创建输出目录
 mkdir -p all
 # 遍历数组并执行命令
 for data in "${data_param[@]}"; do
+    model="TGN_large"
+    if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
+        model="TGN"
+    fi
    mkdir all/"$data"
    mkdir all/"$data"/comm
    #torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition ours --memory_type local --sample_type recent --topk 0 > all/"$data"/1.out &
@@ -33,20 +42,20 @@ for data in "${data_param[@]}"; do
                    if [ "$mem" = "historical" ]; then
                        for ssim in "${shared_memory_ssim[@]}"; do
                            if [ "$partition" = "ours" ]; then
-                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out &
+                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out &
                                wait
                            fi
                        done
                    elif [ "$mem" = "all_reduce" ]; then
                        if [ "$partition" = "ours" ]; then
-                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem"  > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
+                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem"  > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
                            wait
                        fi
                    else
-                        torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
+                        torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
                        wait
                        if [ "$partition" = "ours" ]; then
-                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
+                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
                            wait
                        fi
                    fi
@@ -57,20 +66,20 @@ for data in "${data_param[@]}"; do
                        if [ "$mem" = "historical" ]; then
                            for ssim in "${shared_memory_ssim[@]}"; do
                                if [ "$partition" = "ours" ]; then
-                                    torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
+                                    torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
                                    wait
                                fi
                            done
                        elif [ "$mem" = "all_reduce" ]; then
                            if [ "$partition" = "ours" ]; then
-                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem"  > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out&
+                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem"  > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out&
                                wait
                            fi
                        else
-                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
+                            torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
                            wait
                            if [ "$partition" = "ours" ]; then
-                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out &
+                                torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out &
                                wait
                            fi
                        fi

--- a/examples/tgbl_coin_train.out
+++ b/examples/tgbl_coin_train.out
 LOCAL RANK 0, RANK0
+in
+local rank is 0 world_size is 1 memory group is 0 memory rank is 0 memory group size is 1
+[0]
 use cuda on 0
-638486
-get_neighbors consume: 4.12395s
-Epoch 0:
--- a/examples/tgbl_coin_train_4.out
+++ b/examples/tgbl_coin_train_4.out
-LOCAL RANK 0, RANK0
-LOCAL RANK 2, RANK2
 LOCAL RANK 1, RANK1
 LOCAL RANK 3, RANK3
-use cuda on 3
+LOCAL RANK 0, RANK0
-use cuda on 0
+LOCAL RANK 2, RANK2
-use cuda on 2
-use cuda on 1
-638486
-638486
-638486
-638486
-get_neighbors consume: 3.42567s
-get_neighbors consume: 3.42812s
-num_batchs: tensor([7069], device='cuda:2')
-num_batchs: tensor([6015], device='cuda:0')
-get_neighbors consume: 3.68743s
-num_batchs: tensor([6948], device='cuda:1')
-get_neighbors consume: 4.58464s
-num_batchs: tensor([6576], device='cuda:3')
-num_batchs: num_batchs: tensor([1254], device='cuda:0')
-tensor([1642], device='cuda:3')
-num_batchs: num_batchs: tensor([1331], device='cuda:2')
-tensor([1478], device='cuda:1')
-num_batchs:num_batchs:num_batchs:  num_batchs:  tensor([1227], device='cuda:0')
-tensor([1625], device='cuda:3')tensor([1412], device='cuda:1')tensor([1440], device='cuda:2')
-Epoch 0:
-Epoch 0:
-Epoch 0:
-Epoch 0:
-	train loss:3025.4560  train ap:0.964935  val ap:0.973583  val auc:0.969748
-	train loss:2842.4385  train ap:0.968786  val ap:0.973583  val auc:0.969748
-	train loss:3149.4863  train ap:0.960053  val ap:0.973583  val auc:0.969748
-	train loss:2905.2378  train ap:0.966912  val ap:0.973583  val auc:0.969748
-	total time:109.11s  prep time:90.95s
-	total time:109.11s  prep time:90.95s
-	total time:109.11s  prep time:90.95s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	total time:109.10s  prep time:90.95s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 1:
-Epoch 1:
-Epoch 1:
-Epoch 1:
-	train loss:2756.1783  train ap:0.969414  val ap:0.976812  val auc:0.973716
-	train loss:2786.7234  train ap:0.970014  val ap:0.976812  val auc:0.973716
-	train loss:2600.9249  train ap:0.973055  val ap:0.976812  val auc:0.973716
-	train loss:2561.2065  train ap:0.974347  val ap:0.976812  val auc:0.973716
-	total time:107.65s  prep time:89.41s
-	total time:107.65s  prep time:89.41s
-	total time:107.65s  prep time:89.41s
-	total time:107.65s  prep time:89.41s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 2:
-Epoch 2:
-Epoch 2:
-Epoch 2:
-	train loss:2430.7610  train ap:0.976709  val ap:0.979544  val auc:0.976893
-	train loss:2616.8964  train ap:0.972457  val ap:0.979544  val auc:0.976893
-	train loss:2666.5888  train ap:0.972383  val ap:0.979544  val auc:0.976893
-	train loss:2477.5472  train ap:0.975493  val ap:0.979544  val auc:0.976893
-	total time:107.73s  prep time:89.82s
-	total time:107.73s  prep time:89.82s
-	total time:107.73s  prep time:89.82s
-	total time:107.73s  prep time:89.82s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 3:Epoch 3:Epoch 3:
-Epoch 3:
-	train loss:2404.6129  train ap:0.977177  val ap:0.979526  val auc:0.976748
-	train loss:2652.1562  train ap:0.972664  val ap:0.979526  val auc:0.976748
-	train loss:2561.0276  train ap:0.973517  val ap:0.979526  val auc:0.976748
-	train loss:2431.4974  train ap:0.976369  val ap:0.979526  val auc:0.976748
-	total time:107.16s  prep time:89.15s
-	total time:107.16s  prep time:89.15s
-	total time:107.16s  prep time:89.15s
-	total time:107.16s  prep time:89.15s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 4:
-Epoch 4:Epoch 4:Epoch 4:
-	train loss:2599.0614  train ap:0.973655  val ap:0.980024  val auc:0.977120
-	train loss:2343.8141  train ap:0.978188  val ap:0.980024  val auc:0.977120
-	train loss:2382.7643  train ap:0.977246  val ap:0.980024  val auc:0.977120
-	total time:107.70s  prep time:89.60s
-	total time:107.70s  prep time:89.60s
-	 fetch time:0.00s write back time:0.00s
-	total time:107.70s  prep time:89.60s
-	train loss:2503.4472  train ap:0.974597  val ap:0.980024  val auc:0.977120
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	total time:107.70s  prep time:89.60s
-	 fetch time:0.00s write back time:0.00s
-Epoch 5:Epoch 5:
-Epoch 5:Epoch 5:
-	train loss:2377.6717  train ap:0.977300  val ap:0.981272  val auc:0.978760
-	total time:108.36s  prep time:89.99s
-	train loss:2586.4221  train ap:0.973873  val ap:0.981272  val auc:0.978760
-	train loss:2510.5564  train ap:0.974502  val ap:0.981272  val auc:0.978760
-	train loss:2345.5698  train ap:0.978154  val ap:0.981272  val auc:0.978760
-	 fetch time:0.00s write back time:0.00s
-	total time:108.36s  prep time:89.99s
-	total time:108.36s  prep time:89.99s
-	total time:108.36s  prep time:89.99s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 6:
-Epoch 6:Epoch 6:
-Epoch 6:
-	train loss:2287.1365  train ap:0.979113  val ap:0.981768  val auc:0.979250
-	train loss:2541.0882  train ap:0.974732  val ap:0.981768  val auc:0.979250
-	train loss:2441.7481  train ap:0.975795  val ap:0.981768  val auc:0.979250
-	total time:108.29s  prep time:90.22s
-	train loss:2313.8948  train ap:0.978471  val ap:0.981768  val auc:0.979250
-	total time:108.29s  prep time:90.22s
-	total time:108.29s  prep time:90.22s
-	 fetch time:0.00s write back time:0.00s
-	total time:108.29s  prep time:90.22s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 7:
-Epoch 7:
-Epoch 7:
-Epoch 7:
-	train loss:2321.0527  train ap:0.978335  val ap:0.980500  val auc:0.978016
-	train loss:2558.9959  train ap:0.974414  val ap:0.980500  val auc:0.978016
-	train loss:2289.0225  train ap:0.979144  val ap:0.980500  val auc:0.978016
-	train loss:2436.1819  train ap:0.975923  val ap:0.980500  val auc:0.978016
-	total time:107.98s  prep time:90.08s
-	total time:107.98s  prep time:90.08s
-	total time:107.98s  prep time:90.08s
-	total time:107.98s  prep time:90.08s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 8:
-Epoch 8:Epoch 8:
-Epoch 8:
-	train loss:2422.3653  train ap:0.976156  val ap:0.982765  val auc:0.980566
-	train loss:2250.0465  train ap:0.979720  val ap:0.982765  val auc:0.980566
-	total time:107.98s  prep time:89.73s
-	train loss:2517.5717  train ap:0.975174  val ap:0.982765  val auc:0.980566
-	total time:107.98s  prep time:89.73s
-	 fetch time:0.00s write back time:0.00s
-	train loss:2284.2223  train ap:0.978957  val ap:0.982765  val auc:0.980566
-	 fetch time:0.00s write back time:0.00s
-	total time:107.98s  prep time:89.73s
-	total time:107.98s  prep time:89.73s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 9:
-Epoch 9:Epoch 9:
-Epoch 9:
-	train loss:2495.3455  train ap:0.975555  val ap:0.980162  val auc:0.977624
-	train loss:2268.7504  train ap:0.979202  val ap:0.980162  val auc:0.977624
-	train loss:2243.5499  train ap:0.979831  val ap:0.980162  val auc:0.977624
-	train loss:2392.5389  train ap:0.976669  val ap:0.980162  val auc:0.977624
-	total time:108.06s  prep time:89.87s
-	total time:108.06s  prep time:89.87s
-	total time:108.06s  prep time:89.87s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	total time:108.06s  prep time:89.87s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 10:
-Epoch 10:
-Epoch 10:
-Epoch 10:
-	train loss:2356.5620  train ap:0.977277  val ap:0.983905  val auc:0.981905
-	train loss:2475.2578  train ap:0.975923  val ap:0.983905  val auc:0.981905
-	total time:108.50s  prep time:90.58s
-	train loss:2218.5262  train ap:0.980230  val ap:0.983905  val auc:0.981905
-	train loss:2249.7741  train ap:0.979533  val ap:0.983905  val auc:0.981905
-	 fetch time:0.00s write back time:0.00s
-	total time:108.50s  prep time:90.58s
-	total time:108.50s  prep time:90.58s
-	total time:108.50s  prep time:90.58s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 11:
-Epoch 11:Epoch 11:
-Epoch 11:
-	train loss:2371.3061  train ap:0.977063  val ap:0.981130  val auc:0.978457
-	train loss:2215.4943  train ap:0.980281  val ap:0.981130  val auc:0.978457
-	train loss:2469.7190  train ap:0.975983  val ap:0.981130  val auc:0.978457
-	train loss:2243.8975  train ap:0.979617  val ap:0.981130  val auc:0.978457
-	total time:107.79s  prep time:89.81s
-	total time:107.79s  prep time:89.81s
-	total time:107.79s  prep time:89.81s
-	total time:107.79s  prep time:89.81s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 12:
-Epoch 12:
-Epoch 12:
-Epoch 12:
-	train loss:2454.1705  train ap:0.976276  val ap:0.983270  val auc:0.981183
-	train loss:2225.2349  train ap:0.979939  val ap:0.983270  val auc:0.981183
-	train loss:2337.4529  train ap:0.977606  val ap:0.983270  val auc:0.981183
-	train loss:2189.4448  train ap:0.980683  val ap:0.983270  val auc:0.981183
-	total time:108.64s  prep time:90.63s
-	total time:108.64s  prep time:90.63s
-	total time:108.64s  prep time:90.63s
-	total time:108.64s  prep time:90.63s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 13:
-Epoch 13:
-Epoch 13:
-Epoch 13:
-	train loss:2374.7256  train ap:0.977024  val ap:0.981550  val auc:0.979260
-	train loss:2221.0432  train ap:0.980189  val ap:0.981550  val auc:0.979260
-	train loss:2471.9543  train ap:0.975953  val ap:0.981550  val auc:0.979260
-	train loss:2241.1903  train ap:0.979649  val ap:0.981550  val auc:0.979260
-	total time:108.69s  prep time:90.62s
-	total time:108.69s  prep time:90.62s
-	total time:108.69s  prep time:90.62s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	total time:108.69s  prep time:90.62s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 14:
-Epoch 14:
-Epoch 14:
-Epoch 14:
-	train loss:2358.8334  train ap:0.977242  val ap:0.981721  val auc:0.979185
-	train loss:2208.2876  train ap:0.980383  val ap:0.981721  val auc:0.979185
-	train loss:2227.2542  train ap:0.979885  val ap:0.981721  val auc:0.979185
-	train loss:2460.0171  train ap:0.976178  val ap:0.981721  val auc:0.979185
-	total time:107.77s  prep time:89.81s
-	total time:107.77s  prep time:89.81s
-	total time:107.77s  prep time:89.81s
-	total time:107.77s  prep time:89.81s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-	 fetch time:0.00s write back time:0.00s
-Epoch 15:
-Epoch 15:Epoch 15:
-Epoch 15:
-Early stopping at epoch 15
-Early stopping at epoch 15
-Early stopping at epoch 15
-Early stopping at epoch 15
-Loading the best model at epoch 10
-Loading the best model at epoch 10
-Loading the best model at epoch 10
-Loading the best model at epoch 10
-0.9546157717704773 0.9452952742576599 
-0.9546157717704773 0.9452952742576599 
-0.9546157717704773 0.9452952742576599 
-0.9546157717704773 0.9452952742576599 
-0.9489824175834656 0.9380446672439575 
-0.9489824175834656 0.9380446672439575 
-0.9489824175834656 0.9380446672439575 
-0.9489824175834656 0.9380446672439575 
-	test AP:0.944646  test AUC:0.934612
-	test AP:0.944646  test AUC:0.934612
-	test AP:0.944646  test AUC:0.934612
-	test AP:0.944646  test AUC:0.934612
-test_dataset 798529 avg_time 28.8176681804657 
-test_dataset 752056 avg_time 28.81766140937805 
-test_dataset 984603 avg_time 28.817663559913637 
-test_dataset 886223 avg_time 28.817657227516175 
--- a/examples/train_boundery.py
+++ b/examples/train_boundery.py
@@ -50,7 +50,7 @@ parser.add_argument('--rank', default=0, type=int, metavar='W',
                    help='name of dataset')
 parser.add_argument('--local_rank', default=0, type=int, metavar='W',
                    help='name of dataset')
-parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')
+parser.add_argument('--patience', type=int, default=20, help='Patience for early stopping')
 parser.add_argument('--world_size', default=1, type=int, metavar='W',
                    help='number of negative samples')
 parser.add_argument('--dataname', default="WIKI", type=str, metavar='W',
@@ -73,6 +73,8 @@ parser.add_argument('--shared_memory_ssim', default=2, type=float, metavar='W',
                    help='name of model')
 parser.add_argument('--neg_samples', default=1, type=int, metavar='W',
                    help='name of model')
+parser.add_argument('--eval_neg_samples', default=1, type=int, metavar='W',
+                    help='name of model')
 parser.add_argument('--memory_type', default='all_update', type=str, metavar='W',
                    help='name of model')
 #boundery_recent_uniform boundery_recent_decay
@@ -104,6 +106,7 @@ if not 'MASTER_PORT' in os.environ:
    os.environ["MASTER_PORT"] = '9337'
 os.environ["NCCL_IB_DISABLE"]='1'
 os.environ['NCCL_SOCKET_IFNAME']=matching_interfaces[0]
+print('rank {}'.format(int(os.environ["LOCAL_RANK"])))
 torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
 local_rank = int(os.environ["LOCAL_RANK"])
 def seed_everything(seed=42):
@@ -219,7 +222,7 @@ def main():
    else:
        mailbox = None
-    sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=sample_graph, workers=1,policy = policy_train, graph_name = "train",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability)
+    sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=sample_graph, workers=10,policy = policy_train, graph_name = "train",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability)
    eval_sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=eval_sample_graph, workers=10,policy = 'recent', graph_name = "eval",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability)
    train_data = torch.masked_select(graph.edge_index,train_mask.to(graph.edge_index.device)).reshape(2,-1)
@@ -244,16 +247,16 @@ def main():
    val_data = DataSet(edges = val_data,ts = val_ts,eids = val_mask.nonzero().reshape(-1))
    print('ts {} {} {} {}'.format(train_data.ts,eval_train_data.ts,test_data.ts,val_data.ts))
-    neg_samples = args.neg_samples
+    neg_samples = args.eval_neg_samples
    mask = DistIndex(graph.nids_mapper[graph.edge_index[1,:]].to('cpu')).part == dist.get_rank()
    if args.local_neg_sample:
        print('dst len {} origin len {}'.format(graph.edge_index[1,mask].unique().shape[0],full_dst.unique().shape[0]))
-        train_neg_sampler = LocalNegativeSampling('triplet',amount = neg_samples,dst_node_list = graph.edge_index[1,mask].unique())
+        train_neg_sampler = LocalNegativeSampling('triplet',amount = args.neg_samples,dst_node_list = graph.edge_index[1,mask].unique())
    else:
-        train_neg_sampler = LocalNegativeSampling('triplet',amount = neg_samples,dst_node_list = full_dst.unique())
+        train_neg_sampler = LocalNegativeSampling('triplet',amount = args.neg_samples,dst_node_list = full_dst.unique())
    print(train_neg_sampler.dst_node_list)
-    neg_sampler = LocalNegativeSampling('triplet',amount= neg_samples,dst_node_list = full_dst.unique())
+    neg_sampler = LocalNegativeSampling('triplet',amount= neg_samples,dst_node_list = full_dst.unique(),seed=12357)
    trainloader = DistributedDataLoader(graph,eval_train_data,sampler = sampler,
                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
@@ -354,6 +357,7 @@ def main():
                    y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
                    aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
                    aucs_mrrs.append(roc_auc_score(y_true, y_pred))
                if mailbox is not None:
                    src = metadata['src_pos_index']
                    dst = metadata['dst_pos_index']
@@ -425,8 +429,6 @@ def main():
    for e in range(train_param['epoch']):
        model.module.memory_updater.empty_cache()
        tt._zero()
-        count_empty()
-        time_count.set_zero()
        torch.cuda.synchronize()
        epoch_start_time = time.time()
        epoch_cnt = epoch_cnt + 1
@@ -440,8 +442,6 @@ def main():
            model.module.memory_updater.last_updated_nid = None
            model.module.memory_updater.last_updated_memory = None
            model.module.memory_updater.last_updated_ts = None
-        t0 = time.time()
-        t_s = tt.start()
        sum_local_comm = 0
        sum_remote_comm = 0
        sum_local_edge_comm = 0
@@ -470,10 +470,6 @@ def main():
            sum_remote_comm +=remote_comm[b_cnt-1]
            sum_local_edge_comm +=local_edge_comm[b_cnt-1]
            sum_remote_edge_comm +=remote_edge_comm[b_cnt-1]
-            tt.pre_input += tt.elapsed(t_s)
-            t_prep_s = time.time()
-            t1 = time.time()
-            t_s = tt.start()
            if mailbox is not None:
                if(graph.efeat.device.type != 'cpu'):
                    edge_feats = graph.get_local_efeat(graph.eids_mapper[roots.eids.to('cpu')]).to('cuda')
@@ -490,9 +486,7 @@ def main():
            model.train()
            optimizer.zero_grad()
-            pred_pos, pred_neg = model(mfgs,metadata,neg_samples=neg_samples,async_param = param)
+            pred_pos, pred_neg = model(mfgs,metadata,neg_samples=args.neg_samples,async_param = param)
-            tt.time_forward += tt.elapsed(t_s)
-            t_s = tt.start()
            if memory_param['historical_fix'] == True:
                loss = creterion(pred_pos, torch.ones_like(pred_pos)) + 0.1*inner_prod(model.module.memory_updater.update_memory,model.module.memory_updater.prev_memory)
            else:
@@ -502,12 +496,9 @@ def main():
            #mailbox.handle_last_async()
            #trainloader.async_feature()
            #torch.cuda.synchronize()
-            t2 = time.time()
            loss.backward()
            optimizer.step()
-            tt.time_backward += tt.elapsed(t_s)
            #torch.cuda.synchronize()
-            t3 = time.time()
            ## train aps
            #y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
            #y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
@@ -515,8 +506,6 @@ def main():
            #torch.cuda.synchronize()
            mailbox.update_shared()
            mailbox.update_p2p()
-            t4 = time.time()
-            t_s = tt.start()
            """
            if mailbox is not None:
                #src = metadata['src_pos_index']
@@ -579,7 +568,6 @@ def main():
        print('memory comm {} shared comm {}\n'.format(tot_comm_count,tot_shared_count))
        if(e==0):
            torch.save((local_access,remote_access,local_edge_access,remote_edge_access,local_comm,remote_comm,local_edge_comm,remote_edge_comm),'all/{}/comm/comm_{}_{}_{}_{}_{}_{}_{}_{}.pt'.format(args.dataname,args.partition,args.topk,dist.get_world_size(),dist.get_rank(),args.sample_type,args.probability,args.memory_type,args.shared_memory_ssim))
-        tt.print()
        ap = 0
        auc = 0
        ap, auc = eval('val')
@@ -625,17 +613,6 @@ def main():
        best_model_path = get_checkpoint_path(early_stopper.best_epoch)
        model.module.load_state_dict(torch.load(best_model_path)) 
    print('best test AP:{:4f} test auc{:4f}'.format(*test_ap_list[early_stopper.best_epoch]))
-    if mailbox is not None:
-        mailbox.reset()
-        model.module.memory_updater.last_updated_nid = None
-        ap,auc = eval('train')
-        val_ap,val_auc = eval('val')
-    ap, auc = eval('test')
-    eval_neg_samples = 1
-    if eval_neg_samples > 1:
-        print('\tval AP:{:4f}  val MRR:{:4f} test AP:{:4f}  test MRR:{:4f}\n'.format(val_ap,val_auc,ap, auc))
-    else:
-        print('\tval ap:{:4f} val auc:{:4f} test AP:{:4f}  test AUC:{:4f}\n'.format(val_ap,val_auc,ap, auc))    
    val_list = torch.tensor(val_list)
    loss_list = torch.tensor(loss_list)
    print('test_dataset {} avg_time {} \n'.format(test_data.edges.shape[1],avg_time/epoch_cnt))

--- a/starrygl/module/layers.py
+++ b/starrygl/module/layers.py
@@ -209,7 +209,6 @@ class TransfomerAttentionLayer(torch.nn.Module):
        self.layer_norm = torch.nn.LayerNorm(dim_out)
    def forward(self, b):
-        t_s = tt.start()
        assert(self.dim_time + self.dim_node_feat + self.dim_edge_feat > 0)
        self.device = b.device
        if b.num_edges() == 0:
@@ -217,8 +216,6 @@ class TransfomerAttentionLayer(torch.nn.Module):
        if self.dim_time > 0:
            time_feat = self.time_enc(b.edata['dt'])
            zero_time_feat = self.time_enc(torch.zeros(b.num_dst_nodes(), dtype=torch.float32, device=self.device))
-        tt.time_nbrs += tt.elapsed(t_s)
-        t_s = tt.start()
        if self.combined:
            Q = torch.zeros((b.num_edges(), self.dim_out), device=self.device)
            K = torch.zeros((b.num_edges(), self.dim_out), device=self.device)
@@ -301,7 +298,6 @@ class TransfomerAttentionLayer(torch.nn.Module):
            rst = b.dstdata['h']
        rst = self.w_out(rst)
        rst = torch.nn.functional.relu(self.dropout(rst))
-        tt.time_attention+= tt.elapsed(t_s)
        return self.layer_norm(rst)
 class IdentityNormLayer(torch.nn.Module):

--- a/starrygl/module/utils.py
+++ b/starrygl/module/utils.py
@@ -10,7 +10,7 @@ def parse_config(f):
    return sample_param, memory_param, gnn_param, train_param
 class EarlyStopMonitor(object):
-  def __init__(self, max_round=3, higher_better=True, tolerance=1e-10):
+  def __init__(self, max_round=10, higher_better=True, tolerance=1e-10):
    self.max_round = max_round
    self.num_round = 0

--- a/starrygl/sample/batch_data.py
+++ b/starrygl/sample/batch_data.py
@@ -286,6 +286,7 @@ def to_block(graph,data, sample_out,device = torch.device('cuda'),unique = True)
                idx = block_node_list[0,b.srcnodes()].to(torch.long)
                e_idx = eid_inv[col_len:col_len+elen]
                b.srcdata['__ID'] = idx
                if sample_out[r].delta_ts().shape[0] > 0:
                    b.edata['dt'] = sample_out[r].delta_ts().to(device)
                b.srcdata['ts'] = block_node_list[1,b.srcnodes()].to(torch.float)

--- a/starrygl/sample/count_static.py
+++ b/starrygl/sample/count_static.py
@@ -2,139 +2,53 @@ import os
 import time
 import torch
 class time_count:
-    total_sample_time = 0
-    total_next_batch_time = 0
-    total_sample_core_time = 0
-    total_fetch_prepare_time = 0
-    total_comm_time = 0
-    total_build_time = 0
-    total_prepare_input_time = 0
-    total_build_block_time = 0
-    forward_embedding = 0
-    forward_all_to_all = 0
-    backward_all_to_all = 0
-    memory_historical = 0
-    memory_update = 0
-    memory_get = 0
-    memory_enc = 0
-    memory_historical_count = 0
    time_forward = 0
    time_backward = 0
-    time_sample = 0
+    time_memory_updater = 0
-    pre_batch = 0
+    time_embedding = 0
-    pre_input = 0
+    time_local_update = 0
-    pos_update = 0
+    time_memory_sync = 0
-    mem_update = 0
+    time_sample_and_build = 0
-    time_zero = 0
+    time_memory_fetch = 0
-    time_nbrs = 0
-    time_attention = 0
-    @staticmethod
-    def add_memory_count(t1,t2,t3,t4):
-        time_count.memory_update += t1
-        time_count.memory_get += t2
-        time_count.memory_enc += t3
-        time_count.memory_historical_count += t4
-    @staticmethod
-    def add_train_forward_embedding(t1):
-        time_count.forward_embedding += t1
-    @staticmethod
-    def add_train_foward_all_to_all(t1):
-        time_count.forward_all_to_all += t1
-    @staticmethod
-    def add_backward_all_to_all(t1):
-        time_count.backward_all_to_all += t1
-    @staticmethod
-    def add_next(t1,t2):
-        time_count.total_sample_time += t2
-        time_count.total_next_batch_time +=t1 
-    @staticmethod  
-    def add_batch(t1,t2,t3,t4) :
-        time_count.total_fetch_prepare_time +=t1
-        time_count.total_comm_time+=t2
-        time_count.total_build_time+=t3
-        time_count.total_prepare_input_time+=t4
-    @staticmethod  
-    def add_build_block(t1,t2) :
-        time_count.total_sample_core_time += t1
-        time_count.total_build_block_time+=t2
-    @staticmethod
-    def set_zero():
-        time_count.total_sample_time =0
-        time_count.total_next_batch_time=0
-        time_count.total_sample_core_time =0
-        time_count.total_fetch_prepare_time=0
-        time_count.total_comm_time =0
-        time_count.total_build_time =0
-        time_count.total_prepare_input_time =0
-        time_count.total_build_block_time=0
-        time_count.forward_embedding = 0
-        time_count.forward_all_to_all = 0
-        time_count.backward_all_to_all = 0
-        time_count.memory_update = 0
-        time_count.memory_get = 0
-        time_count.memory_enc = 0
-        time_count.memory_historical_count = 0
-    @staticmethod
-    def query():
-        return {
-            "total_sample_time":time_count.total_sample_time,
-            "total_next_batch_time":time_count.total_next_batch_time,
-            "total_sample_core_time":time_count.total_sample_core_time,
-            "total_fetch_prepare_time":time_count.total_fetch_prepare_time,
-            "total_comm_time":time_count.total_comm_time,
-            "total_build_time":time_count.total_build_time,
-            "total_prepare_input_time":time_count.total_prepare_input_time,
-            "total_build_block_time":time_count.total_build_block_time,
-            "forward_embedding":time_count.forward_embedding,
-            "forward_all_to_all":time_count.forward_all_to_all,
-            "backward_all_to_all":time_count.backward_all_to_all,
-            "memory_update":time_count.memory_update ,
-            "memory_get":time_count.memory_get ,
-            "memory_enc":time_count.memory_enc ,
-            "memory_historical_count":time_count.memory_historical_count ,
-        }
    @staticmethod
    def _zero():
        time_count.time_forward = 0
        time_count.time_backward = 0
-        time_count.time_sample = 0
+        time_count.time_memory_updater = 0
-        time_count.pre_batch = 0
+        time_count.time_embedding = 0
-        time_count.pre_input = 0
+        time_count.time_local_update = 0
-        time_count.pos_update = 0
+        time_count.time_memory_sync = 0
-        time_count.mem_update = 0
+        time_count.time_sample_and_build = 0
-        time_count.time_zero = 0
+        time_count.time_memory_fetch = 0
-        time_count.time_nbrs = 0
+    @staticmethod
-        time_count.time_attention = 0
+    def start_gpu():
-    @staticmethod
-    def start():
        # Uncomment for better breakdown timings
        #torch.cuda.synchronize()
-        return time.perf_counter()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        return start_event,end_event
    @staticmethod
-    def elapsed(start):
+    def start():
-        # Uncomment for better breakdown timings
+        return time.perf_counter(),0 
-        #torch.cuda.synchronize()
+    @staticmethod
-        return time.perf_counter() - start
+    def elapsed_event(start_event,end_event):
+        if start_event.isinstance(torch.cuda.Event):
+            end_event.record()
+            end_event.synchronize()
+            return start_event.elapsed_time(end_event)
+        else:
+            torch.cuda.synchronize()
+            return time.perf_counter() - start_event
    @staticmethod
    def print():
-        print(
+        print('time_count.time_forward={} time_count.time_backward={} time_count.time_memory_updater={} time_count.time_embedding={} time_count.time_local_update={} time_count.time_memory_sync={} time_count.time_sample_and_build={} time_count.time_memory_fetch={}\n'.format(
-            'time_forward = {} time_backward =  {} time_sample =  {} pre_batch = {} pre_input = {} pos_update = {} mem_update = {} time_zero = {} time_nbrs = {} time_attention = {}'.format(
+            time_count.time_backward,
-                time_count.time_forward,
+            time_count.time_memory_updater,
-                time_count.time_backward,
+            time_count.time_embedding,
-                time_count.time_sample,
+            time_count.time_local_update,
-                time_count.pre_batch,
+            time_count.time_memory_sync,
-                time_count.pre_input,
+            time_count.time_sample_and_build,
-                time_count.pos_update,
+            time_count.time_memory_fetch ))
-                time_count.mem_update,
\ No newline at end of file
-                time_count.time_zero,
-                time_count.time_nbrs,
-                time_count.time_attention,
-            )
-        )
\ No newline at end of file
--- a/starrygl/sample/part_utils/transformer_from_speed.py
+++ b/starrygl/sample/part_utils/transformer_from_speed.py
@@ -291,10 +291,10 @@ def load_from_speed(data,seed,top,sampler_graph_add_rev,device=torch.device('cud
            reorder = '../../SPEED/partition/divided_nodes_seed_t2/{}/reorder.txt'.format(data)
            edge_i = '../../SPEED/partition/divided_nodes_seed_t2/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
        elif partition == 'metis':
-            fnode_i = '../../SPEED/partition/divided_nodes_metis/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
+            fnode_i = '../../SPEED/partition/divided_nodes_metis_test/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
-            fnode_share = '../../SPEED/partition/divided_nodes_metis/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top)
+            fnode_share = '../../SPEED/partition/divided_nodes_metis_test/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top)
-            reorder = '../../SPEED/partition/divided_nodes_metis/{}/reorder.txt'.format(data)
+            reorder = '../../SPEED/partition/divided_nodes_metis_test/{}/reorder.txt'.format(data)
-            edge_i = '../../SPEED/partition/divided_nodes_metis/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
+            edge_i = '../../SPEED/partition/divided_nodes_metis_test/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
        elif partition == 'ldg':
            fnode_i = '../../SPEED/partition/divided_nodes_ldg/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
            fnode_share = '../../SPEED/partition/divided_nodes_ldg/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top)

--- a/starrygl/sample/sample_core/LocalNegSampling.py
+++ b/starrygl/sample/sample_core/LocalNegSampling.py
@@ -19,12 +19,15 @@ class LocalNegativeSampling(NegativeSampling):
        amount: Union[int, float] = 1,
        unique: bool = False,
        src_node_list: torch.Tensor = None,
-        dst_node_list: torch.Tensor = None
+        dst_node_list: torch.Tensor = None,
+        seed = False
    ):
        super(LocalNegativeSampling,self).__init__(mode,amount,unique=unique)
        self.src_node_list = src_node_list.to('cpu') if src_node_list is not None else None
        self.dst_node_list = dst_node_list.to('cpu') if dst_node_list is not None else None
        self.rdm = torch.Generator()
+        if seed is True:
+            random.seed(seed)
        seed = random.randint(0,100000)
        print('seed is',seed)
        ctx = DistributedContext.get_default_context()