Commit 24a069d6 by zlj

fix boundary

parent 342421ab
sampling: sampling:
- layer: 1 - layer: 1
neighbor: neighbor:
- 20 - 10
strategy: 'recent' strategy: 'recent'
prop_time: False prop_time: False
history: 1 history: 1
...@@ -28,9 +28,9 @@ gnn: ...@@ -28,9 +28,9 @@ gnn:
dim_out: 100 dim_out: 100
train: train:
- epoch: 100 - epoch: 100
batch_size: 600 batch_size: 1000
# reorder: 16 # reorder: 16
lr: 0.0005 lr: 0.0004
dropout: 0.2 dropout: 0.2
att_dropout: 0.2 att_dropout: 0.2
all_on_gpu: True all_on_gpu: True
...@@ -289,23 +289,25 @@ void ParallelSampler :: neighbor_sample_from_nodes_with_before_layer( ...@@ -289,23 +289,25 @@ void ParallelSampler :: neighbor_sample_from_nodes_with_before_layer(
TimeStampType delta = end_index-1>=0?(rtts - tnb.timestamp[node][end_index-1])*fanout:0; TimeStampType delta = end_index-1>=0?(rtts - tnb.timestamp[node][end_index-1])*fanout:0;
for(int cid = end_index-1;cid>=0;cid--){ for(int cid = end_index-1;cid>=0;cid--){
cal_cnt++; cal_cnt++;
if(cal_cnt>2*fanout)break; if(cal_cnt>fanout)break;
if(part[tnb.eid[node][cid]] != local_part|| node_part[tnb.neighbors[node][cid]]!= local_part){ if(part[tnb.eid[node][cid]] != local_part|| node_part[tnb.neighbors[node][cid]]!= local_part){
double ep = exp((double)(tnb.timestamp[node][cid]-rtts)/(delta)); double ep = exp((double)(tnb.timestamp[node][cid]-rtts)/(delta));
sum_p+=ep;pr[cal_cnt-1]=ep; sum_p+=ep;pr[cal_cnt-1]=ep;
sum_1++; sum_1++;
} }
} }
if(sum_p<1e-6)sum_p=1;
cal_cnt = 0; cal_cnt = 0;
for(int cid = end_index-1;cid>=0;cid--){ for(int cid = end_index-1;cid>=0;cid--){
cal_cnt++; cal_cnt++;
if(cal_cnt > 2*fanout)break; if(cal_cnt > fanout)break;
int eid = tnb.eid[node][cid]; int eid = tnb.eid[node][cid];
if(part[tnb.eid[node][cid]] != local_part|| node_part[tnb.neighbors[node][cid]]!= local_part){ if(part[tnb.eid[node][cid]] != local_part|| node_part[tnb.neighbors[node][cid]]!= local_part){
double p0 = (double)rand_r(&loc_seeds[tid]) / (RAND_MAX + 1.0); double p0 = (double)rand_r(&loc_seeds[tid]) / (RAND_MAX + 1.0);
double ep = boundery_probility*pr[cal_cnt-1]/sum_p*sum_1; double ep = boundery_probility*pr[cal_cnt-1]/sum_p*sum_1;
if(p0 > ep)continue; if(p0 > ep)continue;
//cout<<"in"<<endl;
} }
tgb_i[tid].src_index.emplace_back(i); tgb_i[tid].src_index.emplace_back(i);
tgb_i[tid].sample_nodes.emplace_back(tnb.neighbors[node][cid]); tgb_i[tid].sample_nodes.emplace_back(tnb.neighbors[node][cid]);
......
LOCAL RANK 0, RANK0
in
local rank is 0 world_size is 1 memory group is 0 memory rank is 0 memory group size is 1
[0]
memory used is torch.Size([9228, 172]) torch.float32 0.005912840366363525
dist rank is 0 after node feats defination:
dist rank is 0 after node feats defination:
local node num 9228 ,local edge num 157474
num nodes is tensor([157474])
init 0
cpu
tensor([ 0, 1, 2, ..., 157471, 157472, 157473])
Total GPU memory: 44.3516845703125
Current GPU memory allocated: 4.76837158203125e-07
Current GPU memory reserved: 0.001953125
Max GPU memory allocated during this session: 9728
Max GPU memory reserved during this session: 2097152
cpu 0
Total GPU memory: 44.3516845703125
Current GPU memory allocated: 4.76837158203125e-07
Current GPU memory reserved: 0.001953125
Max GPU memory allocated during this session: 9728
Max GPU memory reserved during this session: 2097152
torch.Size([157474, 172])
Total GPU memory: 44.3516845703125
Current GPU memory allocated: 4.76837158203125e-07
Current GPU memory reserved: 0.001953125
Max GPU memory allocated during this session: 9728
Max GPU memory reserved during this session: 2097152
init data loader
Total GPU memory: 44.3516845703125
Current GPU memory allocated: 4.76837158203125e-07
Current GPU memory reserved: 0.001953125
Max GPU memory allocated during this session: 9728
Max GPU memory reserved during this session: 2097152
tensor([], device='cuda:0', dtype=torch.int32)
100
get_neighbors consume: 0.0111879s
0 tensor([0, 0, 0, ..., 0, 0, 0])
get_neighbors consume: 0.0105944s
0 tensor([0, 0, 0, ..., 0, 0, 0])
part tensor([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]])
ts tensor([ 0, 36, 77, ..., 1862639, 1862645, 1862652],
device='cuda:0') tensor([ 0, 36, 77, ..., 1862639, 1862645, 1862652],
device='cuda:0') tensor([2218300, 2218303, 2218304, ..., 2678293, 2678333, 2678373],
device='cuda:0') tensor([1862653, 1862659, 1862666, ..., 2218282, 2218288, 2218288],
device='cuda:0')
seed is 69328
tensor([8228, 8229, 8230, 8231, 8232, 8233, 8234, 8235, 8236, 8237, 8238, 8239,
8240, 8241, 8242, 8243, 8244, 8245, 8246, 8247, 8248, 8249, 8250, 8251,
8252, 8253, 8254, 8255, 8256, 8257, 8258, 8259, 8260, 8261, 8262, 8263,
8264, 8265, 8266, 8267, 8268, 8269, 8270, 8271, 8272, 8273, 8274, 8275,
8276, 8277, 8278, 8279, 8280, 8281, 8282, 8283, 8284, 8285, 8286, 8287,
8288, 8289, 8290, 8291, 8292, 8293, 8294, 8295, 8296, 8297, 8298, 8299,
8300, 8301, 8302, 8303, 8304, 8305, 8306, 8307, 8308, 8309, 8310, 8311,
8312, 8313, 8314, 8315, 8316, 8317, 8318, 8319, 8320, 8321, 8322, 8323,
8324, 8325, 8326, 8327, 8328, 8329, 8330, 8331, 8332, 8333, 8334, 8335,
8336, 8337, 8338, 8339, 8340, 8341, 8342, 8343, 8344, 8345, 8346, 8347,
8348, 8349, 8350, 8351, 8352, 8353, 8354, 8355, 8356, 8357, 8358, 8359,
8360, 8361, 8362, 8363, 8364, 8365, 8366, 8367, 8368, 8369, 8370, 8371,
8372, 8373, 8374, 8375, 8376, 8377, 8378, 8379, 8380, 8381, 8382, 8383,
8384, 8385, 8386, 8387, 8388, 8389, 8390, 8391, 8392, 8393, 8394, 8395,
8396, 8397, 8398, 8399, 8400, 8401, 8402, 8403, 8404, 8405, 8406, 8407,
8408, 8409, 8410, 8411, 8412, 8413, 8414, 8415, 8416, 8417, 8418, 8419,
8420, 8421, 8422, 8423, 8424, 8425, 8426, 8427, 8428, 8429, 8430, 8431,
8432, 8433, 8434, 8435, 8436, 8437, 8438, 8439, 8440, 8441, 8442, 8443,
8444, 8445, 8446, 8447, 8448, 8449, 8450, 8451, 8452, 8453, 8454, 8455,
8456, 8457, 8458, 8459, 8460, 8461, 8462, 8463, 8464, 8465, 8466, 8467,
8468, 8469, 8470, 8471, 8472, 8473, 8474, 8475, 8476, 8477, 8478, 8479,
8480, 8481, 8482, 8483, 8484, 8485, 8486, 8487, 8488, 8489, 8490, 8491,
8492, 8493, 8494, 8495, 8496, 8497, 8498, 8499, 8500, 8501, 8502, 8503,
8504, 8505, 8506, 8507, 8508, 8509, 8510, 8511, 8512, 8513, 8514, 8515,
8516, 8517, 8518, 8519, 8520, 8521, 8522, 8523, 8524, 8525, 8526, 8527,
8528, 8529, 8530, 8531, 8532, 8533, 8534, 8535, 8536, 8537, 8538, 8539,
8540, 8541, 8542, 8543, 8544, 8545, 8546, 8547, 8548, 8549, 8550, 8551,
8552, 8553, 8554, 8555, 8556, 8557, 8558, 8559, 8560, 8561, 8562, 8563,
8564, 8565, 8566, 8567, 8568, 8569, 8570, 8571, 8572, 8573, 8574, 8575,
8576, 8577, 8578, 8579, 8580, 8581, 8582, 8583, 8584, 8585, 8586, 8587,
8588, 8589, 8590, 8591, 8592, 8593, 8594, 8595, 8596, 8597, 8598, 8599,
8600, 8601, 8602, 8603, 8604, 8605, 8606, 8607, 8608, 8609, 8610, 8611,
8612, 8613, 8614, 8615, 8616, 8617, 8618, 8619, 8620, 8621, 8622, 8623,
8624, 8625, 8626, 8627, 8628, 8629, 8630, 8631, 8632, 8633, 8634, 8635,
8636, 8637, 8638, 8639, 8640, 8641, 8642, 8643, 8644, 8645, 8646, 8647,
8648, 8649, 8650, 8651, 8652, 8653, 8654, 8655, 8656, 8657, 8658, 8659,
8660, 8661, 8662, 8663, 8664, 8665, 8666, 8667, 8668, 8669, 8670, 8671,
8672, 8673, 8674, 8675, 8676, 8677, 8678, 8679, 8680, 8681, 8682, 8683,
8684, 8685, 8686, 8687, 8688, 8689, 8690, 8691, 8692, 8693, 8694, 8695,
8696, 8697, 8698, 8699, 8700, 8701, 8702, 8703, 8704, 8705, 8706, 8707,
8708, 8709, 8710, 8711, 8712, 8713, 8714, 8715, 8716, 8717, 8718, 8719,
8720, 8721, 8722, 8723, 8724, 8725, 8726, 8727, 8728, 8729, 8730, 8731,
8732, 8733, 8734, 8735, 8736, 8737, 8738, 8739, 8740, 8741, 8742, 8743,
8744, 8745, 8746, 8747, 8748, 8749, 8750, 8751, 8752, 8753, 8754, 8755,
8756, 8757, 8758, 8759, 8760, 8761, 8762, 8763, 8764, 8765, 8766, 8767,
8768, 8769, 8770, 8771, 8772, 8773, 8774, 8775, 8776, 8777, 8778, 8779,
8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8788, 8789, 8790, 8791,
8792, 8793, 8794, 8795, 8796, 8797, 8798, 8799, 8800, 8801, 8802, 8803,
8804, 8805, 8806, 8807, 8808, 8809, 8810, 8811, 8812, 8813, 8814, 8815,
8816, 8817, 8818, 8819, 8820, 8821, 8822, 8823, 8824, 8825, 8826, 8827,
8828, 8829, 8830, 8831, 8832, 8833, 8834, 8835, 8836, 8837, 8838, 8839,
8840, 8841, 8842, 8843, 8844, 8845, 8846, 8847, 8848, 8849, 8850, 8851,
8852, 8853, 8854, 8855, 8856, 8857, 8858, 8859, 8860, 8861, 8862, 8863,
8864, 8865, 8866, 8867, 8868, 8869, 8870, 8871, 8872, 8873, 8874, 8875,
8876, 8877, 8878, 8879, 8880, 8881, 8882, 8883, 8884, 8885, 8886, 8887,
8888, 8889, 8890, 8891, 8892, 8893, 8894, 8895, 8896, 8897, 8898, 8899,
8900, 8901, 8902, 8903, 8904, 8905, 8906, 8907, 8908, 8909, 8910, 8911,
8912, 8913, 8914, 8915, 8916, 8917, 8918, 8919, 8920, 8921, 8922, 8923,
8924, 8925, 8926, 8927, 8928, 8929, 8930, 8931, 8932, 8933, 8934, 8935,
8936, 8937, 8938, 8939, 8940, 8941, 8942, 8943, 8944, 8945, 8946, 8947,
8948, 8949, 8950, 8951, 8952, 8953, 8954, 8955, 8956, 8957, 8958, 8959,
8960, 8961, 8962, 8963, 8964, 8965, 8966, 8967, 8968, 8969, 8970, 8971,
8972, 8973, 8974, 8975, 8976, 8977, 8978, 8979, 8980, 8981, 8982, 8983,
8984, 8985, 8986, 8987, 8988, 8989, 8990, 8991, 8992, 8993, 8994, 8995,
8996, 8997, 8998, 8999, 9000, 9001, 9002, 9003, 9004, 9005, 9006, 9007,
9008, 9009, 9010, 9011, 9012, 9013, 9014, 9015, 9016, 9017, 9018, 9019,
9020, 9021, 9022, 9023, 9024, 9025, 9026, 9027, 9028, 9029, 9030, 9031,
9032, 9033, 9034, 9035, 9036, 9037, 9038, 9039, 9040, 9041, 9042, 9043,
9044, 9045, 9046, 9047, 9048, 9049, 9050, 9051, 9052, 9053, 9054, 9055,
9056, 9057, 9058, 9059, 9060, 9061, 9062, 9063, 9064, 9065, 9066, 9067,
9068, 9069, 9070, 9071, 9072, 9073, 9074, 9075, 9076, 9077, 9078, 9079,
9080, 9081, 9082, 9083, 9084, 9085, 9086, 9087, 9088, 9089, 9090, 9091,
9092, 9093, 9094, 9095, 9096, 9097, 9098, 9099, 9100, 9101, 9102, 9103,
9104, 9105, 9106, 9107, 9108, 9109, 9110, 9111, 9112, 9113, 9114, 9115,
9116, 9117, 9118, 9119, 9120, 9121, 9122, 9123, 9124, 9125, 9126, 9127,
9128, 9129, 9130, 9131, 9132, 9133, 9134, 9135, 9136, 9137, 9138, 9139,
9140, 9141, 9142, 9143, 9144, 9145, 9146, 9147, 9148, 9149, 9150, 9151,
9152, 9153, 9154, 9155, 9156, 9157, 9158, 9159, 9160, 9161, 9162, 9163,
9164, 9165, 9166, 9167, 9168, 9169, 9170, 9171, 9172, 9173, 9174, 9175,
9176, 9177, 9178, 9179, 9180, 9181, 9182, 9183, 9184, 9185, 9186, 9187,
9188, 9189, 9190, 9191, 9192, 9193, 9194, 9195, 9196, 9197, 9198, 9199,
9200, 9201, 9202, 9203, 9204, 9205, 9206, 9207, 9208, 9209, 9210, 9211,
9212, 9213, 9214, 9215, 9216, 9217, 9218, 9219, 9220, 9221, 9222, 9223,
9224, 9225, 9226, 9227])
seed is 46802
tensor(36, device='cuda:0') 110232
tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
3000., 3000., 3000., 3000., 3000., 3000., 2232.]])
average: 1.0
max average: 2979.2431640625
min average: 2979.2431640625
all_cross_edge:tensor([0]) local edges num: 110232
tensor(36, device='cuda:0') 110232
tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000., 3000.,
3000., 3000., 3000., 3000., 3000., 3000., 2232.]])
average: 1.0
max average: 2979.2431640625
min average: 2979.2431640625
all_cross_edge:tensor([0]) local edges num: 110232
tensor(7, device='cuda:0') 23621
tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 2621.]])
average: 1.0
max average: 2952.625
min average: 2952.625
all_cross_edge:tensor([0]) local edges num: 23621
tensor(7, device='cuda:0') 23621
tensor([[3000., 3000., 3000., 3000., 3000., 3000., 3000., 2621.]])
average: 1.0
max average: 2952.625
min average: 2952.625
all_cross_edge:tensor([0]) local edges num: 23621
init dataloader
The model has 1.2405433654785156 trainable parameters
Epoch 0:
train time:1.25s
tensor(117481, device='cuda:0')
local node number tensor([117481]) remote node number tensor([0]) local edge tensor([631632]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.5162225598469377 time_backward = 0.5316816458944231 time_sample = 0 pre_batch = 0 pre_input = 0.19834021909628063 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.009082171716727316 time_attention = 0.14380833425093442
mode: val tensor([0.8996]) tensor([0.8919])
mode: test tensor([0.8854]) tensor([0.8750])
train loss:39.3417 train ap: nan val ap:0.899578 val auc:0.891941 test ap 0.885384 test auc0.874995
total time:1.71s prep time:1.25s
Epoch 1:
train time:0.93s
tensor(119297, device='cuda:0')
local node number tensor([119297]) remote node number tensor([0]) local edge tensor([648507]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.33275268971920013 time_backward = 0.4789364457828924 time_sample = 0 pre_batch = 0 pre_input = 0.1096803741529584 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008531142957508564 time_attention = 0.0745205981656909
mode: val tensor([0.9011]) tensor([0.8921])
mode: test tensor([0.8868]) tensor([0.8738])
train loss:30.7512 train ap: nan val ap:0.901060 val auc:0.892088 test ap 0.886778 test auc0.873774
total time:1.33s prep time:0.93s
Epoch 2:
train time:0.93s
tensor(119258, device='cuda:0')
local node number tensor([119258]) remote node number tensor([0]) local edge tensor([648615]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3358315115328878 time_backward = 0.47259313764516264 time_sample = 0 pre_batch = 0 pre_input = 0.11430373543407768 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008510725339874625 time_attention = 0.07622448226902634
mode: val tensor([0.9154]) tensor([0.9093])
mode: test tensor([0.8961]) tensor([0.8869])
train loss:28.2729 train ap: nan val ap:0.915394 val auc:0.909341 test ap 0.896142 test auc0.886938
total time:1.33s prep time:0.93s
Epoch 3:
train time:0.92s
tensor(119292, device='cuda:0')
local node number tensor([119292]) remote node number tensor([0]) local edge tensor([648859]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.332613394013606 time_backward = 0.4307080403668806 time_sample = 0 pre_batch = 0 pre_input = 0.15153192111756653 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008439739933237433 time_attention = 0.07364114979282022
mode: val tensor([0.9314]) tensor([0.9279])
mode: test tensor([0.9201]) tensor([0.9148])
train loss:27.2176 train ap: nan val ap:0.931445 val auc:0.927941 test ap 0.920075 test auc0.914771
total time:1.32s prep time:0.92s
Epoch 4:
train time:0.93s
tensor(119377, device='cuda:0')
local node number tensor([119377]) remote node number tensor([0]) local edge tensor([648636]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3325000318000093 time_backward = 0.44885063578840345 time_sample = 0 pre_batch = 0 pre_input = 0.13642385101411492 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008459722972474992 time_attention = 0.07424444344360381
mode: val tensor([0.9447]) tensor([0.9432])
mode: test tensor([0.9347]) tensor([0.9314])
train loss:24.5895 train ap: nan val ap:0.944669 val auc:0.943231 test ap 0.934749 test auc0.931430
total time:1.33s prep time:0.93s
Epoch 5:
train time:0.93s
tensor(119388, device='cuda:0')
local node number tensor([119388]) remote node number tensor([0]) local edge tensor([648852]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.32934694492723793 time_backward = 0.45787950325757265 time_sample = 0 pre_batch = 0 pre_input = 0.1311466050101444 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008425442734733224 time_attention = 0.07362530985847116
mode: val tensor([0.9486]) tensor([0.9478])
mode: test tensor([0.9400]) tensor([0.9373])
train loss:22.7186 train ap: nan val ap:0.948616 val auc:0.947762 test ap 0.939990 test auc0.937335
total time:1.33s prep time:0.93s
Epoch 6:
train time:0.93s
tensor(119251, device='cuda:0')
local node number tensor([119251]) remote node number tensor([0]) local edge tensor([648799]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3296502644661814 time_backward = 0.4680681542959064 time_sample = 0 pre_batch = 0 pre_input = 0.12141763977706432 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008361523854546249 time_attention = 0.07355450722388923
mode: val tensor([0.9575]) tensor([0.9550])
mode: test tensor([0.9497]) tensor([0.9463])
train loss:22.6257 train ap: nan val ap:0.957484 val auc:0.955028 test ap 0.949730 test auc0.946280
total time:1.33s prep time:0.93s
Epoch 7:
train time:0.92s
tensor(119277, device='cuda:0')
local node number tensor([119277]) remote node number tensor([0]) local edge tensor([648776]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3338864166289568 time_backward = 0.44001385651063174 time_sample = 0 pre_batch = 0 pre_input = 0.14206511317752302 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008449721965007484 time_attention = 0.07399340544361621
mode: val tensor([0.9656]) tensor([0.9629])
mode: test tensor([0.9580]) tensor([0.9545])
train loss:20.5010 train ap: nan val ap:0.965612 val auc:0.962899 test ap 0.958044 test auc0.954479
total time:1.32s prep time:0.92s
Epoch 8:
train time:0.93s
tensor(119249, device='cuda:0')
local node number tensor([119249]) remote node number tensor([0]) local edge tensor([648369]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.33578664518427104 time_backward = 0.4269245610339567 time_sample = 0 pre_batch = 0 pre_input = 0.154373285244219 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008378260536119342 time_attention = 0.07443791208788753
mode: val tensor([0.9678]) tensor([0.9651])
mode: test tensor([0.9624]) tensor([0.9585])
train loss:18.9889 train ap: nan val ap:0.967802 val auc:0.965149 test ap 0.962384 test auc0.958544
total time:1.33s prep time:0.93s
Epoch 9:
train time:0.93s
tensor(119295, device='cuda:0')
local node number tensor([119295]) remote node number tensor([0]) local edge tensor([648887]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.347587923752144 time_backward = 0.3889702036976814 time_sample = 0 pre_batch = 0 pre_input = 0.18681017577182502 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008404633379541337 time_attention = 0.07437894865870476
mode: val tensor([0.9721]) tensor([0.9702])
mode: test tensor([0.9675]) tensor([0.9645])
train loss:17.9763 train ap: nan val ap:0.972146 val auc:0.970216 test ap 0.967490 test auc0.964478
total time:1.34s prep time:0.93s
Epoch 10:
train time:0.92s
tensor(119326, device='cuda:0')
local node number tensor([119326]) remote node number tensor([0]) local edge tensor([648962]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3354337838245556 time_backward = 0.38386718300171196 time_sample = 0 pre_batch = 0 pre_input = 0.18955101375468075 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008398966281674802 time_attention = 0.07468449766747653
mode: val tensor([0.9729]) tensor([0.9706])
mode: test tensor([0.9671]) tensor([0.9640])
train loss:17.1501 train ap: nan val ap:0.972866 val auc:0.970649 test ap 0.967064 test auc0.963955
total time:1.32s prep time:0.92s
Epoch 11:
train time:0.92s
tensor(119276, device='cuda:0')
local node number tensor([119276]) remote node number tensor([0]) local edge tensor([648907]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3342382105765864 time_backward = 0.3991917232051492 time_sample = 0 pre_batch = 0 pre_input = 0.17684688128065318 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.0083418880822137 time_attention = 0.07429628912359476
mode: val tensor([0.9745]) tensor([0.9726])
mode: test tensor([0.9698]) tensor([0.9669])
train loss:16.3822 train ap: nan val ap:0.974546 val auc:0.972600 test ap 0.969842 test auc0.966880
total time:1.32s prep time:0.92s
Epoch 12:
train time:0.92s
tensor(119081, device='cuda:0')
local node number tensor([119081]) remote node number tensor([0]) local edge tensor([647286]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3341143539873883 time_backward = 0.42863147507887334 time_sample = 0 pre_batch = 0 pre_input = 0.15269588702358305 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008519083610735834 time_attention = 0.07500944682396948
mode: val tensor([0.9769]) tensor([0.9746])
mode: test tensor([0.9705]) tensor([0.9675])
train loss:16.1246 train ap: nan val ap:0.976921 val auc:0.974639 test ap 0.970455 test auc0.967546
total time:1.33s prep time:0.92s
Epoch 13:
train time:0.92s
tensor(119328, device='cuda:0')
local node number tensor([119328]) remote node number tensor([0]) local edge tensor([648707]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3364475581329316 time_backward = 0.38464420218952 time_sample = 0 pre_batch = 0 pre_input = 0.1897556931944564 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008665439207106829 time_attention = 0.07490635523572564
mode: val tensor([0.9791]) tensor([0.9771])
mode: test tensor([0.9723]) tensor([0.9696])
train loss:15.4998 train ap: nan val ap:0.979124 val auc:0.977097 test ap 0.972275 test auc0.969606
total time:1.32s prep time:0.92s
Epoch 14:
train time:0.92s
tensor(119268, device='cuda:0')
local node number tensor([119268]) remote node number tensor([0]) local edge tensor([648277]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.33370649884454906 time_backward = 0.42611016042064875 time_sample = 0 pre_batch = 0 pre_input = 0.15458859503269196 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008416186203248799 time_attention = 0.07499780738726258
mode: val tensor([0.9790]) tensor([0.9766])
mode: test tensor([0.9731]) tensor([0.9699])
train loss:15.0516 train ap: nan val ap:0.978998 val auc:0.976639 test ap 0.973110 test auc0.969935
total time:1.32s prep time:0.92s
Epoch 15:
train time:0.92s
tensor(119402, device='cuda:0')
local node number tensor([119402]) remote node number tensor([0]) local edge tensor([649428]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3383830221137032 time_backward = 0.4082292983075604 time_sample = 0 pre_batch = 0 pre_input = 0.1682694231858477 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.008468184154480696 time_attention = 0.07482897094450891
mode: val tensor([0.9777]) tensor([0.9761])
mode: test tensor([0.9726]) tensor([0.9697])
train loss:14.6274 train ap: nan val ap:0.977698 val auc:0.976067 test ap 0.972628 test auc0.969700
total time:1.33s prep time:0.92s
Epoch 16:
train time:0.93s
tensor(119363, device='cuda:0')
local node number tensor([119363]) remote node number tensor([0]) local edge tensor([649219]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.342100684880279 time_backward = 0.4223173810169101 time_sample = 0 pre_batch = 0 pre_input = 0.15758785407524556 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.009801693260669708 time_attention = 0.0757779193809256
mode: val tensor([0.9794]) tensor([0.9774])
mode: test tensor([0.9744]) tensor([0.9717])
train loss:14.2517 train ap: nan val ap:0.979376 val auc:0.977353 test ap 0.974385 test auc0.971685
total time:1.34s prep time:0.93s
Epoch 17:
train time:0.95s
tensor(119286, device='cuda:0')
local node number tensor([119286]) remote node number tensor([0]) local edge tensor([648490]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34202918445225805 time_backward = 0.5509289090987295 time_sample = 0 pre_batch = 0 pre_input = 0.049267802853137255 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.016561739495955408 time_attention = 0.07720356027130038
mode: val tensor([0.9804]) tensor([0.9785])
mode: test tensor([0.9753]) tensor([0.9726])
train loss:13.8701 train ap: nan val ap:0.980375 val auc:0.978506 test ap 0.975348 test auc0.972632
total time:1.36s prep time:0.95s
Epoch 18:
train time:0.95s
tensor(119152, device='cuda:0')
local node number tensor([119152]) remote node number tensor([0]) local edge tensor([648028]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34078817255795 time_backward = 0.5598571858135983 time_sample = 0 pre_batch = 0 pre_input = 0.04400515847373754 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.016431471100077033 time_attention = 0.07700221717823297
mode: val tensor([0.9821]) tensor([0.9803])
mode: test tensor([0.9763]) tensor([0.9735])
train loss:13.3467 train ap: nan val ap:0.982058 val auc:0.980295 test ap 0.976265 test auc0.973481
total time:1.36s prep time:0.95s
Epoch 19:
train time:0.95s
tensor(119270, device='cuda:0')
local node number tensor([119270]) remote node number tensor([0]) local edge tensor([648959]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34164636861532927 time_backward = 0.5451775507535785 time_sample = 0 pre_batch = 0 pre_input = 0.054701198590919375 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01695081579964608 time_attention = 0.07749861106276512
mode: val tensor([0.9815]) tensor([0.9795])
mode: test tensor([0.9748]) tensor([0.9724])
train loss:13.4622 train ap: nan val ap:0.981470 val auc:0.979522 test ap 0.974832 test auc0.972390
total time:1.35s prep time:0.95s
Epoch 20:
train time:0.95s
tensor(119248, device='cuda:0')
local node number tensor([119248]) remote node number tensor([0]) local edge tensor([648423]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34080718306358904 time_backward = 0.5503961594076827 time_sample = 0 pre_batch = 0 pre_input = 0.05070258106570691 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017195376800373197 time_attention = 0.07729481114074588
mode: val tensor([0.9802]) tensor([0.9784])
mode: test tensor([0.9741]) tensor([0.9714])
train loss:13.2978 train ap: nan val ap:0.980217 val auc:0.978400 test ap 0.974100 test auc0.971387
total time:1.35s prep time:0.95s
Epoch 21:
train time:0.95s
tensor(119320, device='cuda:0')
local node number tensor([119320]) remote node number tensor([0]) local edge tensor([649095]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34298782143741846 time_backward = 0.5449241640744731 time_sample = 0 pre_batch = 0 pre_input = 0.05215074634179473 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01692443701904267 time_attention = 0.0773102946113795
mode: val tensor([0.9820]) tensor([0.9807])
mode: test tensor([0.9765]) tensor([0.9738])
train loss:12.9960 train ap: nan val ap:0.982003 val auc:0.980662 test ap 0.976478 test auc0.973790
total time:1.35s prep time:0.95s
Epoch 22:
train time:0.95s
tensor(119239, device='cuda:0')
local node number tensor([119239]) remote node number tensor([0]) local edge tensor([648405]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3450987961841747 time_backward = 0.5491135996999219 time_sample = 0 pre_batch = 0 pre_input = 0.052711950964294374 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017411798588000238 time_attention = 0.07751001499127597
mode: val tensor([0.9822]) tensor([0.9806])
mode: test tensor([0.9769]) tensor([0.9744])
train loss:12.7027 train ap: nan val ap:0.982188 val auc:0.980581 test ap 0.976870 test auc0.974398
total time:1.36s prep time:0.95s
Epoch 23:
train time:0.95s
tensor(119378, device='cuda:0')
local node number tensor([119378]) remote node number tensor([0]) local edge tensor([648240]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3452421477995813 time_backward = 0.5390177856897935 time_sample = 0 pre_batch = 0 pre_input = 0.0565835825400427 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017625993816182017 time_attention = 0.07707663392648101
mode: val tensor([0.9831]) tensor([0.9816])
mode: test tensor([0.9775]) tensor([0.9754])
train loss:12.3543 train ap: nan val ap:0.983065 val auc:0.981607 test ap 0.977524 test auc0.975359
total time:1.36s prep time:0.95s
Epoch 24:
train time:0.96s
tensor(119448, device='cuda:0')
local node number tensor([119448]) remote node number tensor([0]) local edge tensor([649041]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3626376132015139 time_backward = 0.5006735629867762 time_sample = 0 pre_batch = 0 pre_input = 0.08839153102599084 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.016956130508333445 time_attention = 0.07790239271707833
mode: val tensor([0.9831]) tensor([0.9813])
mode: test tensor([0.9776]) tensor([0.9749])
train loss:12.1990 train ap: nan val ap:0.983074 val auc:0.981300 test ap 0.977597 test auc0.974865
total time:1.37s prep time:0.96s
Epoch 25:
train time:0.95s
tensor(119277, device='cuda:0')
local node number tensor([119277]) remote node number tensor([0]) local edge tensor([648323]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34278130857273936 time_backward = 0.553080213139765 time_sample = 0 pre_batch = 0 pre_input = 0.04923592531122267 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01747087435796857 time_attention = 0.07763282326050103
mode: val tensor([0.9834]) tensor([0.9819])
mode: test tensor([0.9767]) tensor([0.9748])
train loss:12.1612 train ap: nan val ap:0.983410 val auc:0.981872 test ap 0.976738 test auc0.974824
total time:1.36s prep time:0.95s
Epoch 26:
train time:0.95s
tensor(119356, device='cuda:0')
local node number tensor([119356]) remote node number tensor([0]) local edge tensor([648680]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3416846259497106 time_backward = 0.5507900801021606 time_sample = 0 pre_batch = 0 pre_input = 0.04809858009684831 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017090636771172285 time_attention = 0.07780991494655609
mode: val tensor([0.9839]) tensor([0.9824])
mode: test tensor([0.9778]) tensor([0.9756])
train loss:11.9506 train ap: nan val ap:0.983882 val auc:0.982390 test ap 0.977783 test auc0.975639
total time:1.35s prep time:0.95s
Epoch 27:
train time:0.97s
tensor(119280, device='cuda:0')
local node number tensor([119280]) remote node number tensor([0]) local edge tensor([648301]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.35982254473492503 time_backward = 0.5581902038538828 time_sample = 0 pre_batch = 0 pre_input = 0.04036963905673474 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01755107450298965 time_attention = 0.08050818706396967
mode: val tensor([0.9835]) tensor([0.9820])
mode: test tensor([0.9780]) tensor([0.9754])
train loss:11.8376 train ap: nan val ap:0.983460 val auc:0.981956 test ap 0.978000 test auc0.975392
total time:1.38s prep time:0.97s
Epoch 28:
train time:0.96s
tensor(119295, device='cuda:0')
local node number tensor([119295]) remote node number tensor([0]) local edge tensor([648768]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.347457638126798 time_backward = 0.5592087459517643 time_sample = 0 pre_batch = 0 pre_input = 0.04261839797254652 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.018026720266789198 time_attention = 0.08068907330743968
mode: val tensor([0.9834]) tensor([0.9821])
mode: test tensor([0.9795]) tensor([0.9772])
train loss:11.7316 train ap: nan val ap:0.983403 val auc:0.982100 test ap 0.979455 test auc0.977188
total time:1.36s prep time:0.96s
Epoch 29:
train time:0.95s
tensor(119324, device='cuda:0')
local node number tensor([119324]) remote node number tensor([0]) local edge tensor([648990]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34383319586049765 time_backward = 0.531271665240638 time_sample = 0 pre_batch = 0 pre_input = 0.06677867460530251 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01735282759182155 time_attention = 0.07825628505088389
mode: val tensor([0.9835]) tensor([0.9822])
mode: test tensor([0.9791]) tensor([0.9769])
train loss:11.7245 train ap: nan val ap:0.983545 val auc:0.982210 test ap 0.979084 test auc0.976873
total time:1.36s prep time:0.95s
Epoch 30:
train time:0.95s
tensor(119323, device='cuda:0')
local node number tensor([119323]) remote node number tensor([0]) local edge tensor([648623]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34352220210712403 time_backward = 0.5586602225666866 time_sample = 0 pre_batch = 0 pre_input = 0.04339289083145559 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017157140071503818 time_attention = 0.07775286072865129
mode: val tensor([0.9838]) tensor([0.9826])
mode: test tensor([0.9788]) tensor([0.9771])
train loss:11.4552 train ap: nan val ap:0.983839 val auc:0.982574 test ap 0.978790 test auc0.977101
total time:1.36s prep time:0.95s
Epoch 31:
train time:0.95s
tensor(119210, device='cuda:0')
local node number tensor([119210]) remote node number tensor([0]) local edge tensor([648221]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34196133306249976 time_backward = 0.5591293250909075 time_sample = 0 pre_batch = 0 pre_input = 0.041276065981946886 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017227028030902147 time_attention = 0.07764211669564247
mode: val tensor([0.9848]) tensor([0.9834])
mode: test tensor([0.9799]) tensor([0.9774])
train loss:11.2347 train ap: nan val ap:0.984829 val auc:0.983405 test ap 0.979930 test auc0.977359
total time:1.36s prep time:0.95s
Epoch 32:
train time:0.95s
tensor(119438, device='cuda:0')
local node number tensor([119438]) remote node number tensor([0]) local edge tensor([649446]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.343736870912835 time_backward = 0.5601736486423761 time_sample = 0 pre_batch = 0 pre_input = 0.04005975800100714 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01796495425514877 time_attention = 0.07848992524668574
mode: val tensor([0.9841]) tensor([0.9828])
mode: test tensor([0.9799]) tensor([0.9780])
train loss:11.1548 train ap: nan val ap:0.984132 val auc:0.982835 test ap 0.979914 test auc0.977990
total time:1.36s prep time:0.95s
Epoch 33:
train time:0.95s
tensor(119236, device='cuda:0')
local node number tensor([119236]) remote node number tensor([0]) local edge tensor([648148]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34337377769406885 time_backward = 0.5570919638266787 time_sample = 0 pre_batch = 0 pre_input = 0.04119874502066523 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01766906923148781 time_attention = 0.07826034002937376
mode: val tensor([0.9840]) tensor([0.9825])
mode: test tensor([0.9798]) tensor([0.9776])
train loss:11.1020 train ap: nan val ap:0.984042 val auc:0.982467 test ap 0.979820 test auc0.977645
total time:1.35s prep time:0.95s
Epoch 34:
train time:0.95s
tensor(119372, device='cuda:0')
local node number tensor([119372]) remote node number tensor([0]) local edge tensor([648901]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.34410411852877587 time_backward = 0.5385550218634307 time_sample = 0 pre_batch = 0 pre_input = 0.057488588034175336 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01736996171530336 time_attention = 0.07862102310173213
mode: val tensor([0.9848]) tensor([0.9833])
mode: test tensor([0.9792]) tensor([0.9776])
train loss:10.9194 train ap: nan val ap:0.984812 val auc:0.983250 test ap 0.979217 test auc0.977595
total time:1.35s prep time:0.95s
Epoch 35:
train time:0.95s
tensor(119295, device='cuda:0')
local node number tensor([119295]) remote node number tensor([0]) local edge tensor([648214]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3435489946277812 time_backward = 0.5511866604210809 time_sample = 0 pre_batch = 0 pre_input = 0.04667527589481324 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.01792132097762078 time_attention = 0.0790077920537442
mode: val tensor([0.9840]) tensor([0.9826])
mode: test tensor([0.9806]) tensor([0.9786])
train loss:10.9772 train ap: nan val ap:0.984041 val auc:0.982637 test ap 0.980644 test auc0.978567
total time:1.35s prep time:0.95s
Epoch 36:
train time:0.95s
tensor(119329, device='cuda:0')
local node number tensor([119329]) remote node number tensor([0]) local edge tensor([648744]) remote edgetensor([0])
memory comm tensor([0]) shared comm tensor([0])
time_forward = 0.3426324164029211 time_backward = 0.5523476478410885 time_sample = 0 pre_batch = 0 pre_input = 0.04714676144067198 pos_update = 0 mem_update = 0 time_zero = 0 time_nbrs = 0.017416485701687634 time_attention = 0.07795663003344089
mode: val tensor([0.9847]) tensor([0.9832])
mode: test tensor([0.9803]) tensor([0.9784])
Early stopping at epoch 36
Loading the best model at epoch 31
35.07431387901306
best test AP:0.979930 test auc0.977359
mode: train tensor([0.9898]) tensor([0.9897])
mode: val tensor([0.9833]) tensor([0.9818])
mode: test tensor([0.9787]) tensor([0.9761])
val ap:0.983326 val auc:0.981776 test AP:0.978683 test AUC:0.976127
test_dataset 23621 avg_time 0.9479544291625152
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import matplotlib.pyplot as plt
import numpy as np
import torch
# 读取文件内容
ssim_values = [0, 0.1, 0.2, 0.3, 0.4, 2] # 假设这是你的 ssim 参数值
probability_values = [1,0.5,0.1,0.05,0.01,0]
data_values = ['WIKI'] # 存储从文件中读取的数据
partition = 'ours_shared'
# 从文件中读取数据,假设数据存储在文件 data.txt 中
#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
partitions=4
topk=0.01
mem='all_update'#'historical'
for data in data_values:
ap_list = []
comm_list = []
for p in probability_values:
file = '{}/{}-{}-{}-{}-boundery_recent_decay-{}.out'.format(data,partitions,partition,topk,mem,p)
prefix = 'best test AP:'
cnt = 0
sum = 0
with open(file, 'r') as file:
for line in file:
if line.startswith(prefix):
ap = float(line.lstrip(prefix).split(' ')[0])
pos = line.find('remote node number tensor')
if(pos!=-1):
posr = line.find(']',pos+2+len('remote node number tensor'),)
comm = int(line[pos+2+len('remote node number tensor'):posr])
sum = sum+comm
cnt = cnt+1
ap_list.append(ap)
comm_list.append(comm/cnt*4)
# 绘制柱状图
bar_width = 0.4
#shared comm tensor
# 设置柱状图的位置
bars = range(len(ssim_values))
# 绘制柱状图
plt.bar([b for b in bars], ap_list, width=bar_width)
# 绘制柱状图
plt.ylim([0.9,1])
plt.xticks([b for b in bars], probability_values)
plt.xlabel('probability')
plt.ylabel('Test AP')
plt.title('{}({} partitions)'.format(data,partitions))
plt.savefig('boundary_AP_{}.png'.format(data))
plt.clf()
plt.bar([b for b in bars], comm_list, width=bar_width)
# 绘制柱状图
plt.xticks([b for b in bars], probability_values)
plt.xlabel('probability')
plt.ylabel('Communication volume')
plt.title('{}({} partitions)'.format(data,partitions))
plt.savefig('boundary_comm_{}.png'.format(data))
plt.clf()
if partition == 'ours_shared':
partition0 = 'ours'
else:
partition0=partition
for p in probability_values:
file = '{}/val_{}_{}_{}_0_boundery_recent_decay_{}_all_update_2.pt'.format(data,partition0,topk,partitions,float(p))
val_ap = torch.tensor(torch.load(file))
epoch = torch.arange(val_ap.shape[0])
#绘制曲线图
plt.plot(epoch,val_ap, label='probability={}'.format(p))
plt.xlabel('Epoch')
plt.ylabel('Val AP')
plt.title('{}({} partitions)'.format(data,partitions))
# plt.grid(True)
plt.legend()
plt.savefig('{}_boundary_Convergence_rate.png'.format(data))
plt.clf()
import matplotlib.pyplot as plt
import numpy as np
import torch
# 读取文件内容
ssim_values = [0, 0.1, 0.2, 0.3, 0.4, 2] # 假设这是你的 ssim 参数值
data_values = ['WikiTalk'] # 存储从文件中读取的数据
partition = 'ours_shared'
# 从文件中读取数据,假设数据存储在文件 data.txt 中
#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
partitions=8
topk=0.01
mem='historical'
for data in data_values:
ap_list = []
comm_list = []
for ssim in ssim_values:
if ssim == 2:
file = '{}/{}-{}-{}-local-recent.out'.format(data,partitions,partition,topk)
else:
file = '{}/{}-{}-{}-{}-{}-recent.out'.format(data,partitions,partition,topk,mem,ssim)
prefix = 'best test AP:'
with open(file, 'r') as file:
for line in file:
if line.startswith(prefix):
ap = float(line.lstrip(prefix).split(' ')[0])
pos = line.find('shared comm tensor')
if(pos!=-1):
comm = int(line[pos+2+len('shared comm tensor'):len(line)-3])
ap_list.append(ap)
comm_list.append(comm)
# 绘制柱状图
bar_width = 0.4
#shared comm tensor
# 设置柱状图的位置
bars = range(len(ssim_values))
# 绘制柱状图
plt.bar([b for b in bars], ap_list, width=bar_width)
# 绘制柱状图
plt.ylim([0.9,1])
plt.xticks([b for b in bars], ssim_values)
plt.xlabel('SSIM threshold Values')
plt.ylabel('Test AP')
plt.title('{}({} partitions)'.format(data,partitions))
plt.savefig('ssim_{}.png'.format(data))
plt.clf()
plt.bar([b for b in bars], comm_list, width=bar_width)
# 绘制柱状图
plt.xticks([b for b in bars], ssim_values)
plt.xlabel('SSIM threshold Values')
plt.ylabel('Communication volume')
plt.title('{}({} partitions)'.format(data,partitions))
plt.savefig('comm_{}.png'.format(data))
plt.clf()
if partition == 'ours_shared':
partition0 = 'ours'
else:
partition0=partition
for ssim in ssim_values:
if ssim == 2:
file = '{}/val_{}_{}_{}_0_recent_0.1_local_2.pt'.format(data,partition0,topk,partitions,)
else:
file = '{}/val_{}_{}_{}_0_recent_0.1_{}_{}.pt'.format(data,partition0,topk,partitions,mem,float(ssim))
val_ap = torch.tensor(torch.load(file))
epoch = torch.arange(val_ap.shape[0])
#绘制曲线图
plt.plot(epoch,val_ap, label='ssim={}'.format(ssim))
plt.xlabel('Epoch')
plt.ylabel('Val AP')
plt.title('{}({} partitions)'.format(data,partitions))
# plt.grid(True)
plt.legend()
plt.savefig('{}_ssim_Convergence_rate.png'.format(data))
plt.clf()
import matplotlib.pyplot as plt
import numpy as np
import torch
# 读取文件内容
ssim_values = [0, 0.1, 0.2, 0.3, 0.4, 2] # 假设这是你的 ssim 参数值
data_values = ['WIKI','REDDIT'] # 存储从文件中读取的数据
partition = 'ours_shared'
# 从文件中读取数据,假设数据存储在文件 data.txt 中
#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
partitions=4
topk=0.01
mem='historical'
for data in data_values:
ap_list = []
for ssim in ssim_values:
if ssim == 2:
file = '{}/{}-{}-{}-local-recent.out'.format(data,partitions,partition,topk)
else:
file = '{}/{}-{}-{}-{}-{}-recent.out'.format(data,partitions,partition,topk,mem,ssim)
prefix = 'best test AP:'
with open(file, 'r') as file:
for line in file:
if line.startswith(prefix):
ap = float(line.lstrip(prefix).split(' ')[0])
ap_list.append(ap)
# 绘制柱状图
bar_width = 0.4
# 设置柱状图的位置
bars = range(len(ssim_values))
# 绘制柱状图
plt.bar([b for b in bars], ap_list, width=bar_width)
# 绘制柱状图
plt.ylim([0.8,1])
plt.xticks([b for b in bars], ssim_values)
plt.xlabel('SSIM threshold Values')
plt.ylabel('Test AP')
plt.title('{}({} partitions)'.format(data,partitions))
plt.savefig('ssim_{}.png'.format(data))
plt.clf()
if partition == 'ours_shared':
partition0 = 'ours'
else:
partition0=partition
for ssim in ssim_values:
if ssim == 2:
file = '{}/val_{}_{}_{}_0_recent_0.1_local_2.pt'.format(data,partition0,topk,partitions,)
else:
file = '{}/val_{}_{}_{}_0_recent_0.1_{}_{}.pt'.format(data,partition0,topk,partitions,mem,float(ssim))
val_ap = torch.tensor(torch.load(file))
epoch = torch.arange(val_ap.shape[0])
#绘制曲线图
plt.plot(epoch,val_ap, label='ssim={}'.format(ssim))
plt.xlabel('Epoch')
plt.ylabel('Val AP')
plt.title('{}({} partitions)'.format(data,partitions))
# plt.grid(True)
plt.legend()
plt.ylim([0.98,0.99])
plt.savefig('{}_ssim_Convergence_rate.png'.format(data))
plt.clf()
...@@ -2,26 +2,35 @@ ...@@ -2,26 +2,35 @@
# 定义数组变量 # 定义数组变量
addr="192.168.1.107" addr="192.168.1.107"
partition_params=("ours" "metis" "ldg" "random") partition_params=("ours")
#"metis" "ldg" "random")
#("ours" "metis" "ldg" "random") #("ours" "metis" "ldg" "random")
partitions="16" partitions="4"
nnodes="4" node_per="4"
nnodes="1"
node_rank="0" node_rank="0"
probability_params=("1" "0.5" "0.1" "0.05" "0.01" "0") probability_params=("1" "0.5" "0.1" "0.05" "0.01" "0")
#sample_type_params=("recent") #"boundery_recent_decay" "boundery_recent_uniform")
sample_type_params=("recent" "boundery_recent_decay" "boundery_recent_uniform") sample_type_params=("recent" "boundery_recent_decay" "boundery_recent_uniform")
#sample_type_params=("recent") #sample_type_params=("recent")
#memory_type=("all_update" "p2p" "all_reduce" "historical" "local") #memory_type=("all_update" "p2p" "all_reduce" "historical" "local")
#memory_type=("all_update" "local" "historical") memory_type=("all_update")
memory_type=("local" "all_update" "historical" "all_reduce") #memory_type=("local" "all_update" "historical" "all_reduce")
shared_memory_ssim=("0" "0.1" "0.2" "0.3" "0.4" ) shared_memory_ssim=("0" "0.1" "0.2" "0.3" "0.4" )
#data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk") #data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk")
data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk" "StackOverflow") data_param=("DGraphFin" "WikiTalk")
#data_param=("WIKI" "REDDIT" "LASTFM" "WikiTalk" "StackOverflow")
#data_param=("REDDIT" "WikiTalk") #data_param=("REDDIT" "WikiTalk")
# 创建输出目录 # 创建输出目录
mkdir -p all mkdir -p all
# 遍历数组并执行命令 # 遍历数组并执行命令
for data in "${data_param[@]}"; do for data in "${data_param[@]}"; do
model="TGN_large"
if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
model="TGN"
fi
mkdir all/"$data" mkdir all/"$data"
mkdir all/"$data"/comm mkdir all/"$data"/comm
#torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition ours --memory_type local --sample_type recent --topk 0 > all/"$data"/1.out & #torchrun --nnodes "$nnodes" --node_rank 0 --nproc-per-node 1 --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition ours --memory_type local --sample_type recent --topk 0 > all/"$data"/1.out &
...@@ -33,20 +42,20 @@ for data in "${data_param[@]}"; do ...@@ -33,20 +42,20 @@ for data in "${data_param[@]}"; do
if [ "$mem" = "historical" ]; then if [ "$mem" = "historical" ]; then
for ssim in "${shared_memory_ssim[@]}"; do for ssim in "${shared_memory_ssim[@]}"; do
if [ "$partition" = "ours" ]; then if [ "$partition" = "ours" ]; then
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out &
wait wait
fi fi
done done
elif [ "$mem" = "all_reduce" ]; then elif [ "$mem" = "all_reduce" ]; then
if [ "$partition" = "ours" ]; then if [ "$partition" = "ours" ]; then
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
wait wait
fi fi
else else
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample".out &
wait wait
if [ "$partition" = "ours" ]; then if [ "$partition" = "ours" ]; then
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample".out &
wait wait
fi fi
fi fi
...@@ -57,20 +66,20 @@ for data in "${data_param[@]}"; do ...@@ -57,20 +66,20 @@ for data in "${data_param[@]}"; do
if [ "$mem" = "historical" ]; then if [ "$mem" = "historical" ]; then
for ssim in "${shared_memory_ssim[@]}"; do for ssim in "${shared_memory_ssim[@]}"; do
if [ "$partition" = "ours" ]; then if [ "$partition" = "ours" ]; then
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01"$mem"-"$ssim"-"$sample"-"$pro".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" --shared_memory_ssim "$ssim" > all/"$data"/"$partitions"-ours_shared-0.01"$mem"-"$ssim"-"$sample"-"$pro".out &
wait wait
fi fi
done done
elif [ "$mem" = "all_reduce" ]; then elif [ "$mem" = "all_reduce" ]; then
if [ "$partition" = "ours" ]; then if [ "$partition" = "ours" ]; then
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out& torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out&
wait wait
fi fi
else else
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-"$partition"-0-"$mem"-"$sample"-"$pro".out &
wait wait
if [ "$partition" = "ours" ]; then if [ "$partition" = "ours" ]; then
torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$partitions" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode TGN_large --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out & torchrun --nnodes "$nnodes" --node_rank "$node_rank" --nproc-per-node "$node_per" --master-addr "$addr" --master-port 9445 train_boundery.py --dataname "$data" --mode "$model" --partition "$partition" --topk 0.01 --sample_type "$sample" --probability "$pro" --memory_type "$mem" > all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$sample"-"$pro".out &
wait wait
fi fi
fi fi
......
LOCAL RANK 0, RANK0 LOCAL RANK 0, RANK0
in
local rank is 0 world_size is 1 memory group is 0 memory rank is 0 memory group size is 1
[0]
use cuda on 0 use cuda on 0
638486
get_neighbors consume: 4.12395s
Epoch 0:
LOCAL RANK 0, RANK0
LOCAL RANK 2, RANK2
LOCAL RANK 1, RANK1 LOCAL RANK 1, RANK1
LOCAL RANK 3, RANK3 LOCAL RANK 3, RANK3
use cuda on 3 LOCAL RANK 0, RANK0
use cuda on 0 LOCAL RANK 2, RANK2
use cuda on 2
use cuda on 1
638486
638486
638486
638486
get_neighbors consume: 3.42567s
get_neighbors consume: 3.42812s
num_batchs: tensor([7069], device='cuda:2')
num_batchs: tensor([6015], device='cuda:0')
get_neighbors consume: 3.68743s
num_batchs: tensor([6948], device='cuda:1')
get_neighbors consume: 4.58464s
num_batchs: tensor([6576], device='cuda:3')
num_batchs: num_batchs: tensor([1254], device='cuda:0')
tensor([1642], device='cuda:3')
num_batchs: num_batchs: tensor([1331], device='cuda:2')
tensor([1478], device='cuda:1')
num_batchs:num_batchs:num_batchs: num_batchs: tensor([1227], device='cuda:0')
tensor([1625], device='cuda:3')tensor([1412], device='cuda:1')tensor([1440], device='cuda:2')
Epoch 0:
Epoch 0:
Epoch 0:
Epoch 0:
train loss:3025.4560 train ap:0.964935 val ap:0.973583 val auc:0.969748
train loss:2842.4385 train ap:0.968786 val ap:0.973583 val auc:0.969748
train loss:3149.4863 train ap:0.960053 val ap:0.973583 val auc:0.969748
train loss:2905.2378 train ap:0.966912 val ap:0.973583 val auc:0.969748
total time:109.11s prep time:90.95s
total time:109.11s prep time:90.95s
total time:109.11s prep time:90.95s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
total time:109.10s prep time:90.95s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 1:
Epoch 1:
Epoch 1:
Epoch 1:
train loss:2756.1783 train ap:0.969414 val ap:0.976812 val auc:0.973716
train loss:2786.7234 train ap:0.970014 val ap:0.976812 val auc:0.973716
train loss:2600.9249 train ap:0.973055 val ap:0.976812 val auc:0.973716
train loss:2561.2065 train ap:0.974347 val ap:0.976812 val auc:0.973716
total time:107.65s prep time:89.41s
total time:107.65s prep time:89.41s
total time:107.65s prep time:89.41s
total time:107.65s prep time:89.41s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 2:
Epoch 2:
Epoch 2:
Epoch 2:
train loss:2430.7610 train ap:0.976709 val ap:0.979544 val auc:0.976893
train loss:2616.8964 train ap:0.972457 val ap:0.979544 val auc:0.976893
train loss:2666.5888 train ap:0.972383 val ap:0.979544 val auc:0.976893
train loss:2477.5472 train ap:0.975493 val ap:0.979544 val auc:0.976893
total time:107.73s prep time:89.82s
total time:107.73s prep time:89.82s
total time:107.73s prep time:89.82s
total time:107.73s prep time:89.82s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 3:Epoch 3:Epoch 3:
Epoch 3:
train loss:2404.6129 train ap:0.977177 val ap:0.979526 val auc:0.976748
train loss:2652.1562 train ap:0.972664 val ap:0.979526 val auc:0.976748
train loss:2561.0276 train ap:0.973517 val ap:0.979526 val auc:0.976748
train loss:2431.4974 train ap:0.976369 val ap:0.979526 val auc:0.976748
total time:107.16s prep time:89.15s
total time:107.16s prep time:89.15s
total time:107.16s prep time:89.15s
total time:107.16s prep time:89.15s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 4:
Epoch 4:Epoch 4:Epoch 4:
train loss:2599.0614 train ap:0.973655 val ap:0.980024 val auc:0.977120
train loss:2343.8141 train ap:0.978188 val ap:0.980024 val auc:0.977120
train loss:2382.7643 train ap:0.977246 val ap:0.980024 val auc:0.977120
total time:107.70s prep time:89.60s
total time:107.70s prep time:89.60s
fetch time:0.00s write back time:0.00s
total time:107.70s prep time:89.60s
train loss:2503.4472 train ap:0.974597 val ap:0.980024 val auc:0.977120
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
total time:107.70s prep time:89.60s
fetch time:0.00s write back time:0.00s
Epoch 5:Epoch 5:
Epoch 5:Epoch 5:
train loss:2377.6717 train ap:0.977300 val ap:0.981272 val auc:0.978760
total time:108.36s prep time:89.99s
train loss:2586.4221 train ap:0.973873 val ap:0.981272 val auc:0.978760
train loss:2510.5564 train ap:0.974502 val ap:0.981272 val auc:0.978760
train loss:2345.5698 train ap:0.978154 val ap:0.981272 val auc:0.978760
fetch time:0.00s write back time:0.00s
total time:108.36s prep time:89.99s
total time:108.36s prep time:89.99s
total time:108.36s prep time:89.99s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 6:
Epoch 6:Epoch 6:
Epoch 6:
train loss:2287.1365 train ap:0.979113 val ap:0.981768 val auc:0.979250
train loss:2541.0882 train ap:0.974732 val ap:0.981768 val auc:0.979250
train loss:2441.7481 train ap:0.975795 val ap:0.981768 val auc:0.979250
total time:108.29s prep time:90.22s
train loss:2313.8948 train ap:0.978471 val ap:0.981768 val auc:0.979250
total time:108.29s prep time:90.22s
total time:108.29s prep time:90.22s
fetch time:0.00s write back time:0.00s
total time:108.29s prep time:90.22s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 7:
Epoch 7:
Epoch 7:
Epoch 7:
train loss:2321.0527 train ap:0.978335 val ap:0.980500 val auc:0.978016
train loss:2558.9959 train ap:0.974414 val ap:0.980500 val auc:0.978016
train loss:2289.0225 train ap:0.979144 val ap:0.980500 val auc:0.978016
train loss:2436.1819 train ap:0.975923 val ap:0.980500 val auc:0.978016
total time:107.98s prep time:90.08s
total time:107.98s prep time:90.08s
total time:107.98s prep time:90.08s
total time:107.98s prep time:90.08s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 8:
Epoch 8:Epoch 8:
Epoch 8:
train loss:2422.3653 train ap:0.976156 val ap:0.982765 val auc:0.980566
train loss:2250.0465 train ap:0.979720 val ap:0.982765 val auc:0.980566
total time:107.98s prep time:89.73s
train loss:2517.5717 train ap:0.975174 val ap:0.982765 val auc:0.980566
total time:107.98s prep time:89.73s
fetch time:0.00s write back time:0.00s
train loss:2284.2223 train ap:0.978957 val ap:0.982765 val auc:0.980566
fetch time:0.00s write back time:0.00s
total time:107.98s prep time:89.73s
total time:107.98s prep time:89.73s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 9:
Epoch 9:Epoch 9:
Epoch 9:
train loss:2495.3455 train ap:0.975555 val ap:0.980162 val auc:0.977624
train loss:2268.7504 train ap:0.979202 val ap:0.980162 val auc:0.977624
train loss:2243.5499 train ap:0.979831 val ap:0.980162 val auc:0.977624
train loss:2392.5389 train ap:0.976669 val ap:0.980162 val auc:0.977624
total time:108.06s prep time:89.87s
total time:108.06s prep time:89.87s
total time:108.06s prep time:89.87s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
total time:108.06s prep time:89.87s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 10:
Epoch 10:
Epoch 10:
Epoch 10:
train loss:2356.5620 train ap:0.977277 val ap:0.983905 val auc:0.981905
train loss:2475.2578 train ap:0.975923 val ap:0.983905 val auc:0.981905
total time:108.50s prep time:90.58s
train loss:2218.5262 train ap:0.980230 val ap:0.983905 val auc:0.981905
train loss:2249.7741 train ap:0.979533 val ap:0.983905 val auc:0.981905
fetch time:0.00s write back time:0.00s
total time:108.50s prep time:90.58s
total time:108.50s prep time:90.58s
total time:108.50s prep time:90.58s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 11:
Epoch 11:Epoch 11:
Epoch 11:
train loss:2371.3061 train ap:0.977063 val ap:0.981130 val auc:0.978457
train loss:2215.4943 train ap:0.980281 val ap:0.981130 val auc:0.978457
train loss:2469.7190 train ap:0.975983 val ap:0.981130 val auc:0.978457
train loss:2243.8975 train ap:0.979617 val ap:0.981130 val auc:0.978457
total time:107.79s prep time:89.81s
total time:107.79s prep time:89.81s
total time:107.79s prep time:89.81s
total time:107.79s prep time:89.81s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 12:
Epoch 12:
Epoch 12:
Epoch 12:
train loss:2454.1705 train ap:0.976276 val ap:0.983270 val auc:0.981183
train loss:2225.2349 train ap:0.979939 val ap:0.983270 val auc:0.981183
train loss:2337.4529 train ap:0.977606 val ap:0.983270 val auc:0.981183
train loss:2189.4448 train ap:0.980683 val ap:0.983270 val auc:0.981183
total time:108.64s prep time:90.63s
total time:108.64s prep time:90.63s
total time:108.64s prep time:90.63s
total time:108.64s prep time:90.63s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 13:
Epoch 13:
Epoch 13:
Epoch 13:
train loss:2374.7256 train ap:0.977024 val ap:0.981550 val auc:0.979260
train loss:2221.0432 train ap:0.980189 val ap:0.981550 val auc:0.979260
train loss:2471.9543 train ap:0.975953 val ap:0.981550 val auc:0.979260
train loss:2241.1903 train ap:0.979649 val ap:0.981550 val auc:0.979260
total time:108.69s prep time:90.62s
total time:108.69s prep time:90.62s
total time:108.69s prep time:90.62s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
total time:108.69s prep time:90.62s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 14:
Epoch 14:
Epoch 14:
Epoch 14:
train loss:2358.8334 train ap:0.977242 val ap:0.981721 val auc:0.979185
train loss:2208.2876 train ap:0.980383 val ap:0.981721 val auc:0.979185
train loss:2227.2542 train ap:0.979885 val ap:0.981721 val auc:0.979185
train loss:2460.0171 train ap:0.976178 val ap:0.981721 val auc:0.979185
total time:107.77s prep time:89.81s
total time:107.77s prep time:89.81s
total time:107.77s prep time:89.81s
total time:107.77s prep time:89.81s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
fetch time:0.00s write back time:0.00s
Epoch 15:
Epoch 15:Epoch 15:
Epoch 15:
Early stopping at epoch 15
Early stopping at epoch 15
Early stopping at epoch 15
Early stopping at epoch 15
Loading the best model at epoch 10
Loading the best model at epoch 10
Loading the best model at epoch 10
Loading the best model at epoch 10
0.9546157717704773 0.9452952742576599
0.9546157717704773 0.9452952742576599
0.9546157717704773 0.9452952742576599
0.9546157717704773 0.9452952742576599
0.9489824175834656 0.9380446672439575
0.9489824175834656 0.9380446672439575
0.9489824175834656 0.9380446672439575
0.9489824175834656 0.9380446672439575
test AP:0.944646 test AUC:0.934612
test AP:0.944646 test AUC:0.934612
test AP:0.944646 test AUC:0.934612
test AP:0.944646 test AUC:0.934612
test_dataset 798529 avg_time 28.8176681804657
test_dataset 752056 avg_time 28.81766140937805
test_dataset 984603 avg_time 28.817663559913637
test_dataset 886223 avg_time 28.817657227516175
...@@ -50,7 +50,7 @@ parser.add_argument('--rank', default=0, type=int, metavar='W', ...@@ -50,7 +50,7 @@ parser.add_argument('--rank', default=0, type=int, metavar='W',
help='name of dataset') help='name of dataset')
parser.add_argument('--local_rank', default=0, type=int, metavar='W', parser.add_argument('--local_rank', default=0, type=int, metavar='W',
help='name of dataset') help='name of dataset')
parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--patience', type=int, default=20, help='Patience for early stopping')
parser.add_argument('--world_size', default=1, type=int, metavar='W', parser.add_argument('--world_size', default=1, type=int, metavar='W',
help='number of negative samples') help='number of negative samples')
parser.add_argument('--dataname', default="WIKI", type=str, metavar='W', parser.add_argument('--dataname', default="WIKI", type=str, metavar='W',
...@@ -73,6 +73,8 @@ parser.add_argument('--shared_memory_ssim', default=2, type=float, metavar='W', ...@@ -73,6 +73,8 @@ parser.add_argument('--shared_memory_ssim', default=2, type=float, metavar='W',
help='name of model') help='name of model')
parser.add_argument('--neg_samples', default=1, type=int, metavar='W', parser.add_argument('--neg_samples', default=1, type=int, metavar='W',
help='name of model') help='name of model')
parser.add_argument('--eval_neg_samples', default=1, type=int, metavar='W',
help='name of model')
parser.add_argument('--memory_type', default='all_update', type=str, metavar='W', parser.add_argument('--memory_type', default='all_update', type=str, metavar='W',
help='name of model') help='name of model')
#boundery_recent_uniform boundery_recent_decay #boundery_recent_uniform boundery_recent_decay
...@@ -104,6 +106,7 @@ if not 'MASTER_PORT' in os.environ: ...@@ -104,6 +106,7 @@ if not 'MASTER_PORT' in os.environ:
os.environ["MASTER_PORT"] = '9337' os.environ["MASTER_PORT"] = '9337'
os.environ["NCCL_IB_DISABLE"]='1' os.environ["NCCL_IB_DISABLE"]='1'
os.environ['NCCL_SOCKET_IFNAME']=matching_interfaces[0] os.environ['NCCL_SOCKET_IFNAME']=matching_interfaces[0]
print('rank {}'.format(int(os.environ["LOCAL_RANK"])))
torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
local_rank = int(os.environ["LOCAL_RANK"]) local_rank = int(os.environ["LOCAL_RANK"])
def seed_everything(seed=42): def seed_everything(seed=42):
...@@ -219,7 +222,7 @@ def main(): ...@@ -219,7 +222,7 @@ def main():
else: else:
mailbox = None mailbox = None
sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=sample_graph, workers=1,policy = policy_train, graph_name = "train",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability) sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=sample_graph, workers=10,policy = policy_train, graph_name = "train",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability)
eval_sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=eval_sample_graph, workers=10,policy = 'recent', graph_name = "eval",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability) eval_sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=num_layers, fanout=fanout,graph_data=eval_sample_graph, workers=10,policy = 'recent', graph_name = "eval",local_part=dist.get_rank(),edge_part=DistIndex(graph.eids_mapper).part,node_part=DistIndex(graph.nids_mapper).part,probability=args.probability)
train_data = torch.masked_select(graph.edge_index,train_mask.to(graph.edge_index.device)).reshape(2,-1) train_data = torch.masked_select(graph.edge_index,train_mask.to(graph.edge_index.device)).reshape(2,-1)
...@@ -244,16 +247,16 @@ def main(): ...@@ -244,16 +247,16 @@ def main():
val_data = DataSet(edges = val_data,ts = val_ts,eids = val_mask.nonzero().reshape(-1)) val_data = DataSet(edges = val_data,ts = val_ts,eids = val_mask.nonzero().reshape(-1))
print('ts {} {} {} {}'.format(train_data.ts,eval_train_data.ts,test_data.ts,val_data.ts)) print('ts {} {} {} {}'.format(train_data.ts,eval_train_data.ts,test_data.ts,val_data.ts))
neg_samples = args.neg_samples neg_samples = args.eval_neg_samples
mask = DistIndex(graph.nids_mapper[graph.edge_index[1,:]].to('cpu')).part == dist.get_rank() mask = DistIndex(graph.nids_mapper[graph.edge_index[1,:]].to('cpu')).part == dist.get_rank()
if args.local_neg_sample: if args.local_neg_sample:
print('dst len {} origin len {}'.format(graph.edge_index[1,mask].unique().shape[0],full_dst.unique().shape[0])) print('dst len {} origin len {}'.format(graph.edge_index[1,mask].unique().shape[0],full_dst.unique().shape[0]))
train_neg_sampler = LocalNegativeSampling('triplet',amount = neg_samples,dst_node_list = graph.edge_index[1,mask].unique()) train_neg_sampler = LocalNegativeSampling('triplet',amount = args.neg_samples,dst_node_list = graph.edge_index[1,mask].unique())
else: else:
train_neg_sampler = LocalNegativeSampling('triplet',amount = neg_samples,dst_node_list = full_dst.unique()) train_neg_sampler = LocalNegativeSampling('triplet',amount = args.neg_samples,dst_node_list = full_dst.unique())
print(train_neg_sampler.dst_node_list) print(train_neg_sampler.dst_node_list)
neg_sampler = LocalNegativeSampling('triplet',amount= neg_samples,dst_node_list = full_dst.unique()) neg_sampler = LocalNegativeSampling('triplet',amount= neg_samples,dst_node_list = full_dst.unique(),seed=12357)
trainloader = DistributedDataLoader(graph,eval_train_data,sampler = sampler, trainloader = DistributedDataLoader(graph,eval_train_data,sampler = sampler,
sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES, sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
...@@ -354,6 +357,7 @@ def main(): ...@@ -354,6 +357,7 @@ def main():
y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0) y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
aps.append(average_precision_score(y_true, y_pred.detach().numpy())) aps.append(average_precision_score(y_true, y_pred.detach().numpy()))
aucs_mrrs.append(roc_auc_score(y_true, y_pred)) aucs_mrrs.append(roc_auc_score(y_true, y_pred))
if mailbox is not None: if mailbox is not None:
src = metadata['src_pos_index'] src = metadata['src_pos_index']
dst = metadata['dst_pos_index'] dst = metadata['dst_pos_index']
...@@ -425,8 +429,6 @@ def main(): ...@@ -425,8 +429,6 @@ def main():
for e in range(train_param['epoch']): for e in range(train_param['epoch']):
model.module.memory_updater.empty_cache() model.module.memory_updater.empty_cache()
tt._zero() tt._zero()
count_empty()
time_count.set_zero()
torch.cuda.synchronize() torch.cuda.synchronize()
epoch_start_time = time.time() epoch_start_time = time.time()
epoch_cnt = epoch_cnt + 1 epoch_cnt = epoch_cnt + 1
...@@ -440,8 +442,6 @@ def main(): ...@@ -440,8 +442,6 @@ def main():
model.module.memory_updater.last_updated_nid = None model.module.memory_updater.last_updated_nid = None
model.module.memory_updater.last_updated_memory = None model.module.memory_updater.last_updated_memory = None
model.module.memory_updater.last_updated_ts = None model.module.memory_updater.last_updated_ts = None
t0 = time.time()
t_s = tt.start()
sum_local_comm = 0 sum_local_comm = 0
sum_remote_comm = 0 sum_remote_comm = 0
sum_local_edge_comm = 0 sum_local_edge_comm = 0
...@@ -470,10 +470,6 @@ def main(): ...@@ -470,10 +470,6 @@ def main():
sum_remote_comm +=remote_comm[b_cnt-1] sum_remote_comm +=remote_comm[b_cnt-1]
sum_local_edge_comm +=local_edge_comm[b_cnt-1] sum_local_edge_comm +=local_edge_comm[b_cnt-1]
sum_remote_edge_comm +=remote_edge_comm[b_cnt-1] sum_remote_edge_comm +=remote_edge_comm[b_cnt-1]
tt.pre_input += tt.elapsed(t_s)
t_prep_s = time.time()
t1 = time.time()
t_s = tt.start()
if mailbox is not None: if mailbox is not None:
if(graph.efeat.device.type != 'cpu'): if(graph.efeat.device.type != 'cpu'):
edge_feats = graph.get_local_efeat(graph.eids_mapper[roots.eids.to('cpu')]).to('cuda') edge_feats = graph.get_local_efeat(graph.eids_mapper[roots.eids.to('cpu')]).to('cuda')
...@@ -490,9 +486,7 @@ def main(): ...@@ -490,9 +486,7 @@ def main():
model.train() model.train()
optimizer.zero_grad() optimizer.zero_grad()
pred_pos, pred_neg = model(mfgs,metadata,neg_samples=neg_samples,async_param = param) pred_pos, pred_neg = model(mfgs,metadata,neg_samples=args.neg_samples,async_param = param)
tt.time_forward += tt.elapsed(t_s)
t_s = tt.start()
if memory_param['historical_fix'] == True: if memory_param['historical_fix'] == True:
loss = creterion(pred_pos, torch.ones_like(pred_pos)) + 0.1*inner_prod(model.module.memory_updater.update_memory,model.module.memory_updater.prev_memory) loss = creterion(pred_pos, torch.ones_like(pred_pos)) + 0.1*inner_prod(model.module.memory_updater.update_memory,model.module.memory_updater.prev_memory)
else: else:
...@@ -502,12 +496,9 @@ def main(): ...@@ -502,12 +496,9 @@ def main():
#mailbox.handle_last_async() #mailbox.handle_last_async()
#trainloader.async_feature() #trainloader.async_feature()
#torch.cuda.synchronize() #torch.cuda.synchronize()
t2 = time.time()
loss.backward() loss.backward()
optimizer.step() optimizer.step()
tt.time_backward += tt.elapsed(t_s)
#torch.cuda.synchronize() #torch.cuda.synchronize()
t3 = time.time()
## train aps ## train aps
#y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu() #y_pred = torch.cat([pred_pos, pred_neg], dim=0).sigmoid().cpu()
#y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0) #y_true = torch.cat([torch.ones(pred_pos.size(0)), torch.zeros(pred_neg.size(0))], dim=0)
...@@ -515,8 +506,6 @@ def main(): ...@@ -515,8 +506,6 @@ def main():
#torch.cuda.synchronize() #torch.cuda.synchronize()
mailbox.update_shared() mailbox.update_shared()
mailbox.update_p2p() mailbox.update_p2p()
t4 = time.time()
t_s = tt.start()
""" """
if mailbox is not None: if mailbox is not None:
#src = metadata['src_pos_index'] #src = metadata['src_pos_index']
...@@ -579,7 +568,6 @@ def main(): ...@@ -579,7 +568,6 @@ def main():
print('memory comm {} shared comm {}\n'.format(tot_comm_count,tot_shared_count)) print('memory comm {} shared comm {}\n'.format(tot_comm_count,tot_shared_count))
if(e==0): if(e==0):
torch.save((local_access,remote_access,local_edge_access,remote_edge_access,local_comm,remote_comm,local_edge_comm,remote_edge_comm),'all/{}/comm/comm_{}_{}_{}_{}_{}_{}_{}_{}.pt'.format(args.dataname,args.partition,args.topk,dist.get_world_size(),dist.get_rank(),args.sample_type,args.probability,args.memory_type,args.shared_memory_ssim)) torch.save((local_access,remote_access,local_edge_access,remote_edge_access,local_comm,remote_comm,local_edge_comm,remote_edge_comm),'all/{}/comm/comm_{}_{}_{}_{}_{}_{}_{}_{}.pt'.format(args.dataname,args.partition,args.topk,dist.get_world_size(),dist.get_rank(),args.sample_type,args.probability,args.memory_type,args.shared_memory_ssim))
tt.print()
ap = 0 ap = 0
auc = 0 auc = 0
ap, auc = eval('val') ap, auc = eval('val')
...@@ -625,17 +613,6 @@ def main(): ...@@ -625,17 +613,6 @@ def main():
best_model_path = get_checkpoint_path(early_stopper.best_epoch) best_model_path = get_checkpoint_path(early_stopper.best_epoch)
model.module.load_state_dict(torch.load(best_model_path)) model.module.load_state_dict(torch.load(best_model_path))
print('best test AP:{:4f} test auc{:4f}'.format(*test_ap_list[early_stopper.best_epoch])) print('best test AP:{:4f} test auc{:4f}'.format(*test_ap_list[early_stopper.best_epoch]))
if mailbox is not None:
mailbox.reset()
model.module.memory_updater.last_updated_nid = None
ap,auc = eval('train')
val_ap,val_auc = eval('val')
ap, auc = eval('test')
eval_neg_samples = 1
if eval_neg_samples > 1:
print('\tval AP:{:4f} val MRR:{:4f} test AP:{:4f} test MRR:{:4f}\n'.format(val_ap,val_auc,ap, auc))
else:
print('\tval ap:{:4f} val auc:{:4f} test AP:{:4f} test AUC:{:4f}\n'.format(val_ap,val_auc,ap, auc))
val_list = torch.tensor(val_list) val_list = torch.tensor(val_list)
loss_list = torch.tensor(loss_list) loss_list = torch.tensor(loss_list)
print('test_dataset {} avg_time {} \n'.format(test_data.edges.shape[1],avg_time/epoch_cnt)) print('test_dataset {} avg_time {} \n'.format(test_data.edges.shape[1],avg_time/epoch_cnt))
......
...@@ -209,7 +209,6 @@ class TransfomerAttentionLayer(torch.nn.Module): ...@@ -209,7 +209,6 @@ class TransfomerAttentionLayer(torch.nn.Module):
self.layer_norm = torch.nn.LayerNorm(dim_out) self.layer_norm = torch.nn.LayerNorm(dim_out)
def forward(self, b): def forward(self, b):
t_s = tt.start()
assert(self.dim_time + self.dim_node_feat + self.dim_edge_feat > 0) assert(self.dim_time + self.dim_node_feat + self.dim_edge_feat > 0)
self.device = b.device self.device = b.device
if b.num_edges() == 0: if b.num_edges() == 0:
...@@ -217,8 +216,6 @@ class TransfomerAttentionLayer(torch.nn.Module): ...@@ -217,8 +216,6 @@ class TransfomerAttentionLayer(torch.nn.Module):
if self.dim_time > 0: if self.dim_time > 0:
time_feat = self.time_enc(b.edata['dt']) time_feat = self.time_enc(b.edata['dt'])
zero_time_feat = self.time_enc(torch.zeros(b.num_dst_nodes(), dtype=torch.float32, device=self.device)) zero_time_feat = self.time_enc(torch.zeros(b.num_dst_nodes(), dtype=torch.float32, device=self.device))
tt.time_nbrs += tt.elapsed(t_s)
t_s = tt.start()
if self.combined: if self.combined:
Q = torch.zeros((b.num_edges(), self.dim_out), device=self.device) Q = torch.zeros((b.num_edges(), self.dim_out), device=self.device)
K = torch.zeros((b.num_edges(), self.dim_out), device=self.device) K = torch.zeros((b.num_edges(), self.dim_out), device=self.device)
...@@ -301,7 +298,6 @@ class TransfomerAttentionLayer(torch.nn.Module): ...@@ -301,7 +298,6 @@ class TransfomerAttentionLayer(torch.nn.Module):
rst = b.dstdata['h'] rst = b.dstdata['h']
rst = self.w_out(rst) rst = self.w_out(rst)
rst = torch.nn.functional.relu(self.dropout(rst)) rst = torch.nn.functional.relu(self.dropout(rst))
tt.time_attention+= tt.elapsed(t_s)
return self.layer_norm(rst) return self.layer_norm(rst)
class IdentityNormLayer(torch.nn.Module): class IdentityNormLayer(torch.nn.Module):
......
...@@ -10,7 +10,7 @@ def parse_config(f): ...@@ -10,7 +10,7 @@ def parse_config(f):
return sample_param, memory_param, gnn_param, train_param return sample_param, memory_param, gnn_param, train_param
class EarlyStopMonitor(object): class EarlyStopMonitor(object):
def __init__(self, max_round=3, higher_better=True, tolerance=1e-10): def __init__(self, max_round=10, higher_better=True, tolerance=1e-10):
self.max_round = max_round self.max_round = max_round
self.num_round = 0 self.num_round = 0
......
...@@ -286,6 +286,7 @@ def to_block(graph,data, sample_out,device = torch.device('cuda'),unique = True) ...@@ -286,6 +286,7 @@ def to_block(graph,data, sample_out,device = torch.device('cuda'),unique = True)
idx = block_node_list[0,b.srcnodes()].to(torch.long) idx = block_node_list[0,b.srcnodes()].to(torch.long)
e_idx = eid_inv[col_len:col_len+elen] e_idx = eid_inv[col_len:col_len+elen]
b.srcdata['__ID'] = idx b.srcdata['__ID'] = idx
if sample_out[r].delta_ts().shape[0] > 0: if sample_out[r].delta_ts().shape[0] > 0:
b.edata['dt'] = sample_out[r].delta_ts().to(device) b.edata['dt'] = sample_out[r].delta_ts().to(device)
b.srcdata['ts'] = block_node_list[1,b.srcnodes()].to(torch.float) b.srcdata['ts'] = block_node_list[1,b.srcnodes()].to(torch.float)
......
...@@ -2,139 +2,53 @@ import os ...@@ -2,139 +2,53 @@ import os
import time import time
import torch import torch
class time_count: class time_count:
total_sample_time = 0
total_next_batch_time = 0
total_sample_core_time = 0
total_fetch_prepare_time = 0
total_comm_time = 0
total_build_time = 0
total_prepare_input_time = 0
total_build_block_time = 0
forward_embedding = 0
forward_all_to_all = 0
backward_all_to_all = 0
memory_historical = 0
memory_update = 0
memory_get = 0
memory_enc = 0
memory_historical_count = 0
time_forward = 0 time_forward = 0
time_backward = 0 time_backward = 0
time_sample = 0 time_memory_updater = 0
pre_batch = 0 time_embedding = 0
pre_input = 0 time_local_update = 0
pos_update = 0 time_memory_sync = 0
mem_update = 0 time_sample_and_build = 0
time_zero = 0 time_memory_fetch = 0
time_nbrs = 0
time_attention = 0
@staticmethod
def add_memory_count(t1,t2,t3,t4):
time_count.memory_update += t1
time_count.memory_get += t2
time_count.memory_enc += t3
time_count.memory_historical_count += t4
@staticmethod
def add_train_forward_embedding(t1):
time_count.forward_embedding += t1
@staticmethod
def add_train_foward_all_to_all(t1):
time_count.forward_all_to_all += t1
@staticmethod
def add_backward_all_to_all(t1):
time_count.backward_all_to_all += t1
@staticmethod
def add_next(t1,t2):
time_count.total_sample_time += t2
time_count.total_next_batch_time +=t1
@staticmethod
def add_batch(t1,t2,t3,t4) :
time_count.total_fetch_prepare_time +=t1
time_count.total_comm_time+=t2
time_count.total_build_time+=t3
time_count.total_prepare_input_time+=t4
@staticmethod
def add_build_block(t1,t2) :
time_count.total_sample_core_time += t1
time_count.total_build_block_time+=t2
@staticmethod
def set_zero():
time_count.total_sample_time =0
time_count.total_next_batch_time=0
time_count.total_sample_core_time =0
time_count.total_fetch_prepare_time=0
time_count.total_comm_time =0
time_count.total_build_time =0
time_count.total_prepare_input_time =0
time_count.total_build_block_time=0
time_count.forward_embedding = 0
time_count.forward_all_to_all = 0
time_count.backward_all_to_all = 0
time_count.memory_update = 0
time_count.memory_get = 0
time_count.memory_enc = 0
time_count.memory_historical_count = 0
@staticmethod
def query():
return {
"total_sample_time":time_count.total_sample_time,
"total_next_batch_time":time_count.total_next_batch_time,
"total_sample_core_time":time_count.total_sample_core_time,
"total_fetch_prepare_time":time_count.total_fetch_prepare_time,
"total_comm_time":time_count.total_comm_time,
"total_build_time":time_count.total_build_time,
"total_prepare_input_time":time_count.total_prepare_input_time,
"total_build_block_time":time_count.total_build_block_time,
"forward_embedding":time_count.forward_embedding,
"forward_all_to_all":time_count.forward_all_to_all,
"backward_all_to_all":time_count.backward_all_to_all,
"memory_update":time_count.memory_update ,
"memory_get":time_count.memory_get ,
"memory_enc":time_count.memory_enc ,
"memory_historical_count":time_count.memory_historical_count ,
}
@staticmethod @staticmethod
def _zero(): def _zero():
time_count.time_forward = 0 time_count.time_forward = 0
time_count.time_backward = 0 time_count.time_backward = 0
time_count.time_sample = 0 time_count.time_memory_updater = 0
time_count.pre_batch = 0 time_count.time_embedding = 0
time_count.pre_input = 0 time_count.time_local_update = 0
time_count.pos_update = 0 time_count.time_memory_sync = 0
time_count.mem_update = 0 time_count.time_sample_and_build = 0
time_count.time_zero = 0 time_count.time_memory_fetch = 0
time_count.time_nbrs = 0 @staticmethod
time_count.time_attention = 0 def start_gpu():
@staticmethod
def start():
# Uncomment for better breakdown timings # Uncomment for better breakdown timings
#torch.cuda.synchronize() #torch.cuda.synchronize()
return time.perf_counter() start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
return start_event,end_event
@staticmethod @staticmethod
def elapsed(start): def start():
# Uncomment for better breakdown timings return time.perf_counter(),0
#torch.cuda.synchronize() @staticmethod
return time.perf_counter() - start def elapsed_event(start_event,end_event):
if start_event.isinstance(torch.cuda.Event):
end_event.record()
end_event.synchronize()
return start_event.elapsed_time(end_event)
else:
torch.cuda.synchronize()
return time.perf_counter() - start_event
@staticmethod @staticmethod
def print(): def print():
print( print('time_count.time_forward={} time_count.time_backward={} time_count.time_memory_updater={} time_count.time_embedding={} time_count.time_local_update={} time_count.time_memory_sync={} time_count.time_sample_and_build={} time_count.time_memory_fetch={}\n'.format(
'time_forward = {} time_backward = {} time_sample = {} pre_batch = {} pre_input = {} pos_update = {} mem_update = {} time_zero = {} time_nbrs = {} time_attention = {}'.format( time_count.time_backward,
time_count.time_forward, time_count.time_memory_updater,
time_count.time_backward, time_count.time_embedding,
time_count.time_sample, time_count.time_local_update,
time_count.pre_batch, time_count.time_memory_sync,
time_count.pre_input, time_count.time_sample_and_build,
time_count.pos_update, time_count.time_memory_fetch ))
time_count.mem_update, \ No newline at end of file
time_count.time_zero,
time_count.time_nbrs,
time_count.time_attention,
)
)
\ No newline at end of file
...@@ -291,10 +291,10 @@ def load_from_speed(data,seed,top,sampler_graph_add_rev,device=torch.device('cud ...@@ -291,10 +291,10 @@ def load_from_speed(data,seed,top,sampler_graph_add_rev,device=torch.device('cud
reorder = '../../SPEED/partition/divided_nodes_seed_t2/{}/reorder.txt'.format(data) reorder = '../../SPEED/partition/divided_nodes_seed_t2/{}/reorder.txt'.format(data)
edge_i = '../../SPEED/partition/divided_nodes_seed_t2/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank) edge_i = '../../SPEED/partition/divided_nodes_seed_t2/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
elif partition == 'metis': elif partition == 'metis':
fnode_i = '../../SPEED/partition/divided_nodes_metis/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank) fnode_i = '../../SPEED/partition/divided_nodes_metis_test/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
fnode_share = '../../SPEED/partition/divided_nodes_metis/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top) fnode_share = '../../SPEED/partition/divided_nodes_metis_test/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top)
reorder = '../../SPEED/partition/divided_nodes_metis/{}/reorder.txt'.format(data) reorder = '../../SPEED/partition/divided_nodes_metis_test/{}/reorder.txt'.format(data)
edge_i = '../../SPEED/partition/divided_nodes_metis/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank) edge_i = '../../SPEED/partition/divided_nodes_metis_test/{}/{}/{}_{}parts_top{}/edge_output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
elif partition == 'ldg': elif partition == 'ldg':
fnode_i = '../../SPEED/partition/divided_nodes_ldg/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank) fnode_i = '../../SPEED/partition/divided_nodes_ldg/{}/{}/{}_{}parts_top{}/output{}.txt'.format(data,seed,data,ctx.memory_group_size,top,ctx.memory_group_rank)
fnode_share = '../../SPEED/partition/divided_nodes_ldg/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top) fnode_share = '../../SPEED/partition/divided_nodes_ldg/{}/{}/{}_{}parts_top{}/outputshared.txt'.format(data,seed,data,ctx.memory_group_size,top)
......
...@@ -19,12 +19,15 @@ class LocalNegativeSampling(NegativeSampling): ...@@ -19,12 +19,15 @@ class LocalNegativeSampling(NegativeSampling):
amount: Union[int, float] = 1, amount: Union[int, float] = 1,
unique: bool = False, unique: bool = False,
src_node_list: torch.Tensor = None, src_node_list: torch.Tensor = None,
dst_node_list: torch.Tensor = None dst_node_list: torch.Tensor = None,
seed = False
): ):
super(LocalNegativeSampling,self).__init__(mode,amount,unique=unique) super(LocalNegativeSampling,self).__init__(mode,amount,unique=unique)
self.src_node_list = src_node_list.to('cpu') if src_node_list is not None else None self.src_node_list = src_node_list.to('cpu') if src_node_list is not None else None
self.dst_node_list = dst_node_list.to('cpu') if dst_node_list is not None else None self.dst_node_list = dst_node_list.to('cpu') if dst_node_list is not None else None
self.rdm = torch.Generator() self.rdm = torch.Generator()
if seed is True:
random.seed(seed)
seed = random.randint(0,100000) seed = random.randint(0,100000)
print('seed is',seed) print('seed is',seed)
ctx = DistributedContext.get_default_context() ctx = DistributedContext.get_default_context()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment