Commit 4fa85d33 by xxx

synchronization

parent ab4e56d0
......@@ -2,53 +2,89 @@ import matplotlib.pyplot as plt
import numpy as np
import torch
# 读取文件内容
probability_values = [0.1]#[0.1,0.05,0.01,0]
data_values = ['WikiTalk'] # 存储从文件中读取的数据
seed = ['12357']#,'12347','63377','53473','54763']
import os
probability_values = [0.1]#][0.1,0.05,0.01,0]
data_values = ['WIKI','LASTFM','WikiTalk','StackOverflow','GDELT'] # 存储从文件中读取的数据
seed = ['13357','12347','53473','54763','12347','63377','53473','54763']
partition = 'ours_shared'
# 从文件中读取数据,假设数据存储在文件 data.txt 中
#all/"$data"/"$partitions"-ours_shared-0.01-"$mem"-"$ssim"-"$sample".out
partitions=4
partitions=12
topk=0.01
mem='historical-0.3'#'historical'
model0='APAN'
model0='TGN'
def average(l):
return sum(l)/len(l)
for data in data_values:
ap_list = []
comm_list = []
for sd in seed :
for p in probability_values:
for p in probability_values:
for data in data_values:
ap_list = []
comm_list = []
test_time = []
train_time = []
total_communication = []
shared_synchronize = []
for sd in seed :
if data == 'WIKI' or data =='LASTFM':
model = model0
else:
model = model0+'_large'
if p == 1:
file = 'all_{}/{}/{}/{}-{}-{}-{}-recent.out'.format(sd,data,model,partitions,partition,topk,mem)
file = '../examples/all_{}/{}/{}/{}-{}-{}-{}-recent.out'.format(sd,data,model,partitions,partition,topk,mem)
else:
file = 'all_{}/{}/{}/{}-{}-{}-{}-boundery_recent_decay-{}.out'.format(sd,data,model,partitions,partition,topk,mem,p)
file = '../examples/all_{}/{}/{}/{}-{}-{}-{}-boundery_recent_decay-{}.out'.format(sd,data,model,partitions,partition,topk,mem,p)
#print(file)
prefix = "val ap:"
max_val_ap = 0
test_ap = 0
with open(file, 'r') as file:
for line in file:
if line.find('Epoch 50:')!=-1:
break
if line.find(prefix)!=-1:
pos = line.find(prefix)+len(prefix)
posr = line.find(' ',pos)
#print(line[pos:posr])
val_ap = float(line[pos:posr])
pos = line.find("test ap ")+len("test ap ")
posr = line.find(' ',pos)
#print(line[pos:posr])
_test_ap = float(line[pos:posr])
if(val_ap>max_val_ap):
max_val_ap = val_ap
test_ap = _test_ap
ap_list.append(test_ap)
print('data {} model {} ap: {}'.format(data,model,ap_list))
#if
#print(file)
if os.path.exists(file):
with open(file, 'r') as file:
_total_communication = []
_shared_synchronize = []
for line in file:
#if line.find('Epoch 50:')!=-1:
# break
if line.find(prefix)!=-1:
pos = line.find(prefix)+len(prefix)
posr = line.find(' ',pos)
#print(line[pos:posr])
val_ap = float(line[pos:posr])
pos = line.find("test ap ")+len("test ap ")
posr = line.find(' ',pos)
#print(line[pos:posr])
_test_ap = float(line[pos:posr])
if(val_ap>max_val_ap):
max_val_ap = val_ap
test_ap = _test_ap
elif line.find('avg_time ')!=-1:
pl = line.find('avg_time ') + len('avg_time ')
pr = line.find(' test time')
train_time.append(float(line[pl:pr]))
test_time.append(float(line[pr+len(' test time '):]))
#print(line)
ap_list.append(test_ap)
total_communication.append(average(_total_communication))
shared_synchronize.append(average(_shared_synchronize))
elif line.find('remote node number tensor([')!=-1:
pl = line.find('remote node number tensor([')+len('remote node number tensor([')
pr = line.find('])',pl)
_total_communication.append(int(line[pl:pr]))
#if(p==0):
#print(file)
#print(line)
elif line.find('shared comm tensor([')!=-1:
pl = line.find('shared comm tensor([')+len('shared comm tensor([')
pr = line.find('])',pl)
_shared_synchronize.append(int(line[pl:pr]))
else:
print(file)
if len(ap_list) > 0:
#print('prob {} data {} model {} remote volume : {} synchronize volume : {}'.format(p,data,model,average(total_communication),average(shared_synchronize)))
print('prob {} data {} model {} ap: {} train_time: {} eval time: {} remote volume : {} synchronize volume : {}'.format(p,data,model,average(ap_list),average(train_time),average(test_time),average(total_communication),average(shared_synchronize)))
......@@ -2,14 +2,14 @@
#跑了4卡的TaoBao
# 定义数组变量
seed=$1
addr="192.168.1.106"
addr="192.168.1.107"
partition_params=("ours" )
#"metis" "ldg" "random")
#("ours" "metis" "ldg" "random")
partitions="4"
partitions="12"
node_per="4"
nnodes="1"
node_rank="0"
nnodes="3"
node_rank="1"
probability_params=("0.1")
sample_type_params=("boundery_recent_decay")
#sample_type_params=("recent" "boundery_recent_decay") #"boundery_recent_uniform")
......@@ -163,11 +163,11 @@ for data in "${data_param[@]}"; do
done
done
done
data_param=("StackOverflow" "GDELT")
for data in "${data_param[@]}"; do
model="TGN"
model="TGN_large"
if [ "$data" = "WIKI" ] || [ "$data" = "REDDIT" ] || [ "$data" = "LASTFM" ]; then
model="TGN_large"
model="TGN"
#continue
fi
#model="APAN"
......
......@@ -110,7 +110,7 @@ if not 'WORLD_SIZE' in os.environ:
os.environ["WORLD_SIZE"] = str(args.world_size)
os.environ["LOCAL_RANK"] = str(args.local_rank)
if not 'MASTER_ADDR' in os.environ:
os.environ["MASTER_ADDR"] = '192.168.2.107'
os.environ["MASTER_ADDR"] = '192.168.1.107'
if not 'MASTER_PORT' in os.environ:
os.environ["MASTER_PORT"] = '9337'
......@@ -379,7 +379,10 @@ def main():
with torch.no_grad():
total_loss = 0
signal = torch.tensor([0],dtype = int,device = device)
batch_cnt = 0
for roots,mfgs,metadata in loader:
print(batch_cnt)
batch_cnt = batch_cnt+1
"""
if ctx.memory_group == 0:
pred_pos, pred_neg = model(mfgs,metadata,neg_samples=neg_samples)
......@@ -450,15 +453,15 @@ def main():
"""
ap = torch.empty([1])
auc_mrr = torch.empty([1])
if(ctx.memory_group==0):
world_size = dist.get_world_size()
ap[0] = torch.tensor(aps).mean()
auc_mrr[0] = torch.tensor(aucs_mrrs).mean()#float(aucs_mrrs.clone().mean())
print('mode: {} {} {}'.format(mode,ap,auc_mrr))
dist.all_reduce(ap,group = ctx.gloo_group)
ap/=ctx.memory_group_size
dist.all_reduce(auc_mrr,group=ctx.gloo_group)
auc_mrr/=ctx.memory_group_size
#if(ctx.memory_group==0):
world_size = dist.get_world_size()
ap[0] = torch.tensor(aps).mean()
auc_mrr[0] = torch.tensor(aucs_mrrs).mean()#float(aucs_mrrs.clone().mean())
print('mode: {} {} {}\n'.format(mode,ap,auc_mrr))
dist.all_reduce(ap,group = ctx.gloo_group)
ap/=ctx.memory_group_size
dist.all_reduce(auc_mrr,group=ctx.gloo_group)
auc_mrr/=ctx.memory_group_size
dist.broadcast(ap,0,group=ctx.gloo_group)
dist.broadcast(auc_mrr,0,group=ctx.gloo_group)
return ap.item(), auc_mrr.item()
......@@ -641,10 +644,13 @@ def main():
tt.weight_count_remote=0
tt.ssim_cnt=0
ap, auc = eval('val')
torch.cuda.synchronize()
print('finish val')
#torch.cuda.synchronize()
print('start')
t_test = time.time()
test_ap,test_auc = eval('test')
torch.cuda.synchronize()
print('test')
test_ap,test_auc = 0,0#eval('test')
#torch.cuda.synchronize()
t_test = time.time() - t_test
total_test_time += t_test
test_ap_list.append((test_ap,test_auc))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment