Commit 4bb7a33b by zlj

fix some bugs

parent 85e4b04c
ERROR:root:cannot import name 'libstarrygl' from 'starrygl.lib' (unknown location)
ERROR:root:unable to import libstarrygl.so, some features may not be available.
ERROR:root:cannot import name 'libstarrygl_sampler' from 'starrygl.lib' (unknown location)
ERROR:root:unable to import libstarrygl_sampler.so, some features may not be available.
libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
get_neighbors consume: 14.4084ususe cuda on 0
33336461 7121936 9088019
torch.Size([2, 33336461]) 33336461
torch.Size([2, 9088019]) 9088019
torch.Size([2, 7121936]) 7121936
tensor([33336], device='cuda:0')
Epoch 0:
torch.Size([2, 1000]) 1000
Traceback (most recent call last):
File "train_tgnn.py", line 290, in <module>
main()
File "train_tgnn.py", line 215, in main
for roots,mfgs,metadata,sample_time in trainloader:
File "/home/zlj/starrygl/starrygl/sample/data_loader.py", line 146, in __next__
batch_data = graph_sample(self.graph,
File "/home/zlj/starrygl/starrygl/sample/batch_data.py", line 141, in graph_sample
return to_block(graph,data,out,mailbox,device)
File "/home/zlj/starrygl/starrygl/sample/batch_data.py", line 81, in to_block
ind_dict = graph.edge_attr.all_to_all_ind2ptr(dist_eid,group = group)
File "/home/zlj/starrygl/starrygl/distributed/utils.py", line 195, in all_to_all_ind2ptr
send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts())
TypeError: 'int' object is not callable
......@@ -192,7 +192,7 @@ class DistributedTensor:
def all_to_all_ind2ptr(self, dist_index: Union[Tensor, DistIndex],group = None) -> Dict[str, Union[List[int], Tensor]]:
if isinstance(dist_index, Tensor):
dist_index = DistIndex(dist_index)
send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts())
send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts)
send_sizes = send_ptr[1:] - send_ptr[:-1]
recv_sizes = torch.empty_like(send_sizes)
......@@ -282,7 +282,7 @@ class DistributedTensor:
index = dist_index.loc
futs: List[torch.futures.Future] = []
for i in range(self.num_parts()):
for i in range(self.num_parts):
mask = part_idx == i
f = self.accessor.async_index_copy_(0, index[mask], source[mask], self.rrefs[i])
futs.append(f)
......@@ -296,7 +296,7 @@ class DistributedTensor:
index = dist_index.loc
futs: List[torch.futures.Future] = []
for i in range(self.num_parts()):
for i in range(self.num_parts):
mask = part_idx == i
f = self.accessor.async_index_add_(0, index[mask], source[mask], self.rrefs[i])
futs.append(f)
......
......@@ -67,8 +67,8 @@ class DistributedDataLoader:
self._get_expected_idx(self.dataset.len)
else:
if torch.distributed.get_rank() == 0:
#self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
#self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
else:
self.expected_idx = 0
......@@ -152,8 +152,9 @@ class DistributedDataLoader:
self.recv_idxs += 1
assert batch_data is not None
end_event.record()
torch.cuda.synchronize()
sample_time = start_event.elapsed_time(end_event)
return batch_data,sample_time
return *batch_data,sample_time
else :
raise StopIteration
if self.queue_size > 0 :
......
......@@ -16,6 +16,7 @@ class DistributedGraphStore:
self.sample_graph = pdata.sample_graph
self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
torch.cuda.empty_cache()
if all_on_gpu:
self.nids_mapper = self.nids_mapper.to(device)
self.eids_mapper = self.eids_mapper.to(device)
......
......@@ -45,7 +45,7 @@ class SharedMailBox():
if self._ctx._use_rpc is True:
self.rref = rpc.RRef(self)
self.rrefs = self._ctx.all_gather_remote_objects(self.rref)
self.partptr = torch.tensor([ ((i & 0xFFFF)<<48) for i in range(self.num_parts+1) ],device = device)
self.partptr = torch.tensor([ ((i & 0xFFFF)<<48) for i in range(self.num_parts+1) ],device = device)
def reset(self):
......
......@@ -79,7 +79,7 @@ def main():
val_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
test_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
test_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device))
print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
#print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
train_data = DataSet(edges = train_data,ts =train_ts,eids = torch.nonzero(pdata.train_mask).view(-1))
test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
val_data = DataSet(edges = val_data,ts = val_ts,eids = torch.nonzero(pdata.val_mask).view(-1))
......@@ -259,6 +259,7 @@ def main():
start_event.record()
mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
end_event.record()
torch.cuda.synchronize()
write_back_time += start_event.elapsed_time(end_event)/1000
torch.cuda.synchronize()
......@@ -270,7 +271,7 @@ def main():
ap, auc = eval('val')
print('\ttrain loss:{:.4f} train ap:{:4f} val ap:{:4f} val auc:{:4f}'.format(total_loss,train_ap, ap, auc))
print('\ttotal time:{:.2f}s prep time:{:.2f}s'.format(time.time()-epoch_start_time, time_prep))
print('\t fetch time:{:.2f}s write back time:{:.2f}s',format(fetch_time,write_back_time))
print('\t fetch time:{:.2f}s write back time:{:.2f}s'.format(fetch_time,write_back_time))
model.eval()
if mailbox is not None:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment