fix some bugs

4bb7a33b · zlj · 85e4b04c · 4bb7a33b · 4bb7a33b · 4bb7a33b
Commit 4bb7a33b authored Dec 25, 2023 by zlj
Showing with 51 additions and 9 deletions

GDELT4.out
+39 -0

starrygl/distributed/utils.py
+3 -3

starrygl/sample/data_loader.py
+4 -3

starrygl/sample/graph_core/__init__.py
+1 -0

starrygl/sample/memory/shared_mailbox.py
+1 -1

train_tgnn.py
+3 -2

No files found.
--- a/GDELT4.out
+++ b/GDELT4.out
+ERROR:root:cannot import name 'libstarrygl' from 'starrygl.lib' (unknown location)
+ERROR:root:unable to import libstarrygl.so, some features may not be available.
+ERROR:root:cannot import name 'libstarrygl_sampler' from 'starrygl.lib' (unknown location)
+ERROR:root:unable to import libstarrygl_sampler.so, some features may not be available.
+libibverbs: Warning: couldn't load driver 'libsiw-rdmav34.so': libsiw-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libbnxt_re-rdmav34.so': libbnxt_re-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhns-rdmav34.so': libhns-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libqedr-rdmav34.so': libqedr-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmthca-rdmav34.so': libmthca-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libcxgb4-rdmav34.so': libcxgb4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libhfi1verbs-rdmav34.so': libhfi1verbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libmlx4-rdmav34.so': libmlx4-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libefa-rdmav34.so': libefa-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libipathverbs-rdmav34.so': libipathverbs-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libocrdma-rdmav34.so': libocrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
+libibverbs: Warning: couldn't load driver 'librxe-rdmav34.so': librxe-rdmav34.so: cannot open shared object file: No such file or directory
+get_neighbors consume: 14.4084ususe cuda on 0
+33336461 7121936 9088019
+torch.Size([2, 33336461]) 33336461
+torch.Size([2, 9088019]) 9088019
+torch.Size([2, 7121936]) 7121936
+tensor([33336], device='cuda:0')
+Epoch 0:
+torch.Size([2, 1000]) 1000
+Traceback (most recent call last):
+  File "train_tgnn.py", line 290, in <module>
+    main()
+  File "train_tgnn.py", line 215, in main
+    for roots,mfgs,metadata,sample_time in trainloader:
+  File "/home/zlj/starrygl/starrygl/sample/data_loader.py", line 146, in __next__
+    batch_data = graph_sample(self.graph,
+  File "/home/zlj/starrygl/starrygl/sample/batch_data.py", line 141, in graph_sample
+    return to_block(graph,data,out,mailbox,device)
+  File "/home/zlj/starrygl/starrygl/sample/batch_data.py", line 81, in to_block
+    ind_dict = graph.edge_attr.all_to_all_ind2ptr(dist_eid,group = group)
+  File "/home/zlj/starrygl/starrygl/distributed/utils.py", line 195, in all_to_all_ind2ptr
+    send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts())
+TypeError: 'int' object is not callable
--- a/starrygl/distributed/utils.py
+++ b/starrygl/distributed/utils.py
@@ -192,7 +192,7 @@ class DistributedTensor:
    def all_to_all_ind2ptr(self, dist_index: Union[Tensor, DistIndex],group = None) -> Dict[str, Union[List[int], Tensor]]:
        if isinstance(dist_index, Tensor):
            dist_index = DistIndex(dist_index)
-        send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts())
+        send_ptr = torch.ops.torch_sparse.ind2ptr(dist_index.part, self.num_parts)
        
        send_sizes = send_ptr[1:] - send_ptr[:-1]
        recv_sizes = torch.empty_like(send_sizes)
@@ -282,7 +282,7 @@ class DistributedTensor:
        index = dist_index.loc

        futs: List[torch.futures.Future] = []
-        for i in range(self.num_parts()):
+        for i in range(self.num_parts):
            mask = part_idx == i
            f = self.accessor.async_index_copy_(0, index[mask], source[mask], self.rrefs[i])
            futs.append(f)
@@ -296,7 +296,7 @@ class DistributedTensor:
        index = dist_index.loc

        futs: List[torch.futures.Future] = []
-        for i in range(self.num_parts()):
+        for i in range(self.num_parts):
            mask = part_idx == i
            f = self.accessor.async_index_add_(0, index[mask], source[mask], self.rrefs[i])
            futs.append(f)

--- a/starrygl/sample/data_loader.py
+++ b/starrygl/sample/data_loader.py
@@ -67,8 +67,8 @@ class DistributedDataLoader:
            self._get_expected_idx(self.dataset.len)
        else:
            if torch.distributed.get_rank() == 0:
-                #self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
-                self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
+                self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
+                #self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
            else:
                self.expected_idx = 0
                  
@@ -152,8 +152,9 @@ class DistributedDataLoader:
                self.recv_idxs += 1
                assert batch_data is not None
                end_event.record()
+                torch.cuda.synchronize()
                sample_time = start_event.elapsed_time(end_event)
-                return batch_data,sample_time
+                return *batch_data,sample_time
            else :
                raise StopIteration
        if self.queue_size > 0 :

--- a/starrygl/sample/graph_core/__init__.py
+++ b/starrygl/sample/graph_core/__init__.py
@@ -16,6 +16,7 @@ class DistributedGraphStore:
        self.sample_graph = pdata.sample_graph
        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
+        torch.cuda.empty_cache()
        if all_on_gpu:
            self.nids_mapper = self.nids_mapper.to(device)
            self.eids_mapper = self.eids_mapper.to(device)

--- a/starrygl/sample/memory/shared_mailbox.py
+++ b/starrygl/sample/memory/shared_mailbox.py
@@ -45,7 +45,7 @@ class SharedMailBox():
        if self._ctx._use_rpc is True:
            self.rref = rpc.RRef(self)
            self.rrefs = self._ctx.all_gather_remote_objects(self.rref)
-            self.partptr = torch.tensor([ ((i & 0xFFFF)<<48) for i in range(self.num_parts+1) ],device = device)
+        self.partptr = torch.tensor([ ((i & 0xFFFF)<<48) for i in range(self.num_parts+1) ],device = device)


    def reset(self):

--- a/train_tgnn.py
+++ b/train_tgnn.py
@@ -79,7 +79,7 @@ def main():
    val_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
    test_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
    test_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device)) 
-    print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
+    #print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
    train_data = DataSet(edges = train_data,ts =train_ts,eids = torch.nonzero(pdata.train_mask).view(-1))
    test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
    val_data = DataSet(edges = val_data,ts = val_ts,eids = torch.nonzero(pdata.val_mask).view(-1))
@@ -259,6 +259,7 @@ def main():
                    start_event.record()
                    mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
                    end_event.record()
+                    torch.cuda.synchronize()
                    write_back_time += start_event.elapsed_time(end_event)/1000
                
        torch.cuda.synchronize()
@@ -270,7 +271,7 @@ def main():
        ap, auc = eval('val')
        print('\ttrain loss:{:.4f}  train ap:{:4f}  val ap:{:4f}  val auc:{:4f}'.format(total_loss,train_ap, ap, auc))
        print('\ttotal time:{:.2f}s  prep time:{:.2f}s'.format(time.time()-epoch_start_time, time_prep))    
-        print('\t fetch time:{:.2f}s write back time:{:.2f}s',format(fetch_time,write_back_time))
+        print('\t fetch time:{:.2f}s write back time:{:.2f}s'.format(fetch_time,write_back_time))
          
    model.eval()
    if mailbox is not None: