temp

d8158bb9 · zlj · b792c909 · d8158bb9 · d8158bb9 · d8158bb9
Commit d8158bb9 authored Jan 08, 2024 by zlj
76 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -174,3 +174,5 @@ cython_debug/
 /dataset
 /test_*
 /*.ipynb
+saved_models/
+saved_checkpoints/
\ No newline at end of file
--- a/.history/.gitignore_20240108192321
+++ b/.history/.gitignore_20240108192321
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# IDE temporary files (generated by IDEs like CLion, etc.)
+.idea/
+cmake-build-*/
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.pt
+/*.out
+/a.out
+/third_party
+/.vscode
+/run_route.py
+/dataset
+/test_*
+/*.ipynb
--- a/.history/.gitignore_20240108195137
+++ b/.history/.gitignore_20240108195137
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# IDE temporary files (generated by IDEs like CLion, etc.)
+.idea/
+cmake-build-*/
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.pt
+/*.out
+/a.out
+/third_party
+/.vscode
+/run_route.py
+/dataset
+/test_*
+/*.ipynb
+saved_models/
+saved_checkpoints/
\ No newline at end of file
--- a/.history/.gitignore_20240108195138
+++ b/.history/.gitignore_20240108195138
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# IDE temporary files (generated by IDEs like CLion, etc.)
+.idea/
+cmake-build-*/
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.pt
+/*.out
+/a.out
+/third_party
+/.vscode
+/run_route.py
+/dataset
+/test_*
+/*.ipynb
+saved_models/
+saved_checkpoints/
\ No newline at end of file
--- a/.history/.gitmodules_20240108192330
+++ b/.history/.gitmodules_20240108192330
+<<<<<<< HEAD
+[submodule "third_party/ldg_partition"]
+	path = third_party/ldg_partition
+	url = https://gitee.com/onlynagesha/graph-partition-v4
+[submodule "third_party/METIS"]
+	path = third_party/METIS
+	url = https://github.com/KarypisLab/METIS
+	branch = v5.1.1-DistDGL-v0.5
+=======
+[submodule "csrc/partition/neighbor_clustering"]
+	path = csrc/partition/neighbor_clustering
+	url = https://gitee.com/onlynagesha/graph-partition-v4
+>>>>>>> cmy_dev
--- a/.history/.gitmodules_20240108192843
+++ b/.history/.gitmodules_20240108192843
+[submodule "third_party/ldg_partition"]
+	path = third_party/ldg_partition
+	url = https://gitee.com/onlynagesha/graph-partition-v4
+[submodule "third_party/METIS"]
+	path = third_party/METIS
+	url = https://github.com/KarypisLab/METIS
+	branch = v5.1.1-DistDGL-v0.5
--- a/.history/CMakeLists_20240108192330.txt
+++ b/.history/CMakeLists_20240108192330.txt
+cmake_minimum_required(VERSION 3.15)
+project(starrygl VERSION 0.1)
+option(WITH_PYTHON "Link to Python when building" ON)
+option(WITH_CUDA "Link to CUDA when building" ON)
+option(WITH_METIS "Link to METIS when building" ON)
+<<<<<<< HEAD
+option(WITH_MTMETIS "Link to multi-threaded METIS when building" OFF)
+=======
+option(WITH_MTMETIS "Link to multi-threaded METIS when building" ON)
+>>>>>>> cmy_dev
+option(WITH_LDG "Link to (multi-threaded optionally) LDG when building" ON)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+find_package(OpenMP REQUIRED)
+link_libraries(OpenMP::OpenMP_CXX)
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+add_compile_options(${TORCH_CXX_FLAGS})
+if(WITH_PYTHON)
+    add_definitions(-DWITH_PYTHON)
+    find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+    include_directories(${Python3_INCLUDE_DIRS})
+endif()
+if(WITH_CUDA)
+    add_definitions(-DWITH_CUDA)
+    add_definitions(-DWITH_UVM)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set(CUDA_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
+    file(GLOB_RECURSE UVM_SRCS "csrc/uvm/*.cpp")
+    add_library(uvm_ops SHARED ${UVM_SRCS})
+    target_link_libraries(uvm_ops PRIVATE ${TORCH_LIBRARIES})
+endif()
+if(WITH_METIS)
+    # add_definitions(-DWITH_METIS)
+    # set(GKLIB_DIR "${CMAKE_SOURCE_DIR}/third_party/GKlib")
+    # set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
+    # set(GKLIB_INCLUDE_DIRS "${GKLIB_DIR}/include")
+    # file(GLOB_RECURSE GKLIB_LIBRARIES "${GKLIB_DIR}/lib/lib*.a")
+    # set(METIS_INCLUDE_DIRS "${METIS_DIR}/include")
+    # file(GLOB_RECURSE METIS_LIBRARIES "${METIS_DIR}/lib/lib*.a")
+    # include_directories(${METIS_INCLUDE_DIRS})
+<<<<<<< HEAD
+    # add_library(metis_partition SHARED "csrc/partition/metis.cpp")
+    # target_link_libraries(metis_partition PRIVATE ${TORCH_LIBRARIES})
+    # target_link_libraries(metis_partition PRIVATE ${GKLIB_LIBRARIES})
+    # target_link_libraries(metis_partition PRIVATE ${METIS_LIBRARIES})
+    add_definitions(-DWITH_METIS)
+    set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
+    set(METIS_GKLIB_DIR "${METIS_DIR}/GKlib")
+    file(GLOB METIS_SRCS "${METIS_DIR}/libmetis/*.c")
+    file(GLOB METIS_GKLIB_SRCS "${METIS_GKLIB_DIR}/*.c")
+    if (MSVC)
+        file(GLOB METIS_GKLIB_WIN32_SRCS "${METIS_GKLIB_DIR}/win32/*.c")
+        set(METIS_GKLIB_SRCS ${METIS_GKLIB_SRCS} ${METIS_GKLIB_WIN32_SRCS})
+    endif()
+    add_library(metis_partition SHARED
+        "csrc/partition/metis.cpp"
+        ${METIS_SRCS} ${METIS_GKLIB_SRCS}
+    )
+    target_include_directories(metis_partition PRIVATE "${METIS_DIR}/include")
+    target_include_directories(metis_partition PRIVATE "${METIS_GKLIB_DIR}")
+    if (MSVC)
+        target_include_directories(metis_partition PRIVATE "${METIS_GKLIB_DIR}/win32")
+    endif()
+    target_compile_definitions(metis_partition PRIVATE -DIDXTYPEWIDTH=64)
+    target_compile_definitions(metis_partition PRIVATE -DREALTYPEWIDTH=32)
+    target_compile_options(metis_partition PRIVATE -O3)
+    target_link_libraries(metis_partition PRIVATE ${TORCH_LIBRARIES})
+    if (UNIX)
+        target_link_libraries(metis_partition PRIVATE m)
+    endif()
+=======
+    add_library(metis_partition SHARED "csrc/partition/metis.cpp")
+    target_link_libraries(metis_partition PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(metis_partition PRIVATE ${GKLIB_LIBRARIES})
+    target_link_libraries(metis_partition PRIVATE ${METIS_LIBRARIES})
+>>>>>>> cmy_dev
+endif()
+if(WITH_MTMETIS)
+    add_definitions(-DWITH_MTMETIS)
+    set(MTMETIS_DIR "${CMAKE_SOURCE_DIR}/third_party/mt-metis")
+    set(MTMETIS_INCLUDE_DIRS "${MTMETIS_DIR}/include")
+    file(GLOB_RECURSE MTMETIS_LIBRARIES "${MTMETIS_DIR}/lib/lib*.a")
+    include_directories(${MTMETIS_INCLUDE_DIRS})
+    add_library(mtmetis_partition SHARED "csrc/partition/mtmetis.cpp")
+    target_link_libraries(mtmetis_partition PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(mtmetis_partition PRIVATE ${MTMETIS_LIBRARIES})
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_VERTICES)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_EDGES)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_WEIGHTS)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_PARTITIONS)
+endif()
+if (WITH_LDG)
+    # Imports neighbor-clustering based (e.g. LDG algorithm) graph partitioning implementation
+    add_definitions(-DWITH_LDG)
+<<<<<<< HEAD
+    set(LDG_DIR "third_party/ldg_partition")
+=======
+    set(LDG_DIR "csrc/partition/neighbor_clustering")
+>>>>>>> cmy_dev
+    add_library(ldg_partition SHARED "csrc/partition/ldg.cpp")
+    target_link_libraries(ldg_partition PRIVATE ${TORCH_LIBRARIES})
+    add_subdirectory(${LDG_DIR})
+    target_include_directories(ldg_partition PRIVATE ${LDG_DIR})
+    target_link_libraries(ldg_partition PRIVATE ldg-vertex-partition)
+endif ()
+include_directories("csrc/include")
+add_library(${PROJECT_NAME} SHARED csrc/export.cpp)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_compile_definitions(${PROJECT_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${PROJECT_NAME})
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+if (WITH_CUDA)
+    target_link_libraries(${PROJECT_NAME} PRIVATE uvm_ops)
+endif()
+if (WITH_METIS)
+    message(STATUS "Current project '${PROJECT_NAME}' uses METIS graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE metis_partition)
+endif()
+if (WITH_MTMETIS)
+    message(STATUS "Current project '${PROJECT_NAME}' uses multi-threaded METIS graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE mtmetis_partition)
+endif()
+if (WITH_LDG)
+    message(STATUS "Current project '${PROJECT_NAME}' uses LDG graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ldg_partition)
+endif()
+# add libsampler.so
+set(SAMLPER_NAME "${PROJECT_NAME}_sampler")
+# set(BOOST_INCLUDE_DIRS "${CMAKE_SOURCE_DIR}/third_party/boost_1_83_0")
+# include_directories(${BOOST_INCLUDE_DIRS})
+file(GLOB_RECURSE SAMPLER_SRCS "csrc/sampler/*.cpp")
+add_library(${SAMLPER_NAME} SHARED ${SAMPLER_SRCS})
+target_include_directories(${SAMLPER_NAME} PRIVATE "csrc/sampler/include")
+target_compile_options(${SAMLPER_NAME} PRIVATE -O3)
+target_link_libraries(${SAMLPER_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_compile_definitions(${SAMLPER_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${SAMLPER_NAME})
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${SAMLPER_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
--- a/.history/CMakeLists_20240108192828.txt
+++ b/.history/CMakeLists_20240108192828.txt
+cmake_minimum_required(VERSION 3.15)
+project(starrygl VERSION 0.1)
+option(WITH_PYTHON "Link to Python when building" ON)
+option(WITH_CUDA "Link to CUDA when building" ON)
+option(WITH_METIS "Link to METIS when building" ON)
+option(WITH_MTMETIS "Link to multi-threaded METIS when building" OFF)
+option(WITH_LDG "Link to (multi-threaded optionally) LDG when building" ON)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+find_package(OpenMP REQUIRED)
+link_libraries(OpenMP::OpenMP_CXX)
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+add_compile_options(${TORCH_CXX_FLAGS})
+if(WITH_PYTHON)
+    add_definitions(-DWITH_PYTHON)
+    find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+    include_directories(${Python3_INCLUDE_DIRS})
+endif()
+if(WITH_CUDA)
+    add_definitions(-DWITH_CUDA)
+    add_definitions(-DWITH_UVM)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set(CUDA_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
+    file(GLOB_RECURSE UVM_SRCS "csrc/uvm/*.cpp")
+    add_library(uvm_ops SHARED ${UVM_SRCS})
+    target_link_libraries(uvm_ops PRIVATE ${TORCH_LIBRARIES})
+endif()
+if(WITH_METIS)
+    # add_definitions(-DWITH_METIS)
+    # set(GKLIB_DIR "${CMAKE_SOURCE_DIR}/third_party/GKlib")
+    # set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
+    # set(GKLIB_INCLUDE_DIRS "${GKLIB_DIR}/include")
+    # file(GLOB_RECURSE GKLIB_LIBRARIES "${GKLIB_DIR}/lib/lib*.a")
+    # set(METIS_INCLUDE_DIRS "${METIS_DIR}/include")
+    # file(GLOB_RECURSE METIS_LIBRARIES "${METIS_DIR}/lib/lib*.a")
+    # include_directories(${METIS_INCLUDE_DIRS})
+    # add_library(metis_partition SHARED "csrc/partition/metis.cpp")
+    # target_link_libraries(metis_partition PRIVATE ${TORCH_LIBRARIES})
+    # target_link_libraries(metis_partition PRIVATE ${GKLIB_LIBRARIES})
+    # target_link_libraries(metis_partition PRIVATE ${METIS_LIBRARIES})
+    add_definitions(-DWITH_METIS)
+    set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
+    set(METIS_GKLIB_DIR "${METIS_DIR}/GKlib")
+    file(GLOB METIS_SRCS "${METIS_DIR}/libmetis/*.c")
+    file(GLOB METIS_GKLIB_SRCS "${METIS_GKLIB_DIR}/*.c")
+    if (MSVC)
+        file(GLOB METIS_GKLIB_WIN32_SRCS "${METIS_GKLIB_DIR}/win32/*.c")
+        set(METIS_GKLIB_SRCS ${METIS_GKLIB_SRCS} ${METIS_GKLIB_WIN32_SRCS})
+    endif()
+    add_library(metis_partition SHARED
+        "csrc/partition/metis.cpp"
+        ${METIS_SRCS} ${METIS_GKLIB_SRCS}
+    )
+    target_include_directories(metis_partition PRIVATE "${METIS_DIR}/include")
+    target_include_directories(metis_partition PRIVATE "${METIS_GKLIB_DIR}")
+    if (MSVC)
+        target_include_directories(metis_partition PRIVATE "${METIS_GKLIB_DIR}/win32")
+    endif()
+    target_compile_definitions(metis_partition PRIVATE -DIDXTYPEWIDTH=64)
+    target_compile_definitions(metis_partition PRIVATE -DREALTYPEWIDTH=32)
+    target_compile_options(metis_partition PRIVATE -O3)
+    target_link_libraries(metis_partition PRIVATE ${TORCH_LIBRARIES})
+    if (UNIX)
+        target_link_libraries(metis_partition PRIVATE m)
+    endif()
+endif()
+if(WITH_MTMETIS)
+    add_definitions(-DWITH_MTMETIS)
+    set(MTMETIS_DIR "${CMAKE_SOURCE_DIR}/third_party/mt-metis")
+    set(MTMETIS_INCLUDE_DIRS "${MTMETIS_DIR}/include")
+    file(GLOB_RECURSE MTMETIS_LIBRARIES "${MTMETIS_DIR}/lib/lib*.a")
+    include_directories(${MTMETIS_INCLUDE_DIRS})
+    add_library(mtmetis_partition SHARED "csrc/partition/mtmetis.cpp")
+    target_link_libraries(mtmetis_partition PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(mtmetis_partition PRIVATE ${MTMETIS_LIBRARIES})
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_VERTICES)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_EDGES)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_WEIGHTS)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_PARTITIONS)
+endif()
+include_directories("csrc/include")
+add_library(${PROJECT_NAME} SHARED csrc/export.cpp)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_compile_definitions(${PROJECT_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${PROJECT_NAME})
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+if (WITH_CUDA)
+    target_link_libraries(${PROJECT_NAME} PRIVATE uvm_ops)
+endif()
+if (WITH_METIS)
+    message(STATUS "Current project '${PROJECT_NAME}' uses METIS graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE metis_partition)
+endif()
+if (WITH_MTMETIS)
+    message(STATUS "Current project '${PROJECT_NAME}' uses multi-threaded METIS graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE mtmetis_partition)
+endif()
+if (WITH_LDG)
+    message(STATUS "Current project '${PROJECT_NAME}' uses LDG graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ldg_partition)
+endif()
+# add libsampler.so
+set(SAMLPER_NAME "${PROJECT_NAME}_sampler")
+# set(BOOST_INCLUDE_DIRS "${CMAKE_SOURCE_DIR}/third_party/boost_1_83_0")
+# include_directories(${BOOST_INCLUDE_DIRS})
+file(GLOB_RECURSE SAMPLER_SRCS "csrc/sampler/*.cpp")
+add_library(${SAMLPER_NAME} SHARED ${SAMPLER_SRCS})
+target_include_directories(${SAMLPER_NAME} PRIVATE "csrc/sampler/include")
+target_compile_options(${SAMLPER_NAME} PRIVATE -O3)
+target_link_libraries(${SAMLPER_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_compile_definitions(${SAMLPER_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${SAMLPER_NAME})
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${SAMLPER_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
--- a/.history/csrc/export_20240108192330.cpp
+++ b/.history/csrc/export_20240108192330.cpp
+#include "extension.h"
+#include "uvm.h"
+#include "partition.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    #ifdef WITH_CUDA
+    m.def("uvm_storage_new", &uvm_storage_new, "return storage of unified virtual memory");
+    m.def("uvm_storage_to_cuda", &uvm_storage_to_cuda, "share uvm storage with another cuda device");
+    m.def("uvm_storage_to_cpu", &uvm_storage_to_cpu, "share uvm storage with cpu");
+    m.def("uvm_storage_advise", &uvm_storage_advise, "apply cudaMemAdvise() to uvm storage");
+    m.def("uvm_storage_prefetch", &uvm_storage_prefetch, "apply cudaMemPrefetchAsync() to uvm storage");
+    py::enum_<cudaMemoryAdvise>(m, "cudaMemoryAdvise")
+        .value("cudaMemAdviseSetAccessedBy", cudaMemoryAdvise::cudaMemAdviseSetAccessedBy)
+        .value("cudaMemAdviseUnsetAccessedBy", cudaMemoryAdvise::cudaMemAdviseUnsetAccessedBy)
+        .value("cudaMemAdviseSetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseSetPreferredLocation)
+        .value("cudaMemAdviseUnsetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseUnsetPreferredLocation)
+        .value("cudaMemAdviseSetReadMostly", cudaMemoryAdvise::cudaMemAdviseSetReadMostly)
+        .value("cudaMemAdviseUnsetReadMostly", cudaMemoryAdvise::cudaMemAdviseUnsetReadMostly);
+    #endif
+    #ifdef WITH_METIS
+    m.def("metis_partition", &metis_partition, "metis graph partition");
+<<<<<<< HEAD
+    m.def("metis_cache_friendly_reordering", &metis_cache_friendly_reordering, "metis cache-friendly reordering");
+=======
+>>>>>>> cmy_dev
+    #endif
+    #ifdef WITH_MTMETIS
+    m.def("mt_metis_partition", &mt_metis_partition, "multi-threaded metis graph partition");
+    #endif
+    #ifdef WITH_LGD
+    // Note: the switch WITH_MULTITHREADING=ON shall be triggered during compilation
+    // to enable multi-threading functionality.
+    m.def("ldg_partition", &ldg_partition, "(multi-threaded optionally) LDG graph partition");
+    #endif
+}
--- a/.history/csrc/export_20240108193538.cpp
+++ b/.history/csrc/export_20240108193538.cpp
+#include "extension.h"
+#include "uvm.h"
+#include "partition.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    #ifdef WITH_CUDA
+    m.def("uvm_storage_new", &uvm_storage_new, "return storage of unified virtual memory");
+    m.def("uvm_storage_to_cuda", &uvm_storage_to_cuda, "share uvm storage with another cuda device");
+    m.def("uvm_storage_to_cpu", &uvm_storage_to_cpu, "share uvm storage with cpu");
+    m.def("uvm_storage_advise", &uvm_storage_advise, "apply cudaMemAdvise() to uvm storage");
+    m.def("uvm_storage_prefetch", &uvm_storage_prefetch, "apply cudaMemPrefetchAsync() to uvm storage");
+    py::enum_<cudaMemoryAdvise>(m, "cudaMemoryAdvise")
+        .value("cudaMemAdviseSetAccessedBy", cudaMemoryAdvise::cudaMemAdviseSetAccessedBy)
+        .value("cudaMemAdviseUnsetAccessedBy", cudaMemoryAdvise::cudaMemAdviseUnsetAccessedBy)
+        .value("cudaMemAdviseSetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseSetPreferredLocation)
+        .value("cudaMemAdviseUnsetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseUnsetPreferredLocation)
+        .value("cudaMemAdviseSetReadMostly", cudaMemoryAdvise::cudaMemAdviseSetReadMostly)
+        .value("cudaMemAdviseUnsetReadMostly", cudaMemoryAdvise::cudaMemAdviseUnsetReadMostly);
+    #endif
+    #ifdef WITH_METIS
+    m.def("metis_partition", &metis_partition, "metis graph partition");
+    m.def("metis_cache_friendly_reordering", &metis_cache_friendly_reordering, "metis cache-friendly reordering");
+    #endif
+    #ifdef WITH_MTMETIS
+    m.def("mt_metis_partition", &mt_metis_partition, "multi-threaded metis graph partition");
+    #endif
+    #ifdef WITH_LGD
+    // Note: the switch WITH_MULTITHREADING=ON shall be triggered during compilation
+    // to enable multi-threading functionality.
+    m.def("ldg_partition", &ldg_partition, "(multi-threaded optionally) LDG graph partition");
+    #endif
+}
--- a/.history/csrc/export_20240108193640.cpp
+++ b/.history/csrc/export_20240108193640.cpp
+#include "extension.h"
+#include "uvm.h"
+#include "partition.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    #ifdef WITH_CUDA
+    #ifdef WITH_CUDA
+    m.def("uvm_storage_new", &uvm_storage_new, "return storage of unified virtual memory");
+    m.def("uvm_storage_to_cuda", &uvm_storage_to_cuda, "share uvm storage with another cuda device");
+    m.def("uvm_storage_to_cpu", &uvm_storage_to_cpu, "share uvm storage with cpu");
+    m.def("uvm_storage_advise", &uvm_storage_advise, "apply cudaMemAdvise() to uvm storage");
+    m.def("uvm_storage_prefetch", &uvm_storage_prefetch, "apply cudaMemPrefetchAsync() to uvm storage");
+    py::enum_<cudaMemoryAdvise>(m, "cudaMemoryAdvise")
+        .value("cudaMemAdviseSetAccessedBy", cudaMemoryAdvise::cudaMemAdviseSetAccessedBy)
+        .value("cudaMemAdviseUnsetAccessedBy", cudaMemoryAdvise::cudaMemAdviseUnsetAccessedBy)
+        .value("cudaMemAdviseSetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseSetPreferredLocation)
+        .value("cudaMemAdviseUnsetPreferredLocation", cudaMemoryAdvise::cudaMemAdviseUnsetPreferredLocation)
+        .value("cudaMemAdviseSetReadMostly", cudaMemoryAdvise::cudaMemAdviseSetReadMostly)
+        .value("cudaMemAdviseUnsetReadMostly", cudaMemoryAdvise::cudaMemAdviseUnsetReadMostly);
+    #endif
+    #ifdef WITH_METIS
+    m.def("metis_partition", &metis_partition, "metis graph partition");
+    m.def("metis_cache_friendly_reordering", &metis_cache_friendly_reordering, "metis cache-friendly reordering");
+    #endif
+    #ifdef WITH_MTMETIS
+    m.def("mt_metis_partition", &mt_metis_partition, "multi-threaded metis graph partition");
+    #endif
+    #ifdef WITH_LGD
+    // Note: the switch WITH_MULTITHREADING=ON shall be triggered during compilation
+    // to enable multi-threading functionality.
+    m.def("ldg_partition", &ldg_partition, "(multi-threaded optionally) LDG graph partition");
+    #endif
+}
--- a/.history/docs/source/advanced/index_20240108192330.rst
+++ b/.history/docs/source/advanced/index_20240108192330.rst
+Advanced Concepts
+=================
+.. toctree::
+<<<<<<< HEAD
+    sampling_parallel/index
+    partition_parallel/index
+    timeline_parallel/index
+=======
+    ts_sampling
+    pp_training
+    tp_training
+    data_proc
+>>>>>>> cmy_dev
--- a/.history/docs/source/advanced/index_20240108193709.rst
+++ b/.history/docs/source/advanced/index_20240108193709.rst
+Advanced Concepts
+=================
+.. toctree::
+    sampling_parallel/index
+    partition_parallel/index
+    timeline_parallel/index
--- a/.history/docs/source/api/python/index_20240108193847.rst
+++ b/.history/docs/source/api/python/index_20240108193847.rst
+Package References
+==================
+.. toctree::
+    distributed
+    neighbor_sampler
+    memory
+    data_loader
+    graph_core
+    cache
--- a/.history/docs/source/conf_20240108193653.py
+++ b/.history/docs/source/conf_20240108193653.py
+import os
+import sys
+sys.path.insert(0, os.path.abspath("../.."))
+import starrygl
+project = 'StarryGL'
+copyright = '2023, StarryGL Team'
+author = 'StarryGL Team'
+version = starrygl.__version__
+release = starrygl.__version__
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.duration",
+    "sphinx.ext.viewcode",
+]
+templates_path = ['_templates']
+exclude_patterns = []
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
--- a/.history/install_20240108164047.sh
+++ b/.history/install_20240108164047.sh
+#!/bin/bash
+mkdir -p build && cd build
+cmake .. \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DCMAKE_PREFIX_PATH="/home/hwj/.miniconda3/envs/sgl/lib/python3.10/site-packages" \
+    -DPython3_ROOT_DIR="/home/hwj/.miniconda3/envs/sgl" \
+    -DCUDA_TOOLKIT_ROOT_DIR="/home/hwj/.local/cuda-11.8" \
+&& make -j32 \
+&& rm -rf ../starrygl/lib \
+&& mkdir ../starrygl/lib \
+&& cp lib*.so ../starrygl/lib/ \
+&& patchelf --set-rpath '$ORIGIN:$ORIGIN/lib' --force-rpath ../starrygl/lib/*.so
--- a/.history/install_20240108164221.sh
+++ b/.history/install_20240108164221.sh
+#!/bin/bash
+mkdir -p build && cd build
+cmake .. \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/dgnn/lib/python3.8/site-packages" \
+    -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/dgnn" \
+    -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/local/cuda-12.2" \
+&& make -j32 \
+&& rm -rf ../starrygl/lib \
+&& mkdir ../starrygl/lib \
+&& cp lib*.so ../starrygl/lib/ \
+&& patchelf --set-rpath '$ORIGIN:$ORIGIN/lib' --force-rpath ../starrygl/lib/*.so
--- a/.history/install_20240108192354.sh
+++ b/.history/install_20240108192354.sh
+#!/bin/bash
+mkdir -p build && cd build
+cmake .. \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/sgl/lib/python3.10/site-packages" \
+    -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/sgl" \
+    -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/.local/cuda-11.8" \
+&& make -j32 \
+&& rm -rf ../starrygl/lib \
+&& mkdir ../starrygl/lib \
+&& cp lib*.so ../starrygl/lib/ \
+&& patchelf --set-rpath '$ORIGIN:$ORIGIN/lib' --force-rpath ../starrygl/lib/*.so
--- a/.history/install_20240108194515.sh
+++ b/.history/install_20240108194515.sh
+#!/bin/bash
+mkdir -p build && cd build
+cmake .. \
+    -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+    -DCMAKE_PREFIX_PATH="/home/zlj/.miniconda3/envs/sgl/lib/python3.10/site-packages" \
+    -DPython3_ROOT_DIR="/home/zlj/.miniconda3/envs/sgl" \
+    -DCUDA_TOOLKIT_ROOT_DIR="/home/zlj/.local/cuda-11.8" \
+&& make -j32 \
+&& rm -rf ../starrygl/lib \
+&& mkdir ../starrygl/lib \
+&& cp lib*.so ../starrygl/lib/ \
+&& patchelf --set-rpath '$ORIGIN:$ORIGIN/lib' --force-rpath ../starrygl/lib/*.so
--- a/.history/requirements_20240108192330.txt
+++ b/.history/requirements_20240108192330.txt
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.1.1+cu118
+torchvision==0.16.1+cu118
+torchaudio==2.1.1+cu118
+--extra-index-url https://data.pyg.org/whl/torch-2.1.0+cu118.html
+torch_geometric==2.4.0
+pyg_lib==0.3.1+pt21cu118
+torch_scatter==2.1.2+pt21cu118
+torch_sparse==0.6.18+pt21cu118
+torch_cluster==1.6.3+pt21cu118
+torch_spline_conv==1.2.2+pt21cu118
+ogb
+<<<<<<< HEAD
+tqdm
+networkx
+=======
+tqdm
+>>>>>>> cmy_dev
--- a/.history/requirements_20240108192856.txt
+++ b/.history/requirements_20240108192856.txt
+--extra-index-url https://download.pytorch.org/whl/cu118
+torch==2.1.1+cu118
+torchvision==0.16.1+cu118
+torchaudio==2.1.1+cu118
+--extra-index-url https://data.pyg.org/whl/torch-2.1.0+cu118.html
+torch_geometric==2.4.0
+pyg_lib==0.3.1+pt21cu118
+torch_scatter==2.1.2+pt21cu118
+torch_sparse==0.6.18+pt21cu118
+torch_cluster==1.6.3+pt21cu118
+torch_spline_conv==1.2.2+pt21cu118
+ogb
+tqdm
+networkx
--- a/.history/starrygl/data/graph_20240108192330.py
+++ b/.history/starrygl/data/graph_20240108192330.py
--- a/.history/starrygl/data/graph_20240108193955.py
+++ b/.history/starrygl/data/graph_20240108193955.py
--- a/.history/starrygl/distributed/utils_20240108192330.py
+++ b/.history/starrygl/distributed/utils_20240108192330.py
--- a/.history/starrygl/distributed/utils_20240108194019.py
+++ b/.history/starrygl/distributed/utils_20240108194019.py
--- a/.history/starrygl/sample/batch_data_20240108192330.py
+++ b/.history/starrygl/sample/batch_data_20240108192330.py
+from typing import List, Tuple
+import torch
+import torch.distributed as dist
+from starrygl.distributed.utils import DistributedTensor
+from starrygl.module.memorys import MailBox
+from starrygl.sample.cache.fetch_cache import FetchFeatureCache
+from starrygl.sample.graph_core import DataSet
+from starrygl.sample.graph_core import DistributedGraphStore
+from starrygl.sample.sample_core.base import BaseSampler, NegativeSampling
+import dgl
+from starrygl.sample.stream_manager import PipelineManager, getPipelineManger
+"""
+入参不变，出参变为：
+sample_from_nodes
+node: list[tensor,tensor, tensor...]
+eid: list[tensor,tensor, tensor...]
+src_index: list[tensor,tensor, tensor...]
+sample_from_edges：
+node
+eid: list[tensor,tensor, tensor...]
+src_index: list[tensor,tensor, tensor...]
+delta_ts: list[tensor,tensor, tensor...]
+metadata
+"""
+def prepare_input(node_feat, edge_feat, mem_embedding,mfgs,dist_nid,dist_eid):
+    for mfg in mfgs:
+        for i,b in enumerate(mfg):
+            e_idx = b.edata['ID']
+            idx = b.srcdata['ID']
+            b.edata['ID'] = dist_eid[e_idx]
+            b.srcdata['ID'] = dist_nid[idx]
+            if edge_feat is not None:
+                b.edata['f'] = edge_feat[e_idx]
+            if i == 0:
+                if node_feat is not None:
+                    b.srcdata['h'] = node_feat[idx]
+                if mem_embedding is not None:
+                    node_memory,node_memory_ts,mailbox,mailbox_ts = mem_embedding
+                    b.srcdata['mem'] = node_memory[idx]
+                    b.srcdata['mem_ts'] = node_memory_ts[idx]
+                    b.srcdata['mem_input'] = mailbox[idx].reshape(b.srcdata['ID'].shape[0], -1)
+                    b.srcdata['mail_ts'] = mailbox_ts[idx]
+                    #print(idx.shape[0],b.srcdata['mem_ts'].shape)
+        return mfgs
+def to_block(graph: DistributedGraphStore, data, sample_out, mailbox:MailBox = None,device = torch.device('cuda'),group = None):
+<<<<<<< HEAD
+=======
+>>>>>>> cmy_dev
+    if len(sample_out) > 1:
+        sample_out,metadata = sample_out
+    else:
+        metadata = None
+    eid = [ret.eid() for ret in sample_out]
+    eid_len = [e.shape[0] for e in eid ]
+    eid_mapper: torch.Tensor = graph.eids_mapper
+    nid_mapper: torch.Tensor = graph.nids_mapper
+    eid_tensor = torch.cat(eid,dim = 0).to(eid_mapper.device)
+    dist_eid = eid_mapper[eid_tensor].to(device)
+    dist_eid,eid_inv = dist_eid.unique(return_inverse=True)
+    src_node = graph.sample_graph['edge_index'][0,eid_tensor*2].to(graph.nids_mapper.device)
+    src_ts = None  
+    if metadata is None:
+        root_node = data.nodes.to(graph.nids_mapper.device)
+        root_len = [root_node.shape[0]]
+        if hasattr(data,'ts'):
+            src_ts = torch.cat([data.ts,
+                                graph.sample_graph['ts'][eid_tensor*2].to(device)])
+    elif 'seed' in metadata:
+        root_node = metadata.pop('seed').to(graph.nids_mapper.device)
+        root_len = root_node.shape[0]
+        if 'seed_ts' in metadata:
+            src_ts = torch.cat([metadata.pop('seed_ts').to(device),\
+                                graph.sample_graph['ts'][eid_tensor*2].to(device)])
+        for k in metadata:
+            metadata[k] = metadata[k].to(device)
+    nid_tensor = torch.cat([root_node,src_node],dim = 0)
+    dist_nid = nid_mapper[nid_tensor].to(device)
+    dist_nid,nid_inv = dist_nid.unique(return_inverse = True)
+    fetchCache = FetchFeatureCache.getFetchCache()
+    if fetchCache is None:
+        if isinstance(graph.edge_attr,DistributedTensor):
+            ind_dict = graph.edge_attr.all_to_all_ind2ptr(dist_eid,group = group)
+            edge_feat = graph.edge_attr.all_to_all_get(group = group,**ind_dict)
+        else:
+            edge_feat = graph._get_edge_attr(dist_eid)
+        ind_dict = None
+        if isinstance(graph.x,DistributedTensor):
+            ind_dict = graph.x.all_to_all_ind2ptr(dist_nid,group = group)
+            node_feat = graph.x.all_to_all_get(group = group,**ind_dict)
+        else:
+            node_feat = graph._get_node_attr(dist_nid)
+        if mailbox is not None:
+            if torch.distributed.get_world_size() > 1:
+                if node_feat is None:
+                    ind_dict = mailbox.node_memory.all_to_all_ind2ptr(dist_nid,group = group)
+                mem = mailbox.gather_memory(**ind_dict)
+            else:
+                mem = mailbox.get_memory(dist_nid)
+        else:
+            mem = None
+    else:
+        raw_nid = torch.empty_like(dist_nid)
+        raw_eid = torch.empty_like(dist_eid)
+        nid_tensor = nid_tensor.to(device)
+        eid_tensor = eid_tensor.to(device)
+        raw_nid[nid_inv] = nid_tensor
+        raw_eid[eid_inv] = eid_tensor
+        node_feat,edge_feat,mem = fetchCache.fetch_feature(raw_nid,
+                                 dist_nid,raw_eid,
+                                 dist_eid)
+    def build_block():
+        mfgs = list()
+        col = torch.arange(0,root_len,device = device)
+        col_len = 0
+        row_len = root_len
+        for r in range(len(eid_len)):
+            elen = eid_len[r]
+            row = torch.arange(row_len,row_len+elen,device = device)
+            b = dgl.create_block((row,col[sample_out[r].src_index().to(device)]),
+                                 num_src_nodes = row_len + elen,
+                                 num_dst_nodes = row_len,
+                                 device = device)
+            idx = nid_inv[0:row_len + elen]
+            e_idx = eid_inv[col_len:col_len+elen]
+            b.srcdata['ID'] = idx
+            if sample_out[r].delta_ts().shape[0] > 0:
+                b.edata['dt'] = sample_out[r].delta_ts().to(device)
+            if src_ts is not None:
+                b.srcdata['ts'] = src_ts[0:row_len + eid_len[r]]
+            b.edata['ID'] = e_idx
+            col = row
+            col_len += eid_len[r]
+            row_len += eid_len[r]
+            mfgs.append(b)
+        mfgs = list(map(list, zip(*[iter(mfgs)])))
+        mfgs.reverse()
+        return data,mfgs,metadata
+    data,mfgs,metadata = build_block()
+    mfgs = prepare_input(node_feat,edge_feat,mem,mfgs,dist_nid,dist_eid)
+        #return build_block(node_feat,edge_feat,mem)#data,mfgs,metadata
+    return (data,mfgs,metadata)
+def graph_sample(graph, sampler:BaseSampler,
+                      sample_fn, data, 
+                      neg_sampling = None,
+                      mailbox = None,
+                      device = torch.device('cuda'),
+                      async_op = False):
+    out = sample_fn(sampler,data,neg_sampling)
+    if async_op == False:
+        return to_block(graph,data,out,mailbox,device)
+    else:
+        manger =  getPipelineManger()
+        future = manger.submit('lookup',to_block,{'graph':graph,'data':data,\
+                                                  'sample_out':out,\
+                                                  'mailbox':mailbox,\
+                                                  'device':device})
+        return future
+def sample_from_nodes(sampler:BaseSampler,  data:DataSet, **kwargs):
+    out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1))
+    #out.metadata = None
+    return out
+def sample_from_edges(sampler:BaseSampler,  
+                      data:DataSet, 
+                      neg_sampling:NegativeSampling = None):
+    edge_label = data.labels if hasattr(data,'labels') else None
+    out = sampler.sample_from_edges(edges = data.edges, 
+                                    neg_sampling=neg_sampling)
+    return out
+def sample_from_temporal_nodes(sampler:BaseSampler,data:DataSet,
+                               **kwargs):
+    out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1),
+                                    ts = data.ts.reshape(-1))
+    #out.metadata = None
+    return out
+def sample_from_temporal_edges(sampler:BaseSampler, data:DataSet,
+                               neg_sampling: NegativeSampling = None):
+    edge_label = data.labels if hasattr(data,'labels') else None
+    out = sampler.sample_from_edges(edges=data.edges.to('cpu'),
+                                    ets=data.ts.to('cpu'),
+                                    neg_sampling = neg_sampling
+                                    )
+    return out
+class SAMPLE_TYPE:
+    SAMPLE_FROM_NODES = sample_from_nodes,
+    SAMPLE_FROM_EDGES = sample_from_edges,
+    SAMPLE_FROM_TEMPORAL_NODES = sample_from_temporal_nodes,
+    SAMPLE_FROM_TEMPORAL_EDGES = sample_from_temporal_edges
\ No newline at end of file
--- a/.history/starrygl/sample/batch_data_20240108194032.py
+++ b/.history/starrygl/sample/batch_data_20240108194032.py
+from typing import List, Tuple
+import torch
+import torch.distributed as dist
+from starrygl.distributed.utils import DistributedTensor
+from starrygl.module.memorys import MailBox
+from starrygl.sample.cache.fetch_cache import FetchFeatureCache
+from starrygl.sample.graph_core import DataSet
+from starrygl.sample.graph_core import DistributedGraphStore
+from starrygl.sample.sample_core.base import BaseSampler, NegativeSampling
+import dgl
+from starrygl.sample.stream_manager import PipelineManager, getPipelineManger
+"""
+入参不变，出参变为：
+sample_from_nodes
+node: list[tensor,tensor, tensor...]
+eid: list[tensor,tensor, tensor...]
+src_index: list[tensor,tensor, tensor...]
+sample_from_edges：
+node
+eid: list[tensor,tensor, tensor...]
+src_index: list[tensor,tensor, tensor...]
+delta_ts: list[tensor,tensor, tensor...]
+metadata
+"""
+def prepare_input(node_feat, edge_feat, mem_embedding,mfgs,dist_nid,dist_eid):
+    for mfg in mfgs:
+        for i,b in enumerate(mfg):
+            e_idx = b.edata['ID']
+            idx = b.srcdata['ID']
+            b.edata['ID'] = dist_eid[e_idx]
+            b.srcdata['ID'] = dist_nid[idx]
+            if edge_feat is not None:
+                b.edata['f'] = edge_feat[e_idx]
+            if i == 0:
+                if node_feat is not None:
+                    b.srcdata['h'] = node_feat[idx]
+                if mem_embedding is not None:
+                    node_memory,node_memory_ts,mailbox,mailbox_ts = mem_embedding
+                    b.srcdata['mem'] = node_memory[idx]
+                    b.srcdata['mem_ts'] = node_memory_ts[idx]
+                    b.srcdata['mem_input'] = mailbox[idx].reshape(b.srcdata['ID'].shape[0], -1)
+                    b.srcdata['mail_ts'] = mailbox_ts[idx]
+                    #print(idx.shape[0],b.srcdata['mem_ts'].shape)
+        return mfgs
+def to_block(graph: DistributedGraphStore, data, sample_out, mailbox:MailBox = None,device = torch.device('cuda'),group = None):
+    if len(sample_out) > 1:
+        sample_out,metadata = sample_out
+    else:
+        metadata = None
+    eid = [ret.eid() for ret in sample_out]
+    eid_len = [e.shape[0] for e in eid ]
+    eid_mapper: torch.Tensor = graph.eids_mapper
+    nid_mapper: torch.Tensor = graph.nids_mapper
+    eid_tensor = torch.cat(eid,dim = 0).to(eid_mapper.device)
+    dist_eid = eid_mapper[eid_tensor].to(device)
+    dist_eid,eid_inv = dist_eid.unique(return_inverse=True)
+    src_node = graph.sample_graph['edge_index'][0,eid_tensor*2].to(graph.nids_mapper.device)
+    src_ts = None  
+    if metadata is None:
+        root_node = data.nodes.to(graph.nids_mapper.device)
+        root_len = [root_node.shape[0]]
+        if hasattr(data,'ts'):
+            src_ts = torch.cat([data.ts,
+                                graph.sample_graph['ts'][eid_tensor*2].to(device)])
+    elif 'seed' in metadata:
+        root_node = metadata.pop('seed').to(graph.nids_mapper.device)
+        root_len = root_node.shape[0]
+        if 'seed_ts' in metadata:
+            src_ts = torch.cat([metadata.pop('seed_ts').to(device),\
+                                graph.sample_graph['ts'][eid_tensor*2].to(device)])
+        for k in metadata:
+            metadata[k] = metadata[k].to(device)
+    nid_tensor = torch.cat([root_node,src_node],dim = 0)
+    dist_nid = nid_mapper[nid_tensor].to(device)
+    dist_nid,nid_inv = dist_nid.unique(return_inverse = True)
+    fetchCache = FetchFeatureCache.getFetchCache()
+    if fetchCache is None:
+        if isinstance(graph.edge_attr,DistributedTensor):
+            ind_dict = graph.edge_attr.all_to_all_ind2ptr(dist_eid,group = group)
+            edge_feat = graph.edge_attr.all_to_all_get(group = group,**ind_dict)
+        else:
+            edge_feat = graph._get_edge_attr(dist_eid)
+        ind_dict = None
+        if isinstance(graph.x,DistributedTensor):
+            ind_dict = graph.x.all_to_all_ind2ptr(dist_nid,group = group)
+            node_feat = graph.x.all_to_all_get(group = group,**ind_dict)
+        else:
+            node_feat = graph._get_node_attr(dist_nid)
+        if mailbox is not None:
+            if torch.distributed.get_world_size() > 1:
+                if node_feat is None:
+                    ind_dict = mailbox.node_memory.all_to_all_ind2ptr(dist_nid,group = group)
+                mem = mailbox.gather_memory(**ind_dict)
+            else:
+                mem = mailbox.get_memory(dist_nid)
+        else:
+            mem = None
+    else:
+        raw_nid = torch.empty_like(dist_nid)
+        raw_eid = torch.empty_like(dist_eid)
+        nid_tensor = nid_tensor.to(device)
+        eid_tensor = eid_tensor.to(device)
+        raw_nid[nid_inv] = nid_tensor
+        raw_eid[eid_inv] = eid_tensor
+        node_feat,edge_feat,mem = fetchCache.fetch_feature(raw_nid,
+                                 dist_nid,raw_eid,
+                                 dist_eid)
+    def build_block():
+        mfgs = list()
+        col = torch.arange(0,root_len,device = device)
+        col_len = 0
+        row_len = root_len
+        for r in range(len(eid_len)):
+            elen = eid_len[r]
+            row = torch.arange(row_len,row_len+elen,device = device)
+            b = dgl.create_block((row,col[sample_out[r].src_index().to(device)]),
+                                 num_src_nodes = row_len + elen,
+                                 num_dst_nodes = row_len,
+                                 device = device)
+            idx = nid_inv[0:row_len + elen]
+            e_idx = eid_inv[col_len:col_len+elen]
+            b.srcdata['ID'] = idx
+            if sample_out[r].delta_ts().shape[0] > 0:
+                b.edata['dt'] = sample_out[r].delta_ts().to(device)
+            if src_ts is not None:
+                b.srcdata['ts'] = src_ts[0:row_len + eid_len[r]]
+            b.edata['ID'] = e_idx
+            col = row
+            col_len += eid_len[r]
+            row_len += eid_len[r]
+            mfgs.append(b)
+        mfgs = list(map(list, zip(*[iter(mfgs)])))
+        mfgs.reverse()
+        return data,mfgs,metadata
+    data,mfgs,metadata = build_block()
+    mfgs = prepare_input(node_feat,edge_feat,mem,mfgs,dist_nid,dist_eid)
+        #return build_block(node_feat,edge_feat,mem)#data,mfgs,metadata
+    return (data,mfgs,metadata)
+def graph_sample(graph, sampler:BaseSampler,
+                      sample_fn, data, 
+                      neg_sampling = None,
+                      mailbox = None,
+                      device = torch.device('cuda'),
+                      async_op = False):
+    out = sample_fn(sampler,data,neg_sampling)
+    if async_op == False:
+        return to_block(graph,data,out,mailbox,device)
+    else:
+        manger =  getPipelineManger()
+        future = manger.submit('lookup',to_block,{'graph':graph,'data':data,\
+                                                  'sample_out':out,\
+                                                  'mailbox':mailbox,\
+                                                  'device':device})
+        return future
+def sample_from_nodes(sampler:BaseSampler,  data:DataSet, **kwargs):
+    out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1))
+    #out.metadata = None
+    return out
+def sample_from_edges(sampler:BaseSampler,  
+                      data:DataSet, 
+                      neg_sampling:NegativeSampling = None):
+    edge_label = data.labels if hasattr(data,'labels') else None
+    out = sampler.sample_from_edges(edges = data.edges, 
+                                    neg_sampling=neg_sampling)
+    return out
+def sample_from_temporal_nodes(sampler:BaseSampler,data:DataSet,
+                               **kwargs):
+    out = sampler.sample_from_nodes(nodes=data.nodes.reshape(-1),
+                                    ts = data.ts.reshape(-1))
+    #out.metadata = None
+    return out
+def sample_from_temporal_edges(sampler:BaseSampler, data:DataSet,
+                               neg_sampling: NegativeSampling = None):
+    edge_label = data.labels if hasattr(data,'labels') else None
+    out = sampler.sample_from_edges(edges=data.edges.to('cpu'),
+                                    ets=data.ts.to('cpu'),
+                                    neg_sampling = neg_sampling
+                                    )
+    return out
+class SAMPLE_TYPE:
+    SAMPLE_FROM_NODES = sample_from_nodes,
+    SAMPLE_FROM_EDGES = sample_from_edges,
+    SAMPLE_FROM_TEMPORAL_NODES = sample_from_temporal_nodes,
+    SAMPLE_FROM_TEMPORAL_EDGES = sample_from_temporal_edges
\ No newline at end of file
--- a/.history/starrygl/sample/data_loader_20240108194042.py
+++ b/.history/starrygl/sample/data_loader_20240108194042.py
+from collections import deque
+from enum import Enum
+import queue
+import torch
+import sys
+from os.path import abspath, join, dirname
+import numpy as np
+from starrygl.sample.batch_data import graph_sample
+from starrygl.sample.sample_core.PreNegSampling import PreNegativeSampling
+sys.path.insert(0, join(abspath(dirname(__file__))))
+from typing import Deque, Optional
+import torch.distributed as dist
+from torch_geometric.data import Data
+import os.path as osp
+import math
+class DistributedDataLoader:
+    ''' 
+    We will perform feature fetch in the data loader.
+    you can simply define a data loader for use, while starrygl assisting in fetching node or edge features:
+    Args:
+        graph: distributed graph store
+        data: the graph data
+        sampler: a parallel sampler like `NeighborSampler` above
+        sampler_fn: sample type
+        neg_sampler: negative sampler
+        batch_size: batch size
+        mailbox: APAN's mailbox and TGN's memory implemented by starrygl
+    Examples:
+        .. code-block:: python
+            import torch
+            from starrygl.sample.data_loader import DistributedDataLoader
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            from starrygl.sample.sample_core.base import NegativeSampling
+            from starrygl.sample.batch_data import SAMPLE_TYPE
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")    
+            graph = DistributedGraphStore(pdata = pdata, uvm_edge = False, uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.  edge_attr is not None else 0)
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10], graph_data=sample_graph,    workers=15,policy = 'recent',graph_name = "wiki_train")
+            neg_sampler = NegativeSampling('triplet')
+            train_data = torch.masked_select(graph.edge_index, pdata.train_mask.to(graph.edge_index.device)).reshape    (2, -1)
+            trainloader = DistributedDataLoader(graph, train_data, sampler=sampler, sampler_fn=SAMPLE_TYPE. SAMPLE_FROM_TEMPORAL_EDGES,neg_sampler=neg_sampler, batch_size=1000, shuffle=False, drop_last=True, chunk_size = None,train=True, mailbox=mailbox )
+    In the data loader, we will call the `graph_sample`, sourced from `starrygl.sample.batch_data`.
+    And the `to_block` function in the `graph_sample` will implement feature fetching.
+    If cache is not used, we will directly fetch node or edge features from the graph data, 
+    otherwise we will call `fetch_data` for feature fetching.     
+    '''
+    def __init__(
+            self,
+            graph,
+            dataset = None,
+            sampler = None,
+            sampler_fn = None,
+            neg_sampler = None,
+            batch_size: Optional[int]=None,
+            drop_last = False,
+            device: torch.device  = torch.device('cuda'),
+            shuffle:bool = True,
+            chunk_size = None,
+            train = False,
+            queue_size = 10,
+            mailbox = None,
+            is_pipeline = False,
+            **kwargs
+    ):
+        assert sampler is not None
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.queue_size = queue_size
+        self.num_pending = 0
+        self.current_pos = 0
+        self.recv_idxs = 0
+        self.drop_last = drop_last
+        self.result_queue = deque(maxlen = self.queue_size)
+        self.shuffle = shuffle
+        self.is_closed = False
+        self.sampler = sampler
+        self.sampler_fn = sampler_fn
+        self.neg_sampler = neg_sampler
+        self.graph = graph
+        self.shuffle=shuffle
+        self.dataset = dataset
+        self.mailbox = mailbox
+        self.device =  device
+        self.is_pipeline = is_pipeline
+        if train is True:
+            self._get_expected_idx(self.dataset.len)
+        else:
+            self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
+            #self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
+        torch.distributed.barrier()
+    def __iter__(self):
+        if self.chunk_size is None:
+            if self.shuffle:
+                self.input_dataset = self.dataset.shuffle()
+            else:
+                self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.current_pos = 0
+            self.num_pending = 0
+            self.submitted = 0
+        else:
+            self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.num_pending = 0
+            self.submitted = 0
+            if dist.get_rank == 0:
+                self.current_pos = int(
+                    math.floor(
+                        np.random.uniform(0,self.batch_size/self.chunk_size)
+                    )*self.chunk_size
+                )
+            else:
+                self.current_pos = 0
+            current_pos = torch.tensor([self.current_pos],dtype = torch.long,device=self.device) 
+            dist.broadcast(current_pos, src = 0)
+            self.current_pos = int(current_pos.item())
+            self._get_expected_idx(self.dataset.len-self.current_pos)
+        if self.neg_sampler is not None \
+            and isinstance(self.neg_sampler,PreNegativeSampling):
+            self.neg_sampler.set_next_pos(self.current_pos)
+        return self
+    def _get_expected_idx(self,data_size,op = dist.ReduceOp.MIN):
+        world_size = dist.get_world_size()
+        self.expected_idx = data_size // self.batch_size if self.drop_last is True else int(math.ceil(data_size/self.batch_size))
+        if dist.get_world_size() > 1:
+            num_epochs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
+            print(num_epochs)
+            dist.all_reduce(num_epochs, op=op)
+            self.expected_idx = int(num_epochs.item())
+    def _next_data(self):   
+        if self.current_pos >= self.dataset.len:
+            return self.input_dataset._get_empty()
+        if self.current_pos + self.batch_size > self.input_dataset.len:
+            if self.drop_last:
+                return None
+            else:
+                next_data = self.input_dataset.get_next(
+                    slice(self.current_pos,None,None)
+                )
+                self.current_pos = 0
+        else:
+            next_data = self.input_dataset.get_next(
+                slice(self.current_pos,self.current_pos + self.batch_size,None)
+            )
+            self.current_pos += self.batch_size
+        return next_data
+    def __next__(self):
+<<<<<<< HEAD
+        if self.is_pipeline is False:
+=======
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+        if(dist.get_world_size() > 0):
+>>>>>>> cmy_dev
+            if self.recv_idxs < self.expected_idx:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+                assert batch_data is not None
+                end_event.record()
+                torch.cuda.synchronize()
+                sample_time = start_event.elapsed_time(end_event)
+                return *batch_data,sample_time
+            else :
+                raise StopIteration
+        else:
+            if self.recv_idxs == 0:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+            else:
+                if(self.recv_idxs < self.expected_idx):
+                    assert len(self.result_queue) > 0
+                    result= self.result_queue[0]
+                    self.result_queue.popleft()
+                    batch_data = result.result()
+                    self.recv_idxs += 1
+                else:
+                    raise StopIteration
+            if(self.recv_idxs+1<=self.expected_idx):
+                data = self._next_data()
+                next_batch =  graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device,
+                                          async_op=True)
+                self.result_queue.append(next_batch)     
+        return batch_data
--- a/.history/starrygl/sample/data_loader_20240108194127.py
+++ b/.history/starrygl/sample/data_loader_20240108194127.py
+from collections import deque
+from enum import Enum
+import queue
+import torch
+import sys
+from os.path import abspath, join, dirname
+import numpy as np
+from starrygl.sample.batch_data import graph_sample
+from starrygl.sample.sample_core.PreNegSampling import PreNegativeSampling
+sys.path.insert(0, join(abspath(dirname(__file__))))
+from typing import Deque, Optional
+import torch.distributed as dist
+from torch_geometric.data import Data
+import os.path as osp
+import math
+class DistributedDataLoader:
+    ''' 
+    We will perform feature fetch in the data loader.
+    you can simply define a data loader for use, while starrygl assisting in fetching node or edge features:
+    Args:
+        graph: distributed graph store
+        data: the graph data
+        sampler: a parallel sampler like `NeighborSampler` above
+        sampler_fn: sample type
+        neg_sampler: negative sampler
+        batch_size: batch size
+        mailbox: APAN's mailbox and TGN's memory implemented by starrygl
+    Examples:
+        .. code-block:: python
+            import torch
+            from starrygl.sample.data_loader import DistributedDataLoader
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            from starrygl.sample.sample_core.base import NegativeSampling
+            from starrygl.sample.batch_data import SAMPLE_TYPE
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")    
+            graph = DistributedGraphStore(pdata = pdata, uvm_edge = False, uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.  edge_attr is not None else 0)
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10], graph_data=sample_graph,    workers=15,policy = 'recent',graph_name = "wiki_train")
+            neg_sampler = NegativeSampling('triplet')
+            train_data = torch.masked_select(graph.edge_index, pdata.train_mask.to(graph.edge_index.device)).reshape    (2, -1)
+            trainloader = DistributedDataLoader(graph, train_data, sampler=sampler, sampler_fn=SAMPLE_TYPE. SAMPLE_FROM_TEMPORAL_EDGES,neg_sampler=neg_sampler, batch_size=1000, shuffle=False, drop_last=True, chunk_size = None,train=True, mailbox=mailbox )
+    In the data loader, we will call the `graph_sample`, sourced from `starrygl.sample.batch_data`.
+    And the `to_block` function in the `graph_sample` will implement feature fetching.
+    If cache is not used, we will directly fetch node or edge features from the graph data, 
+    otherwise we will call `fetch_data` for feature fetching.     
+    '''
+    def __init__(
+            self,
+            graph,
+            dataset = None,
+            sampler = None,
+            sampler_fn = None,
+            neg_sampler = None,
+            batch_size: Optional[int]=None,
+            drop_last = False,
+            device: torch.device  = torch.device('cuda'),
+            shuffle:bool = True,
+            chunk_size = None,
+            train = False,
+            queue_size = 10,
+            mailbox = None,
+            is_pipeline = False,
+            **kwargs
+    ):
+        assert sampler is not None
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.queue_size = queue_size
+        self.num_pending = 0
+        self.current_pos = 0
+        self.recv_idxs = 0
+        self.drop_last = drop_last
+        self.result_queue = deque(maxlen = self.queue_size)
+        self.shuffle = shuffle
+        self.is_closed = False
+        self.sampler = sampler
+        self.sampler_fn = sampler_fn
+        self.neg_sampler = neg_sampler
+        self.graph = graph
+        self.shuffle=shuffle
+        self.dataset = dataset
+        self.mailbox = mailbox
+        self.device =  device
+        self.is_pipeline = is_pipeline
+        if train is True:
+            self._get_expected_idx(self.dataset.len)
+        else:
+            self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
+            #self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
+        torch.distributed.barrier()
+    def __iter__(self):
+        if self.chunk_size is None:
+            if self.shuffle:
+                self.input_dataset = self.dataset.shuffle()
+            else:
+                self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.current_pos = 0
+            self.num_pending = 0
+            self.submitted = 0
+        else:
+            self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.num_pending = 0
+            self.submitted = 0
+            if dist.get_rank == 0:
+                self.current_pos = int(
+                    math.floor(
+                        np.random.uniform(0,self.batch_size/self.chunk_size)
+                    )*self.chunk_size
+                )
+            else:
+                self.current_pos = 0
+            current_pos = torch.tensor([self.current_pos],dtype = torch.long,device=self.device) 
+            dist.broadcast(current_pos, src = 0)
+            self.current_pos = int(current_pos.item())
+            self._get_expected_idx(self.dataset.len-self.current_pos)
+        if self.neg_sampler is not None \
+            and isinstance(self.neg_sampler,PreNegativeSampling):
+            self.neg_sampler.set_next_pos(self.current_pos)
+        return self
+    def _get_expected_idx(self,data_size,op = dist.ReduceOp.MIN):
+        world_size = dist.get_world_size()
+        self.expected_idx = data_size // self.batch_size if self.drop_last is True else int(math.ceil(data_size/self.batch_size))
+        if dist.get_world_size() > 1:
+            num_epochs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
+            print(num_epochs)
+            dist.all_reduce(num_epochs, op=op)
+            self.expected_idx = int(num_epochs.item())
+    def _next_data(self):   
+        if self.current_pos >= self.dataset.len:
+            return self.input_dataset._get_empty()
+        if self.current_pos + self.batch_size > self.input_dataset.len:
+            if self.drop_last:
+                return None
+            else:
+                next_data = self.input_dataset.get_next(
+                    slice(self.current_pos,None,None)
+                )
+                self.current_pos = 0
+        else:
+            next_data = self.input_dataset.get_next(
+                slice(self.current_pos,self.current_pos + self.batch_size,None)
+            )
+            self.current_pos += self.batch_size
+        return next_data
+    def __next__(self):
+        if self.is_pipeline is False:
+            if self.recv_idxs < self.expected_idx:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+                assert batch_data is not None
+                end_event.record()
+                torch.cuda.synchronize()
+                sample_time = start_event.elapsed_time(end_event)
+                return *batch_data,sample_time
+            else :
+                raise StopIteration
+        else:
+            if self.recv_idxs == 0:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+            else:
+                if(self.recv_idxs < self.expected_idx):
+                    assert len(self.result_queue) > 0
+                    result= self.result_queue[0]
+                    self.result_queue.popleft()
+                    batch_data = result.result()
+                    self.recv_idxs += 1
+                else:
+                    raise StopIteration
+            if(self.recv_idxs+1<=self.expected_idx):
+                data = self._next_data()
+                next_batch =  graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device,
+                                          async_op=True)
+                self.result_queue.append(next_batch)     
+        return batch_data
--- a/.history/starrygl/sample/data_loader_20240108195550.py
+++ b/.history/starrygl/sample/data_loader_20240108195550.py
+from collections import deque
+from enum import Enum
+import queue
+import torch
+import sys
+from os.path import abspath, join, dirname
+import numpy as np
+from starrygl.sample.batch_data import graph_sample
+from starrygl.sample.sample_core.PreNegSampling import PreNegativeSampling
+sys.path.insert(0, join(abspath(dirname(__file__))))
+from typing import Deque, Optional
+import torch.distributed as dist
+from torch_geometric.data import Data
+import os.path as osp
+import math
+class DistributedDataLoader:
+    ''' 
+    We will perform feature fetch in the data loader.
+    you can simply define a data loader for use, while starrygl assisting in fetching node or edge features:
+    Args:
+        graph: distributed graph store
+        data: the graph data
+        sampler: a parallel sampler like `NeighborSampler` above
+        sampler_fn: sample type
+        neg_sampler: negative sampler
+        batch_size: batch size
+        mailbox: APAN's mailbox and TGN's memory implemented by starrygl
+    Examples:
+        .. code-block:: python
+            import torch
+            from starrygl.sample.data_loader import DistributedDataLoader
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            from starrygl.sample.sample_core.base import NegativeSampling
+            from starrygl.sample.batch_data import SAMPLE_TYPE
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")    
+            graph = DistributedGraphStore(pdata = pdata, uvm_edge = False, uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.  edge_attr is not None else 0)
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10], graph_data=sample_graph,    workers=15,policy = 'recent',graph_name = "wiki_train")
+            neg_sampler = NegativeSampling('triplet')
+            train_data = torch.masked_select(graph.edge_index, pdata.train_mask.to(graph.edge_index.device)).reshape    (2, -1)
+            trainloader = DistributedDataLoader(graph, train_data, sampler=sampler, sampler_fn=SAMPLE_TYPE. SAMPLE_FROM_TEMPORAL_EDGES,neg_sampler=neg_sampler, batch_size=1000, shuffle=False, drop_last=True, chunk_size = None,train=True, mailbox=mailbox )
+    In the data loader, we will call the `graph_sample`, sourced from `starrygl.sample.batch_data`.
+    And the `to_block` function in the `graph_sample` will implement feature fetching.
+    If cache is not used, we will directly fetch node or edge features from the graph data, 
+    otherwise we will call `fetch_data` for feature fetching.     
+    '''
+    def __init__(
+            self,
+            graph,
+            dataset = None,
+            sampler = None,
+            sampler_fn = None,
+            neg_sampler = None,
+            batch_size: Optional[int]=None,
+            drop_last = False,
+            device: torch.device  = torch.device('cuda'),
+            shuffle:bool = True,
+            chunk_size = None,
+            train = False,
+            queue_size = 10,
+            mailbox = None,
+            is_pipeline = False,
+            **kwargs
+    ):
+        assert sampler is not None
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.queue_size = queue_size
+        self.num_pending = 0
+        self.current_pos = 0
+        self.recv_idxs = 0
+        self.drop_last = drop_last
+        self.result_queue = deque(maxlen = self.queue_size)
+        self.shuffle = shuffle
+        self.is_closed = False
+        self.sampler = sampler
+        self.sampler_fn = sampler_fn
+        self.neg_sampler = neg_sampler
+        self.graph = graph
+        self.shuffle=shuffle
+        self.dataset = dataset
+        self.mailbox = mailbox
+        self.device =  device
+        self.is_pipeline = is_pipeline
+        if train is True:
+            self._get_expected_idx(self.dataset.len)
+        else:
+            self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
+            #self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
+        torch.distributed.barrier()
+    def __iter__(self):
+        if self.chunk_size is None:
+            if self.shuffle:
+                self.input_dataset = self.dataset.shuffle()
+            else:
+                self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.current_pos = 0
+            self.num_pending = 0
+            self.submitted = 0
+        else:
+            self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.num_pending = 0
+            self.submitted = 0
+            if dist.get_rank == 0:
+                self.current_pos = int(
+                    math.floor(
+                        np.random.uniform(0,self.batch_size/self.chunk_size)
+                    )*self.chunk_size
+                )
+            else:
+                self.current_pos = 0
+            current_pos = torch.tensor([self.current_pos],dtype = torch.long,device=self.device) 
+            dist.broadcast(current_pos, src = 0)
+            self.current_pos = int(current_pos.item())
+            self._get_expected_idx(self.dataset.len-self.current_pos)
+        if self.neg_sampler is not None \
+            and isinstance(self.neg_sampler,PreNegativeSampling):
+            self.neg_sampler.set_next_pos(self.current_pos)
+        return self
+    def _get_expected_idx(self,data_size,op = dist.ReduceOp.MIN):
+        world_size = dist.get_world_size()
+        self.expected_idx = data_size // self.batch_size if self.drop_last is True else int(math.ceil(data_size/self.batch_size))
+        if dist.get_world_size() > 1:
+            num_epochs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
+            print(num_epochs)
+            dist.all_reduce(num_epochs, op=op)
+            self.expected_idx = int(num_epochs.item())
+    def _next_data(self):   
+        if self.current_pos >= self.dataset.len:
+            return self.input_dataset._get_empty()
+        if self.current_pos + self.batch_size > self.input_dataset.len:
+            if self.drop_last:
+                return None
+            else:
+                next_data = self.input_dataset.get_next(
+                    slice(self.current_pos,None,None)
+                )
+                self.current_pos = 0
+        else:
+            next_data = self.input_dataset.get_next(
+                slice(self.current_pos,self.current_pos + self.batch_size,None)
+            )
+            self.current_pos += self.batch_size
+        return next_data
+    def __next__(self):
+        if self.is_pipeline is False:
+            if self.recv_idxs < self.expected_idx:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+                assert batch_data is not None
+                return *batch_data
+            else :
+                raise StopIteration
+        else:
+            if self.recv_idxs == 0:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+            else:
+                if(self.recv_idxs < self.expected_idx):
+                    assert len(self.result_queue) > 0
+                    result= self.result_queue[0]
+                    self.result_queue.popleft()
+                    batch_data = result.result()
+                    self.recv_idxs += 1
+                else:
+                    raise StopIteration
+            if(self.recv_idxs+1<=self.expected_idx):
+                data = self._next_data()
+                next_batch =  graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device,
+                                          async_op=True)
+                self.result_queue.append(next_batch)     
+        return batch_data
--- a/.history/starrygl/sample/data_loader_20240108195552.py
+++ b/.history/starrygl/sample/data_loader_20240108195552.py
+from collections import deque
+from enum import Enum
+import queue
+import torch
+import sys
+from os.path import abspath, join, dirname
+import numpy as np
+from starrygl.sample.batch_data import graph_sample
+from starrygl.sample.sample_core.PreNegSampling import PreNegativeSampling
+sys.path.insert(0, join(abspath(dirname(__file__))))
+from typing import Deque, Optional
+import torch.distributed as dist
+from torch_geometric.data import Data
+import os.path as osp
+import math
+class DistributedDataLoader:
+    ''' 
+    We will perform feature fetch in the data loader.
+    you can simply define a data loader for use, while starrygl assisting in fetching node or edge features:
+    Args:
+        graph: distributed graph store
+        data: the graph data
+        sampler: a parallel sampler like `NeighborSampler` above
+        sampler_fn: sample type
+        neg_sampler: negative sampler
+        batch_size: batch size
+        mailbox: APAN's mailbox and TGN's memory implemented by starrygl
+    Examples:
+        .. code-block:: python
+            import torch
+            from starrygl.sample.data_loader import DistributedDataLoader
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            from starrygl.sample.sample_core.base import NegativeSampling
+            from starrygl.sample.batch_data import SAMPLE_TYPE
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")    
+            graph = DistributedGraphStore(pdata = pdata, uvm_edge = False, uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.  edge_attr is not None else 0)
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10], graph_data=sample_graph,    workers=15,policy = 'recent',graph_name = "wiki_train")
+            neg_sampler = NegativeSampling('triplet')
+            train_data = torch.masked_select(graph.edge_index, pdata.train_mask.to(graph.edge_index.device)).reshape    (2, -1)
+            trainloader = DistributedDataLoader(graph, train_data, sampler=sampler, sampler_fn=SAMPLE_TYPE. SAMPLE_FROM_TEMPORAL_EDGES,neg_sampler=neg_sampler, batch_size=1000, shuffle=False, drop_last=True, chunk_size = None,train=True, mailbox=mailbox )
+    In the data loader, we will call the `graph_sample`, sourced from `starrygl.sample.batch_data`.
+    And the `to_block` function in the `graph_sample` will implement feature fetching.
+    If cache is not used, we will directly fetch node or edge features from the graph data, 
+    otherwise we will call `fetch_data` for feature fetching.     
+    '''
+    def __init__(
+            self,
+            graph,
+            dataset = None,
+            sampler = None,
+            sampler_fn = None,
+            neg_sampler = None,
+            batch_size: Optional[int]=None,
+            drop_last = False,
+            device: torch.device  = torch.device('cuda'),
+            shuffle:bool = True,
+            chunk_size = None,
+            train = False,
+            queue_size = 10,
+            mailbox = None,
+            is_pipeline = False,
+            **kwargs
+    ):
+        assert sampler is not None
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.queue_size = queue_size
+        self.num_pending = 0
+        self.current_pos = 0
+        self.recv_idxs = 0
+        self.drop_last = drop_last
+        self.result_queue = deque(maxlen = self.queue_size)
+        self.shuffle = shuffle
+        self.is_closed = False
+        self.sampler = sampler
+        self.sampler_fn = sampler_fn
+        self.neg_sampler = neg_sampler
+        self.graph = graph
+        self.shuffle=shuffle
+        self.dataset = dataset
+        self.mailbox = mailbox
+        self.device =  device
+        self.is_pipeline = is_pipeline
+        if train is True:
+            self._get_expected_idx(self.dataset.len)
+        else:
+            self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
+            #self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
+        torch.distributed.barrier()
+    def __iter__(self):
+        if self.chunk_size is None:
+            if self.shuffle:
+                self.input_dataset = self.dataset.shuffle()
+            else:
+                self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.current_pos = 0
+            self.num_pending = 0
+            self.submitted = 0
+        else:
+            self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.num_pending = 0
+            self.submitted = 0
+            if dist.get_rank == 0:
+                self.current_pos = int(
+                    math.floor(
+                        np.random.uniform(0,self.batch_size/self.chunk_size)
+                    )*self.chunk_size
+                )
+            else:
+                self.current_pos = 0
+            current_pos = torch.tensor([self.current_pos],dtype = torch.long,device=self.device) 
+            dist.broadcast(current_pos, src = 0)
+            self.current_pos = int(current_pos.item())
+            self._get_expected_idx(self.dataset.len-self.current_pos)
+        if self.neg_sampler is not None \
+            and isinstance(self.neg_sampler,PreNegativeSampling):
+            self.neg_sampler.set_next_pos(self.current_pos)
+        return self
+    def _get_expected_idx(self,data_size,op = dist.ReduceOp.MIN):
+        world_size = dist.get_world_size()
+        self.expected_idx = data_size // self.batch_size if self.drop_last is True else int(math.ceil(data_size/self.batch_size))
+        if dist.get_world_size() > 1:
+            num_epochs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
+            print(num_epochs)
+            dist.all_reduce(num_epochs, op=op)
+            self.expected_idx = int(num_epochs.item())
+    def _next_data(self):   
+        if self.current_pos >= self.dataset.len:
+            return self.input_dataset._get_empty()
+        if self.current_pos + self.batch_size > self.input_dataset.len:
+            if self.drop_last:
+                return None
+            else:
+                next_data = self.input_dataset.get_next(
+                    slice(self.current_pos,None,None)
+                )
+                self.current_pos = 0
+        else:
+            next_data = self.input_dataset.get_next(
+                slice(self.current_pos,self.current_pos + self.batch_size,None)
+            )
+            self.current_pos += self.batch_size
+        return next_data
+    def __next__(self):
+        if self.is_pipeline is False:
+            if self.recv_idxs < self.expected_idx:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+                assert batch_data is not None
+                return batch_data
+            else :
+                raise StopIteration
+        else:
+            if self.recv_idxs == 0:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+            else:
+                if(self.recv_idxs < self.expected_idx):
+                    assert len(self.result_queue) > 0
+                    result= self.result_queue[0]
+                    self.result_queue.popleft()
+                    batch_data = result.result()
+                    self.recv_idxs += 1
+                else:
+                    raise StopIteration
+            if(self.recv_idxs+1<=self.expected_idx):
+                data = self._next_data()
+                next_batch =  graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device,
+                                          async_op=True)
+                self.result_queue.append(next_batch)     
+        return batch_data
--- a/.history/starrygl/sample/data_loader_20240108195557.py
+++ b/.history/starrygl/sample/data_loader_20240108195557.py
+from collections import deque
+from enum import Enum
+import queue
+import torch
+import sys
+from os.path import abspath, join, dirname
+import numpy as np
+from starrygl.sample.batch_data import graph_sample
+from starrygl.sample.sample_core.PreNegSampling import PreNegativeSampling
+sys.path.insert(0, join(abspath(dirname(__file__))))
+from typing import Deque, Optional
+import torch.distributed as dist
+from torch_geometric.data import Data
+import os.path as osp
+import math
+class DistributedDataLoader:
+    ''' 
+    We will perform feature fetch in the data loader.
+    you can simply define a data loader for use, while starrygl assisting in fetching node or edge features:
+    Args:
+        graph: distributed graph store
+        data: the graph data
+        sampler: a parallel sampler like `NeighborSampler` above
+        sampler_fn: sample type
+        neg_sampler: negative sampler
+        batch_size: batch size
+        mailbox: APAN's mailbox and TGN's memory implemented by starrygl
+    Examples:
+        .. code-block:: python
+            import torch
+            from starrygl.sample.data_loader import DistributedDataLoader
+            from starrygl.sample.part_utils.partition_tgnn import partition_load
+            from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+            from starrygl.sample.memory.shared_mailbox import SharedMailBox
+            from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
+            from starrygl.sample.sample_core.base import NegativeSampling
+            from starrygl.sample.batch_data import SAMPLE_TYPE
+            pdata = partition_load("PATH/{}".format(dataname), algo="metis_for_tgnn")    
+            graph = DistributedGraphStore(pdata = pdata, uvm_edge = False, uvm_node = False)
+            sample_graph = TemporalNeighborSampleGraph(sample_graph = pdata.sample_graph,mode = 'full')
+            mailbox = SharedMailBox(pdata.ids.shape[0], memory_param, dim_edge_feat=pdata.edge_attr.shape[1] if pdata.  edge_attr is not None else 0)
+            sampler = NeighborSampler(num_nodes=graph.num_nodes, num_layers=1, fanout=[10], graph_data=sample_graph,    workers=15,policy = 'recent',graph_name = "wiki_train")
+            neg_sampler = NegativeSampling('triplet')
+            train_data = torch.masked_select(graph.edge_index, pdata.train_mask.to(graph.edge_index.device)).reshape    (2, -1)
+            trainloader = DistributedDataLoader(graph, train_data, sampler=sampler, sampler_fn=SAMPLE_TYPE. SAMPLE_FROM_TEMPORAL_EDGES,neg_sampler=neg_sampler, batch_size=1000, shuffle=False, drop_last=True, chunk_size = None,train=True, mailbox=mailbox )
+    In the data loader, we will call the `graph_sample`, sourced from `starrygl.sample.batch_data`.
+    And the `to_block` function in the `graph_sample` will implement feature fetching.
+    If cache is not used, we will directly fetch node or edge features from the graph data, 
+    otherwise we will call `fetch_data` for feature fetching.     
+    '''
+    def __init__(
+            self,
+            graph,
+            dataset = None,
+            sampler = None,
+            sampler_fn = None,
+            neg_sampler = None,
+            batch_size: Optional[int]=None,
+            drop_last = False,
+            device: torch.device  = torch.device('cuda'),
+            shuffle:bool = True,
+            chunk_size = None,
+            train = False,
+            queue_size = 10,
+            mailbox = None,
+            is_pipeline = False,
+            **kwargs
+    ):
+        assert sampler is not None
+        self.chunk_size = chunk_size
+        self.batch_size = batch_size
+        self.queue_size = queue_size
+        self.num_pending = 0
+        self.current_pos = 0
+        self.recv_idxs = 0
+        self.drop_last = drop_last
+        self.result_queue = deque(maxlen = self.queue_size)
+        self.shuffle = shuffle
+        self.is_closed = False
+        self.sampler = sampler
+        self.sampler_fn = sampler_fn
+        self.neg_sampler = neg_sampler
+        self.graph = graph
+        self.shuffle=shuffle
+        self.dataset = dataset
+        self.mailbox = mailbox
+        self.device =  device
+        self.is_pipeline = is_pipeline
+        if train is True:
+            self._get_expected_idx(self.dataset.len)
+        else:
+            self._get_expected_idx(self.dataset.len,op = dist.ReduceOp.MAX)
+            #self.expected_idx = int(math.ceil(self.dataset.len/self.batch_size))
+        torch.distributed.barrier()
+    def __iter__(self):
+        if self.chunk_size is None:
+            if self.shuffle:
+                self.input_dataset = self.dataset.shuffle()
+            else:
+                self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.current_pos = 0
+            self.num_pending = 0
+            self.submitted = 0
+        else:
+            self.input_dataset = self.dataset
+            self.recv_idxs = 0
+            self.num_pending = 0
+            self.submitted = 0
+            if dist.get_rank == 0:
+                self.current_pos = int(
+                    math.floor(
+                        np.random.uniform(0,self.batch_size/self.chunk_size)
+                    )*self.chunk_size
+                )
+            else:
+                self.current_pos = 0
+            current_pos = torch.tensor([self.current_pos],dtype = torch.long,device=self.device) 
+            dist.broadcast(current_pos, src = 0)
+            self.current_pos = int(current_pos.item())
+            self._get_expected_idx(self.dataset.len-self.current_pos)
+        if self.neg_sampler is not None \
+            and isinstance(self.neg_sampler,PreNegativeSampling):
+            self.neg_sampler.set_next_pos(self.current_pos)
+        return self
+    def _get_expected_idx(self,data_size,op = dist.ReduceOp.MIN):
+        world_size = dist.get_world_size()
+        self.expected_idx = data_size // self.batch_size if self.drop_last is True else int(math.ceil(data_size/self.batch_size))
+        if dist.get_world_size() > 1:
+            num_epochs = torch.tensor([self.expected_idx],dtype = torch.long,device=self.device) 
+            print(num_epochs)
+            dist.all_reduce(num_epochs, op=op)
+            self.expected_idx = int(num_epochs.item())
+    def _next_data(self):   
+        if self.current_pos >= self.dataset.len:
+            return self.input_dataset._get_empty()
+        if self.current_pos + self.batch_size > self.input_dataset.len:
+            if self.drop_last:
+                return None
+            else:
+                next_data = self.input_dataset.get_next(
+                    slice(self.current_pos,None,None)
+                )
+                self.current_pos = 0
+        else:
+            next_data = self.input_dataset.get_next(
+                slice(self.current_pos,self.current_pos + self.batch_size,None)
+            )
+            self.current_pos += self.batch_size
+        return next_data
+    def __next__(self):
+        if self.is_pipeline is False:
+            if self.recv_idxs < self.expected_idx:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+                assert batch_data is not None
+                return batch_data
+            else :
+                raise StopIteration
+        else:
+            if self.recv_idxs == 0:
+                data = self._next_data()
+                batch_data = graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device)
+                self.recv_idxs += 1
+            else:
+                if(self.recv_idxs < self.expected_idx):
+                    assert len(self.result_queue) > 0
+                    result= self.result_queue[0]
+                    self.result_queue.popleft()
+                    batch_data = result.result()
+                    self.recv_idxs += 1
+                else:
+                    raise StopIteration
+            if(self.recv_idxs+1<=self.expected_idx):
+                data = self._next_data()
+                next_batch =  graph_sample(self.graph,
+                                          self.sampler,
+                                          self.sampler_fn,
+                                          data,self.neg_sampler,
+                                          self.mailbox,
+                                          self.device,
+                                          async_op=True)
+                self.result_queue.append(next_batch)     
+        return batch_data
--- a/.history/starrygl/sample/graph_core/__init___20240108194326.py
+++ b/.history/starrygl/sample/graph_core/__init___20240108194326.py
+import starrygl
+from starrygl.distributed.context import DistributedContext
+from starrygl.distributed.utils import DistIndex, DistributedTensor
+from starrygl.sample.graph_core.utils import build_mapper
+import os.path as osp
+import torch
+import torch.distributed as dist
+from torch_geometric.data import Data
+class DistributedGraphStore:
+    '''
+    Initializes the DistributedGraphStore with distributed graph data.
+    Args:
+        pdata: Graph data object containing ids, eids, edge_index, edge_ts, sample_graph, x, and edge_attr.
+        device: Device to which tensors are moved (default is 'cuda').
+        uvm_node: If True, enables Unified Virtual Memory (UVM) for node data.
+        uvm_edge: If True, enables Unified Virtual Memory (UVM) for edge data.
+    '''
+    def __init__(self, pdata, device = torch.device('cuda'),
+                 uvm_node = False, 
+                 uvm_edge = False):
+        self.device = device
+        self.ids = pdata.ids.to(device)
+        self.eids = pdata.eids
+        self.edge_index = pdata.edge_index.to(device)
+        if hasattr(pdata,'edge_ts'):
+            self.edge_ts = pdata.edge_ts.to(device).to(torch.float)
+        else:
+            self.edge_ts = None
+        self.sample_graph = pdata.sample_graph
+        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
+        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
+        torch.cuda.empty_cache()
+        self.num_nodes = self.nids_mapper.data.shape[0]
+        self.num_edges = self.eids_mapper.data.shape[0]
+        world_size = dist.get_world_size()
+        self.uvm_node = uvm_node
+        self.uvm_edge = uvm_edge
+        if hasattr(pdata,'x') and pdata.x is not None:
+            pdata.x = pdata.x.to(torch.float)
+            if uvm_node == False :
+                x = pdata.x.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    x = starrygl.utils.uvm.uvm_empty(*pdata.x.size(),
+                                    dtype=pdata.x.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(x,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(x,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(x)
+            if world_size > 1:
+                self.x = DistributedTensor(pdata.x.to(self.device).to(torch.float))
+            else:
+                self.x = x
+        else:
+            self.x = None
+        if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None:
+            ctx = DistributedContext.get_default_context()
+            pdata.edge_attr = pdata.edge_attr.to(torch.float)
+            if uvm_edge == False :
+                edge_attr = pdata.edge_attr.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    edge_attr = starrygl.utils.uvm.uvm_empty(*pdata.edge_attr.size(),
+                                    dtype=pdata.edge_attr.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(edge_attr,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(edge_attr,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(edge_attr)
+            if world_size > 1:
+                self.edge_attr = DistributedTensor(edge_attr)
+            else:
+                self.edge_attr = edge_attr
+        else:
+            self.edge_attr = None
+    def _get_node_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves node attributes for the specified node IDs.
+        Args:
+            ids: Node IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.x is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.x[ids]
+        else:
+            if self.x.rrefs is None or asyncOp is False:
+                ids = self.x.all_to_all_ind2ptr(ids)
+                return self.x.all_to_all_get(**ids)
+            return self.x.index_select(ids)
+    def _get_edge_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves edge attributes for the specified edge IDs.
+        Args:
+            ids: Edge IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.edge_attr is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.edge_attr[ids]
+        else:
+            if self.edge_attr.rrefs is None or asyncOp is False:
+                ids = self.edge_attr.all_to_all_ind2ptr(ids)
+                return self.edge_attr.all_to_all_get(**ids)
+            return self.edge_attr.index_select(ids)
+    def _get_dist_index(self,ind,mapper):
+        '''
+        Retrieves the distributed index for the specified local index using the provided mapper.
+        Args:
+            ind: Local index for which to retrieve the distributed index.
+            mapper: Mapper providing the distributed index.
+        '''
+        return mapper[ind.to(mapper.device)]
+class DataSet:
+    '''
+    Args:
+        nodes: Tensor representing nodes. If not None, it is moved to the specified device.
+        edges: Tensor representing edges. If not None, it is moved to the specified device.
+        labels: Optional parameter for labels.
+        ts: Tensor representing timestamps. If not None, it is moved to the specified device.
+        device: Device to which tensors are moved (default is 'cuda').
+    '''
+    def __init__(self,nodes = None,
+                 edges = None,
+                 labels = None, 
+                 ts = None, 
+                 device = torch.device('cuda'),**kwargs):
+        if nodes is not None:
+            self.nodes = nodes.to(device)
+        if edges is not None:
+            self.edges = edges.to(device)
+        if ts is not None:
+            self.ts = ts.to(device)
+        if labels is not None:
+            self.labels = labels
+        self.len = self.nodes.shape[0] if nodes is not None else self.edges.shape[1] 
+        for k, v in kwargs.items():
+            assert isinstance(v,torch.Tensor) and v.shape[0]==self.len
+            setattr(self, k, v.to(device))
+    def _get_empty(self):
+        '''
+        Creates an empty dataset with the same device and data types as the current instance.
+        '''
+        nodes = torch.empty([],dtype = self.nodes.dtype,device= self.nodes.device)if hasattr(self,'nodes') else None
+        edges = torch.empty([[],[]],dtype = self.edges.dtype,device= self.edge.device)if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,torch.empty([]))
+        return d
+    #@staticmethod
+    def get_next(self,indx):
+        '''
+        Retrieves the next dataset based on the provided index.
+        Args:
+            indx: Index specifying the dataset to retrieve.
+        '''
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+    #@staticmethod
+    def shuffle(self):
+        '''
+        Shuffles the dataset and returns a new dataset with the same attributes.
+        '''
+        indx = torch.randperm(self.len)
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+class TemporalGraphData(DistributedGraphStore):
+    def __init__(self,pdata,device):
+        super(DistributedTempoGraphData,self).__init__(pdata,device)
+    def _set_temporal_batch_cache(self,size,pin_size):
+        pass
+    def _load_feature_to_cuda(self,ids):
+        pass
+class TemporalNeighborSampleGraph(DistributedGraphStore):
+    '''
+    Args:
+        sample_graph: A dictionary containing graph structure information, including 'edge_index', 'ts' (edge timestamp), and 'eids' (edge identifiers).
+        mode: Specifies the dataset mode ('train', 'val', 'test', or 'full').
+        eids_mapper: Optional parameter for edge identifiers mapping.
+    '''
+    def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
+        self.edge_index = sample_graph['edge_index']
+        self.num_edges = self.edge_index.shape[1]
+        if 'ts' in sample_graph:
+            self.edge_ts = sample_graph['ts']
+        else:
+            self.edge_ts = None
+        self.eid = sample_graph['eids']
+        if mode == 'train':
+            mask = sample_graph['train_mask']
+        if mode == 'val':
+            mask = sample_graph['val_mask']
+        if mode == 'test':
+            mask = sample_graph['test_mask']
+        if mode != 'full':
+            self.edge_index = self.edge_index[:, mask]
+            self.edge_ts = self.edge_ts[mask]
+            self.eid = self.eid[mask]  
--- a/.history/starrygl/sample/graph_core/__init___20240108195206.py
+++ b/.history/starrygl/sample/graph_core/__init___20240108195206.py
+import starrygl
+from starrygl.distributed.context import DistributedContext
+from starrygl.distributed.utils import DistIndex, DistributedTensor
+from starrygl.sample.graph_core.utils import build_mapper
+import os.path as osp
+import torch
+import torch.distributed as dist
+from torch_geometric.data import Data
+class DistributedGraphStore:
+    '''
+    Initializes the DistributedGraphStore with distributed graph data.
+    Args:
+        pdata: Graph data object containing ids, eids, edge_index, edge_ts, sample_graph, x, and edge_attr.
+        device: Device to which tensors are moved (default is 'cuda').
+        uvm_node: If True, enables Unified Virtual Memory (UVM) for node data.
+        uvm_edge: If True, enables Unified Virtual Memory (UVM) for edge data.
+    '''
+    def __init__(self, pdata, device = torch.device('cuda'),
+                 uvm_node = False, 
+                 uvm_edge = False):
+        self.device = device
+        self.ids = pdata.ids.to(device)
+        self.eids = pdata.eids
+        self.edge_index = pdata.edge_index.to(device)
+        if hasattr(pdata,'edge_ts'):
+            self.edge_ts = pdata.edge_ts.to(device).to(torch.float)
+        else:
+            self.edge_ts = None
+        self.sample_graph = pdata.sample_graph
+        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
+        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
+        torch.cuda.empty_cache()
+        self.num_nodes = self.nids_mapper.data.shape[0]
+        self.num_edges = self.eids_mapper.data.shape[0]
+        world_size = dist.get_world_size()
+        self.uvm_node = uvm_node
+        self.uvm_edge = uvm_edge
+        if hasattr(pdata,'x') and pdata.x is not None:
+            pdata.x = pdata.x.to(torch.float)
+            if uvm_node == False :
+                x = pdata.x.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    x = starrygl.utils.uvm.uvm_empty(*pdata.x.size(),
+                                    dtype=pdata.x.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(x,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(x,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(x)
+            if world_size > 1:
+                self.x = DistributedTensor(pdata.x.to(self.device).to(torch.float))
+            else:
+                self.x = x
+        else:
+            self.x = None
+        if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None:
+            ctx = DistributedContext.get_default_context()
+            pdata.edge_attr = pdata.edge_attr.to(torch.float)
+            if uvm_edge == False :
+                edge_attr = pdata.edge_attr.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    edge_attr = starrygl.utils.uvm.uvm_empty(*pdata.edge_attr.size(),
+                                    dtype=pdata.edge_attr.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(edge_attr,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(edge_attr,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(edge_attr)
+            if world_size > 1:
+                self.edge_attr = DistributedTensor(edge_attr)
+            else:
+                self.edge_attr = edge_attr
+        else:
+            self.edge_attr = None
+    def _get_node_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves node attributes for the specified node IDs.
+        Args:
+            ids: Node IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.x is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.x[ids]
+        else:
+            if self.x.rrefs is None or asyncOp is False:
+                ids = self.x.all_to_all_ind2ptr(ids)
+                return self.x.all_to_all_get(**ids)
+            return self.x.index_select(ids)
+    def _get_edge_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves edge attributes for the specified edge IDs.
+        Args:
+            ids: Edge IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.edge_attr is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.edge_attr[ids]
+        else:
+            if self.edge_attr.rrefs is None or asyncOp is False:
+                ids = self.edge_attr.all_to_all_ind2ptr(ids)
+                return self.edge_attr.all_to_all_get(**ids)
+            return self.edge_attr.index_select(ids)
+    def _get_dist_index(self,ind,mapper):
+        '''
+        Retrieves the distributed index for the specified local index using the provided mapper.
+        Args:
+            ind: Local index for which to retrieve the distributed index.
+            mapper: Mapper providing the distributed index.
+        '''
+        return mapper[ind.to(mapper.device)]
+class DataSet:
+    '''
+    Args:
+        nodes: Tensor representing nodes. If not None, it is moved to the specified device.
+        edges: Tensor representing edges. If not None, it is moved to the specified device.
+        labels: Optional parameter for labels.
+        ts: Tensor representing timestamps. If not None, it is moved to the specified device.
+        device: Device to which tensors are moved (default is 'cuda').
+    '''
+    def __init__(self,nodes = None,
+                 edges = None,
+                 labels = None, 
+                 ts = None, 
+                 device = torch.device('cuda'),**kwargs):
+        if nodes is not None:
+            self.nodes = nodes.to(device)
+        if edges is not None:
+            self.edges = edges.to(device)
+        if ts is not None:
+            self.ts = ts.to(device)
+        if labels is not None:
+            self.labels = labels
+        self.len = self.nodes.shape[0] if nodes is not None else self.edges.shape[1] 
+        for k, v in kwargs.items():
+            assert isinstance(v,torch.Tensor) and v.shape[0]==self.len
+            setattr(self, k, v.to(device))
+    def _get_empty(self):
+        '''
+        Creates an empty dataset with the same device and data types as the current instance.
+        '''
+        nodes = torch.empty([],dtype = self.nodes.dtype,device= self.nodes.device)if hasattr(self,'nodes') else None
+        edges = torch.empty([[],[]],dtype = self.edges.dtype,device= self.edge.device)if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,torch.empty([]))
+        return d
+    #@staticmethod
+    def get_next(self,indx):
+        '''
+        Retrieves the next dataset based on the provided index.
+        Args:
+            indx: Index specifying the dataset to retrieve.
+        '''
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+    #@staticmethod
+    def shuffle(self):
+        '''
+        Shuffles the dataset and returns a new dataset with the same attributes.
+        '''
+        indx = torch.randperm(self.len)
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+class TemporalGraphData(DistributedGraphStore):
+    def __init__(self,pdata,device):
+        super(DistributedGraphData,self).__init__(pdata,device)
+    def _set_temporal_batch_cache(self,size,pin_size):
+        pass
+    def _load_feature_to_cuda(self,ids):
+        pass
+class TemporalNeighborSampleGraph(DistributedGraphStore):
+    '''
+    Args:
+        sample_graph: A dictionary containing graph structure information, including 'edge_index', 'ts' (edge timestamp), and 'eids' (edge identifiers).
+        mode: Specifies the dataset mode ('train', 'val', 'test', or 'full').
+        eids_mapper: Optional parameter for edge identifiers mapping.
+    '''
+    def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
+        self.edge_index = sample_graph['edge_index']
+        self.num_edges = self.edge_index.shape[1]
+        if 'ts' in sample_graph:
+            self.edge_ts = sample_graph['ts']
+        else:
+            self.edge_ts = None
+        self.eid = sample_graph['eids']
+        if mode == 'train':
+            mask = sample_graph['train_mask']
+        if mode == 'val':
+            mask = sample_graph['val_mask']
+        if mode == 'test':
+            mask = sample_graph['test_mask']
+        if mode != 'full':
+            self.edge_index = self.edge_index[:, mask]
+            self.edge_ts = self.edge_ts[mask]
+            self.eid = self.eid[mask]  
--- a/.history/starrygl/sample/graph_core/__init___20240108195210.py
+++ b/.history/starrygl/sample/graph_core/__init___20240108195210.py
+import starrygl
+from starrygl.distributed.context import DistributedContext
+from starrygl.distributed.utils import DistIndex, DistributedTensor
+from starrygl.sample.graph_core.utils import build_mapper
+import os.path as osp
+import torch
+import torch.distributed as dist
+from torch_geometric.data import Data
+class DistributedGraphStore:
+    '''
+    Initializes the DistributedGraphStore with distributed graph data.
+    Args:
+        pdata: Graph data object containing ids, eids, edge_index, edge_ts, sample_graph, x, and edge_attr.
+        device: Device to which tensors are moved (default is 'cuda').
+        uvm_node: If True, enables Unified Virtual Memory (UVM) for node data.
+        uvm_edge: If True, enables Unified Virtual Memory (UVM) for edge data.
+    '''
+    def __init__(self, pdata, device = torch.device('cuda'),
+                 uvm_node = False, 
+                 uvm_edge = False):
+        self.device = device
+        self.ids = pdata.ids.to(device)
+        self.eids = pdata.eids
+        self.edge_index = pdata.edge_index.to(device)
+        if hasattr(pdata,'edge_ts'):
+            self.edge_ts = pdata.edge_ts.to(device).to(torch.float)
+        else:
+            self.edge_ts = None
+        self.sample_graph = pdata.sample_graph
+        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
+        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
+        torch.cuda.empty_cache()
+        self.num_nodes = self.nids_mapper.data.shape[0]
+        self.num_edges = self.eids_mapper.data.shape[0]
+        world_size = dist.get_world_size()
+        self.uvm_node = uvm_node
+        self.uvm_edge = uvm_edge
+        if hasattr(pdata,'x') and pdata.x is not None:
+            pdata.x = pdata.x.to(torch.float)
+            if uvm_node == False :
+                x = pdata.x.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    x = starrygl.utils.uvm.uvm_empty(*pdata.x.size(),
+                                    dtype=pdata.x.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(x,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(x,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(x)
+            if world_size > 1:
+                self.x = DistributedTensor(pdata.x.to(self.device).to(torch.float))
+            else:
+                self.x = x
+        else:
+            self.x = None
+        if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None:
+            ctx = DistributedContext.get_default_context()
+            pdata.edge_attr = pdata.edge_attr.to(torch.float)
+            if uvm_edge == False :
+                edge_attr = pdata.edge_attr.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    edge_attr = starrygl.utils.uvm.uvm_empty(*pdata.edge_attr.size(),
+                                    dtype=pdata.edge_attr.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(edge_attr,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(edge_attr,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(edge_attr)
+            if world_size > 1:
+                self.edge_attr = DistributedTensor(edge_attr)
+            else:
+                self.edge_attr = edge_attr
+        else:
+            self.edge_attr = None
+    def _get_node_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves node attributes for the specified node IDs.
+        Args:
+            ids: Node IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.x is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.x[ids]
+        else:
+            if self.x.rrefs is None or asyncOp is False:
+                ids = self.x.all_to_all_ind2ptr(ids)
+                return self.x.all_to_all_get(**ids)
+            return self.x.index_select(ids)
+    def _get_edge_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves edge attributes for the specified edge IDs.
+        Args:
+            ids: Edge IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.edge_attr is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.edge_attr[ids]
+        else:
+            if self.edge_attr.rrefs is None or asyncOp is False:
+                ids = self.edge_attr.all_to_all_ind2ptr(ids)
+                return self.edge_attr.all_to_all_get(**ids)
+            return self.edge_attr.index_select(ids)
+    def _get_dist_index(self,ind,mapper):
+        '''
+        Retrieves the distributed index for the specified local index using the provided mapper.
+        Args:
+            ind: Local index for which to retrieve the distributed index.
+            mapper: Mapper providing the distributed index.
+        '''
+        return mapper[ind.to(mapper.device)]
+class DataSet:
+    '''
+    Args:
+        nodes: Tensor representing nodes. If not None, it is moved to the specified device.
+        edges: Tensor representing edges. If not None, it is moved to the specified device.
+        labels: Optional parameter for labels.
+        ts: Tensor representing timestamps. If not None, it is moved to the specified device.
+        device: Device to which tensors are moved (default is 'cuda').
+    '''
+    def __init__(self,nodes = None,
+                 edges = None,
+                 labels = None, 
+                 ts = None, 
+                 device = torch.device('cuda'),**kwargs):
+        if nodes is not None:
+            self.nodes = nodes.to(device)
+        if edges is not None:
+            self.edges = edges.to(device)
+        if ts is not None:
+            self.ts = ts.to(device)
+        if labels is not None:
+            self.labels = labels
+        self.len = self.nodes.shape[0] if nodes is not None else self.edges.shape[1] 
+        for k, v in kwargs.items():
+            assert isinstance(v,torch.Tensor) and v.shape[0]==self.len
+            setattr(self, k, v.to(device))
+    def _get_empty(self):
+        '''
+        Creates an empty dataset with the same device and data types as the current instance.
+        '''
+        nodes = torch.empty([],dtype = self.nodes.dtype,device= self.nodes.device)if hasattr(self,'nodes') else None
+        edges = torch.empty([[],[]],dtype = self.edges.dtype,device= self.edge.device)if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,torch.empty([]))
+        return d
+    #@staticmethod
+    def get_next(self,indx):
+        '''
+        Retrieves the next dataset based on the provided index.
+        Args:
+            indx: Index specifying the dataset to retrieve.
+        '''
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+    #@staticmethod
+    def shuffle(self):
+        '''
+        Shuffles the dataset and returns a new dataset with the same attributes.
+        '''
+        indx = torch.randperm(self.len)
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+class TemporalGraphData(DistributedGraphStore):
+    def __init__(self,pdata,device):
+        super(DistributedGraphStore,self).__init__(pdata,device)
+    def _set_temporal_batch_cache(self,size,pin_size):
+        pass
+    def _load_feature_to_cuda(self,ids):
+        pass
+class TemporalNeighborSampleGraph(DistributedGraphStore):
+    '''
+    Args:
+        sample_graph: A dictionary containing graph structure information, including 'edge_index', 'ts' (edge timestamp), and 'eids' (edge identifiers).
+        mode: Specifies the dataset mode ('train', 'val', 'test', or 'full').
+        eids_mapper: Optional parameter for edge identifiers mapping.
+    '''
+    def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
+        self.edge_index = sample_graph['edge_index']
+        self.num_edges = self.edge_index.shape[1]
+        if 'ts' in sample_graph:
+            self.edge_ts = sample_graph['ts']
+        else:
+            self.edge_ts = None
+        self.eid = sample_graph['eids']
+        if mode == 'train':
+            mask = sample_graph['train_mask']
+        if mode == 'val':
+            mask = sample_graph['val_mask']
+        if mode == 'test':
+            mask = sample_graph['test_mask']
+        if mode != 'full':
+            self.edge_index = self.edge_index[:, mask]
+            self.edge_ts = self.edge_ts[mask]
+            self.eid = self.eid[mask]  
--- a/.history/starrygl/sample/graph_core/__init___20240108195211.py
+++ b/.history/starrygl/sample/graph_core/__init___20240108195211.py
+import starrygl
+from starrygl.distributed.context import DistributedContext
+from starrygl.distributed.utils import DistIndex, DistributedTensor
+from starrygl.sample.graph_core.utils import build_mapper
+import os.path as osp
+import torch
+import torch.distributed as dist
+from torch_geometric.data import Data
+class DistributedGraphStore:
+    '''
+    Initializes the DistributedGraphStore with distributed graph data.
+    Args:
+        pdata: Graph data object containing ids, eids, edge_index, edge_ts, sample_graph, x, and edge_attr.
+        device: Device to which tensors are moved (default is 'cuda').
+        uvm_node: If True, enables Unified Virtual Memory (UVM) for node data.
+        uvm_edge: If True, enables Unified Virtual Memory (UVM) for edge data.
+    '''
+    def __init__(self, pdata, device = torch.device('cuda'),
+                 uvm_node = False, 
+                 uvm_edge = False):
+        self.device = device
+        self.ids = pdata.ids.to(device)
+        self.eids = pdata.eids
+        self.edge_index = pdata.edge_index.to(device)
+        if hasattr(pdata,'edge_ts'):
+            self.edge_ts = pdata.edge_ts.to(device).to(torch.float)
+        else:
+            self.edge_ts = None
+        self.sample_graph = pdata.sample_graph
+        self.nids_mapper = build_mapper(nids=pdata.ids.to(device)).dist.to('cpu')
+        self.eids_mapper = build_mapper(nids=pdata.eids.to(device)).dist.to('cpu')
+        torch.cuda.empty_cache()
+        self.num_nodes = self.nids_mapper.data.shape[0]
+        self.num_edges = self.eids_mapper.data.shape[0]
+        world_size = dist.get_world_size()
+        self.uvm_node = uvm_node
+        self.uvm_edge = uvm_edge
+        if hasattr(pdata,'x') and pdata.x is not None:
+            pdata.x = pdata.x.to(torch.float)
+            if uvm_node == False :
+                x = pdata.x.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    x = starrygl.utils.uvm.uvm_empty(*pdata.x.size(),
+                                    dtype=pdata.x.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(x,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(x,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(x)
+            if world_size > 1:
+                self.x = DistributedTensor(pdata.x.to(self.device).to(torch.float))
+            else:
+                self.x = x
+        else:
+            self.x = None
+        if hasattr(pdata,'edge_attr') and pdata.edge_attr is not None:
+            ctx = DistributedContext.get_default_context()
+            pdata.edge_attr = pdata.edge_attr.to(torch.float)
+            if uvm_edge == False :
+                edge_attr = pdata.edge_attr.to(self.device)
+            else:
+                if self.device.type == 'cuda':
+                    edge_attr = starrygl.utils.uvm.uvm_empty(*pdata.edge_attr.size(),
+                                    dtype=pdata.edge_attr.dtype,
+                                    device=ctx.device)
+                    starrygl.utils.uvm.uvm_share(edge_attr,device = ctx.device)
+                    starrygl.utils.uvm.uvm_advise(edge_attr,starrygl.utils.uvm.cudaMemoryAdvise.cudaMemAdviseSetAccessedBy)
+                    starrygl.utils.uvm.uvm_prefetch(edge_attr)
+            if world_size > 1:
+                self.edge_attr = DistributedTensor(edge_attr)
+            else:
+                self.edge_attr = edge_attr
+        else:
+            self.edge_attr = None
+    def _get_node_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves node attributes for the specified node IDs.
+        Args:
+            ids: Node IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.x is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.x[ids]
+        else:
+            if self.x.rrefs is None or asyncOp is False:
+                ids = self.x.all_to_all_ind2ptr(ids)
+                return self.x.all_to_all_get(**ids)
+            return self.x.index_select(ids)
+    def _get_edge_attr(self,ids,asyncOp = False):
+        '''
+        Retrieves edge attributes for the specified edge IDs.
+        Args:
+            ids: Edge IDs for which to retrieve attributes.
+            asyncOp: If True, performs asynchronous operation for distributed data.
+        '''
+        if self.edge_attr is None:
+            return None
+        elif dist.get_world_size() == 1:
+            return self.edge_attr[ids]
+        else:
+            if self.edge_attr.rrefs is None or asyncOp is False:
+                ids = self.edge_attr.all_to_all_ind2ptr(ids)
+                return self.edge_attr.all_to_all_get(**ids)
+            return self.edge_attr.index_select(ids)
+    def _get_dist_index(self,ind,mapper):
+        '''
+        Retrieves the distributed index for the specified local index using the provided mapper.
+        Args:
+            ind: Local index for which to retrieve the distributed index.
+            mapper: Mapper providing the distributed index.
+        '''
+        return mapper[ind.to(mapper.device)]
+class DataSet:
+    '''
+    Args:
+        nodes: Tensor representing nodes. If not None, it is moved to the specified device.
+        edges: Tensor representing edges. If not None, it is moved to the specified device.
+        labels: Optional parameter for labels.
+        ts: Tensor representing timestamps. If not None, it is moved to the specified device.
+        device: Device to which tensors are moved (default is 'cuda').
+    '''
+    def __init__(self,nodes = None,
+                 edges = None,
+                 labels = None, 
+                 ts = None, 
+                 device = torch.device('cuda'),**kwargs):
+        if nodes is not None:
+            self.nodes = nodes.to(device)
+        if edges is not None:
+            self.edges = edges.to(device)
+        if ts is not None:
+            self.ts = ts.to(device)
+        if labels is not None:
+            self.labels = labels
+        self.len = self.nodes.shape[0] if nodes is not None else self.edges.shape[1] 
+        for k, v in kwargs.items():
+            assert isinstance(v,torch.Tensor) and v.shape[0]==self.len
+            setattr(self, k, v.to(device))
+    def _get_empty(self):
+        '''
+        Creates an empty dataset with the same device and data types as the current instance.
+        '''
+        nodes = torch.empty([],dtype = self.nodes.dtype,device= self.nodes.device)if hasattr(self,'nodes') else None
+        edges = torch.empty([[],[]],dtype = self.edges.dtype,device= self.edge.device)if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,torch.empty([]))
+        return d
+    #@staticmethod
+    def get_next(self,indx):
+        '''
+        Retrieves the next dataset based on the provided index.
+        Args:
+            indx: Index specifying the dataset to retrieve.
+        '''
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+    #@staticmethod
+    def shuffle(self):
+        '''
+        Shuffles the dataset and returns a new dataset with the same attributes.
+        '''
+        indx = torch.randperm(self.len)
+        nodes = self.nodes[indx] if hasattr(self,'nodes') else None
+        edges = self.edges[:,indx] if hasattr(self,'edges') else None
+        d = DataSet(nodes,edges)
+        for k,v in self.__dict__.items():
+            if k == 'edges' or k=='nodes' or k == 'len':
+                continue
+            else:
+                setattr(d,k,v[indx])
+        return d
+class TemporalGraphData(DistributedGraphStore):
+    def __init__(self,pdata,device):
+        super(DistributedGraphStore,self).__init__(pdata,device)
+    def _set_temporal_batch_cache(self,size,pin_size):
+        pass
+    def _load_feature_to_cuda(self,ids):
+        pass
+class TemporalNeighborSampleGraph(DistributedGraphStore):
+    '''
+    Args:
+        sample_graph: A dictionary containing graph structure information, including 'edge_index', 'ts' (edge timestamp), and 'eids' (edge identifiers).
+        mode: Specifies the dataset mode ('train', 'val', 'test', or 'full').
+        eids_mapper: Optional parameter for edge identifiers mapping.
+    '''
+    def __init__(self, sample_graph=None, mode='full', eids_mapper=None):
+        self.edge_index = sample_graph['edge_index']
+        self.num_edges = self.edge_index.shape[1]
+        if 'ts' in sample_graph:
+            self.edge_ts = sample_graph['ts']
+        else:
+            self.edge_ts = None
+        self.eid = sample_graph['eids']
+        if mode == 'train':
+            mask = sample_graph['train_mask']
+        if mode == 'val':
+            mask = sample_graph['val_mask']
+        if mode == 'test':
+            mask = sample_graph['test_mask']
+        if mode != 'full':
+            self.edge_index = self.edge_index[:, mask]
+            self.edge_ts = self.edge_ts[mask]
+            self.eid = self.eid[mask]  
--- a/.history/starrygl/sample/memory/shared_mailbox_20240108194336.py
+++ b/.history/starrygl/sample/memory/shared_mailbox_20240108194336.py
--- a/.history/starrygl/sample/part_utils/partition_tgnn_20240108192330.py
+++ b/.history/starrygl/sample/part_utils/partition_tgnn_20240108192330.py
--- a/.history/starrygl/sample/part_utils/partition_tgnn_20240108194351.py
+++ b/.history/starrygl/sample/part_utils/partition_tgnn_20240108194351.py
--- a/.history/starrygl/sample/sample_core/neighbor_sampler_20240108194422.py
+++ b/.history/starrygl/sample/sample_core/neighbor_sampler_20240108194422.py
--- a/.history/train_tgnn_20240108192330.py
+++ b/.history/train_tgnn_20240108192330.py
--- a/.history/train_tgnn_20240108192928.py
+++ b/.history/train_tgnn_20240108192928.py
--- a/.history/train_tgnn_20240108192929.py
+++ b/.history/train_tgnn_20240108192929.py
--- a/.history/train_tgnn_20240108193157.py
+++ b/.history/train_tgnn_20240108193157.py
--- a/.history/train_tgnn_20240108193302.py
+++ b/.history/train_tgnn_20240108193302.py
--- a/.history/train_tgnn_20240108193340.py
+++ b/.history/train_tgnn_20240108193340.py
--- a/.history/train_tgnn_20240108193341.py
+++ b/.history/train_tgnn_20240108193341.py
--- a/.history/train_tgnn_20240108193345.py
+++ b/.history/train_tgnn_20240108193345.py
--- a/.history/train_tgnn_20240108193352.py
+++ b/.history/train_tgnn_20240108193352.py
--- a/.history/train_tgnn_20240108193417.py
+++ b/.history/train_tgnn_20240108193417.py
--- a/.history/train_tgnn_20240108193418.py
+++ b/.history/train_tgnn_20240108193418.py
--- a/.history/train_tgnn_20240108193419.py
+++ b/.history/train_tgnn_20240108193419.py
--- a/.history/train_tgnn_20240108193510.py
+++ b/.history/train_tgnn_20240108193510.py
--- a/.history/train_tgnn_20240108193517.py
+++ b/.history/train_tgnn_20240108193517.py
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -110,19 +110,6 @@ if(WITH_MTMETIS)
    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_PARTITIONS)
 endif()
-if (WITH_LDG)
-    # Imports neighbor-clustering based (e.g. LDG algorithm) graph partitioning implementation
-    add_definitions(-DWITH_LDG)
-    set(LDG_DIR "third_party/ldg_partition")
-    add_library(ldg_partition SHARED "csrc/partition/ldg.cpp")
-    target_link_libraries(ldg_partition PRIVATE ${TORCH_LIBRARIES})
-    add_subdirectory(${LDG_DIR})
-    target_include_directories(ldg_partition PRIVATE ${LDG_DIR})
-    target_link_libraries(ldg_partition PRIVATE ldg-vertex-partition)
-endif ()
 include_directories("csrc/include")
 add_library(${PROJECT_NAME} SHARED csrc/export.cpp)

--- a/CMakeLists_.txt
+++ b/CMakeLists_.txt
+cmake_minimum_required(VERSION 3.15)
+project(starrygl VERSION 0.1)
+option(WITH_PYTHON "Link to Python when building" ON)
+option(WITH_CUDA "Link to CUDA when building" ON)
+option(WITH_METIS "Link to METIS when building" ON)
+option(WITH_MTMETIS "Link to multi-threaded METIS when building" ON)
+option(WITH_LDG "Link to (multi-threaded optionally) LDG when building" ON)
+message("third_party dir is ${CMAKE_SOURCE_DIR}")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+find_package(OpenMP REQUIRED)
+link_libraries(OpenMP::OpenMP_CXX)
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+add_compile_options(${TORCH_CXX_FLAGS})
+if(WITH_PYTHON)
+    add_definitions(-DWITH_PYTHON)
+    find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
+    include_directories(${Python3_INCLUDE_DIRS})
+endif()
+if(WITH_CUDA)
+    add_definitions(-DWITH_CUDA)
+    add_definitions(-DWITH_UVM)
+    find_package(CUDA REQUIRED)
+    include_directories(${CUDA_INCLUDE_DIRS})
+    set(CUDA_LIBRARIES "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcudart.so")
+    file(GLOB_RECURSE UVM_SRCS "csrc/uvm/*.cpp")
+    add_library(uvm_ops SHARED ${UVM_SRCS})
+    target_link_libraries(uvm_ops PRIVATE ${TORCH_LIBRARIES})
+endif()
+if(WITH_METIS)
+    add_definitions(-DWITH_METIS)
+    set(GKLIB_DIR "${CMAKE_SOURCE_DIR}/third_party/GKlib")
+    set(METIS_DIR "${CMAKE_SOURCE_DIR}/third_party/METIS")
+    set(GKLIB_INCLUDE_DIRS "${GKLIB_DIR}/include")
+    file(GLOB_RECURSE GKLIB_LIBRARIES "${GKLIB_DIR}/lib/lib*.a")
+    set(METIS_INCLUDE_DIRS "${METIS_DIR}/include")
+    file(GLOB_RECURSE METIS_LIBRARIES "${METIS_DIR}/lib/lib*.a")
+    include_directories(${METIS_INCLUDE_DIRS})
+    add_library(metis_partition SHARED "csrc/partition/metis.cpp")
+    target_link_libraries(metis_partition PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(metis_partition PRIVATE ${GKLIB_LIBRARIES})
+    target_link_libraries(metis_partition PRIVATE ${METIS_LIBRARIES})
+endif()
+if(WITH_MTMETIS)
+    add_definitions(-DWITH_MTMETIS)
+    set(MTMETIS_DIR "${CMAKE_SOURCE_DIR}/third_party/mt-metis")
+    set(MTMETIS_INCLUDE_DIRS "${MTMETIS_DIR}/include")
+    file(GLOB_RECURSE MTMETIS_LIBRARIES "${MTMETIS_DIR}/lib/lib*.a")
+    include_directories(${MTMETIS_INCLUDE_DIRS})
+    add_library(mtmetis_partition SHARED "csrc/partition/mtmetis.cpp")
+    target_link_libraries(mtmetis_partition PRIVATE ${TORCH_LIBRARIES})
+    target_link_libraries(mtmetis_partition PRIVATE ${MTMETIS_LIBRARIES})
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_VERTICES)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_EDGES)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_WEIGHTS)
+    target_compile_definitions(mtmetis_partition PRIVATE -DMTMETIS_64BIT_PARTITIONS)
+endif()
+if (WITH_LDG)
+    # Imports neighbor-clustering based (e.g. LDG algorithm) graph partitioning implementation
+    add_definitions(-DWITH_LDG)
+    # set(LDG_DIR "csrc/partition/neighbor_clustering")
+    set(LDG_DIR "third_party/ldg_partition")
+    add_library(ldg_partition SHARED "csrc/partition/ldg.cpp")
+    target_link_libraries(ldg_partition PRIVATE ${TORCH_LIBRARIES})
+    # add_subdirectory(${LDG_DIR})
+    target_include_directories(ldg_partition PRIVATE ${LDG_DIR})
+    target_link_libraries(ldg_partition PRIVATE ldg-vertex-partition)
+endif ()
+include_directories("csrc/include")
+add_library(${PROJECT_NAME} SHARED csrc/export.cpp)
+target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_compile_definitions(${PROJECT_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${PROJECT_NAME})
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
+if (WITH_CUDA)
+    target_link_libraries(${PROJECT_NAME} PRIVATE uvm_ops)
+endif()
+if (WITH_METIS)
+    message(STATUS "Current project '${PROJECT_NAME}' uses METIS graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE metis_partition)
+endif()
+if (WITH_MTMETIS)
+    message(STATUS "Current project '${PROJECT_NAME}' uses multi-threaded METIS graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE mtmetis_partition)
+endif()
+if (WITH_LDG)
+    message(STATUS "Current project '${PROJECT_NAME}' uses LDG graph partitioning algorithm.")
+    target_link_libraries(${PROJECT_NAME} PRIVATE ldg_partition)
+endif()
+# add libsampler.so
+set(SAMLPER_NAME "${PROJECT_NAME}_sampler")
+set(BOOST_INCLUDE_DIRS "${CMAKE_SOURCE_DIR}/third_party/boost_1_83_0")
+include_directories(${BOOST_INCLUDE_DIRS})
+file(GLOB_RECURSE SAMPLER_SRCS "csrc/sampler/*.cpp")
+add_library(${SAMLPER_NAME} SHARED ${SAMPLER_SRCS})
+target_include_directories(${SAMLPER_NAME} PRIVATE "csrc/sampler/include")
+target_compile_options(${SAMLPER_NAME} PRIVATE -O3)
+target_link_libraries(${SAMLPER_NAME} PRIVATE ${TORCH_LIBRARIES})
+target_compile_definitions(${SAMLPER_NAME} PRIVATE -DTORCH_EXTENSION_NAME=lib${SAMLPER_NAME})
+if(WITH_PYTHON)
+    find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
+    target_link_libraries(${SAMLPER_NAME} PRIVATE ${TORCH_PYTHON_LIBRARY})
+endif()
--- a/config/DyRep.yml
+++ b/config/DyRep.yml
+sampling:
+  - layer: 1
+    neighbor: 
+      - 10
+    strategy: 'recent'
+    prop_time: False
+    history: 1
+    duration: 0
+    num_thread: 32
+memory: 
+  - type: 'node'
+    dim_time: 100
+    deliver_to: 'self'
+    mail_combine: 'last'
+    memory_update: 'gru'
+    mailbox_size: 1
+    combine_node_feature: True
+    dim_out: 100
+gnn:
+  - arch: 'transformer_attention'
+    use_src_emb: True
+    use_dst_emb: True
+    layer: 1
+    att_head: 2
+    dim_time: 100
+    dim_out: 100
+train:
+  - epoch: 50
+    batch_size: 100
+    # reorder: 16
+    lr: 0.0001
+    dropout: 0.1
+    att_dropout: 0.2
+    all_on_gpu: True
\ No newline at end of file
--- a/config/TGN.yml
+++ b/config/TGN.yml
@@ -18,13 +18,15 @@ memory:
    dim_out: 100
 gnn:
  - arch: 'transformer_attention'
+    use_src_emb: False
+    use_dst_emb: False
    layer: 1
    att_head: 2
    dim_time: 100
    dim_out: 100
 train:
-  - epoch: 5
+  - epoch: 20
-    #batch_size: 100
+    batch_size: 200
    # reorder: 16
    lr: 0.0001
    dropout: 0.2

--- a/config/TIGE.yml
+++ b/config/TIGE.yml
+sampling:
+  - layer: 1
+    neighbor: 
+      - 10
+    strategy: 'recent'
+    prop_time: False
+    history: 1
+    duration: 0
+    num_thread: 32
+memory: 
+  - type: 'node'
+    dim_time: 100
+    deliver_to: 'self'
+    mail_combine: 'last'
+    memory_update: 'gru'
+    mailbox_size: 1
+    combine_node_feature: True
+    dim_out: 100
+gnn:
+  - arch: 'transformer_attention'
+    use_src_emb: True
+    use_dst_emb: True
+    layer: 1
+    att_head: 2
+    dim_time: 100
+    dim_out: 100
+train:
+  - epoch: 20
+    batch_size: 200
+    # reorder: 16
+    lr: 0.0001
+    dropout: 0.2
+    att_dropout: 0.2
+    all_on_gpu: True
\ No newline at end of file
--- a/csrc/export.cpp
+++ b/csrc/export.cpp
@@ -5,6 +5,7 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    #ifdef WITH_CUDA
+    #ifdef WITH_CUDA
    m.def("uvm_storage_new", &uvm_storage_new, "return storage of unified virtual memory");
    m.def("uvm_storage_to_cuda", &uvm_storage_to_cuda, "share uvm storage with another cuda device");
    m.def("uvm_storage_to_cpu", &uvm_storage_to_cpu, "share uvm storage with cpu");

--- a/data_maker.py
+++ b/data_maker.py
@@ -114,11 +114,15 @@ edge_weight_dict = {}
 edge_weight_dict['edata'] = 2*neg_nums
 edge_weight_dict['sample_data'] = 1*neg_nums
 edge_weight_dict['neg_data'] = 1
-partition_save('./dataset/here/'+data_name, data, 1, 'metis_for_tgnn',
+#partition_save('./dataset/here/'+data_name, data, 1, 'metis_for_tgnn',
-               edge_weight_dict=edge_weight_dict)
+#               edge_weight_dict=edge_weight_dict)
-partition_save('./dataset/here/'+data_name, data, 2, 'metis_for_tgnn',
+#partition_save('./dataset/here/'+data_name, data, 2, 'metis_for_tgnn',
-               edge_weight_dict=edge_weight_dict)
+#               edge_weight_dict=edge_weight_dict)
-partition_save('./dataset/here/'+data_name, data, 4, 'metis_for_tgnn',
+#partition_save('./dataset/here/'+data_name, data, 4, 'metis_for_tgnn',
+#               edge_weight_dict=edge_weight_dict)
+#partition_save('./dataset/here/'+data_name, data, 8, 'metis_for_tgnn',
+#               edge_weight_dict=edge_weight_dict)
+partition_save('./dataset/here/'+data_name, data, 16, 'metis_for_tgnn',
               edge_weight_dict=edge_weight_dict)
 #
 # partition_save('./dataset/here/'+data_name, data, 4, 'metis_for_tgnn',

--- a/docs/source/advanced/data_proc.rst
+++ b/docs/source/advanced/data_proc.rst
+Advanced Data Preprocessing
+===========================
+.. note::
+    详细介绍一下StarryGL几种数据管理类，例如GraphData，的使用细节，内部索引结构的设计和底层操作。
\ No newline at end of file
--- a/docs/source/advanced/index.rst
+++ b/docs/source/advanced/index.rst
--- a/docs/source/advanced/pp_training.rst
+++ b/docs/source/advanced/pp_training.rst
+Distributed Partition Parallel
+==============================
+.. note::
+    分布式分区并行训练部分
\ No newline at end of file
--- a/docs/source/advanced/tp_training.rst
+++ b/docs/source/advanced/tp_training.rst
+Distributed Timeline Parallel
+=============================
+.. note::
+    分布式时序并行
\ No newline at end of file
--- a/docs/source/advanced/ts_sampling.rst
+++ b/docs/source/advanced/ts_sampling.rst
+Distributed Temporal Sampling
+=============================
+.. note::
+    基于分布式时序图采样的训练模式
\ No newline at end of file
--- a/docs/source/api/python/index.rst
+++ b/docs/source/api/python/index.rst
--- a/requirements.txt
+++ b/requirements.txt
--- a/starrygl/distributed/utils.py
+++ b/starrygl/distributed/utils.py
@@ -294,7 +294,7 @@ class DistributedTensor:
        index = dist_index.loc
        futs: List[torch.futures.Future] = []
-        for i in range(self.num_parts()):
+        for i in range(self.num_parts):
            mask = part_idx == i
            f = self.accessor.async_index_copy_(0, index[mask], source[mask], self.rrefs[i])
            futs.append(f)
@@ -308,7 +308,7 @@ class DistributedTensor:
        index = dist_index.loc
        futs: List[torch.futures.Future] = []
-        for i in range(self.num_parts()):
+        for i in range(self.num_parts):
            mask = part_idx == i
            f = self.accessor.async_index_add_(0, index[mask], source[mask], self.rrefs[i])
            futs.append(f)

--- a/starrygl/evaluation/evaluate.py
+++ b/starrygl/evaluation/evaluate.py
--- a/starrygl/module/modules.py
+++ b/starrygl/module/modules.py
@@ -68,8 +68,14 @@ class GeneralModel(torch.nn.Module):
            out = torch.stack(out, dim=0)
            out = self.combiner(out)[0][-1, :, :]
        #metadata需要在前面去重的时候记一下id
+        if self.gnn_param['use_src_emb'] or self.gnn_param['use_dst_emb']:
+            self.embedding = out.detach().clone()
+        else:
+            self.embedding = None
        if metadata is not None:
            #out = torch.cat((out[metadata['dst_pos_pos']],out[metadata['src_id_pos']],out[metadata['dst_neg_pos']]),0)
+            if self.gnn_param['dyrep']:
+                out = self.memory_updater.last_updated_memory
            out = torch.cat((out[metadata['src_pos_index']],out[metadata['dst_pos_index']],out[metadata['src_neg_index']]),0)
        return self.edge_predictor(out, neg_samples=neg_samples)

--- a/starrygl/module/utils.py
+++ b/starrygl/module/utils.py
 import yaml
+import numpy as np
 def parse_config(f):
    conf = yaml.safe_load(open(f, 'r'))
@@ -8,3 +8,31 @@ def parse_config(f):
    gnn_param = conf['gnn'][0]
    train_param = conf['train'][0]
    return sample_param, memory_param, gnn_param, train_param
+class EarlyStopMonitor(object):
+  def __init__(self, max_round=3, higher_better=True, tolerance=1e-10):
+    self.max_round = max_round
+    self.num_round = 0
+    self.epoch_count = 0
+    self.best_epoch = 0
+    self.last_best = None
+    self.higher_better = higher_better
+    self.tolerance = tolerance
+  def early_stop_check(self, curr_val):
+    if not self.higher_better:
+      curr_val *= -1
+    if self.last_best is None:
+      self.last_best = curr_val
+    elif (curr_val - self.last_best) / np.abs(self.last_best) > self.tolerance:
+      self.last_best = curr_val
+      self.num_round = 0
+      self.best_epoch = self.epoch_count
+    else:
+      self.num_round += 1
+    self.epoch_count += 1
+    return self.num_round >= self.max_round
\ No newline at end of file
--- a/starrygl/sample/graph_core/__init__.py
+++ b/starrygl/sample/graph_core/__init__.py
@@ -166,6 +166,7 @@ class DataSet:
        if labels is not None:
            self.labels = labels
        self.len = self.nodes.shape[0] if nodes is not None else self.edges.shape[1] 
        for k, v in kwargs.items():
            assert isinstance(v,torch.Tensor) and v.shape[0]==self.len
            setattr(self, k, v.to(device))
@@ -222,7 +223,7 @@ class DataSet:
 class TemporalGraphData(DistributedGraphStore):
    def __init__(self,pdata,device):
-        super(TemporalGraphData,self).__init__(pdata,device)
+        super(DistributedGraphStore,self).__init__(pdata,device)
    def _set_temporal_batch_cache(self,size,pin_size):
        pass
    def _load_feature_to_cuda(self,ids):

--- a/starrygl/sample/memory/shared_mailbox.py
+++ b/starrygl/sample/memory/shared_mailbox.py
@@ -299,7 +299,7 @@ class SharedMailBox():
    def get_update_mail(self,dist_indx_mapper,
                 src,dst,ts,edge_feats,
-                 memory):
+                 memory,embedding=None,use_src_emb=False,use_dst_emb=False):
        if edge_feats is not None:
            edge_feats = edge_feats.to(self.device).to(self.mailbox.dtype)
        src = src.to(self.device)
@@ -309,12 +309,14 @@ class SharedMailBox():
        mem_src = memory[src]
        mem_dst = memory[dst]
+        if embedding is not None:
+            emb_src = embedding[src]
+            emb_dst = embedding[dst]
+        src_mail = torch.cat([emb_src if use_src_emb else mem_src, emb_dst if use_dst_emb else mem_dst], dim=1)
+        dst_mail = torch.cat([emb_dst if use_src_emb else mem_dst, emb_src if use_dst_emb else mem_src], dim=1)
        if edge_feats is not None:
-            src_mail = torch.cat([mem_src, mem_dst, edge_feats], dim=1)
+            src_mail = torch.cat([src_mail, edge_feats], dim=1)
-            dst_mail = torch.cat([mem_dst, mem_src, edge_feats], dim=1)
+            dst_mail = torch.cat([dst_mail, edge_feats], dim=1)
-        else:
-            src_mail = torch.cat([mem_src, mem_dst], dim=1)
-            dst_mail = torch.cat([mem_dst, mem_src], dim=1)
        mail = torch.cat([src_mail, dst_mail], dim=1).reshape(-1, src_mail.shape[1])
        mail_ts = torch.cat((ts,ts),-1).to(self.device).to(self.mailbox_ts.dtype)
        unq_index,inv = torch.unique(index,return_inverse = True)
@@ -324,7 +326,6 @@ class SharedMailBox():
        index = unq_index
        return index,mail,mail_ts
    def get_update_memory(self,index,memory,memory_ts):
        unq_index,inv = torch.unique(index,return_inverse = True)
        max_ts,idx = torch_scatter.scatter_max(memory_ts,inv,0)

--- a/starrygl/sample/part_utils/partition_tgnn.py
+++ b/starrygl/sample/part_utils/partition_tgnn.py
 from torch_sparse import SparseTensor
 from torch_geometric.data import Data
 from torch_geometric.utils import degree
 import os.path as osp
 import os
 import shutil

--- a/train_tgnn.py
+++ b/train_tgnn.py
@@ -5,10 +5,14 @@ from os.path import abspath, join, dirname
 from starrygl.distributed.context import DistributedContext
 from starrygl.distributed.utils import DistIndex
 from starrygl.module.modules import GeneralModel
+from pathlib import Path
 from starrygl.module.utils import parse_config
 from starrygl.sample.cache.fetch_cache import FetchFeatureCache
 from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
+from starrygl.module.utils import parse_config, EarlyStopMonitor
+from starrygl.sample.graph_core import DataSet, DistributedGraphStore, TemporalNeighborSampleGraph
 from starrygl.sample.memory.shared_mailbox import SharedMailBox
 from starrygl.sample.sample_core.base import NegativeSampling
 from starrygl.sample.sample_core.neighbor_sampler import NeighborSampler
@@ -34,10 +38,13 @@ parser = argparse.ArgumentParser(
 )
 parser.add_argument('--rank', default=0, type=int, metavar='W',
                    help='name of dataset')
+parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping')
 parser.add_argument('--world_size', default=1, type=int, metavar='W',
                    help='number of negative samples')
 parser.add_argument('--dataname', default=1, type=str, metavar='W',
-                    help='number of negative samples')
+                    help='name of dataset')
+parser.add_argument('--model', default='TGN', type=str, metavar='W',
+                    help='name of model')
 args = parser.parse_args()
 from sklearn.metrics import average_precision_score, roc_auc_score
 import torch
@@ -66,7 +73,7 @@ seed_everything(1234)
 def main():   
    print('main')
    use_cuda = True
-    sample_param, memory_param, gnn_param, train_param = parse_config('./config/TGN.yml')
+    sample_param, memory_param, gnn_param, train_param = parse_config('./config/{}.yml'.format(args.model))
    torch.set_num_threads(12)
    ctx = DistributedContext.init(backend="nccl", use_gpu=True)
    device_id = torch.cuda.current_device()
@@ -83,7 +90,7 @@ def main():
    val_ts = torch.masked_select(graph.edge_ts,pdata.val_mask.to(graph.edge_index.device))
    test_data = torch.masked_select(graph.edge_index,pdata.test_mask.to(graph.edge_index.device)).reshape(2,-1)
    test_ts = torch.masked_select(graph.edge_ts,pdata.test_mask.to(graph.edge_index.device)) 
-    print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
+    #print(train_data.shape[1],val_data.shape[1],test_data.shape[1])
    train_data = DataSet(edges = train_data,ts =train_ts,eids = torch.nonzero(pdata.train_mask).view(-1))
    #if dist.get_rank() == 0:
    test_data = DataSet(edges = test_data,ts =test_ts,eids = torch.nonzero(pdata.test_mask).view(-1))
@@ -100,7 +107,7 @@ def main():
    trainloader = DistributedDataLoader(graph,train_data,sampler = sampler,
                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
                                        neg_sampler=neg_sampler,
-                                        batch_size = 1000,
+                                        batch_size = train_param['batch_size'],
                                        shuffle=False,
                                        drop_last=True,
                                        chunk_size = None,
@@ -111,7 +118,7 @@ def main():
    testloader = DistributedDataLoader(graph,test_data,sampler = sampler,
                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
                                        neg_sampler=neg_sampler,
-                                        batch_size = 1000,
+                                        batch_size = train_param['batch_size'],
                                        shuffle=False,
                                        drop_last=False,
                                        chunk_size = None,
@@ -121,7 +128,7 @@ def main():
    valloader = DistributedDataLoader(graph,val_data,sampler = sampler,
                                        sampler_fn = SAMPLE_TYPE.SAMPLE_FROM_TEMPORAL_EDGES,
                                        neg_sampler=neg_sampler,
-                                        batch_size = 1000,
+                                        batch_size = train_param['batch_size'],
                                        shuffle=False,
                                        drop_last=False,
                                        chunk_size = None,
@@ -194,6 +201,8 @@ def main():
                        index, mail, mail_ts = mailbox.get_update_mail(dist_index_mapper,
                                                     src,dst,ts,edge_feats,
                                                     model.module.memory_updater.last_updated_memory,
+                                                     model.module.embedding,use_src_emb,
+                                                     use_dst_emb,
                                                     )
                        mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
@@ -212,10 +221,13 @@ def main():
        auc_mrr = float(torch.tensor(auc_mrr).mean())
        return ap, auc_mrr
    creterion = torch.nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=train_param['lr'])
+    early_stopper = EarlyStopMonitor(max_round=args.patience)
+    MODEL_SAVE_PATH = f'./saved_models/{args.model}-{args.dataname}.pth'
    for e in range(train_param['epoch']):
        torch.cuda.synchronize()
+        write_back_time = 0
+        fetch_time = 0
        epoch_start_time = time.time()
        train_aps = list()
        print('Epoch {:d}:'.format(e))
@@ -227,7 +239,8 @@ def main():
            model.module.memory_updater.last_updated_nid = None
            model.module.memory_updater.last_updated_memory = None
            model.module.memory_updater.last_updated_ts = None
-        for roots,mfgs,metadata in trainloader:
+        for roots,mfgs,metadata,sample_time in trainloader:
+            fetch_time +=sample_time/1000
            t_prep_s = time.time()
            with torch.cuda.stream(train_stream):
@@ -270,13 +283,13 @@ def main():
                    index, mail, mail_ts = mailbox.get_update_mail(dist_index_mapper,
                                                src,dst,ts,edge_feats,
                                                model.module.memory_updater.last_updated_memory, 
+                                                model.module.embedding,use_src_emb,use_dst_emb,
                                                )
+                    start_event = torch.cuda.Event(enable_timing=True)
+                    end_event = torch.cuda.Event(enable_timing=True)
+                    start_event.record()
                    mailbox.set_mailbox_all_to_all(index,memory,memory_ts,mail,mail_ts,reduce_Op = 'max')
-                    #end_event.record()
-                    #torch.cuda.synchronize()
-                    #write_back_time += start_event.elapsed_time(end_event)/1000
        torch.cuda.synchronize()
        time_prep = time.time() - epoch_start_time
        avg_time += time.time() - epoch_start_time
@@ -288,9 +301,19 @@ def main():
        #if cache.node_cache is not None:
        #    print('hit {}'.format(cache.node_cache.hit_/ cache.node_cache.hit_sum))
        ap, auc = eval('val')
+        early_stop = early_stopper.early_stop_check(ap)
+        if early_stop:
+            print("Early stopping at epoch {:d}".format(e))
+            print(f"Loading the best model at epoch {early_stopper.best_epoch}")
+            best_model_path = get_checkpoint_path(early_stopper.best_epoch)
+            model.load_state_dict(torch.load(best_model_path))
+            break
+        else:
            print('\ttrain loss:{:.4f}  train ap:{:4f}  val ap:{:4f}  val auc:{:4f}'.format(total_loss,train_ap, ap, auc))
            print('\ttotal time:{:.2f}s  prep time:{:.2f}s'.format(time.time()-epoch_start_time, time_prep))    
-        #print('\t fetch time:{:.2f}s write back time:{:.2f}s'.format(fetch_time,write_back_time))
+            print('\t fetch time:{:.2f}s write back time:{:.2f}s'.format(fetch_time,write_back_time))
+            torch.save(model.state_dict(), get_checkpoint_path(e))
    model.eval()
    if mailbox is not None:
        mailbox.reset()
@@ -304,6 +327,7 @@ def main():
    else:
        print('\ttest AP:{:4f}  test AUC:{:4f}'.format(ap, auc))    
    print('test_dataset',test_data.edges.shape[1],'avg_time',avg_time/train_param['epoch'])
+    torch.save(model.state_dict(), MODEL_SAVE_PATH)
    ctx.shutdown()
 if __name__ == "__main__":
    main()