From 2b7da92d56ff318083a1c69574dbacad4ea13751 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Fri, 16 Jun 2023 07:31:01 +0000 Subject: [PATCH] Add host hashing unit tests --- src/communicator.cc | 3 +- src/utils_internal.cc | 6 ++-- test/mp_unit/CMakeLists.txt | 2 +- test/mp_unit/communicator_tests.cu | 4 --- test/mp_unit/mp_unit_tests.cc | 49 ++++++++++++++++++++++++++++++ test/mp_unit/mp_unit_tests.hpp | 8 ++--- test/unit/CMakeLists.txt | 1 + test/unit/utils_internal_tests.cc | 17 +++++++++++ 8 files changed, 75 insertions(+), 15 deletions(-) create mode 100644 test/unit/utils_internal_tests.cc diff --git a/src/communicator.cc b/src/communicator.cc index 2ec3f664..d5c8db6e 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -98,8 +98,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connectOnSetup(int rem if (pimpl->rankToHash_[remoteRank] != pimpl->rankToHash_[pimpl->bootstrap_->getRank()]) { std::stringstream ss; ss << "Cuda IPC connection can only be made within a node: " << remoteRank << "(" << std::hex - << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")" - << " != " << pimpl->bootstrap_->getRank() << "(" << std::hex + << pimpl->rankToHash_[remoteRank] << ") != " << pimpl->bootstrap_->getRank() << "(" << std::hex << pimpl->rankToHash_[pimpl->bootstrap_->getRank()] << ")"; throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage); } diff --git a/src/utils_internal.cc b/src/utils_internal.cc index 930c841f..273ecbe1 100644 --- a/src/utils_internal.cc +++ b/src/utils_internal.cc @@ -10,22 +10,20 @@ #include "debug.h" -namespace { constexpr char HOSTID_FILE[32] = "/proc/sys/kernel/random/boot_id"; -bool matchIf(const char* string, const char* ref, bool matchExact) { +static bool matchIf(const char* string, const char* ref, bool matchExact) { // Make sure to include '\0' in the exact case int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); return strncmp(string, ref, matchLen) == 0; } -bool matchPort(const int port1, const int port2) { +static bool matchPort(const int port1, const int port2) { if (port1 == -1) return true; if (port2 == -1) return true; if (port1 == port2) return true; return false; } -} // namespace namespace mscclpp { std::string int64ToBusId(int64_t id) { diff --git a/test/mp_unit/CMakeLists.txt b/test/mp_unit/CMakeLists.txt index 78080970..1a9e43d9 100644 --- a/test/mp_unit/CMakeLists.txt +++ b/test/mp_unit/CMakeLists.txt @@ -1,8 +1,8 @@ target_sources(mp_unit_tests PRIVATE + mp_unit_tests.cc bootstrap_tests.cc ib_tests.cu communicator_tests.cu device_channel_tests.cu direct_channel_tests.cu - mp_unit_tests.cc ) diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu index 1a7f49ad..bda61591 100644 --- a/test/mp_unit/communicator_tests.cu +++ b/test/mp_unit/communicator_tests.cu @@ -37,10 +37,6 @@ void CommunicatorTestBase::TearDown() { void CommunicatorTestBase::setNumRanksToUse(int num) { numRanksToUse = num; } -int CommunicatorTestBase::rankToLocalRank(int rank) const { return rank % gEnv->nRanksPerNode; } - -int CommunicatorTestBase::rankToNode(int rank) const { return rank / gEnv->nRanksPerNode; } - void CommunicatorTestBase::connectMesh(bool useIbOnly) { for (int i = 0; i < numRanksToUse; i++) { if (i != gEnv->rank) { diff --git a/test/mp_unit/mp_unit_tests.cc b/test/mp_unit/mp_unit_tests.cc index cde8439f..7a88f61c 100644 --- a/test/mp_unit/mp_unit_tests.cc +++ b/test/mp_unit/mp_unit_tests.cc @@ -2,11 +2,24 @@ #include +#include #include +#include "utils_internal.hpp" + const char gDefaultIpPort[] = "127.0.0.1:50053"; MultiProcessTestEnv* gEnv = nullptr; +int rankToLocalRank(int rank) { + if (gEnv == nullptr) throw std::runtime_error("rankToLocalRank is called before gEnv is initialized"); + return rank % gEnv->nRanksPerNode; +} + +int rankToNode(int rank) { + if (gEnv == nullptr) throw std::runtime_error("rankToNode is called before gEnv is initialized"); + return rank / gEnv->nRanksPerNode; +} + mscclpp::Transport ibIdToTransport(int id) { mscclpp::Transport IBs[] = {mscclpp::Transport::IB0, mscclpp::Transport::IB1, mscclpp::Transport::IB2, mscclpp::Transport::IB3, mscclpp::Transport::IB4, mscclpp::Transport::IB5, @@ -81,3 +94,39 @@ TEST_F(MultiProcessTest, Prelim) { // Test to make sure the MPI environment is set up correctly ASSERT_GE(gEnv->worldSize, 2); } +#include +TEST_F(MultiProcessTest, HostName) { + const size_t maxNameLen = 1024; + std::vector buffer(gEnv->worldSize * maxNameLen, '\0'); + std::string hostName = mscclpp::getHostName(maxNameLen, '\0'); + // Copy hostName to buffer + memcpy(buffer.data() + gEnv->rank * maxNameLen, hostName.c_str(), hostName.size()); + + MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, buffer.data(), maxNameLen, MPI_BYTE, MPI_COMM_WORLD); + + for (int rank = 0; rank < gEnv->worldSize; rank++) { + char rankHostName[maxNameLen + 1]; + strncpy(rankHostName, buffer.data() + rank * maxNameLen, maxNameLen); + if (rankToNode(rank) == rankToNode(gEnv->rank)) { + ASSERT_EQ(std::string(rankHostName), hostName); + } else { + ASSERT_NE(std::string(rankHostName), hostName); + } + } +} + +TEST_F(MultiProcessTest, HostHash) { + std::vector buffer(gEnv->worldSize, 0); + uint64_t hostHash = mscclpp::getHostHash(); + buffer[gEnv->rank] = hostHash; + + MPI_Allgather(MPI_IN_PLACE, 0, MPI_BYTE, buffer.data(), sizeof(hostHash), MPI_BYTE, MPI_COMM_WORLD); + + for (int rank = 0; rank < gEnv->worldSize; rank++) { + if (rankToNode(rank) == rankToNode(gEnv->rank)) { + ASSERT_EQ(buffer[rank], hostHash); + } else { + ASSERT_NE(buffer[rank], hostHash); + } + } +} diff --git a/test/mp_unit/mp_unit_tests.hpp b/test/mp_unit/mp_unit_tests.hpp index b8dc8d7c..967e043c 100644 --- a/test/mp_unit/mp_unit_tests.hpp +++ b/test/mp_unit/mp_unit_tests.hpp @@ -26,6 +26,10 @@ class MultiProcessTestEnv : public ::testing::Environment { extern MultiProcessTestEnv* gEnv; +mscclpp::Transport ibIdToTransport(int id); +int rankToLocalRank(int rank); +int rankToNode(int rank); + class MultiProcessTest : public ::testing::Test { protected: void TearDown() override; @@ -77,16 +81,12 @@ class IbPeerToPeerTest : public IbTestBase { std::array mrInfo; }; -mscclpp::Transport ibIdToTransport(int id); - class CommunicatorTestBase : public MultiProcessTest { protected: void SetUp() override; void TearDown() override; void setNumRanksToUse(int num); - int rankToLocalRank(int rank) const; - int rankToNode(int rank) const; void connectMesh(bool useIbOnly = false); // Register a local memory and receive corresponding remote memories diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index b74b2f60..ecc91c26 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -6,4 +6,5 @@ target_sources(unit_tests PRIVATE numa_tests.cc socket_tests.cc utils_tests.cc + utils_internal_tests.cc ) diff --git a/test/unit/utils_internal_tests.cc b/test/unit/utils_internal_tests.cc new file mode 100644 index 00000000..6ae04561 --- /dev/null +++ b/test/unit/utils_internal_tests.cc @@ -0,0 +1,17 @@ +#include + +#include + +#include "utils_internal.hpp" + +TEST(UtilsInternalTest, getHostHash) { + uint64_t hash1 = mscclpp::getHostHash(); + uint64_t hash2; + + std::thread th([&hash2]() { hash2 = mscclpp::getHostHash(); }); + + ASSERT_TRUE(th.joinable()); + th.join(); + + EXPECT_EQ(hash1, hash2); +}