From 6973c4701a9c01e9fbd6b74a5db457749cef85b6 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Mon, 12 Aug 2024 14:50:38 -0700 Subject: [PATCH] fs/ufs: change default locking protocol The fs/ufs component by default disabled all file locking before read/write operations (except for NFS file systems). This was based on the assumption, that the file system itself performs the required locking operation and hence we don't have to add to it. This assumption is incorrect when using data sieving. In data sieving, the code 'ignore' small gaps when we write to a file, and perform instead a read-modify-write sequence ourselves for performance reasons. The problem is however that even within a collective operation not all aggregators might want to use data sieving. Hence, enabling locking just for the data-sieving routines is insufficient, all processes have to perform the locking. Therefore, our two options are: a) either disable write data-sieving by default, or b) enable range-locking by default. After some testing, I think enabling range-locking be default is the safer and better approach. It doesn't seem to show any significant performance impact on my test systems. Note, that on Lustre file systems, we can keep the default to no-locking as far as I can see, since the collective algorithm used by Lustre is unlikely to produce this pattern. I did add in however an mca parameter that allows us to control the locking algorithm used by the Lustre component as well, in case we need to change that for a particular use-case or platform. Fixes Issue #12718 Signed-off-by: Edgar Gabriel (cherry picked from commit c697f28d2ca027238016f62da8786b24038578c0) --- ompi/mca/fs/lustre/fs_lustre.h | 7 +++++++ ompi/mca/fs/lustre/fs_lustre_component.c | 11 +++++++++++ ompi/mca/fs/lustre/fs_lustre_file_open.c | 17 ++++++++++++++++- ompi/mca/fs/ufs/fs_ufs_file_open.c | 7 +------ 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/ompi/mca/fs/lustre/fs_lustre.h b/ompi/mca/fs/lustre/fs_lustre.h index 6ff2fac47f0..b499f59bc77 100644 --- a/ompi/mca/fs/lustre/fs_lustre.h +++ b/ompi/mca/fs/lustre/fs_lustre.h @@ -13,6 +13,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserverd. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,6 +32,12 @@ extern int mca_fs_lustre_priority; extern int mca_fs_lustre_stripe_size; extern int mca_fs_lustre_stripe_width; +extern int mca_fs_lustre_lock_algorithm; + +#define FS_LUSTRE_LOCK_AUTO 0 +#define FS_LUSTRE_LOCK_NEVER 1 +#define FS_LUSTRE_LOCK_ENTIRE_FILE 2 +#define FS_LUSTRE_LOCK_RANGES 3 BEGIN_C_DECLS diff --git a/ompi/mca/fs/lustre/fs_lustre_component.c b/ompi/mca/fs/lustre/fs_lustre_component.c index d8392af482d..011b99b23de 100644 --- a/ompi/mca/fs/lustre/fs_lustre_component.c +++ b/ompi/mca/fs/lustre/fs_lustre_component.c @@ -13,6 +13,7 @@ * Copyright (c) 2008-2011 University of Houston. All rights reserved. * Copyright (c) 2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserverd. * $COPYRIGHT$ * * Additional copyrights may follow @@ -45,6 +46,7 @@ int mca_fs_lustre_priority = 20; runtime also*/ int mca_fs_lustre_stripe_size = 0; int mca_fs_lustre_stripe_width = 0; +int mca_fs_lustre_lock_algorithm = 0; /* auto */ /* * Instantiate the public struct with all of our public information * and pointers to our public functions in it @@ -93,6 +95,15 @@ lustre_register(void) MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, &mca_fs_lustre_stripe_width); + mca_fs_lustre_lock_algorithm = 0; + (void) mca_base_component_var_register(&mca_fs_lustre_component.fsm_version, + "lock_algorithm", "Locking algorithm used by the fs ufs component. " + " 0: auto (default), 1: skip locking, 2: always lock entire file, " + "3: lock only specific ranges", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_fs_lustre_lock_algorithm ); return OMPI_SUCCESS; } diff --git a/ompi/mca/fs/lustre/fs_lustre_file_open.c b/ompi/mca/fs/lustre/fs_lustre_file_open.c index 58c06056bf2..d7c551d9192 100644 --- a/ompi/mca/fs/lustre/fs_lustre_file_open.c +++ b/ompi/mca/fs/lustre/fs_lustre_file_open.c @@ -13,6 +13,7 @@ * Copyright (c) 2015-2020 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserverd. * $COPYRIGHT$ * * Additional copyrights may follow @@ -171,8 +172,22 @@ mca_fs_lustre_file_open (struct ompi_communicator_t *comm, fh->f_stripe_size = lump->lmm_stripe_size; fh->f_stripe_count = lump->lmm_stripe_count; fh->f_fs_block_size = lump->lmm_stripe_size; - fh->f_flags |= OMPIO_LOCK_NEVER; free(lump); + if (FS_LUSTRE_LOCK_AUTO == mca_fs_lustre_lock_algorithm || + FS_LUSTRE_LOCK_NEVER == mca_fs_lustre_lock_algorithm ) { + fh->f_flags |= OMPIO_LOCK_NEVER; + } + else if (FS_LUSTRE_LOCK_ENTIRE_FILE == mca_fs_lustre_lock_algorithm) { + fh->f_flags |= OMPIO_LOCK_ENTIRE_FILE; + } + else if (FS_LUSTRE_LOCK_RANGES == mca_fs_lustre_lock_algorithm) { + /* Nothing to be done. This is what the posix fbtl component would do + anyway without additional information . */ + } + else { + opal_output ( 1, "Invalid value for mca_fs_lustre_lock_algorithm %d", mca_fs_lustre_lock_algorithm ); + } + return OMPI_SUCCESS; } diff --git a/ompi/mca/fs/ufs/fs_ufs_file_open.c b/ompi/mca/fs/ufs/fs_ufs_file_open.c index 1159de993bb..273934b9748 100644 --- a/ompi/mca/fs/ufs/fs_ufs_file_open.c +++ b/ompi/mca/fs/ufs/fs_ufs_file_open.c @@ -13,6 +13,7 @@ * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserverd. * $COPYRIGHT$ * * Additional copyrights may follow @@ -105,12 +106,6 @@ mca_fs_ufs_file_open (struct ompi_communicator_t *comm, component. */ fh->f_flags |= OMPIO_LOCK_ENTIRE_FILE; } - else { - fh->f_flags |= OMPIO_LOCK_NEVER; - } - } - else { - fh->f_flags |= OMPIO_LOCK_NEVER; } free (fstype); }