From e87d296fbc63edc50a221439505e5f30cb3d1fc8 Mon Sep 17 00:00:00 2001 From: Christophe Haen Date: Thu, 22 Apr 2021 17:53:51 +0200 Subject: [PATCH 1/8] SiteDirector: for HTCondor, write the executable in the globally defined working directory --- WorkloadManagementSystem/Agent/SiteDirector.py | 12 +++++++++++- .../Agent/test/Test_Agent_SiteDirector.py | 8 ++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/WorkloadManagementSystem/Agent/SiteDirector.py b/WorkloadManagementSystem/Agent/SiteDirector.py index 0af1e8778fc..ea83ca5ce13 100644 --- a/WorkloadManagementSystem/Agent/SiteDirector.py +++ b/WorkloadManagementSystem/Agent/SiteDirector.py @@ -1075,7 +1075,17 @@ def getExecutable(self, queue, pilotsToSubmit, pilotsSubmitted = pilotsToSubmit pilotOptions = ' '.join(pilotOptions) self.log.verbose('pilotOptions: %s' % pilotOptions) - executable = self._writePilotScript(workingDirectory=self.workingDirectory, + + # if a global workingDirectory is defined for the CEType (like HTCondor) + # use it (otherwise the __cleanup done by HTCondor will be in the wrong folder !) + # Note that this means that if you run multiple HTCondorCE + # in your machine, the executable files will be in the same place + # but it does not matter since they are very temporary + + ce = self.queueCECache[queue]['CE'] + workingDirectory = getattr(ce, 'workingDirectory', self.workingDirectory) + + executable = self._writePilotScript(workingDirectory=workingDirectory, pilotOptions=pilotOptions, proxy=proxy, pilotExecDir=jobExecDir, diff --git a/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py b/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py index 52b4ede12da..f7bdae1df72 100644 --- a/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py +++ b/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py @@ -172,6 +172,14 @@ def test__submitPilotsToQueue(mocker): 'Setup': 'LHCb-Production', 'Site': 'LCG.CERN.cern', 'SubmitPool': ''}}} + + # Create a MagicMock that does not have the workingDirectory + # attribute (https://cpython-test-docs.readthedocs.io/en/latest/library/unittest.mock.html#deleting-attributes) + # This is to use the SiteDirector's working directory, not the CE one + ceMock = MagicMock() + del ceMock.workingDirectory + + sd.queueCECache = {'aQueue': {'CE': ceMock}} sd.queueSlots = {'aQueue': {'AvailableSlots': 10}} res = sd._submitPilotsToQueue(1, MagicMock(), 'aQueue') assert res['OK'] is True From 2979e1c52521cce7f393d89359e30b0a127fb411 Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:11 +0200 Subject: [PATCH 2/8] HTCondorCE: Limit cleanup to a single run per minute per SiteDirector --- .../Computing/HTCondorCEComputingElement.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index e511f39392d..dd04c86f7f1 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -45,7 +45,9 @@ import os import tempfile import commands +import datetime import errno +import threading from DIRAC import S_OK, S_ERROR, gConfig from DIRAC.Resources.Computing.ComputingElement import ComputingElement @@ -160,6 +162,10 @@ class HTCondorCEComputingElement(ComputingElement): implementing the functions jobSubmit, getJobOutput """ + # static variables to ensure single cleanup every minute + _lastCleanupTime = datetime.datetime.utcnow() + _cleanupLock = threading.Lock() + ############################################################################# def __init__(self, ceUniqueID): """ Standard constructor. @@ -516,6 +522,16 @@ def __cleanup(self): # FIXME: again some issue with the working directory... # workingDirectory = self.ceParameters.get( 'WorkingDirectory', DEFAULT_WORKINGDIRECTORY ) + if not self._cleanupLock.acquire(False): + return + + now = datetime.datetime.utcnow() + if (self._lastCleanupTime - now).total_seconds < 60: + self._cleanupLock.release() + return + + self._lastCleanupTime = now + self.log.debug("Cleaning working directory: %s" % self.workingDirectory) # remove all files older than 120 minutes starting with DIRAC_ Condor will @@ -534,3 +550,4 @@ def __cleanup(self): findPars) if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) + self._cleanupLock.release() From 2e24b8ea1d4aba81c97495f53d5cf2a5950dbe3b Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:19 +0200 Subject: [PATCH 3/8] HTCondorCE: optimize find for DIRAC_ executables --- Resources/Computing/HTCondorCEComputingElement.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index dd04c86f7f1..26646caf3aa 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -537,7 +537,8 @@ def __cleanup(self): # remove all files older than 120 minutes starting with DIRAC_ Condor will # push files on submission, but it takes at least a few seconds until this # happens so we can't directly unlink after condor_submit - status, stdout = commands.getstatusoutput('find %s -mmin +120 -name "DIRAC_*" -delete ' % self.workingDirectory) + status, stdout = commands.getstatusoutput('find -O3 %s -maxdepth 1 -mmin +120 -name "DIRAC_*" -delete ' % + self.workingDirectory) if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) From 55faac003c6d8b418e6a30bd96b280d38220f12b Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:25 +0200 Subject: [PATCH 4/8] HTCondorCE: Fix cleanup timing check --- Resources/Computing/HTCondorCEComputingElement.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 26646caf3aa..456ce743340 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -526,7 +526,7 @@ def __cleanup(self): return now = datetime.datetime.utcnow() - if (self._lastCleanupTime - now).total_seconds < 60: + if (now - self._lastCleanupTime).total_seconds() < 60: self._cleanupLock.release() return From 937b2988d801da3da29d5891b6b77bac131d1178 Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:37 +0200 Subject: [PATCH 5/8] HTCondorCE: Clean up log/out/err files for all CEs as we run only once and not for all CEs --- Resources/Computing/HTCondorCEComputingElement.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 456ce743340..4bb697e7e05 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -542,9 +542,9 @@ def __cleanup(self): if status: self.log.error("Failure during HTCondorCE __cleanup", stdout) - # remove all out/err/log files older than "DaysToKeepLogs" days in the CE part of the working Directory - workDir = os.path.join(self.workingDirectory, self.ceName) - findPars = dict(workDir=workDir, days=self.daysToKeepLogs) + # remove all out/err/log files older than "DaysToKeepLogs" days in the working directory + # not running this for each CE so we do global cleanup + findPars = dict(workDir=self.workingDirectory, days=self.daysToKeepLogs) # remove all out/err/log files older than "DaysToKeepLogs" days status, stdout = commands.getstatusoutput( r'find %(workDir)s -mtime +%(days)s -type f \( -name "*.out" -o -name "*.err" -o -name "*.log" \) -delete ' % From e81d0a49c86851e7f57f466c3656ce20c1f683f5 Mon Sep 17 00:00:00 2001 From: Andre Sailer Date: Fri, 23 Apr 2021 12:03:40 +0200 Subject: [PATCH 6/8] HTCondorCE: fix use of static variables --- Resources/Computing/HTCondorCEComputingElement.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Resources/Computing/HTCondorCEComputingElement.py b/Resources/Computing/HTCondorCEComputingElement.py index 4bb697e7e05..e11da0a90c8 100644 --- a/Resources/Computing/HTCondorCEComputingElement.py +++ b/Resources/Computing/HTCondorCEComputingElement.py @@ -522,15 +522,15 @@ def __cleanup(self): # FIXME: again some issue with the working directory... # workingDirectory = self.ceParameters.get( 'WorkingDirectory', DEFAULT_WORKINGDIRECTORY ) - if not self._cleanupLock.acquire(False): + if not HTCondorCEComputingElement._cleanupLock.acquire(False): return now = datetime.datetime.utcnow() - if (now - self._lastCleanupTime).total_seconds() < 60: - self._cleanupLock.release() + if (now - HTCondorCEComputingElement._lastCleanupTime).total_seconds() < 60: + HTCondorCEComputingElement._cleanupLock.release() return - self._lastCleanupTime = now + HTCondorCEComputingElement._lastCleanupTime = now self.log.debug("Cleaning working directory: %s" % self.workingDirectory) From d7af92e4a36cdd2cfdb22e07f92919f4a1556fe4 Mon Sep 17 00:00:00 2001 From: Andrei Tsaregorodtsev Date: Wed, 28 Apr 2021 22:57:06 +0200 Subject: [PATCH 7/8] v7r0p56 notes and tags --- __init__.py | 2 +- release.notes | 8 ++++++++ setup.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/__init__.py b/__init__.py index e4478be06af..b0ee99c0888 100755 --- a/__init__.py +++ b/__init__.py @@ -95,7 +95,7 @@ else: majorVersion = 7 minorVersion = 0 - patchLevel = 55 + patchLevel = 56 preVersion = 0 version = "v%sr%s" % (majorVersion, minorVersion) diff --git a/release.notes b/release.notes index 25b6d42df33..aa8f621b1b2 100644 --- a/release.notes +++ b/release.notes @@ -1,3 +1,11 @@ +[v7r0p56] + +*Resources +FIX: (#5119) HTCondorCE: Limit calls to actual cleanup (find and delete files on disk) to + once per minute per SiteDirector, fixes #5118 +CHANGE: (#5119) HTCondorCE cleanup: Run the DIRAC_ executable purge with -O3 and -maxdepth + 1 to speed up the find + [v7r0p55] *TS diff --git a/setup.py b/setup.py index e1e1c02b514..344322717d9 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ setup( name="DIRAC", - version="7.0.55", + version="7.0.56", url="https://github.com/DIRACGRID/DIRAC", license="GPLv3", package_dir=package_dir, From e0d8b48df78925aee48788e750b41fe7c321dfaa Mon Sep 17 00:00:00 2001 From: Andrei Tsaregorodtsev Date: Wed, 28 Apr 2021 23:12:38 +0200 Subject: [PATCH 8/8] v7r1p39 notes and tags --- __init__.py | 2 +- release.notes | 5 +++++ setup.py | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/__init__.py b/__init__.py index f22de1a5ed9..d99c3488c5c 100755 --- a/__init__.py +++ b/__init__.py @@ -95,7 +95,7 @@ else: majorVersion = 7 minorVersion = 1 - patchLevel = 38 + patchLevel = 39 preVersion = 0 version = "v%sr%s" % (majorVersion, minorVersion) diff --git a/release.notes b/release.notes index 753e6331663..65d1cf5d31b 100644 --- a/release.notes +++ b/release.notes @@ -1,3 +1,8 @@ +[v7r1p39] + +*WMS +CHANGE: (#5121) for HTCondor, the SiteDirectory write the executable in the globally defined working directory + [v7r1p38] FIX: fixes from v7r0p55 diff --git a/setup.py b/setup.py index ffe819e3e71..973a1de3ce2 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( name="DIRAC", - version="7.1.38", + version="7.1.39", url="https://github.com/DIRACGRID/DIRAC", license="GPLv3", package_dir=package_dir,