From b6d1e5323bebd10b21b1b9cdb29d2d671039d1c7 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 16:06:33 -0400 Subject: [PATCH 01/20] Add connection timeout parameter to RemoteCommandExecutor. Set the a longer connection timeout to avoid paramiko.ssh_exception.SSHException: No existing session error. --- tests/integration-tests/configs/tmp_test.yaml | 13 +++++++++++++ tests/integration-tests/remote_command_executor.py | 11 ++++++++++- tests/integration-tests/tests/proxy/test_proxy.py | 2 +- 3 files changed, 24 insertions(+), 2 deletions(-) create mode 100644 tests/integration-tests/configs/tmp_test.yaml diff --git a/tests/integration-tests/configs/tmp_test.yaml b/tests/integration-tests/configs/tmp_test.yaml new file mode 100644 index 0000000000..647f9b0bbd --- /dev/null +++ b/tests/integration-tests/configs/tmp_test.yaml @@ -0,0 +1,13 @@ +{%- import 'common.jinja2' as common with context -%} +{{- common.OSS_COMMERCIAL_ARM.append("centos7") or "" -}} +{{- common.OSS_COMMERCIAL_X86.append("rocky8") or "" -}} +{{- common.OSS_COMMERCIAL_X86.append("rocky9") or "" -}} +--- +test-suites: + proxy: + test_proxy.py::test_proxy: + dimensions: + - regions: ["us-east-1"] + instances: {{ common.INSTANCES_DEFAULT_X86 }} + oss: ["ubuntu2004"] + schedulers: ["slurm"] diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 8c314fc3a8..9c71e46c56 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -29,7 +29,14 @@ class RemoteCommandExecutor: """Execute remote commands on the cluster head node.""" def __init__( - self, cluster, compute_node_ip=None, username=None, bastion=None, alternate_ssh_key=None, use_login_node=False + self, + cluster, + compute_node_ip=None, + username=None, + bastion=None, + alternate_ssh_key=None, + use_login_node=False, + connection_timeout=None, ): """ Initiate SSH connection @@ -74,6 +81,8 @@ def __init__( connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True connection_kwargs["connect_kwargs"]["banner_timeout"] = 60 + if connection_timeout: + connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout logging.info( f"Connecting to {connection_kwargs['host']} as {connection_kwargs['user']} with " f"{connection_kwargs['connect_kwargs']['key_filename']}" diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 7e310dab5d..a9982769a1 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -96,7 +96,7 @@ def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, sc bastion = f"ubuntu@{proxy_public_ip}" - remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion) + remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) slurm_commands = SlurmCommands(remote_command_executor) _check_internet_access(remote_command_executor) From d427e3976f565a4e779895e358b1980d1d82b0ca Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 16:51:06 -0400 Subject: [PATCH 02/20] eval ssh-agent and add ssh key to avoid Error reading SSH protocol banner issue --- tests/integration-tests/tests/proxy/test_proxy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index a9982769a1..8f3a33a9ae 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -16,7 +16,7 @@ from assertpy import assert_that from cfn_stacks_factory import CfnStack from remote_command_executor import RemoteCommandExecutor -from utils import generate_stack_name +from utils import generate_stack_name, run_command from tests.common.schedulers_common import SlurmCommands @@ -95,8 +95,8 @@ def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, sc cluster = clusters_factory(cluster_config) bastion = f"ubuntu@{proxy_public_ip}" - - remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) + run_command(f"eval ssh-agent && ssh-add {cluster.ssh_key}", timeout=60, shell=True) + remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=200) slurm_commands = SlurmCommands(remote_command_executor) _check_internet_access(remote_command_executor) From 39d6130d78dda476086955b76f8fa44107a7624e Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 20:43:05 -0400 Subject: [PATCH 03/20] Add finalizer to ensure proxy stack is deleted after cluster stacks, add set ssh environment variables logic. Split the ssh-agent and ssh-add commands and add log output to debug. --- .../tests/proxy/test_proxy.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 8f3a33a9ae..13ee7e6b43 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -10,6 +10,7 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging +import os import boto3 import pytest @@ -62,8 +63,14 @@ def proxy_stack_factory(region, request, cfn_stacks_factory): yield proxy_stack - if not request.config.getoption("no_delete") and not request.config.getoption("proxy_stack"): - cfn_stacks_factory.delete_stack(proxy_stack.name, region) + # Add finalizer to ensure proxy stack is deleted after cluster stacks + request.addfinalizer(lambda: delete_proxy_stack(proxy_stack, region, request, cfn_stacks_factory)) + + +def delete_proxy_stack(proxy_stack, region, request, cfn_stacks_factory): + if not request.config.getoption("no_delete") and not request.config.getoption("proxy_stack"): + logging.info(f"Deleting proxy stack {proxy_stack.name} in region {region}") + cfn_stacks_factory.delete_stack(proxy_stack.name, region) def get_instance_public_ip(instance_id, region): @@ -95,7 +102,20 @@ def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, sc cluster = clusters_factory(cluster_config) bastion = f"ubuntu@{proxy_public_ip}" - run_command(f"eval ssh-agent && ssh-add {cluster.ssh_key}", timeout=60, shell=True) + + # Start the SSH agent and add SSH key + agent_result = run_command("eval `ssh-agent -s`", shell=True) + logging.info(f"SSH agent started with output: {agent_result.stdout}") + + # Set environment variables + for line in agent_result.stdout.splitlines(): + if "SSH_AUTH_SOCK" in line or "SSH_AGENT_PID" in line: + key, value = line.replace(";", "").split("=") + os.environ[key.strip()] = value.strip() + + add_key_result = run_command(f"ssh-add {cluster.ssh_key}", shell=True, env=os.environ) + logging.info(f"SSH key add result: {add_key_result.stdout}") + remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=200) slurm_commands = SlurmCommands(remote_command_executor) From 7c2c0fea4344055265ec60b40336fde4f41156b3 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 23:34:31 -0400 Subject: [PATCH 04/20] Fix proxy stack teardown failed issue. --- tests/integration-tests/tests/proxy/test_proxy.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 13ee7e6b43..54c59949cd 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -63,14 +63,8 @@ def proxy_stack_factory(region, request, cfn_stacks_factory): yield proxy_stack - # Add finalizer to ensure proxy stack is deleted after cluster stacks - request.addfinalizer(lambda: delete_proxy_stack(proxy_stack, region, request, cfn_stacks_factory)) - - -def delete_proxy_stack(proxy_stack, region, request, cfn_stacks_factory): - if not request.config.getoption("no_delete") and not request.config.getoption("proxy_stack"): - logging.info(f"Deleting proxy stack {proxy_stack.name} in region {region}") - cfn_stacks_factory.delete_stack(proxy_stack.name, region) + if not request.config.getoption("no_delete") and not request.config.getoption("proxy_stack"): + cfn_stacks_factory.delete_stack(proxy_stack.name, region) def get_instance_public_ip(instance_id, region): @@ -81,7 +75,7 @@ def get_instance_public_ip(instance_id, region): @pytest.mark.usefixtures("region", "os", "instance", "scheduler") -def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, scheduler_commands_factory): +def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_factory, clusters_factory): """ Test the creation and functionality of a Cluster using a proxy environment. From 66c7c7311174d113ea160bf4253c96c7a1823895 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 17 Jun 2024 23:36:06 -0400 Subject: [PATCH 05/20] Remove ssh-agent and ssh-add codes for now, increase banner timeout to 200 seconds. --- tests/integration-tests/remote_command_executor.py | 2 +- tests/integration-tests/tests/proxy/test_proxy.py | 13 ------------- 2 files changed, 1 insertion(+), 14 deletions(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 9c71e46c56..e5c3994312 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -80,7 +80,7 @@ def __init__( ) connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True - connection_kwargs["connect_kwargs"]["banner_timeout"] = 60 + connection_kwargs["connect_kwargs"]["banner_timeout"] = 200 if connection_timeout: connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout logging.info( diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 54c59949cd..9a3f94367f 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -97,19 +97,6 @@ def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_f bastion = f"ubuntu@{proxy_public_ip}" - # Start the SSH agent and add SSH key - agent_result = run_command("eval `ssh-agent -s`", shell=True) - logging.info(f"SSH agent started with output: {agent_result.stdout}") - - # Set environment variables - for line in agent_result.stdout.splitlines(): - if "SSH_AUTH_SOCK" in line or "SSH_AGENT_PID" in line: - key, value = line.replace(";", "").split("=") - os.environ[key.strip()] = value.strip() - - add_key_result = run_command(f"ssh-add {cluster.ssh_key}", shell=True, env=os.environ) - logging.info(f"SSH key add result: {add_key_result.stdout}") - remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=200) slurm_commands = SlurmCommands(remote_command_executor) From 549dc04d06cda281971474be272db6505960b718 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 12:14:43 -0400 Subject: [PATCH 06/20] Start the SSH agent and add SSH key, increase timeout and banner_timeout --- tests/integration-tests/remote_command_executor.py | 2 +- tests/integration-tests/tests/proxy/test_proxy.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index e5c3994312..b73bc6fdfa 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -80,7 +80,7 @@ def __init__( ) connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True - connection_kwargs["connect_kwargs"]["banner_timeout"] = 200 + connection_kwargs["connect_kwargs"]["banner_timeout"] = 360 if connection_timeout: connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout logging.info( diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 9a3f94367f..ed91ad1b04 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and limitations under the License. import logging import os +from time import sleep import boto3 import pytest @@ -97,7 +98,14 @@ def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_f bastion = f"ubuntu@{proxy_public_ip}" - remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=200) + # Start the SSH agent and add SSH key + agent_result = run_command("eval `ssh-agent -s`", shell=True) + logging.info(f"SSH agent started with output: {agent_result.stdout}") + sleep(5) + add_key_result = run_command(f"ssh-add {cluster.ssh_key}", shell=True) + logging.info(f"SSH key add result: {add_key_result.stdout}") + + remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) slurm_commands = SlurmCommands(remote_command_executor) _check_internet_access(remote_command_executor) From 87eb3218bdc1127700abe3928b40de738687fdc3 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 12:36:34 -0400 Subject: [PATCH 07/20] Remove Start the SSH agent and add SSH key logic for now to test --- tests/integration-tests/tests/proxy/test_proxy.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index ed91ad1b04..fd2cd687ad 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -98,13 +98,6 @@ def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_f bastion = f"ubuntu@{proxy_public_ip}" - # Start the SSH agent and add SSH key - agent_result = run_command("eval `ssh-agent -s`", shell=True) - logging.info(f"SSH agent started with output: {agent_result.stdout}") - sleep(5) - add_key_result = run_command(f"ssh-add {cluster.ssh_key}", shell=True) - logging.info(f"SSH key add result: {add_key_result.stdout}") - remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) slurm_commands = SlurmCommands(remote_command_executor) From 9c0407a008ff315a135057c380134dde52bef6f7 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 14:56:21 -0400 Subject: [PATCH 08/20] Add a new parameter connection_allow_agent to RemoteCommandExecutor. Set it to False, to disable connecting to the SSH agent when using paramiko. --- tests/integration-tests/remote_command_executor.py | 3 +++ tests/integration-tests/tests/proxy/test_proxy.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index b73bc6fdfa..6920f36147 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -37,6 +37,7 @@ def __init__( alternate_ssh_key=None, use_login_node=False, connection_timeout=None, + connection_allow_agent=None, ): """ Initiate SSH connection @@ -83,6 +84,8 @@ def __init__( connection_kwargs["connect_kwargs"]["banner_timeout"] = 360 if connection_timeout: connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout + if connection_allow_agent: + connection_kwargs["connect_kwargs"]["allow_agent"] = connection_allow_agent logging.info( f"Connecting to {connection_kwargs['host']} as {connection_kwargs['user']} with " f"{connection_kwargs['connect_kwargs']['key_filename']}" diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index fd2cd687ad..62a39ba716 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -98,7 +98,9 @@ def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_f bastion = f"ubuntu@{proxy_public_ip}" - remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) + remote_command_executor = RemoteCommandExecutor( + cluster=cluster, bastion=bastion, connection_timeout=300, connection_allow_agent=False + ) slurm_commands = SlurmCommands(remote_command_executor) _check_internet_access(remote_command_executor) From 884dbaf312767e36b9a2863e45ab6c73e0d2121b Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 15:56:35 -0400 Subject: [PATCH 09/20] Comment out internet test, increase banner timeout to 600s. Add connect_timeout set to 360s --- tests/integration-tests/remote_command_executor.py | 5 ++++- tests/integration-tests/tests/proxy/test_proxy.py | 6 ++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 6920f36147..6f69e9a53d 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -69,6 +69,7 @@ def __init__( "host": node_ip, "user": username, "forward_agent": False, + "connect_timeout": 360, "connect_kwargs": { "key_filename": [alternate_ssh_key if alternate_ssh_key else cluster.ssh_key], "look_for_keys": False, @@ -81,11 +82,13 @@ def __init__( ) connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True - connection_kwargs["connect_kwargs"]["banner_timeout"] = 360 + connection_kwargs["connect_kwargs"]["banner_timeout"] = 600 if connection_timeout: connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout + logging.info(f"set timeout to {connection_timeout}") if connection_allow_agent: connection_kwargs["connect_kwargs"]["allow_agent"] = connection_allow_agent + logging.info(f"set allow_agent to {connection_allow_agent}") logging.info( f"Connecting to {connection_kwargs['host']} as {connection_kwargs['user']} with " f"{connection_kwargs['connect_kwargs']['key_filename']}" diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 62a39ba716..69636a9824 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -10,15 +10,13 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging -import os -from time import sleep import boto3 import pytest from assertpy import assert_that from cfn_stacks_factory import CfnStack from remote_command_executor import RemoteCommandExecutor -from utils import generate_stack_name, run_command +from utils import generate_stack_name from tests.common.schedulers_common import SlurmCommands @@ -103,7 +101,7 @@ def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_f ) slurm_commands = SlurmCommands(remote_command_executor) - _check_internet_access(remote_command_executor) + # _check_internet_access(remote_command_executor) job_id = slurm_commands.submit_command_and_assert_job_accepted( submit_command_args={"command": "srun sleep 1", "nodes": 1} From d79b901840c8328d7c479945800eae4221db9475 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 16:57:10 -0400 Subject: [PATCH 10/20] Delete connection timeout from regular arg --- tests/integration-tests/remote_command_executor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 6f69e9a53d..7ea37314df 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -69,7 +69,6 @@ def __init__( "host": node_ip, "user": username, "forward_agent": False, - "connect_timeout": 360, "connect_kwargs": { "key_filename": [alternate_ssh_key if alternate_ssh_key else cluster.ssh_key], "look_for_keys": False, From 150facfd6cbe8a58d34dc9243caa5cdbaaea2198 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 19:58:53 -0400 Subject: [PATCH 11/20] Add ssh_agent and ssh_add --- tests/integration-tests/tests/proxy/test_proxy.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 69636a9824..03cc14de22 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -16,7 +16,7 @@ from assertpy import assert_that from cfn_stacks_factory import CfnStack from remote_command_executor import RemoteCommandExecutor -from utils import generate_stack_name +from utils import generate_stack_name, run_command from tests.common.schedulers_common import SlurmCommands @@ -74,7 +74,7 @@ def get_instance_public_ip(instance_id, region): @pytest.mark.usefixtures("region", "os", "instance", "scheduler") -def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_factory, clusters_factory): +def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_commands_factory, clusters_factory): """ Test the creation and functionality of a Cluster using a proxy environment. @@ -84,6 +84,12 @@ def test_proxy(pcluster_config_reader, proxy_stack_factory, scheduler_commands_f 3. Submit a sleep job to the cluster and verify it completes successfully. 4. Check Internet access by trying to access google.com """ + ssh_agent_result = run_command("eval `ssh-agent -s`", shell=True) + logging.info(f"SSH agent started with output: {ssh_agent_result.stdout}") + + ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True) + logging.info(f"SSH key add result: {ssh_add_result.stderr}") + proxy_address = proxy_stack_factory.cfn_outputs["ProxyAddress"] subnet_with_proxy = proxy_stack_factory.cfn_outputs["PrivateSubnet"] proxy_instance_id = proxy_stack_factory.cfn_resources.get("Proxy") From dba6114bc8869e51f94d745ae9d3dd5933e7d63e Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 20:36:27 -0400 Subject: [PATCH 12/20] Add ssh-agent and ssh-add, and set env variable for ssh-add --- .../tests/proxy/test_proxy.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 03cc14de22..d7a6a95324 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -10,6 +10,7 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging +import os import boto3 import pytest @@ -84,11 +85,26 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c 3. Submit a sleep job to the cluster and verify it completes successfully. 4. Check Internet access by trying to access google.com """ - ssh_agent_result = run_command("eval `ssh-agent -s`", shell=True) + # Start ssh-agent and capture the output + ssh_agent_result = run_command("ssh-agent -s", shell=True) logging.info(f"SSH agent started with output: {ssh_agent_result.stdout}") - ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True) - logging.info(f"SSH key add result: {ssh_add_result.stderr}") + # Parse the ssh-agent output to set environment variables + for line in ssh_agent_result.stdout.splitlines(): + if line.startswith("SSH_AUTH_SOCK"): + key, value = line.split(";")[0].split("=") + os.environ[key] = value + elif line.startswith("SSH_AGENT_PID"): + key, value = line.split(";")[0].split("=") + os.environ[key] = value + + # Verify that the environment variables are set correctly + logging.info(f"SSH_AUTH_SOCK: {os.environ.get('SSH_AUTH_SOCK')}") + logging.info(f"SSH_AGENT_PID: {os.environ.get('SSH_AGENT_PID')}") + + # Add the SSH key using the ssh-add command, passing the environment variables + ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True, env=os.environ.copy()) + logging.info(f"SSH key add result: {ssh_add_result.stdout}") proxy_address = proxy_stack_factory.cfn_outputs["ProxyAddress"] subnet_with_proxy = proxy_stack_factory.cfn_outputs["PrivateSubnet"] From cfef656cde3a3df587fc238ccb7f6996fe70cd75 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Tue, 18 Jun 2024 21:50:53 -0400 Subject: [PATCH 13/20] Remove connection_allow_agent from RemoteCommandExecutor, add custom_env to pass env vairables. Add logic to show SSH keys added --- tests/integration-tests/remote_command_executor.py | 12 ++++++------ tests/integration-tests/tests/proxy/test_proxy.py | 8 ++++++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 7ea37314df..ad6929909b 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -37,7 +37,7 @@ def __init__( alternate_ssh_key=None, use_login_node=False, connection_timeout=None, - connection_allow_agent=None, + custom_env=None, ): """ Initiate SSH connection @@ -77,17 +77,17 @@ def __init__( if bastion: # Need to execute simple ssh command before using Connection to avoid Paramiko _check_banner error run_command( - f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname", timeout=30, shell=True + f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname", + timeout=30, + shell=True, + env=custom_env if custom_env else None, ) connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True - connection_kwargs["connect_kwargs"]["banner_timeout"] = 600 + connection_kwargs["connect_kwargs"]["banner_timeout"] = 300 if connection_timeout: connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout logging.info(f"set timeout to {connection_timeout}") - if connection_allow_agent: - connection_kwargs["connect_kwargs"]["allow_agent"] = connection_allow_agent - logging.info(f"set allow_agent to {connection_allow_agent}") logging.info( f"Connecting to {connection_kwargs['host']} as {connection_kwargs['user']} with " f"{connection_kwargs['connect_kwargs']['key_filename']}" diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index d7a6a95324..fb6470b857 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -104,7 +104,11 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c # Add the SSH key using the ssh-add command, passing the environment variables ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True, env=os.environ.copy()) - logging.info(f"SSH key add result: {ssh_add_result.stdout}") + logging.info(f"SSH key add result: {ssh_add_result.stderr}") + + # Confirm that the key has been added + added_keys = run_command("ssh-add -l", shell=True, env=os.environ.copy()) + logging.info(f"SSH keys added: {added_keys.stdout}") proxy_address = proxy_stack_factory.cfn_outputs["ProxyAddress"] subnet_with_proxy = proxy_stack_factory.cfn_outputs["PrivateSubnet"] @@ -119,7 +123,7 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c bastion = f"ubuntu@{proxy_public_ip}" remote_command_executor = RemoteCommandExecutor( - cluster=cluster, bastion=bastion, connection_timeout=300, connection_allow_agent=False + cluster=cluster, bastion=bastion, connection_timeout=300, custom_env=os.environ.copy() ) slurm_commands = SlurmCommands(remote_command_executor) From 95195c99267132d35cb080b01206c28bae1e6dc3 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 19 Jun 2024 00:00:00 -0400 Subject: [PATCH 14/20] Add inline_ssh_env to test --- tests/integration-tests/remote_command_executor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index ad6929909b..c4be09b601 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -69,6 +69,7 @@ def __init__( "host": node_ip, "user": username, "forward_agent": False, + "inline_ssh_env": True, "connect_kwargs": { "key_filename": [alternate_ssh_key if alternate_ssh_key else cluster.ssh_key], "look_for_keys": False, @@ -76,12 +77,13 @@ def __init__( } if bastion: # Need to execute simple ssh command before using Connection to avoid Paramiko _check_banner error - run_command( + ssh_command_result = run_command( f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname", timeout=30, shell=True, env=custom_env if custom_env else None, ) + logging.info(f"Command output: {ssh_command_result}") connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True connection_kwargs["connect_kwargs"]["banner_timeout"] = 300 From 892203e86347fe39f1324d7d17280bb61965bb97 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 19 Jun 2024 12:24:13 -0400 Subject: [PATCH 15/20] Add env_prefix. set env to os.environ when env is none. print os.environ. --- .../remote_command_executor.py | 2 -- .../tests/proxy/test_proxy.py | 32 ++++++++++++------- tests/integration-tests/utils.py | 3 ++ 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index c4be09b601..52b73cbd98 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -37,7 +37,6 @@ def __init__( alternate_ssh_key=None, use_login_node=False, connection_timeout=None, - custom_env=None, ): """ Initiate SSH connection @@ -81,7 +80,6 @@ def __init__( f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname", timeout=30, shell=True, - env=custom_env if custom_env else None, ) logging.info(f"Command output: {ssh_command_result}") connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index fb6470b857..faa591fd06 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -98,16 +98,18 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c key, value = line.split(";")[0].split("=") os.environ[key] = value + logging.info("Environment variables are: %s", dict(os.environ)) + # Verify that the environment variables are set correctly logging.info(f"SSH_AUTH_SOCK: {os.environ.get('SSH_AUTH_SOCK')}") logging.info(f"SSH_AGENT_PID: {os.environ.get('SSH_AGENT_PID')}") # Add the SSH key using the ssh-add command, passing the environment variables - ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True, env=os.environ.copy()) + ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True) logging.info(f"SSH key add result: {ssh_add_result.stderr}") # Confirm that the key has been added - added_keys = run_command("ssh-add -l", shell=True, env=os.environ.copy()) + added_keys = run_command("ssh-add -l", shell=True) logging.info(f"SSH keys added: {added_keys.stdout}") proxy_address = proxy_stack_factory.cfn_outputs["ProxyAddress"] @@ -122,23 +124,29 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c bastion = f"ubuntu@{proxy_public_ip}" + env_vars = { + "SSH_AUTH_SOCK": os.environ.get("SSH_AUTH_SOCK"), + "SSH_AGENT_PID": os.environ.get("SSH_AGENT_PID"), + } + env_prefix = " && ".join([f"export {key}={value}" for key, value in env_vars.items()]) + remote_command_executor = RemoteCommandExecutor( - cluster=cluster, bastion=bastion, connection_timeout=300, custom_env=os.environ.copy() + cluster=cluster, bastion=bastion, connection_timeout=300 ) - slurm_commands = SlurmCommands(remote_command_executor) + # slurm_commands = SlurmCommands(remote_command_executor) - # _check_internet_access(remote_command_executor) + _check_internet_access(remote_command_executor, env_prefix) - job_id = slurm_commands.submit_command_and_assert_job_accepted( - submit_command_args={"command": "srun sleep 1", "nodes": 1} - ) - slurm_commands.wait_job_completed(job_id) - slurm_commands.assert_job_succeeded(job_id) + # job_id = slurm_commands.submit_command_and_assert_job_accepted( + # submit_command_args={"command": "srun sleep 1", "nodes": 1} + # ) + # slurm_commands.wait_job_completed(job_id) + # slurm_commands.assert_job_succeeded(job_id) -def _check_internet_access(remote_command_executor): +def _check_internet_access(remote_command_executor, env_prefix): logging.info("Checking cluster has Internet access by trying to access google.com") internet_result = remote_command_executor.run_remote_command( - "curl --connect-timeout 10 -I https://google.com", raise_on_error=False + f"{env_prefix} && curl --connect-timeout 10 -I https://google.com", raise_on_error=False ) assert_that(internet_result.failed).is_false() diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index 9a92998c85..46159ec8fb 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -187,6 +187,9 @@ def run_command( command = shlex.split(command) log_command = command if isinstance(command, str) else " ".join(str(arg) for arg in command) logging.info("Executing command: {}".format(log_command)) + + env = env if env is not None else os.environ.copy() + try: result = subprocess.run( command, From 2512cf641ad4cb12f53fa4813a2e73db86c71723 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Wed, 19 Jun 2024 18:21:04 -0400 Subject: [PATCH 16/20] Reformat --- tests/integration-tests/tests/proxy/test_proxy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index faa591fd06..06816e84a4 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -130,9 +130,7 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c } env_prefix = " && ".join([f"export {key}={value}" for key, value in env_vars.items()]) - remote_command_executor = RemoteCommandExecutor( - cluster=cluster, bastion=bastion, connection_timeout=300 - ) + remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) # slurm_commands = SlurmCommands(remote_command_executor) _check_internet_access(remote_command_executor, env_prefix) From 6e6828acaf306f3e15c392c997398bac14144c52 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 20 Jun 2024 11:33:38 -0400 Subject: [PATCH 17/20] Test protocol banner --- tests/integration-tests/tests/proxy/test_proxy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 06816e84a4..6ab58bf379 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -130,6 +130,10 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c } env_prefix = " && ".join([f"export {key}={value}" for key, value in env_vars.items()]) + headnode_instance_ip = cluster.head_node_ip + ssh_gateway_result = run_command(f"ssh -W {headnode_instance_ip}:22 -A {bastion}", shell=True, raise_on_error=False) + logging.info(f"SSH command output: {ssh_gateway_result}") + remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) # slurm_commands = SlurmCommands(remote_command_executor) From de5d2e51e03cd356625dc0bf537c75f56ba7b58c Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 20 Jun 2024 12:08:38 -0400 Subject: [PATCH 18/20] Add -vvv to debug, increase banner_timeout to 30mins to test --- tests/integration-tests/remote_command_executor.py | 2 +- tests/integration-tests/tests/proxy/test_proxy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/remote_command_executor.py b/tests/integration-tests/remote_command_executor.py index 52b73cbd98..e58e8d5529 100644 --- a/tests/integration-tests/remote_command_executor.py +++ b/tests/integration-tests/remote_command_executor.py @@ -84,7 +84,7 @@ def __init__( logging.info(f"Command output: {ssh_command_result}") connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}" connection_kwargs["forward_agent"] = True - connection_kwargs["connect_kwargs"]["banner_timeout"] = 300 + connection_kwargs["connect_kwargs"]["banner_timeout"] = 1800 if connection_timeout: connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout logging.info(f"set timeout to {connection_timeout}") diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index 6ab58bf379..d64c2fe3e3 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -131,7 +131,7 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c env_prefix = " && ".join([f"export {key}={value}" for key, value in env_vars.items()]) headnode_instance_ip = cluster.head_node_ip - ssh_gateway_result = run_command(f"ssh -W {headnode_instance_ip}:22 -A {bastion}", shell=True, raise_on_error=False) + ssh_gateway_result = run_command(f"ssh -W {headnode_instance_ip}:22 -A {bastion} -vvv", shell=True, raise_on_error=False) logging.info(f"SSH command output: {ssh_gateway_result}") remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300) From 2c4f7a848387f7a3916211109cb753d83489cb1c Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 20 Jun 2024 13:05:20 -0400 Subject: [PATCH 19/20] Run simple ssh command to avoid Host key verification failed --- tests/integration-tests/tests/proxy/test_proxy.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/integration-tests/tests/proxy/test_proxy.py b/tests/integration-tests/tests/proxy/test_proxy.py index d64c2fe3e3..45b998ba1b 100644 --- a/tests/integration-tests/tests/proxy/test_proxy.py +++ b/tests/integration-tests/tests/proxy/test_proxy.py @@ -131,6 +131,14 @@ def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_c env_prefix = " && ".join([f"export {key}={value}" for key, value in env_vars.items()]) headnode_instance_ip = cluster.head_node_ip + + ssh_command_result = run_command( + f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname", + timeout=30, + shell=True, + ) + logging.info(f"Command output: {ssh_command_result}") + ssh_gateway_result = run_command(f"ssh -W {headnode_instance_ip}:22 -A {bastion} -vvv", shell=True, raise_on_error=False) logging.info(f"SSH command output: {ssh_gateway_result}") From 4ff0e32c654b63577fef7b21bbef020d96535dac Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Thu, 20 Jun 2024 14:59:12 -0400 Subject: [PATCH 20/20] Add AllowedIps: 0.0.0.0/0 in configuration --- .../tests/proxy/test_proxy/test_proxy/pcluster.config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration-tests/tests/proxy/test_proxy/test_proxy/pcluster.config.yaml b/tests/integration-tests/tests/proxy/test_proxy/test_proxy/pcluster.config.yaml index 4ab0a63a59..0e6e722ab2 100644 --- a/tests/integration-tests/tests/proxy/test_proxy/test_proxy/pcluster.config.yaml +++ b/tests/integration-tests/tests/proxy/test_proxy/test_proxy/pcluster.config.yaml @@ -4,6 +4,7 @@ HeadNode: InstanceType: {{ instance }} Ssh: KeyName: {{ key_name }} + AllowedIps: 0.0.0.0/0 Networking: SubnetId: {{ subnet_with_proxy }} Proxy: