Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft][Develop] Fix test_porxy integ test #6302

Open
wants to merge 21 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b6d1e53
Add connection timeout parameter to RemoteCommandExecutor. Set the a …
hehe7318 Jun 17, 2024
d427e39
eval ssh-agent and add ssh key to avoid Error reading SSH protocol ba…
hehe7318 Jun 17, 2024
39d6130
Add finalizer to ensure proxy stack is deleted after cluster stacks, …
hehe7318 Jun 18, 2024
7c2c0fe
Fix proxy stack teardown failed issue.
hehe7318 Jun 18, 2024
66c7c73
Remove ssh-agent and ssh-add codes for now, increase banner timeout t…
hehe7318 Jun 18, 2024
549dc04
Start the SSH agent and add SSH key, increase timeout and banner_timeout
hehe7318 Jun 18, 2024
87eb321
Remove Start the SSH agent and add SSH key logic for now to test
hehe7318 Jun 18, 2024
9c0407a
Add a new parameter connection_allow_agent to RemoteCommandExecutor. …
hehe7318 Jun 18, 2024
884dbaf
Comment out internet test, increase banner timeout to 600s. Add conne…
hehe7318 Jun 18, 2024
d79b901
Delete connection timeout from regular arg
hehe7318 Jun 18, 2024
150facf
Add ssh_agent and ssh_add
hehe7318 Jun 18, 2024
dba6114
Add ssh-agent and ssh-add, and set env variable for ssh-add
hehe7318 Jun 19, 2024
cfef656
Remove connection_allow_agent from RemoteCommandExecutor, add custom_…
hehe7318 Jun 19, 2024
95195c9
Add inline_ssh_env to test
hehe7318 Jun 19, 2024
892203e
Add env_prefix. set env to os.environ when env is none. print os.envi…
hehe7318 Jun 19, 2024
2512cf6
Reformat
hehe7318 Jun 19, 2024
6e6828a
Test protocol banner
hehe7318 Jun 20, 2024
ca14436
Merge branch 'develop' into wip/fix-inte-test-proxy
hehe7318 Jun 20, 2024
de5d2e5
Add -vvv to debug, increase banner_timeout to 30mins to test
hehe7318 Jun 20, 2024
2c4f7a8
Run simple ssh command to avoid Host key verification failed
hehe7318 Jun 20, 2024
4ff0e32
Add AllowedIps: 0.0.0.0/0 in configuration
hehe7318 Jun 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions tests/integration-tests/configs/tmp_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{%- import 'common.jinja2' as common with context -%}
{{- common.OSS_COMMERCIAL_ARM.append("centos7") or "" -}}
{{- common.OSS_COMMERCIAL_X86.append("rocky8") or "" -}}
{{- common.OSS_COMMERCIAL_X86.append("rocky9") or "" -}}
---
test-suites:
proxy:
test_proxy.py::test_proxy:
dimensions:
- regions: ["us-east-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2004"]
schedulers: ["slurm"]
22 changes: 18 additions & 4 deletions tests/integration-tests/remote_command_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@ class RemoteCommandExecutor:
"""Execute remote commands on the cluster head node."""

def __init__(
self, cluster, compute_node_ip=None, username=None, bastion=None, alternate_ssh_key=None, use_login_node=False
self,
cluster,
compute_node_ip=None,
username=None,
bastion=None,
alternate_ssh_key=None,
use_login_node=False,
connection_timeout=None,
):
"""
Initiate SSH connection
Expand Down Expand Up @@ -61,19 +68,26 @@ def __init__(
"host": node_ip,
"user": username,
"forward_agent": False,
"inline_ssh_env": True,
"connect_kwargs": {
"key_filename": [alternate_ssh_key if alternate_ssh_key else cluster.ssh_key],
"look_for_keys": False,
},
}
if bastion:
# Need to execute simple ssh command before using Connection to avoid Paramiko _check_banner error
run_command(
f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname", timeout=30, shell=True
ssh_command_result = run_command(
f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname",
timeout=30,
shell=True,
)
logging.info(f"Command output: {ssh_command_result}")
connection_kwargs["gateway"] = f"ssh -W %h:%p -A {bastion}"
connection_kwargs["forward_agent"] = True
connection_kwargs["connect_kwargs"]["banner_timeout"] = 60
connection_kwargs["connect_kwargs"]["banner_timeout"] = 1800
if connection_timeout:
connection_kwargs["connect_kwargs"]["timeout"] = connection_timeout
logging.info(f"set timeout to {connection_timeout}")
logging.info(
f"Connecting to {connection_kwargs['host']} as {connection_kwargs['user']} with "
f"{connection_kwargs['connect_kwargs']['key_filename']}"
Expand Down
68 changes: 57 additions & 11 deletions tests/integration-tests/tests/proxy/test_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
import os

import boto3
import pytest
from assertpy import assert_that
from cfn_stacks_factory import CfnStack
from remote_command_executor import RemoteCommandExecutor
from utils import generate_stack_name
from utils import generate_stack_name, run_command

from tests.common.schedulers_common import SlurmCommands

Expand Down Expand Up @@ -74,7 +75,7 @@ def get_instance_public_ip(instance_id, region):


@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, scheduler_commands_factory):
def test_proxy(pcluster_config_reader, request, proxy_stack_factory, scheduler_commands_factory, clusters_factory):
"""
Test the creation and functionality of a Cluster using a proxy environment.

Expand All @@ -84,6 +85,33 @@ def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, sc
3. Submit a sleep job to the cluster and verify it completes successfully.
4. Check Internet access by trying to access google.com
"""
# Start ssh-agent and capture the output
ssh_agent_result = run_command("ssh-agent -s", shell=True)
logging.info(f"SSH agent started with output: {ssh_agent_result.stdout}")

# Parse the ssh-agent output to set environment variables
for line in ssh_agent_result.stdout.splitlines():
if line.startswith("SSH_AUTH_SOCK"):
key, value = line.split(";")[0].split("=")
os.environ[key] = value
elif line.startswith("SSH_AGENT_PID"):
key, value = line.split(";")[0].split("=")
os.environ[key] = value

logging.info("Environment variables are: %s", dict(os.environ))

# Verify that the environment variables are set correctly
logging.info(f"SSH_AUTH_SOCK: {os.environ.get('SSH_AUTH_SOCK')}")
logging.info(f"SSH_AGENT_PID: {os.environ.get('SSH_AGENT_PID')}")

# Add the SSH key using the ssh-add command, passing the environment variables
ssh_add_result = run_command(f'ssh-add {request.config.getoption("key_path")}', shell=True)
logging.info(f"SSH key add result: {ssh_add_result.stderr}")

# Confirm that the key has been added
added_keys = run_command("ssh-add -l", shell=True)
logging.info(f"SSH keys added: {added_keys.stdout}")

proxy_address = proxy_stack_factory.cfn_outputs["ProxyAddress"]
subnet_with_proxy = proxy_stack_factory.cfn_outputs["PrivateSubnet"]
proxy_instance_id = proxy_stack_factory.cfn_resources.get("Proxy")
Expand All @@ -96,21 +124,39 @@ def test_proxy(pcluster_config_reader, clusters_factory, proxy_stack_factory, sc

bastion = f"ubuntu@{proxy_public_ip}"

remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion)
slurm_commands = SlurmCommands(remote_command_executor)
env_vars = {
"SSH_AUTH_SOCK": os.environ.get("SSH_AUTH_SOCK"),
"SSH_AGENT_PID": os.environ.get("SSH_AGENT_PID"),
}
env_prefix = " && ".join([f"export {key}={value}" for key, value in env_vars.items()])

_check_internet_access(remote_command_executor)
headnode_instance_ip = cluster.head_node_ip

job_id = slurm_commands.submit_command_and_assert_job_accepted(
submit_command_args={"command": "srun sleep 1", "nodes": 1}
ssh_command_result = run_command(
f"ssh -i {cluster.ssh_key} -o StrictHostKeyChecking=no {bastion} hostname",
timeout=30,
shell=True,
)
slurm_commands.wait_job_completed(job_id)
slurm_commands.assert_job_succeeded(job_id)
logging.info(f"Command output: {ssh_command_result}")

ssh_gateway_result = run_command(f"ssh -W {headnode_instance_ip}:22 -A {bastion} -vvv", shell=True, raise_on_error=False)
logging.info(f"SSH command output: {ssh_gateway_result}")

remote_command_executor = RemoteCommandExecutor(cluster=cluster, bastion=bastion, connection_timeout=300)
# slurm_commands = SlurmCommands(remote_command_executor)

_check_internet_access(remote_command_executor, env_prefix)

# job_id = slurm_commands.submit_command_and_assert_job_accepted(
# submit_command_args={"command": "srun sleep 1", "nodes": 1}
# )
# slurm_commands.wait_job_completed(job_id)
# slurm_commands.assert_job_succeeded(job_id)


def _check_internet_access(remote_command_executor):
def _check_internet_access(remote_command_executor, env_prefix):
logging.info("Checking cluster has Internet access by trying to access google.com")
internet_result = remote_command_executor.run_remote_command(
"curl --connect-timeout 10 -I https://google.com", raise_on_error=False
f"{env_prefix} && curl --connect-timeout 10 -I https://google.com", raise_on_error=False
)
assert_that(internet_result.failed).is_false()
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ HeadNode:
InstanceType: {{ instance }}
Ssh:
KeyName: {{ key_name }}
AllowedIps: 0.0.0.0/0
Networking:
SubnetId: {{ subnet_with_proxy }}
Proxy:
Expand Down
3 changes: 3 additions & 0 deletions tests/integration-tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ def run_command(
command = shlex.split(command)
log_command = command if isinstance(command, str) else " ".join(str(arg) for arg in command)
logging.info("Executing command: {}".format(log_command))

env = env if env is not None else os.environ.copy()

try:
result = subprocess.run(
command,
Expand Down
Loading