Skip to content

Commit

Permalink
Update spaceflights tutorial and starter requirements for kedro-datas…
Browse files Browse the repository at this point in the history
…ets optional dependencies (#3664)

* Update spaceflights tutorial and starter requirements

Signed-off-by: lrcouto <[email protected]>

* fix e2e tests

Signed-off-by: lrcouto <[email protected]>

* Fix e2e tests by distinguishing `kedro-datasets` dependency for different python versions (#3802)


Signed-off-by: Merel Theisen <[email protected]>

* Update docs/source/tutorial/tutorial_template.md

Co-authored-by: Merel Theisen <[email protected]>
Signed-off-by: L. R. Couto <[email protected]>

---------

Signed-off-by: lrcouto <[email protected]>
Signed-off-by: L. R. Couto <[email protected]>
Signed-off-by: Merel Theisen <[email protected]>
Co-authored-by: Merel Theisen <[email protected]>
  • Loading branch information
lrcouto and merelcht committed Apr 11, 2024
1 parent f8bdf13 commit 44817b8
Show file tree
Hide file tree
Showing 8 changed files with 47 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ package: clean install

install-test-requirements:
python -m pip install -U "pip>=21.2"
pip install .[test]
pip install -U .[test]

install-pre-commit:
pre-commit install --install-hooks
Expand Down
1 change: 0 additions & 1 deletion docs/source/kedro_project_setup/dependencies.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ For example, your workflow might require the `pandas.ExcelDataset`, so to instal
From `kedro-datasets` version 3.0.0 onwards, the names of the optional dataset-level dependencies have been normalised to follow [PEP 685](https://peps.python.org/pep-0685/). The '.' character has been replaced with a '-' character and the names are in lowercase. For example, if you had `kedro-datasets[pandas.ExcelDataset]` in your requirements file, it would have to be changed to `kedro-datasets[pandas-exceldataset]`.
```


## Reproducible environments
To ensure that the project dependencies and the transitive dependencies are pinned to specific versions, use [`pip-tools`](https://pypi.org/project/pip-tools/) to compile `requirements.txt` file into a `requirements.lock` file.
To install `pip-tools` in your virtual environment, run the following command:
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tutorial/tutorial_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ pytest~=7.2
# Kedro dependencies and datasets to work with different data formats (including CSV, Excel, and Parquet)
kedro~=0.19.0
kedro-datasets[pandas.CSVDataset, pandas.ExcelDataset, pandas.ParquetDataset]>=1.1
kedro-datasets[pandas-csvdataset, pandas-exceldataset, pandas-parquetdataset]>=3.0
kedro-telemetry>=0.3.1
kedro-viz~=6.0 # Visualise pipelines
Expand Down
9 changes: 8 additions & 1 deletion features/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import shutil
import subprocess
import sys
import tempfile
import venv
from pathlib import Path
Expand All @@ -14,6 +15,7 @@
_PATHS_TO_REMOVE: set[Path] = set()

FRESH_VENV_TAG = "fresh_venv"
MINOR_PYTHON_38_VERSION = 8


def call(cmd, env):
Expand Down Expand Up @@ -130,6 +132,11 @@ def _install_project_requirements(context):
.splitlines()
)
install_reqs = [req for req in install_reqs if "{" not in req and "#" not in req]
install_reqs.append("kedro-datasets[pandas.CSVDataset]")
# For Python versions 3.9 and above we use the new dataset dependency format introduced in `kedro-datasets` 3.0.0
if sys.version_info.minor > MINOR_PYTHON_38_VERSION:
install_reqs.append("kedro-datasets[pandas-csvdataset]")
# For Python 3.8 we use the older `kedro-datasets` dependency format
else:
install_reqs.append("kedro-datasets[pandas.CSVDataset]")
call([context.pip, "install", *install_reqs], env=context.env)
return context
6 changes: 4 additions & 2 deletions features/steps/cli_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,8 @@ def check_one_node_run(context, number):
def check_correct_nodes_run(context, node):
expected_log_line = f"Running node: {node}"
stdout = context.result.stdout
assert expected_log_line in stdout, (
clean_logs = util.clean_up_log(stdout)
assert expected_log_line in clean_logs, (
"Expected the following message segment to be printed on stdout: "
f"{expected_log_line},\nbut got {stdout}"
)
Expand Down Expand Up @@ -595,7 +596,8 @@ def check_message_printed(context, msg):
else:
stdout = context.result.stdout

assert msg in stdout, (
clean_logs = util.clean_up_log(stdout)
assert msg in clean_logs, (
"Expected the following message segment to be printed on stdout: "
f"{msg},\nbut got {stdout}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ ipython>=8.10
jupyterlab>=3.0
notebook
kedro~={{ cookiecutter.kedro_version}}
kedro-datasets[pandas.CSVDataset]
kedro-datasets[pandas-csvdataset]; python_version >= "3.9"
kedro-datasets[pandas.CSVDataset]<2.0.0; python_version < '3.9'
kedro-telemetry>=0.3.1
pytest-cov~=3.0
pytest-mock>=1.7.1, <2.0
Expand Down
29 changes: 29 additions & 0 deletions features/steps/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,32 @@ def parse_csv(text: str) -> list[str]:
List of string tokens
"""
return re.findall(r"\"(.+?)\"\s*,?", text)


def clean_up_log(stdout: str) -> str:
"""
Cleans up log output by removing duplicate lines, extra whitespaces,
and log levels (INFO, WARNING, ERROR) along with .py filenames.
Args:
stdout (str): The log output to be cleaned.
Returns:
str: Cleaned log output without unnecessary information.
"""
cleaned_lines = []
already_extracted = set()

for line in stdout.split("\n"):
if any(word in line for word in ["WARNING", "INFO", "ERROR"]):
# Remove log levels and .py filenames
cleaned_line = re.sub(r"\b(INFO|WARNING|ERROR)\b|\s+\w+\.py:\d+", "", line)
cleaned_lines.append(cleaned_line.strip())
already_extracted.add(line)
elif line not in already_extracted:
cleaned_lines.append(line)

cleaned_output = "\n".join(cleaned_lines)
cleaned_output = re.sub(r"\s+", " ", cleaned_output)

return cleaned_output.strip()
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ test = [
"jupyterlab_server>=2.11.1",
"jupyterlab>=3,<5",
"jupyter~=1.0",
"kedro-datasets",
"kedro-datasets; python_version >= '3.9'",
"kedro-datasets<2.0.0; python_version < '3.9'",
"mypy~=1.0",
"pandas~=2.0",
"pluggy>=1.0, <1.4", # pluggy 1.4 hide imports inside function and causing mocking issue
Expand Down

0 comments on commit 44817b8

Please sign in to comment.