Skip to content

Commit

Permalink
Both courses and programs should handle alternate unique fields; set …
Browse files Browse the repository at this point in the history
…resources/runs with past dates as unpublished instead of omitting them entirely
  • Loading branch information
mbertrand committed Jul 3, 2024
1 parent a6cdca5 commit 7e1994f
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 79 deletions.
4 changes: 4 additions & 0 deletions learning_resources/etl/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,10 @@ def load_resource_by_unique_field(
unique_field,
unique_value: any,
) -> tuple[LearningResource, bool]:
"""
Upsert a learning resource using conditional parameters depending
on whether the unique field differs from the usual readable_id
"""
readable_id = resource_data.get(READABLE_ID_FIELD)
resource_type = resource_data.get("resource_type")
if unique_field and unique_field != READABLE_ID_FIELD:
Expand Down
150 changes: 75 additions & 75 deletions learning_resources/etl/mitpe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Professional Education ETL"""

import copy
import json
import logging
import re
Expand Down Expand Up @@ -186,8 +187,13 @@ def parse_format(format_str: str) -> list[str]:
Returns:
list of str: list of resource formats
"""
format_str = format_str.strip().lower()
if not format_str:
log.warning("No format provided, defaulting to online")
return [LearningResourceFormat.online.name]

formats = []
format_str = format_str.strip().lower()

is_online = "virtual" in format_str or "online" in format_str
in_person = "campus" in format_str
is_hybrid = is_online and in_person and " and " in format_str
Expand Down Expand Up @@ -238,19 +244,19 @@ def _transform_runs(resource_data: dict) -> list[dict]:
dates_hash = md5(json.dumps(resource_dates).encode("utf-8")).hexdigest() # noqa: S324
price = resource_data["attributes"]["field_course_fee"]
now = now_in_utc()
# Skip runs w/past enrollment end date or resource end date
if (enrollment_end_date and enrollment_end_date < now) or (
end_date and end_date < now
):
continue
# Unpublish runs w/past enrollment end date or resource end date
published = not (
(enrollment_end_date and enrollment_end_date < now)
or (end_date and end_date < now)
)
runs.append(
{
"run_id": f'{resource_data["id"]}_{dates_hash}',
"title": resource_data["attributes"]["title"],
"start_date": start_date,
"end_date": end_date,
"enrollment_end": enrollment_end_date,
"published": True,
"published": published,
"prices": [price] if price else [],
"url": parse_resource_url(resource_data),
"instructors": parse_instructors(resource_data),
Expand All @@ -271,38 +277,35 @@ def transform_course(resource_data: dict) -> dict or None:
dict: transformed course data if it has any viable runs
"""
runs = _transform_runs(resource_data)
if len(runs) > 0:
return {
"readable_id": resource_data["id"],
"offered_by": OFFERED_BY,
"platform": PlatformType.mitpe.name,
"etl_source": ETLSource.mitpe.name,
"professional": True,
"certification": True,
"certification_type": CertificationType.professional.name,
"title": resource_data["attributes"]["title"],
"url": parse_resource_url(resource_data),
"image": parse_image(resource_data),
"description": clean_data(
resource_data["attributes"]["field_featured_course_summary"]
),
"full_description": clean_data(
resource_data["attributes"]["body"]["processed"]
),
"course": {
"course_numbers": [],
},
"learning_format": parse_format(
resource_data["attributes"]["field_course_location"]
),
"published": not resource_data["attributes"][
"field_do_not_show_in_catalog"
],
"topics": parse_topics(resource_data),
"runs": runs,
"unique_field": UNIQUE_FIELD,
}
return None
return {
"readable_id": resource_data["id"],
"offered_by": copy.deepcopy(OFFERED_BY),
"platform": PlatformType.mitpe.name,
"etl_source": ETLSource.mitpe.name,
"professional": True,
"certification": True,
"certification_type": CertificationType.professional.name,
"title": resource_data["attributes"]["title"],
"url": parse_resource_url(resource_data),
"image": parse_image(resource_data),
"description": clean_data(
resource_data["attributes"]["field_featured_course_summary"]
),
"full_description": clean_data(
resource_data["attributes"]["body"]["processed"]
),
"course": {
"course_numbers": [],
},
"learning_format": parse_format(
resource_data["attributes"]["field_course_location"]
),
"published": not resource_data["attributes"]["field_do_not_show_in_catalog"]
and len([run for run in runs if run["published"] is True]) > 0,
"topics": parse_topics(resource_data),
"runs": runs,
"unique_field": UNIQUE_FIELD,
}


def transform_program(resource_data: dict) -> dict:
Expand All @@ -316,41 +319,38 @@ def transform_program(resource_data: dict) -> dict:
dict: transformed program data
"""
runs = _transform_runs(resource_data)
if len(runs) > 0:
return {
"readable_id": resource_data["id"],
"offered_by": OFFERED_BY,
"platform": PlatformType.mitpe.name,
"etl_source": ETLSource.mitpe.name,
"professional": True,
"certification": True,
"certification_type": CertificationType.professional.name,
"title": resource_data["attributes"]["title"],
"url": parse_resource_url(resource_data),
"image": parse_image(resource_data),
"description": clean_data(
resource_data["attributes"]["field_featured_course_summary"]
),
"full_description": clean_data(
resource_data["attributes"]["body"]["processed"]
),
"learning_format": parse_format(
resource_data["attributes"]["field_course_location"]
),
"published": not resource_data["attributes"][
"field_do_not_show_in_catalog"
],
"topics": parse_topics(resource_data),
"course_ids": [
course["id"]
for course in resource_data["relationships"]["field_program_courses"][
"data"
]
],
"runs": runs,
"unique_field": UNIQUE_FIELD,
}
return None
return {
"readable_id": resource_data["id"],
"offered_by": copy.deepcopy(OFFERED_BY),
"platform": PlatformType.mitpe.name,
"etl_source": ETLSource.mitpe.name,
"professional": True,
"certification": True,
"certification_type": CertificationType.professional.name,
"title": resource_data["attributes"]["title"],
"url": parse_resource_url(resource_data),
"image": parse_image(resource_data),
"description": clean_data(
resource_data["attributes"]["field_featured_course_summary"]
),
"full_description": clean_data(
resource_data["attributes"]["body"]["processed"]
),
"learning_format": parse_format(
resource_data["attributes"]["field_course_location"]
),
"published": not resource_data["attributes"]["field_do_not_show_in_catalog"]
and len([run for run in runs if run["published"] is True]) > 0,
"topics": parse_topics(resource_data),
"course_ids": [
course["id"]
for course in resource_data["relationships"]["field_program_courses"][
"data"
]
],
"runs": runs,
"unique_field": UNIQUE_FIELD,
}


def transform_program_courses(programs: list[dict], courses_data: list[dict]):
Expand All @@ -365,7 +365,7 @@ def transform_program_courses(programs: list[dict], courses_data: list[dict]):
for program in programs:
course_ids = program.pop("course_ids", [])
program["courses"] = [
course_dict[course_id]
copy.deepcopy(course_dict[course_id])
for course_id in course_ids
if course_id in course_dict
]
Expand Down
14 changes: 10 additions & 4 deletions learning_resources/etl/mitpe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@
}
],
"courses": [EXPECTED_COURSE],
"unique_field": "url",
}


Expand Down Expand Up @@ -238,7 +239,7 @@ def test_parse_format(format_str, expected):


@pytest.mark.parametrize(
("enrollment_end", "end_date", "course_count"),
("enrollment_end", "end_date", "published_count"),
[
(None, None, 1),
(None, "2020-01-01", 0),
Expand All @@ -249,11 +250,16 @@ def test_parse_format(format_str, expected):
],
)
def test_transform_by_dates(
mock_fetch_data, prof_ed_settings, enrollment_end, end_date, course_count
mock_fetch_data, prof_ed_settings, enrollment_end, end_date, published_count
):
"""Transform should omit resources with past enrollment_end or end_dates"""
"""Transform should unpublish resources with past enrollment_end or end_dates"""
resource_data = mitpe.extract()
course_data = resource_data[1]
course_data["attributes"]["field_registration_deadline"] = enrollment_end
course_data["attributes"]["field_course_dates"][0]["end_value"] = end_date
assert len(mitpe.transform([course_data])[0]) == course_count
courses = mitpe.transform([course_data])[0]
assert len(courses) == 1
assert (
len([course for course in courses if course["published"] is True])
== published_count
)

0 comments on commit 7e1994f

Please sign in to comment.