Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add industry JRC data processing #354

Merged
merged 12 commits into from
May 2, 2024
Merged
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## 1.2.0 (unpublished)

### Added (models)

* **ADD** fully-electrified heat demand (#284).

* **ADD** fully-electrified road transportation (#270), (#271).
Expand All @@ -11,6 +12,7 @@

### Added (workflow)

* **ADD** Module to process JRC-IDEES Excel spreadsheets (#354).
* **ADD** Ruff as our default linter and formatter (#285).
* **ADD** DAG rule that generates a visualisation of Snakemake's directed acyclic graph (#208).
* **ADD** IPython debugger to all conda environments to ease debugging (#254).
Expand Down
2 changes: 2 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ techs_template_dir = f"{model_template_dir}techs/"

include: "./rules/shapes.smk"
include: "./rules/data.smk"
include: "./rules/jrc-idees.smk"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not particularly important but I moved the previous eurostat.smk and jrc-idees.smk into data.smk, as the container for all downloading and pre-processing of all data that is not sector specific. The idea being that we don't generate too many rule files, especially not rule files that aren't feature-based. You didn't like that idea?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer one per source as they do become large enough rule files to be worth splitting off. It also is in line with the concept of modularising different major data sources.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One could imagine a future where we split off JRC processing completely and just store the pre-built files on zenodo for convenience.

include: "./rules/wind-and-solar.smk"
include: "./rules/biofuels.smk"
include: "./rules/hydro.smk"
Expand All @@ -25,6 +26,7 @@ include: "./rules/nuclear.smk"
include: "./rules/transport.smk"
include: "./rules/sync.smk"
include: "./rules/heat.smk"

min_version("7.8")
localrules: all, clean
wildcard_constraints:
Expand Down
113 changes: 106 additions & 7 deletions lib/eurocalliopelib/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
"""Utility functions."""

import logging
from typing import Literal, Optional

import pandas as pd
import pycountry
import xarray as xr

LOGGER = logging.getLogger(__name__)


def eu_country_code_to_iso3(eu_country_code):
Expand All @@ -19,7 +26,7 @@ def convert_country_code(input_country, output="alpha3"):
Converts input country code or name into either a 2- or 3-letter code.

ISO alpha2: alpha2
ISO alpha2 with Eurostat codes: alpha2_eu
brynpickering marked this conversation as resolved.
Show resolved Hide resolved
ISO alpha2 with EU codes: alpha2_eu
ISO alpha3: alpha3

"""
Expand Down Expand Up @@ -49,12 +56,109 @@ def convert_country_code(input_country, output="alpha3"):
return pycountry.countries.lookup(input_country).alpha_3


# conversion utils
def convert_valid_countries(
country_codes: list,
output: str = "alpha3",
errors: Literal["raise", "ignore"] = "raise",
) -> dict:
"""
Convert a list of country codes / names to a list of uniform ISO coded country codes.
If an input item isn't a valid country (e.g. "EU27") then raise an error or skip and print the code and continue.

Args:
country_codes (list):
Strings defining country codes / names (["France", "FRA", "FR"] will all be treated the same)
output (str, optional):
pycountry output type, e.g. `alpha3` for 3-letter ISO standard.
Defaults to "alpha3".
errors (Literal["raise", "ignore"], optional).
If country code is not valid, `raise` an error and stop or `ignore` the error and continue with only logging the code.
Defaults to "ignore".
Returns:
dict: Mapping from input country code/name to output country code for all valid input countries
"""

mapped_codes = {}
for country_code in country_codes:
try:
mapped_codes[country_code] = convert_country_code(
country_code, output=output
)
except LookupError as err:
if errors == "raise":
raise err
elif errors == "ignore":
LOGGER.info(f"Skipping country/region {country_code}")
continue
return mapped_codes


def rename_and_groupby(
brynpickering marked this conversation as resolved.
Show resolved Hide resolved
da: xr.DataArray,
rename_dict: dict,
dim_name: str,
new_dim_name: Optional[str] = None,
dropna: bool = False,
drop_other_dim_items: bool = True,
) -> xr.DataArray:
"""
Take an xarray dataarray and rename the contents of a given dimension as well as (optionally) rename that dimension.
If renaming the contents has some overlap (e.g. {'FRA': 'DEU', 'CHE': 'DEU'}),
then the returned dataarray will be grouped over the new dimension items and summed.

Args:
da (xr.DataArray):
Input dataarray with the dimension `dim_name`.
rename_dict (dict):
Dictionary to map items in the dimension `dim_name` to new names ({"old_item_name": "new_item_name"}).
dim_name (str):
Dimension on which to rename items.
new_dim_name (Optional[str], optional): Defaults to None.
If not None, rename the dimension "dim_name" to the given string.
dropna (bool, optional): Defaults to False.
If True, drop any items in "dim_name" after renaming/grouping which have all NaN values along all other dimensions.
drop_other_dim_items (bool, optional): Defaults to True.
If True, any dimension items _not_ referenced in `rename_dict` keys will be removed from that dimension in the returned array.
Returns:
(xr.DataArray): Same as "da" but with the items in "dim_name" renamed and possibly a. grouped and summed, b. "dim_name" itself renamed.
"""
rename_series = pd.Series(rename_dict).rename_axis(index=dim_name)
if drop_other_dim_items is False:
existing_dim_items = da[dim_name].to_series()
rename_series = rename_series.reindex(existing_dim_items).fillna(
existing_dim_items
)

if new_dim_name is None:
new_dim_name = f"_{dim_name}" # placeholder that we'll revert
revert_dim_name = True
else:
revert_dim_name = False

rename_da = xr.DataArray(rename_series.rename(new_dim_name))
da = (
da.reindex({dim_name: rename_da[dim_name]})
.groupby(rename_da)
.sum(dim_name, skipna=True, min_count=1, keep_attrs=True)
)
if revert_dim_name:
da = da.rename({new_dim_name: dim_name})
new_dim_name = dim_name
if dropna:
da = da.dropna(new_dim_name, how="all")
return da


def ktoe_to_twh(array):
"""Convert KTOE to TWH"""
return array * 1.163e-2


def gwh_to_tj(array):
"""Convert GWh to TJ"""
return array * 3.6


def pj_to_twh(array):
"""Convert PJ to TWh"""
return array / 3.6
Expand All @@ -63,8 +167,3 @@ def pj_to_twh(array):
def tj_to_twh(array):
"""Convert TJ to TWh"""
return pj_to_twh(array) / 1000


def gwh_to_tj(array):
"""Convert GWh to TJ"""
return array * 3.6
47 changes: 0 additions & 47 deletions rules/data.smk
Original file line number Diff line number Diff line change
Expand Up @@ -43,50 +43,3 @@ rule annual_energy_balances:
first_year = 2000
conda: "../envs/default.yaml"
script: "../scripts/data/annual_energy_balance.py"


"Rules regarding JRC-IDEES Data:"


rule download_jrc_idees_zipped:
message: "Download JRC IDEES zip file for {wildcards.country_code}"
params: url = config["data-sources"]["jrc-idees"]
output: protected("data/automatic/jrc-idees/{country_code}.zip")
conda: "../envs/shell.yaml"
localrule: True
shell: "curl -sSLo {output} '{params.url}'"


def jrc_to_euro_calliope_sector(sector: str):
if sector == "transport":
return "Transport"
elif sector == "heat":
return "Tertiary"
else:
raise ValueError(f"Unknown sector {sector}.")


rule jrc_idees_unzipped:
message: "Unzip all JRC-IDEES {wildcards.sector} sector country data"
input:
"data/automatic/jrc-idees/{country_code}.zip"
params:
file_name = lambda wildcards: f"JRC-IDEES-2015_{jrc_to_euro_calliope_sector(wildcards.sector)}_{wildcards.country_code}.xlsx"
wildcard_constraints:
sector = "transport|heat"
output: temp("build/data/jrc-idees/{sector}/unprocessed/{country_code}.xlsx")
conda: "../envs/shell.yaml"
shadow: "minimal"
localrule: True
shell: """
unzip -j {input} -d build/data/jrc-idees/{wildcards.sector}/unprocessed/
mv build/data/jrc-idees/{wildcards.sector}/unprocessed/{params.file_name} {output}
"""


"EU28 county codes used for downloading JRC-IDEES"
JRC_IDEES_SCOPE = [
"AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "EL", "ES", "FI", "FR",
"HR", "HU", "IE", "IT", "LT", "LU", "LV", "MT", "NL", "PL", "PT", "RO",
"SE", "SI", "SK", "UK"
]
14 changes: 1 addition & 13 deletions rules/heat.smk
Original file line number Diff line number Diff line change
@@ -1,22 +1,10 @@
rule jrc_idees_heat_processed:
message: "Process tertiary heat data from JRC-IDEES"
input:
data = expand(
"build/data/jrc-idees/heat/unprocessed/{country_code}.xlsx",
country_code=JRC_IDEES_SCOPE
)
output: "build/data/jrc-idees/heat/commercial/processed.csv"
conda: "../envs/default.yaml"
script: "../scripts/heat/jrc_idees.py"


rule annual_heat_demand:
message: "Calculate national heat demand for household and commercial sectors"
input:
hh_end_use = "data/automatic/eurostat-hh-end-use.tsv.gz",
ch_end_use = "data/automatic/ch-end-use.xlsx",
energy_balance = rules.annual_energy_balances.output[0],
commercial_demand = "build/data/jrc-idees/heat/commercial/processed.csv",
commercial_demand = "build/data/jrc-idees/tertiary/processed.csv",
carrier_names = "config/energy-balances/energy-balance-carrier-names.csv"
params:
heat_tech_params = config["parameters"]["heat"],
Expand Down
72 changes: 72 additions & 0 deletions rules/jrc-idees.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"Rules regarding JRC-IDEES Data"

JRC_IDEES_SPATIAL_SCOPE = [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice-to-have:
Should we consider moving this to the configuration?
As it is, data will always be downloaded for all JRC countries even if it is unused (such as in the minimal configuration).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, you need data from some countries to fill in neighbours, so even if they're not in the list of model countries we need to pull in and process all data

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've now combined main countries and infill countries into one list and only unzip and process that list of countries. It's a bit of a proof of concept and might be too verbose to be worth keeping

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you!
And agreed, I suppose it's fine to just do all countries unless the files are huge if this does not work well with the rest of the workflow.

"AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "EL", "ES", "FI", "FR",
"HR", "HU", "IE", "IT", "LT", "LU", "LV", "MT", "NL", "PL", "PT", "RO",
"SE", "SI", "SK", "UK"
]


rule download_jrc_idees_zipped:
message: "Download JRC IDEES zip file for {wildcards.country_code}"
params: url = config["data-sources"]["jrc-idees"]
output: protected("data/automatic/jrc-idees/{country_code}.zip")
conda: "../envs/shell.yaml"
localrule: True
shell: "curl -sSLo {output} '{params.url}'"


rule jrc_idees_unzipped:
message: "Unzip JRC-IDEES {wildcards.sector} sector data for {wildcards.country_code}"
input:
country_data = "data/automatic/jrc-idees/{country_code}.zip",
params:
sector_title_case = lambda wildcards: wildcards.sector.title()
wildcard_constraints:
sector = "industry|transport|tertiary"
output: temp("build/data/jrc-idees/{sector}/unprocessed/{country_code}.xlsx")
conda: "../envs/shell.yaml"
shell: "unzip -p {input.country_data} JRC-IDEES-2015_{params.sector_title_case}_{wildcards.country_code}.xlsx > {output}"


rule jrc_idees_industry_processed:
message: "Process {wildcards.dataset} industry data from JRC-IDEES to be used in understanding current and future industry demand"
input:
data = expand(
"build/data/jrc-idees/industry/unprocessed/{country_code}.xlsx",
country_code=JRC_IDEES_SPATIAL_SCOPE
)
output: "build/data/jrc-idees/industry/processed-{dataset}.nc"
wildcard_constraints:
dataset = "energy|production"
conda: "../envs/default.yaml"
threads: 4
script: "../scripts/jrc-idees/industry.py"


rule jrc_idees_tertiary_processed:
message: "Process tertiary heat data from JRC-IDEES"
input:
data = expand(
"build/data/jrc-idees/tertiary/unprocessed/{country_code}.xlsx",
country_code=JRC_IDEES_SPATIAL_SCOPE
)
output: "build/data/jrc-idees/tertiary/processed.csv"
conda: "../envs/default.yaml"
script: "../scripts/jrc-idees/heat.py"


rule jrc_idees_transport_processed:
message: "Process {wildcards.dataset} transport data from JRC-IDEES to be used in understanding current and future transport demand"
input:
data = expand(
"build/data/jrc-idees/transport/unprocessed/{country_code}.xlsx",
country_code=JRC_IDEES_SPATIAL_SCOPE
)
output: "build/data/jrc-idees/transport/processed-{dataset}.csv"
params:
vehicle_type_names = config["parameters"]["transport"]["vehicle-type-names"],
wildcard_constraints:
dataset = "road-energy|road-distance|road-vehicles"
conda: "../envs/default.yaml"
script: "../scripts/jrc-idees/transport.py"
16 changes: 0 additions & 16 deletions rules/transport.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,6 @@ rule download_transport_timeseries:
shell: "curl -sSLo {output} {params.url}"


rule jrc_idees_transport_processed:
message: "Process {wildcards.dataset} transport data from JRC-IDEES to be used in understanding current and future transport demand"
input:
data = expand(
"build/data/jrc-idees/transport/unprocessed/{country_code}.xlsx",
country_code=JRC_IDEES_SCOPE
)
output: "build/data/jrc-idees/transport/processed-{dataset}.csv"
params:
vehicle_type_names = config["parameters"]["transport"]["vehicle-type-names"],
wildcard_constraints:
dataset = "road-energy|road-distance|road-vehicles"
conda: "../envs/default.yaml"
script: "../scripts/transport/jrc_idees.py"


rule annual_transport_demand:
message: "Calculate future transport energy demand based on JRC IDEES"
input:
Expand Down
3 changes: 0 additions & 3 deletions scripts/heat/jrc_idees.py → scripts/jrc-idees/heat.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from pathlib import Path

import numpy as np
import pandas as pd

Expand Down Expand Up @@ -33,7 +31,6 @@
def process_jrc_heat_tertiary_sector_data(
paths_to_national_data: list[str], out_path: str
):
paths_to_national_data = [Path(p) for p in paths_to_national_data]
dfs = []
for file in paths_to_national_data:
df_final_energy = pd.read_excel(file, sheet_name="SER_hh_fec", index_col=0)
Expand Down
Loading
Loading