calliope-project · brynpickering · May 2, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## 1.2.0 (unpublished)
 
 ### Added (models)
+
 * **ADD** fully-electrified heat demand (#284).
 
 * **ADD** fully-electrified road transportation (#270), (#271).
@@ -11,6 +12,7 @@
 
 ### Added (workflow)
 
+* **ADD** Module to process JRC-IDEES Excel spreadsheets (#354).
 * **ADD** Ruff as our default linter and formatter (#285).
 * **ADD** DAG rule that generates a visualisation of Snakemake's directed acyclic graph (#208).
 * **ADD** IPython debugger to all conda environments to ease debugging (#254).

diff --git a/Snakefile b/Snakefile
@@ -16,6 +16,7 @@ techs_template_dir = f"{model_template_dir}techs/"
 
 include: "./rules/shapes.smk"
 include: "./rules/data.smk"
+include: "./rules/jrc-idees.smk"
 include: "./rules/wind-and-solar.smk"
 include: "./rules/biofuels.smk"
 include: "./rules/hydro.smk"
@@ -25,6 +26,7 @@ include: "./rules/nuclear.smk"
 include: "./rules/transport.smk"
 include: "./rules/sync.smk"
 include: "./rules/heat.smk"
+
 min_version("7.8")
 localrules: all, clean
 wildcard_constraints:

diff --git a/lib/eurocalliopelib/utils.py b/lib/eurocalliopelib/utils.py
@@ -1,6 +1,13 @@
 """Utility functions."""
 
+import logging
+from typing import Literal, Optional
+
+import pandas as pd
 import pycountry
+import xarray as xr
+
+LOGGER = logging.getLogger(__name__)
 
 
 def eu_country_code_to_iso3(eu_country_code):
@@ -19,7 +26,7 @@ def convert_country_code(input_country, output="alpha3"):
     Converts input country code or name into either a 2- or 3-letter code.
 
     ISO alpha2: alpha2
-    ISO alpha2 with Eurostat codes: alpha2_eu
+    ISO alpha2 with EU codes: alpha2_eu
     ISO alpha3: alpha3
 
     """
@@ -49,12 +56,109 @@ def convert_country_code(input_country, output="alpha3"):
         return pycountry.countries.lookup(input_country).alpha_3
 
 
-# conversion utils
+def convert_valid_countries(
+    country_codes: list,
+    output: str = "alpha3",
+    errors: Literal["raise", "ignore"] = "raise",
+) -> dict:
+    """
+    Convert a list of country codes / names to a list of uniform ISO coded country codes.
+    If an input item isn't a valid country (e.g. "EU27") then raise an error or skip and print the code and continue.
+
+    Args:
+        country_codes (list):
+            Strings defining country codes / names (["France", "FRA", "FR"] will all be treated the same)
+        output (str, optional):
+            pycountry output type, e.g. `alpha3` for 3-letter ISO standard.
+            Defaults to "alpha3".
+        errors (Literal["raise", "ignore"], optional).
+            If country code is not valid, `raise` an error and stop or `ignore` the error and continue with only logging the code.
+            Defaults to "ignore".
+    Returns:
+        dict: Mapping from input country code/name to output country code for all valid input countries
+    """
+
+    mapped_codes = {}
+    for country_code in country_codes:
+        try:
+            mapped_codes[country_code] = convert_country_code(
+                country_code, output=output
+            )
+        except LookupError as err:
+            if errors == "raise":
+                raise err
+            elif errors == "ignore":
+                LOGGER.info(f"Skipping country/region {country_code}")
+                continue
+    return mapped_codes
+
+
+def rename_and_groupby(
+    da: xr.DataArray,
+    rename_dict: dict,
+    dim_name: str,
+    new_dim_name: Optional[str] = None,
+    dropna: bool = False,
+    drop_other_dim_items: bool = True,
+) -> xr.DataArray:
+    """
+    Take an xarray dataarray and rename the contents of a given dimension as well as (optionally) rename that dimension.
+    If renaming the contents has some overlap (e.g. {'FRA': 'DEU', 'CHE': 'DEU'}),
+    then the returned dataarray will be grouped over the new dimension items and summed.
+
+    Args:
+        da (xr.DataArray):
+            Input dataarray with the dimension `dim_name`.
+        rename_dict (dict):
+            Dictionary to map items in the dimension `dim_name` to new names ({"old_item_name": "new_item_name"}).
+        dim_name (str):
+            Dimension on which to rename items.
+        new_dim_name (Optional[str], optional): Defaults to None.
+            If not None, rename the dimension "dim_name" to the given string.
+        dropna (bool, optional): Defaults to False.
+            If True, drop any items in "dim_name" after renaming/grouping which have all NaN values along all other dimensions.
+        drop_other_dim_items (bool, optional): Defaults to True.
+            If True, any dimension items _not_ referenced in `rename_dict` keys will be removed from that dimension in the returned array.
+    Returns:
+        (xr.DataArray): Same as "da" but with the items in "dim_name" renamed and possibly a. grouped and summed, b. "dim_name" itself renamed.
+    """
+    rename_series = pd.Series(rename_dict).rename_axis(index=dim_name)
+    if drop_other_dim_items is False:
+        existing_dim_items = da[dim_name].to_series()
+        rename_series = rename_series.reindex(existing_dim_items).fillna(
+            existing_dim_items
+        )
+
+    if new_dim_name is None:
+        new_dim_name = f"_{dim_name}"  # placeholder that we'll revert
+        revert_dim_name = True
+    else:
+        revert_dim_name = False
+
+    rename_da = xr.DataArray(rename_series.rename(new_dim_name))
+    da = (
+        da.reindex({dim_name: rename_da[dim_name]})
+        .groupby(rename_da)
+        .sum(dim_name, skipna=True, min_count=1, keep_attrs=True)
+    )
+    if revert_dim_name:
+        da = da.rename({new_dim_name: dim_name})
+        new_dim_name = dim_name
+    if dropna:
+        da = da.dropna(new_dim_name, how="all")
+    return da
+
+
 def ktoe_to_twh(array):
     """Convert KTOE to TWH"""
     return array * 1.163e-2
 
 
+def gwh_to_tj(array):
+    """Convert GWh to TJ"""
+    return array * 3.6
+
+
 def pj_to_twh(array):
     """Convert PJ to TWh"""
     return array / 3.6
@@ -63,8 +167,3 @@ def pj_to_twh(array):
 def tj_to_twh(array):
     """Convert TJ to TWh"""
     return pj_to_twh(array) / 1000
-
-
-def gwh_to_tj(array):
-    """Convert GWh to TJ"""
-    return array * 3.6
diff --git a/rules/data.smk b/rules/data.smk
@@ -43,50 +43,3 @@ rule annual_energy_balances:
         first_year = 2000
     conda: "../envs/default.yaml"
     script: "../scripts/data/annual_energy_balance.py"
-
-
-"Rules regarding JRC-IDEES Data:"
-
-
-rule download_jrc_idees_zipped:
-    message: "Download JRC IDEES zip file for {wildcards.country_code}"
-    params: url = config["data-sources"]["jrc-idees"]
-    output: protected("data/automatic/jrc-idees/{country_code}.zip")
-    conda: "../envs/shell.yaml"
-    localrule: True
-    shell: "curl -sSLo {output} '{params.url}'"
-
-
-def jrc_to_euro_calliope_sector(sector: str):
-    if sector == "transport":
-        return "Transport"
-    elif sector == "heat":
-        return "Tertiary"
-    else:
-        raise ValueError(f"Unknown sector {sector}.")
-
-
-rule jrc_idees_unzipped:
-    message: "Unzip all JRC-IDEES {wildcards.sector} sector country data"
-    input:
-        "data/automatic/jrc-idees/{country_code}.zip"
-    params:
-        file_name = lambda wildcards: f"JRC-IDEES-2015_{jrc_to_euro_calliope_sector(wildcards.sector)}_{wildcards.country_code}.xlsx"
-    wildcard_constraints:
-        sector = "transport|heat"
-    output: temp("build/data/jrc-idees/{sector}/unprocessed/{country_code}.xlsx")
-    conda: "../envs/shell.yaml"
-    shadow: "minimal"
-    localrule: True
-    shell: """
-    unzip -j {input} -d build/data/jrc-idees/{wildcards.sector}/unprocessed/
-    mv build/data/jrc-idees/{wildcards.sector}/unprocessed/{params.file_name} {output}
-    """
-
-
-"EU28 county codes used for downloading JRC-IDEES"
-JRC_IDEES_SCOPE = [
-    "AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "EL", "ES", "FI", "FR",
-    "HR", "HU", "IE", "IT", "LT", "LU", "LV", "MT", "NL", "PL", "PT", "RO",
-    "SE", "SI", "SK", "UK"
-]
diff --git a/rules/heat.smk b/rules/heat.smk
@@ -1,22 +1,10 @@
-rule jrc_idees_heat_processed:
-    message: "Process tertiary heat data from JRC-IDEES"
-    input:
-        data = expand(
-            "build/data/jrc-idees/heat/unprocessed/{country_code}.xlsx",
-            country_code=JRC_IDEES_SCOPE
-        )
-    output: "build/data/jrc-idees/heat/commercial/processed.csv"
-    conda: "../envs/default.yaml"
-    script: "../scripts/heat/jrc_idees.py"
-
-
 rule annual_heat_demand:
     message: "Calculate national heat demand for household and commercial sectors"
     input:
         hh_end_use = "data/automatic/eurostat-hh-end-use.tsv.gz",
         ch_end_use = "data/automatic/ch-end-use.xlsx",
         energy_balance = rules.annual_energy_balances.output[0],
-        commercial_demand = "build/data/jrc-idees/heat/commercial/processed.csv",
+        commercial_demand = "build/data/jrc-idees/tertiary/processed.csv",
         carrier_names = "config/energy-balances/energy-balance-carrier-names.csv"
     params:
         heat_tech_params = config["parameters"]["heat"],

diff --git a/rules/jrc-idees.smk b/rules/jrc-idees.smk
@@ -0,0 +1,72 @@
+"Rules regarding JRC-IDEES Data"
+
+JRC_IDEES_SPATIAL_SCOPE = [
+    "AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "EL", "ES", "FI", "FR",
+    "HR", "HU", "IE", "IT", "LT", "LU", "LV", "MT", "NL", "PL", "PT", "RO",
+    "SE", "SI", "SK", "UK"
+]
+
+
+rule download_jrc_idees_zipped:
+    message: "Download JRC IDEES zip file for {wildcards.country_code}"
+    params: url = config["data-sources"]["jrc-idees"]
+    output: protected("data/automatic/jrc-idees/{country_code}.zip")
+    conda: "../envs/shell.yaml"
+    localrule: True
+    shell: "curl -sSLo {output} '{params.url}'"
+
+
+rule jrc_idees_unzipped:
+    message: "Unzip JRC-IDEES {wildcards.sector} sector data for {wildcards.country_code}"
+    input:
+        country_data = "data/automatic/jrc-idees/{country_code}.zip",
+    params:
+        sector_title_case = lambda wildcards: wildcards.sector.title()
+    wildcard_constraints:
+        sector = "industry|transport|tertiary"
+    output: temp("build/data/jrc-idees/{sector}/unprocessed/{country_code}.xlsx")
+    conda: "../envs/shell.yaml"
+    shell: "unzip -p {input.country_data} JRC-IDEES-2015_{params.sector_title_case}_{wildcards.country_code}.xlsx > {output}"
+
+
+rule jrc_idees_industry_processed:
+    message: "Process {wildcards.dataset} industry data from JRC-IDEES to be used in understanding current and future industry demand"
+    input:
+        data = expand(
+            "build/data/jrc-idees/industry/unprocessed/{country_code}.xlsx",
+            country_code=JRC_IDEES_SPATIAL_SCOPE
+        )
+    output: "build/data/jrc-idees/industry/processed-{dataset}.nc"
+    wildcard_constraints:
+        dataset = "energy|production"
+    conda: "../envs/default.yaml"
+    threads: 4
+    script: "../scripts/jrc-idees/industry.py"
+
+
+rule jrc_idees_tertiary_processed:
+    message: "Process tertiary heat data from JRC-IDEES"
+    input:
+        data = expand(
+            "build/data/jrc-idees/tertiary/unprocessed/{country_code}.xlsx",
+            country_code=JRC_IDEES_SPATIAL_SCOPE
+        )
+    output: "build/data/jrc-idees/tertiary/processed.csv"
+    conda: "../envs/default.yaml"
+    script: "../scripts/jrc-idees/heat.py"
+
+
+rule jrc_idees_transport_processed:
+    message: "Process {wildcards.dataset} transport data from JRC-IDEES to be used in understanding current and future transport demand"
+    input:
+        data = expand(
+            "build/data/jrc-idees/transport/unprocessed/{country_code}.xlsx",
+            country_code=JRC_IDEES_SPATIAL_SCOPE
+        )
+    output: "build/data/jrc-idees/transport/processed-{dataset}.csv"
+    params:
+        vehicle_type_names = config["parameters"]["transport"]["vehicle-type-names"],
+    wildcard_constraints:
+        dataset = "road-energy|road-distance|road-vehicles"
+    conda: "../envs/default.yaml"
+    script: "../scripts/jrc-idees/transport.py"
diff --git a/rules/transport.smk b/rules/transport.smk
@@ -11,22 +11,6 @@ rule download_transport_timeseries:
     shell: "curl -sSLo {output} {params.url}"
 
 
-rule jrc_idees_transport_processed:
-    message: "Process {wildcards.dataset} transport data from JRC-IDEES to be used in understanding current and future transport demand"
-    input:
-        data = expand(
-            "build/data/jrc-idees/transport/unprocessed/{country_code}.xlsx",
-            country_code=JRC_IDEES_SCOPE
-        )
-    output: "build/data/jrc-idees/transport/processed-{dataset}.csv"
-    params:
-        vehicle_type_names = config["parameters"]["transport"]["vehicle-type-names"],
-    wildcard_constraints:
-        dataset = "road-energy|road-distance|road-vehicles"
-    conda: "../envs/default.yaml"
-    script: "../scripts/transport/jrc_idees.py"
-
-
 rule annual_transport_demand:
     message: "Calculate future transport energy demand based on JRC IDEES"
     input:

diff --git a/scripts/heat/jrc_idees.py → scripts/jrc-idees/heat.py b/scripts/heat/jrc_idees.py → scripts/jrc-idees/heat.py
@@ -1,5 +1,3 @@
-from pathlib import Path
-
 import numpy as np
 import pandas as pd
 
@@ -33,7 +31,6 @@
 def process_jrc_heat_tertiary_sector_data(
     paths_to_national_data: list[str], out_path: str
 ):
-    paths_to_national_data = [Path(p) for p in paths_to_national_data]
     dfs = []
     for file in paths_to_national_data:
         df_final_energy = pd.read_excel(file, sheet_name="SER_hh_fec", index_col=0)