change serialisation of empty slices (of nodes or edges)

change from completely empty dataframe to empty geodataframe (no rows) but with a minimal column set (geometry). this is necessary as geopandas/pyarrow behaviour on opening empty geodataframe has changed.
nismod · Jul 20, 2023 · 9b77e55 · 9b77e55
1 parent 291812d
commit 9b77e55
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 58 deletions.
diff --git a/src/open_gira/io.py b/src/open_gira/io.py
@@ -15,7 +15,6 @@
 from open_gira.utils import natural_sort
 
 
-NO_GEOM_ERROR_MSG: str = "No geometry columns are included in the columns"
 WGS84_EPSG = 4326
 
 # column names and dtypes for STORM synthetic tropical cyclone tracks

diff --git a/workflow/scripts/concat_and_sum_slices.py b/workflow/scripts/concat_and_sum_slices.py
@@ -15,7 +15,6 @@
 import pandas as pd
 from tqdm import tqdm
 
-from open_gira.io import NO_GEOM_ERROR_MSG
 from open_gira.utils import natural_sort
 
 
@@ -38,18 +37,11 @@
     dataframes: list[gpd.GeoDataFrame] = []
     for i, slice_path in tqdm(enumerate(slice_files)):
 
-        try:
-            gdf = gpd.read_parquet(slice_path)
+        gdf = gpd.read_parquet(slice_path)
 
-        except ValueError as error:
-            if NO_GEOM_ERROR_MSG in str(error):
-                # if the input parquet file does not contain a geometry column,
-                # geopandas will raise a ValueError rather than try to procede. we
-                # catch that here, but check the error message - to be more
-                # specific than catching and suppressing any ValueError
-
-                # use an empty geodataframe to append instead
-                gdf = gpd.GeoDataFrame([])
+        if gdf.empty is True:
+            # use an empty geodataframe to append instead
+            gdf = gpd.GeoDataFrame([], columns=["geometry"])
 
         dataframes.append(gdf)
 

diff --git a/workflow/scripts/osm_to_pq.py b/workflow/scripts/osm_to_pq.py
@@ -201,22 +201,28 @@ def get_node_by_coords(self, coords, prefix, node_list):
         }
 
 
+def empty_gdf() -> geopandas.GeoDataFrame:
+    """
+    Create an return an empty GeoDataFrame. Must explicitly specify columns
+    (despite empty list) to permit saving as geoparquet.
+    """
+    return geopandas.GeoDataFrame([], columns=["geometry"])
+
+
 if __name__ == "__main__":
     try:
         pbf_path = snakemake.input["pbf"]  # type: ignore
         edges_path = snakemake.output["edges"]  # type: ignore
         nodes_path = snakemake.output["nodes"]  # type: ignore
         keep_tags = snakemake.params["keep_tags"]  # type: ignore
-        osm_epsg = snakemake.config["osm_epsg"]  # type: ignore
     except NameError:
         # If "snakemake" doesn't exist then must be running from the
         # command line.
-        pbf_path, edges_path, nodes_path, keep_tags, osm_epsg = sys.argv[1:]
+        pbf_path, edges_path, nodes_path, keep_tags = sys.argv[1:]
         # pbf_path = 'results/slices/tanzania-mini_filter-road/slice-2.osm.pbf'
         # edges_path = 'results/slice-2.geoparquet'
         # nodes_path = 'results/slice-2.geoparquet'
         # keep_tags = 'highway, railway'
-        # osm_epsg = 4326
 
         # process comma separated string into list of strings
         keep_tags: list = keep_tags.replace(" ", "").split(",")
@@ -243,7 +249,12 @@ def get_node_by_coords(self, coords, prefix, node_list):
         tags_to_preserve=keep_tags,
     )
     h.apply_file(pbf_path, locations=True)
-    edges = geopandas.GeoDataFrame(h.output_data)
+
+    if len(h.output_data) != 0:
+        edges = geopandas.GeoDataFrame(h.output_data)
+        edges = edges.set_crs(epsg=4326)
+    else:
+        edges = empty_gdf()
     logging.info(
         f"Complete: {len(h.output_data)} segments from {len(Counter(w['osm_way_id'] for w in h.output_data))} ways."
     )
@@ -252,14 +263,13 @@ def get_node_by_coords(self, coords, prefix, node_list):
         tags_to_preserve=keep_tags,
     )
     n.apply_file(pbf_path, locations=True)
-    nodes = geopandas.GeoDataFrame(n.output_data)
-    logging.info(f"Complete: {len(n.output_data)} nodes.")
+    if len(n.output_data) != 0:
+        nodes = geopandas.GeoDataFrame(n.output_data)
+        nodes = nodes.set_crs(epsg=4326)
+    else:
+        nodes = empty_gdf()
 
-    # can't set a CRS on an empty dataframe, will AttributeError
-    if not edges.empty:
-        edges.set_crs(epsg=osm_epsg, inplace=True)
-    if not nodes.empty:
-        nodes.set_crs(epsg=osm_epsg, inplace=True)
+    logging.info(f"Complete: {len(n.output_data)} nodes.")
 
     # write to disk -- even if empty
     edges.to_parquet(edges_path)

diff --git a/workflow/scripts/transport/create_rail_network.py b/workflow/scripts/transport/create_rail_network.py
@@ -14,7 +14,7 @@
 
 from utils import annotate_country, get_administrative_data
 from open_gira.assets import RailAssets
-from open_gira.io import write_empty_frames, NO_GEOM_ERROR_MSG
+from open_gira.io import write_empty_frames
 from open_gira.network import create_network
 from open_gira.utils import str_to_bool
 
@@ -98,29 +98,15 @@ def get_rehab_costs(row: pd.Series, rehab_costs: pd.DataFrame) -> float:
     warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")
 
     # read edges
-    try:
-        edges = gpd.read_parquet(osm_edges_path)
-    except ValueError as error:
-        if NO_GEOM_ERROR_MSG in str(error):
-            logging.info("No data in geometry column, writing empty files.")
-            # if the input parquet file does not contain a geometry column, geopandas
-            # will raise a ValueError rather than try to procede
-            write_empty_frames(edges_output_path, nodes_output_path)
-            sys.exit(0)  # exit gracefully so snakemake will continue
-        else:
-            raise error
+    edges = gpd.read_parquet(osm_edges_path)
+    if edges.empty is True:
+        write_empty_frames(edges_output_path, nodes_output_path)
+        sys.exit(0)  # exit gracefully so snakemake will continue
 
     # read nodes
-    try:
-        nodes = gpd.read_parquet(osm_nodes_path)
-    except ValueError as error:
-        if NO_GEOM_ERROR_MSG in str(error):
-            logging.info(f"No nodes from OSM to process")
-            # if the input parquet file does not contain a geometry column, geopandas
-            # will raise a ValueError rather than try to procede
-            nodes = None
-        else:
-            raise error
+    nodes = gpd.read_parquet(osm_nodes_path)
+    if nodes.empty is True:
+        nodes = None
 
     # osm_to_pq.py creates these columns but we're not using them, so discard
     edges = edges.drop(

diff --git a/workflow/scripts/transport/create_road_network.py b/workflow/scripts/transport/create_road_network.py
@@ -16,7 +16,7 @@
 
 from utils import annotate_country, cast, get_administrative_data, strip_suffix
 from open_gira.assets import RoadAssets
-from open_gira.io import write_empty_frames, NO_GEOM_ERROR_MSG
+from open_gira.io import write_empty_frames
 from open_gira.network import create_network
 from open_gira.utils import str_to_bool
 
@@ -412,17 +412,11 @@ def get_rehab_costs(row: pd.Series, rehab_costs: pd.DataFrame) -> float:
     # NB though that .geoparquet is not the format to use for archiving.
     warnings.filterwarnings("ignore", message=".*initial implementation of Parquet.*")
 
-    try:
-        edges = gpd.read_parquet(osm_edges_path)
-    except ValueError as error:
-        if NO_GEOM_ERROR_MSG in str(error):
-            logging.info("No data in geometry column, writing empty files.")
-            # if the input parquet file does not contain a geometry column, geopandas
-            # will raise a ValueError rather than try to procede
-            write_empty_frames(edges_output_path, nodes_output_path)
-            sys.exit(0)  # exit gracefully so snakemake will continue
-        else:
-            raise error
+    edges = gpd.read_parquet(osm_edges_path)
+
+    if edges.empty is True:
+        write_empty_frames(edges_output_path, nodes_output_path)
+        sys.exit(0)  # exit gracefully so snakemake will continue
 
     # osm_to_pq.py creates these columns but we're not using them, so discard
     edges = edges.drop(