diff --git a/datacube/api/core.py b/datacube/api/core.py index 87986db74..9c88cde2c 100644 --- a/datacube/api/core.py +++ b/datacube/api/core.py @@ -247,7 +247,7 @@ def load(self, resolution: int | float | tuple[int | float, int | float] | Resolution | None = None, resampling: Resampling | dict[str, Resampling] | None = None, align: XY[float] | Iterable[float] | None = None, - skip_broken_datasets: bool = False, + skip_broken_datasets: bool | None = None, dask_chunks: dict[str, str | int] | None = None, like: GeoBox | xarray.Dataset | xarray.DataArray | None = None, fuse_func: FuserFunction | Mapping[str, FuserFunction | None] | None = None, @@ -423,6 +423,7 @@ def load(self, :param bool skip_broken_datasets: Optional. If this is True, then don't break when failing to load a broken dataset. + If None, the value will come from the environment variable of the same name. Default is False. :param dict dask_chunks: @@ -475,6 +476,7 @@ def filter_jan(dataset): return dataset.time.begin.month == 1 :param limit: Optional. If provided, limit the maximum number of datasets returned. Useful for testing and debugging. + Can also be provided via the ``dc_load_limit`` config option. :param driver: Optional. If provided, use the specified driver to load the data. @@ -494,6 +496,9 @@ def filter_jan(dataset): return dataset.time.begin.month == 1 if datasets is None: assert product is not None # For type checker + if limit is None: + # check if a value was provided via the envvar + limit = self.index.environment["dc_load_limit"] datasets = self.find_datasets(ensure_location=True, dataset_predicate=dataset_predicate, like=like, limit=limit, @@ -547,6 +552,10 @@ def filter_jan(dataset): return dataset.time.begin.month == 1 measurement_dicts = datacube_product.lookup_measurements(measurements) + if skip_broken_datasets is None: + # default to value from env var, which defaults to False + skip_broken_datasets = self.index.environment["skip_broken_datasets"] + result = self.load_data(grouped, geobox, measurement_dicts, resampling=resampling, diff --git a/datacube/cfg/api.py b/datacube/cfg/api.py index f0edeb7f9..43cdd70e3 100644 --- a/datacube/cfg/api.py +++ b/datacube/cfg/api.py @@ -15,7 +15,7 @@ from .cfg import find_config, parse_text from .exceptions import ConfigException -from .opt import ODCOptionHandler, AliasOptionHandler, IndexDriverOptionHandler +from .opt import ODCOptionHandler, AliasOptionHandler, IndexDriverOptionHandler, BoolOptionHandler, IntOptionHandler from .utils import ConfigDict, check_valid_env_name @@ -272,7 +272,9 @@ def __init__(self, self._option_handlers: list[ODCOptionHandler] = [ AliasOptionHandler("alias", self), - IndexDriverOptionHandler("index_driver", self, default="default") + IndexDriverOptionHandler("index_driver", self, default="default"), + BoolOptionHandler("skip_broken_datasets", self, default=False), + IntOptionHandler("dc_load_limit", self, minval=0), ] def get_all_aliases(self): diff --git a/datacube/cfg/opt.py b/datacube/cfg/opt.py index 4b433b37e..09eac8fa0 100644 --- a/datacube/cfg/opt.py +++ b/datacube/cfg/opt.py @@ -90,6 +90,7 @@ def get_val_from_environment(self) -> str | None: 2. Check canonical envvar name first. E.g. option "bar" in environment "foo" -> $ODC_FOO_BAR 3. Check canonical envvar name for any alias environments that point to this one. 4. Check any legacy envvar names, and raise warnings if found. + 5. Check global envvar name, denoted by "all" instead of an environment name :return: First environment variable with non-empty value, or None if none found. """ @@ -97,16 +98,17 @@ def get_val_from_environment(self) -> str | None: canonical_name = f"odc_{self.env._name}_{self.name}".upper() for env_name in self.env.get_all_aliases(): envvar_name = f"odc_{env_name}_{self.name}".upper() - val = os.environ.get(envvar_name) - if val: + if val := os.environ.get(envvar_name): return val - for env_name in self.legacy_env_aliases: - val = os.environ.get(env_name) - if val: + for envvar_name in self.legacy_env_aliases: + if val := os.environ.get(envvar_name): warnings.warn( - f"Config being passed in by legacy environment variable ${env_name}. " + f"Config being passed in by legacy environment variable ${envvar_name}. " f"Please use ${canonical_name} instead.") return val + global_name = f"odc_all_{self.name}".upper() + if val := os.environ.get(global_name): + return val return None @@ -157,6 +159,8 @@ def __init__(self, *args, minval: int | None = None, maxval: int | None = None, def validate_and_normalise(self, value: Any) -> Any: # Call super() to get handle default value value = super().validate_and_normalise(value) + if value is None: + return value try: ival = int(value) except ValueError: @@ -168,6 +172,20 @@ def validate_and_normalise(self, value: Any) -> Any: return ival +class BoolOptionHandler(ODCOptionHandler): + """ + Handle config option expecting a boolean value + """ + def validate_and_normalise(self, value: Any) -> Any: + value = super().validate_and_normalise(value) + if isinstance(value, bool): + return value + elif isinstance(value, str) and value.lower() == "true": + return True + else: + return False + + class IAMAuthenticationOptionHandler(ODCOptionHandler): """ A simple boolean, compatible with the historic behaviour of the IAM Authentication on/off option. diff --git a/datacube/cfg/utils.py b/datacube/cfg/utils.py index 77fecf25d..a17f705a1 100644 --- a/datacube/cfg/utils.py +++ b/datacube/cfg/utils.py @@ -22,6 +22,8 @@ def check_valid_env_name(name: str) -> None: """ if not re.fullmatch(r"^[a-z][a-z0-9]*$", name): raise ConfigException(f'Environment names must consist of only lower case letters and numbers: {name}') + if name.lower() == "all": + raise ConfigException('Environments cannot be named "ALL"') def check_valid_option(name: str) -> None: diff --git a/datacube/scripts/system.py b/datacube/scripts/system.py index c19ee7701..cff4a1fff 100644 --- a/datacube/scripts/system.py +++ b/datacube/scripts/system.py @@ -79,6 +79,8 @@ def echo_field(name, value): echo_field('Index Driver', cfg_env.index_driver) db_url = psql_url_from_config(cfg_env) echo_field('Database URL:', db_url) + echo_field('Skip broken datasets:', cfg_env.skip_broken_datasets) + echo_field('Datacube load limit:', cfg_env.dc_load_limit) echo() echo('Valid connection:\t', nl=False) diff --git a/docs/about/whats_new.rst b/docs/about/whats_new.rst index 33e446d0e..940b8e0c1 100644 --- a/docs/about/whats_new.rst +++ b/docs/about/whats_new.rst @@ -9,6 +9,8 @@ v1.9.next ========= - Permissions management cleanup in postgis driver. (:pull:`1613`) +- Add `skip_broken_datasets` and `dc_load_limit` config options. (:pull:`1616`) +- Enable global environment variables with `ODC_ALL` naming convention (:pull:`1616`) v1.9.0-rc9 (3rd July 2024) ========================== diff --git a/integration_tests/test_config_tool.py b/integration_tests/test_config_tool.py index 0b12f383d..915fabb66 100644 --- a/integration_tests/test_config_tool.py +++ b/integration_tests/test_config_tool.py @@ -122,6 +122,8 @@ def test_config_check(clirunner, index, cfg_env): assert cfg_env['db_hostname'] in result.output assert cfg_env['db_username'] in result.output + assert str(cfg_env['skip_broken_datasets']) in result.output + assert str(cfg_env['dc_load_limit']) in result.output def test_list_users_does_not_fail(clirunner, cfg_env, index):