diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6dcffe82..c5eeb81f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -12,12 +12,16 @@ Added Fixed * Single-ended measurements with `fix_alpha` failed due to a bug introduced in v2.0.0 ([#173](https://github.com/dtscalibration/python-dts-calibration/pull/173)). +* Headers in example notebooks and their appearance in the docs are now at correct levels Changed * Standardized parameter names. Reduced the freedom in choosing parameter names and dimension names in favor of simplifying the code. * Requiring netcdf4 >= 1.6.4 * Optional dependencies of xarray that improve performance are now required by default. +* variance_stokes_linear() does not default through zero anymore. +* Refactored calibration_single_ended and calibration_double_ended() +* Moved ParameterIndices classes to calibration_utils.py * Flox included in requirements to speed up resampling via xarray ([Xarray #5734](https://github.com/pydata/xarray/pull/5734)). Removed diff --git a/docs/_config.yml b/docs/_config.yml deleted file mode 100644 index c4192631..00000000 --- a/docs/_config.yml +++ /dev/null @@ -1 +0,0 @@ -theme: jekyll-theme-cayman \ No newline at end of file diff --git a/docs/api/dtscalibration.DataStore.rst b/docs/api/dtscalibration.DataStore.rst deleted file mode 100644 index 102209b0..00000000 --- a/docs/api/dtscalibration.DataStore.rst +++ /dev/null @@ -1,83 +0,0 @@ -DataStore -========= - -.. currentmodule:: dtscalibration - -.. autoclass:: DataStore - :show-inheritance: - - .. rubric:: Attributes Summary - - .. autosummary:: - - ~DataStore.channel_configuration - ~DataStore.chbw - ~DataStore.chfw - ~DataStore.is_double_ended - ~DataStore.sections - ~DataStore.timeseries_keys - - .. rubric:: Methods Summary - - .. autosummary:: - - ~DataStore.average_double_ended - ~DataStore.average_single_ended - ~DataStore.calibration_double_ended - ~DataStore.calibration_single_ended - ~DataStore.check_deprecated_kwargs - ~DataStore.conf_int_double_ended - ~DataStore.conf_int_single_ended - ~DataStore.get_default_encoding - ~DataStore.get_section_indices - ~DataStore.get_time_dim - ~DataStore.i_var - ~DataStore.in_confidence_interval - ~DataStore.inverse_variance_weighted_mean - ~DataStore.inverse_variance_weighted_mean_array - ~DataStore.rename_labels - ~DataStore.resample_datastore - ~DataStore.temperature_residuals - ~DataStore.to_mf_netcdf - ~DataStore.to_netcdf - ~DataStore.ufunc_per_section - ~DataStore.variance_stokes - ~DataStore.variance_stokes_constant - ~DataStore.variance_stokes_exponential - ~DataStore.variance_stokes_linear - - .. rubric:: Attributes Documentation - - .. autoattribute:: channel_configuration - .. autoattribute:: chbw - .. autoattribute:: chfw - .. autoattribute:: is_double_ended - .. autoattribute:: sections - .. autoattribute:: timeseries_keys - - .. rubric:: Methods Documentation - - .. automethod:: average_double_ended - .. automethod:: average_single_ended - .. automethod:: calibration_double_ended - .. automethod:: calibration_single_ended - .. automethod:: check_deprecated_kwargs - .. automethod:: conf_int_double_ended - .. automethod:: conf_int_single_ended - .. automethod:: get_default_encoding - .. automethod:: get_section_indices - .. automethod:: get_time_dim - .. automethod:: i_var - .. automethod:: in_confidence_interval - .. automethod:: inverse_variance_weighted_mean - .. automethod:: inverse_variance_weighted_mean_array - .. automethod:: rename_labels - .. automethod:: resample_datastore - .. automethod:: temperature_residuals - .. automethod:: to_mf_netcdf - .. automethod:: to_netcdf - .. automethod:: ufunc_per_section - .. automethod:: variance_stokes - .. automethod:: variance_stokes_constant - .. automethod:: variance_stokes_exponential - .. automethod:: variance_stokes_linear diff --git a/docs/api/dtscalibration.check_dims.rst b/docs/api/dtscalibration.check_dims.rst deleted file mode 100644 index 1dc68e7e..00000000 --- a/docs/api/dtscalibration.check_dims.rst +++ /dev/null @@ -1,6 +0,0 @@ -check_dims -========== - -.. currentmodule:: dtscalibration - -.. autofunction:: check_dims diff --git a/docs/api/dtscalibration.check_timestep_allclose.rst b/docs/api/dtscalibration.check_timestep_allclose.rst deleted file mode 100644 index d655bfe4..00000000 --- a/docs/api/dtscalibration.check_timestep_allclose.rst +++ /dev/null @@ -1,6 +0,0 @@ -check_timestep_allclose -======================= - -.. currentmodule:: dtscalibration - -.. autofunction:: check_timestep_allclose diff --git a/docs/api/dtscalibration.get_netcdf_encoding.rst b/docs/api/dtscalibration.get_netcdf_encoding.rst deleted file mode 100644 index 820b0a0f..00000000 --- a/docs/api/dtscalibration.get_netcdf_encoding.rst +++ /dev/null @@ -1,6 +0,0 @@ -get_netcdf_encoding -=================== - -.. currentmodule:: dtscalibration - -.. autofunction:: get_netcdf_encoding diff --git a/docs/api/dtscalibration.merge_double_ended.rst b/docs/api/dtscalibration.merge_double_ended.rst deleted file mode 100644 index c8e512b5..00000000 --- a/docs/api/dtscalibration.merge_double_ended.rst +++ /dev/null @@ -1,6 +0,0 @@ -merge_double_ended -================== - -.. currentmodule:: dtscalibration - -.. autofunction:: merge_double_ended diff --git a/docs/api/dtscalibration.open_datastore.rst b/docs/api/dtscalibration.open_datastore.rst deleted file mode 100644 index 95a987bd..00000000 --- a/docs/api/dtscalibration.open_datastore.rst +++ /dev/null @@ -1,6 +0,0 @@ -open_datastore -============== - -.. currentmodule:: dtscalibration - -.. autofunction:: open_datastore diff --git a/docs/api/dtscalibration.open_mf_datastore.rst b/docs/api/dtscalibration.open_mf_datastore.rst deleted file mode 100644 index edab4ebe..00000000 --- a/docs/api/dtscalibration.open_mf_datastore.rst +++ /dev/null @@ -1,6 +0,0 @@ -open_mf_datastore -================= - -.. currentmodule:: dtscalibration - -.. autofunction:: open_mf_datastore diff --git a/docs/api/dtscalibration.plot_accuracy.rst b/docs/api/dtscalibration.plot_accuracy.rst deleted file mode 100644 index a2e41fc2..00000000 --- a/docs/api/dtscalibration.plot_accuracy.rst +++ /dev/null @@ -1,6 +0,0 @@ -plot_accuracy -============= - -.. currentmodule:: dtscalibration - -.. autofunction:: plot_accuracy diff --git a/docs/api/dtscalibration.plot_location_residuals_double_ended.rst b/docs/api/dtscalibration.plot_location_residuals_double_ended.rst deleted file mode 100644 index ad0d16db..00000000 --- a/docs/api/dtscalibration.plot_location_residuals_double_ended.rst +++ /dev/null @@ -1,6 +0,0 @@ -plot_location_residuals_double_ended -==================================== - -.. currentmodule:: dtscalibration - -.. autofunction:: plot_location_residuals_double_ended diff --git a/docs/api/dtscalibration.plot_residuals_reference_sections.rst b/docs/api/dtscalibration.plot_residuals_reference_sections.rst deleted file mode 100644 index 45f2529b..00000000 --- a/docs/api/dtscalibration.plot_residuals_reference_sections.rst +++ /dev/null @@ -1,6 +0,0 @@ -plot_residuals_reference_sections -================================= - -.. currentmodule:: dtscalibration - -.. autofunction:: plot_residuals_reference_sections diff --git a/docs/api/dtscalibration.plot_residuals_reference_sections_single.rst b/docs/api/dtscalibration.plot_residuals_reference_sections_single.rst deleted file mode 100644 index 725d77a0..00000000 --- a/docs/api/dtscalibration.plot_residuals_reference_sections_single.rst +++ /dev/null @@ -1,6 +0,0 @@ -plot_residuals_reference_sections_single -======================================== - -.. currentmodule:: dtscalibration - -.. autofunction:: plot_residuals_reference_sections_single diff --git a/docs/api/dtscalibration.plot_sigma_report.rst b/docs/api/dtscalibration.plot_sigma_report.rst deleted file mode 100644 index b047cdeb..00000000 --- a/docs/api/dtscalibration.plot_sigma_report.rst +++ /dev/null @@ -1,6 +0,0 @@ -plot_sigma_report -================= - -.. currentmodule:: dtscalibration - -.. autofunction:: plot_sigma_report diff --git a/docs/api/dtscalibration.read_apsensing_files.rst b/docs/api/dtscalibration.read_apsensing_files.rst deleted file mode 100644 index 04bd67d6..00000000 --- a/docs/api/dtscalibration.read_apsensing_files.rst +++ /dev/null @@ -1,6 +0,0 @@ -read_apsensing_files -==================== - -.. currentmodule:: dtscalibration - -.. autofunction:: read_apsensing_files diff --git a/docs/api/dtscalibration.read_sensornet_files.rst b/docs/api/dtscalibration.read_sensornet_files.rst deleted file mode 100644 index 541f00cb..00000000 --- a/docs/api/dtscalibration.read_sensornet_files.rst +++ /dev/null @@ -1,6 +0,0 @@ -read_sensornet_files -==================== - -.. currentmodule:: dtscalibration - -.. autofunction:: read_sensornet_files diff --git a/docs/api/dtscalibration.read_sensortran_files.rst b/docs/api/dtscalibration.read_sensortran_files.rst deleted file mode 100644 index 35d92c94..00000000 --- a/docs/api/dtscalibration.read_sensortran_files.rst +++ /dev/null @@ -1,6 +0,0 @@ -read_sensortran_files -===================== - -.. currentmodule:: dtscalibration - -.. autofunction:: read_sensortran_files diff --git a/docs/api/dtscalibration.read_silixa_files.rst b/docs/api/dtscalibration.read_silixa_files.rst deleted file mode 100644 index d5adee72..00000000 --- a/docs/api/dtscalibration.read_silixa_files.rst +++ /dev/null @@ -1,6 +0,0 @@ -read_silixa_files -================= - -.. currentmodule:: dtscalibration - -.. autofunction:: read_silixa_files diff --git a/docs/api/dtscalibration.shift_double_ended.rst b/docs/api/dtscalibration.shift_double_ended.rst deleted file mode 100644 index 0a879ea1..00000000 --- a/docs/api/dtscalibration.shift_double_ended.rst +++ /dev/null @@ -1,6 +0,0 @@ -shift_double_ended -================== - -.. currentmodule:: dtscalibration - -.. autofunction:: shift_double_ended diff --git a/docs/api/dtscalibration.suggest_cable_shift_double_ended.rst b/docs/api/dtscalibration.suggest_cable_shift_double_ended.rst deleted file mode 100644 index 14a135c0..00000000 --- a/docs/api/dtscalibration.suggest_cable_shift_double_ended.rst +++ /dev/null @@ -1,6 +0,0 @@ -suggest_cable_shift_double_ended -================================ - -.. currentmodule:: dtscalibration - -.. autofunction:: suggest_cable_shift_double_ended diff --git a/docs/conf.py b/docs/conf.py index b58fdee4..6e28eadd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -4,6 +4,7 @@ extensions = [ + "sphinx_rtd_theme", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.coverage", @@ -69,7 +70,7 @@ napoleon_use_param = False # sphinx_automodapi.automodapi -numpydoc_show_class_members = False +numpydoc_show_class_members = True # -- nbsphinx configuration -- nbsphinx_allow_errors = False diff --git a/docs/index.rst b/docs/index.rst index 1cbfeed1..894b4e21 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,11 +3,9 @@ Contents ======== .. toctree:: - :maxdepth: 2 + :maxdepth: 3 readme - installation - usage learn_by_examples reference/index contributing diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index 98bf9bbc..00000000 --- a/docs/installation.rst +++ /dev/null @@ -1,8 +0,0 @@ -============ -Installation -============ - -At the command line:: - - pip install dtscalibration - diff --git a/docs/nb_examples_to_docs.py b/docs/nb_examples_to_docs.py deleted file mode 100644 index ac6a6a3e..00000000 --- a/docs/nb_examples_to_docs.py +++ /dev/null @@ -1,95 +0,0 @@ -# coding=utf-8 -import glob -import os -import shutil -from subprocess import check_call - -clean_nb = False # save clean notebook to github - -try: - # this file is excecuted as script - wd = os.path.dirname(os.path.realpath(__file__)) - print("run as script: wd", wd) -except: - # Excecuted from console. pwd = ./docs - wd = os.getcwd() - print("run from console: wd", wd) - pass - -file_ext = "*.ipynb" - -# ./examples/notebooks -inpath = os.path.join(wd, "..", "examples", "notebooks") - -# ./docs/examples/notebooks -outpath = os.path.join(wd, "examples", "notebooks") -fp_index = os.path.join(wd, "examples", "index.rst") - -# clean outputdir -shutil.rmtree(outpath) -os.makedirs(outpath) - -filepathlist = sorted(glob.glob(os.path.join(inpath, file_ext))) -filenamelist = [os.path.basename(path) for path in filepathlist] - -for fp, fn in zip(filepathlist, filenamelist): - if clean_nb: - # save clean notebook to github - check_call( - [ - "jupyter", - "nbconvert", - "--clear-output", - "--ClearOutputPreprocessor.enabled=True", - "--inplace", - fp, - ] - ) - else: - check_call( - [ - "jupyter", - "nbconvert", - "--execute", - "--ExecutePreprocessor.kernel_name=python", - "--KernelGatewayApp.force_kernel_name=python", - "--ExecutePreprocessor.timeout=60", - "--inplace", - fp, - ] - ) - # run the notebook to: - # 1) check whether no errors occur. - # 2) save and show outputconvert notebook to rst for documentation - # outfilepath = os.path.join(outpath, fn) - check_call( - [ - "jupyter", - "nbconvert", - "--execute", - "--to", - "rst", - "--ExecutePreprocessor.kernel_name=python", - "--KernelGatewayApp.force_kernel_name=python", - "--ExecutePreprocessor.timeout=60", - "--output-dir", - outpath, - "--output", - fn, - fp, - ] - ) - -# write index file to toc -fp_index = os.path.join(wd, "examples", "index.rst") -s = """Learn by Examples -================= - -.. toctree:: -""" - -with open(fp_index, "w+") as fh: - fh.write(s) - for fn in filenamelist: - sf = " notebooks/{}.rst\n".format(fn) - fh.write(sf) diff --git a/docs/notebooks/07Calibrate_single_ended.ipynb b/docs/notebooks/07Calibrate_single_ended.ipynb index a130ac63..28616d9e 100644 --- a/docs/notebooks/07Calibrate_single_ended.ipynb +++ b/docs/notebooks/07Calibrate_single_ended.ipynb @@ -52,7 +52,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Read the raw data files loaded from your DTS machine\n", + "## Read the raw data files loaded from your DTS machine\n", "Use `read_silixa_files` for reading files from a Silixa device. The following functions are available for reading files from other devices: `read_sensortran_files`, `read_apsensing_files`, and `read_sensornet_files`. See Notebook 1.\n", "\n", "Calibration is performed on sections that have a known" @@ -79,7 +79,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Define the reference fiber sections that have a known temperature\n", + "## Define the reference fiber sections that have a known temperature\n", "As explained in Notebook 3. DTS devices come with temperature probes to measure the temperature of the water baths. These measurements are stored in the data that was loaded in the previous step and are loaded automatically. In the case you would like to use an external temperature sensor, have a look at notebook `09Import_timeseries` to append those measurements to the `ds` before continuing with the calibration. " ] }, @@ -100,7 +100,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Estimate the variance of the noise in the Stokes and anti-Stokes measurements\n", + "## Estimate the variance of the noise in the Stokes and anti-Stokes measurements\n", "First calculate the variance of the noise in the measured Stokes and anti-Stokes signals. See Notebook 4 for more information.\n", "\n", "The Stokes and anti-Stokes signals should follow a smooth decaying exponential. This function fits a decaying exponential to each reference section for each time step. The variance of the residuals between the measured Stokes and anti-Stokes signals and the fitted signals is used as an estimate of the variance in measured signals. This algorithm assumes that the temperature is the same for the entire section but may vary over time and differ per section." @@ -143,7 +143,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Perform calibration and compute the temperature\n", + "## Perform calibration and compute the temperature\n", "We calibrate the measurements and their uncertainty with a single method call. The temperature is stored by default as the `ds.tmpf` dataarray and the variance of its approximation as `ds.tmpf_var`." ] }, @@ -167,7 +167,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Plot the temperature and uncertainty of the estimated temperature\n", + "## Plot the temperature and uncertainty of the estimated temperature\n", "First, the temperature for the entire fiber is plotted. Second, the temperature and its standard error are plotted for the first timestep." ] }, diff --git a/docs/notebooks/08Calibrate_double_ended.ipynb b/docs/notebooks/08Calibrate_double_ended.ipynb index c76fcdf4..2ed0f35b 100644 --- a/docs/notebooks/08Calibrate_double_ended.ipynb +++ b/docs/notebooks/08Calibrate_double_ended.ipynb @@ -56,7 +56,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Read the raw data files loaded from your DTS machine\n", + "## Read the raw data files loaded from your DTS machine\n", "Use `read_silixa_files` for reading files from a Silixa device. The following functions are available for reading files from other devices: `read_sensortran_files`, `read_apsensing_files`, and `read_sensornet_files`. See Notebook 1. If your DTS device was configured such that your forward- and backward measurements are stored in seperate folders have a look at `11Merge_single_measurements_into_double` example notebook. " ] }, @@ -77,7 +77,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Align measurements of the forward and backward channels" + "## Align measurements of the forward and backward channels" ] }, { @@ -147,7 +147,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Define the reference fiber sections that have a known temperature\n", + "## Define the reference fiber sections that have a known temperature\n", "As explained in Notebook 3. As explained in Notebook 3. DTS devices come with temperature probes to measure the temperature of the water baths. These measurements are stored in the data that was loaded in the previous step and are loaded automatically. In the case you would like to use an external temperature sensor, have a look at notebook `09Import_timeseries` to append those measurements to the `ds` before continuing with the calibration." ] }, @@ -167,7 +167,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Estimate the variance of the noise in the Stokes and anti-Stokes measurements\n", + "## Estimate the variance of the noise in the Stokes and anti-Stokes measurements\n", "First calculate the variance in the measured Stokes and anti-Stokes signals, in the forward and backward direction. See Notebook 4 for more information.\n", "\n", "The Stokes and anti-Stokes signals should follow a smooth decaying exponential. This function fits a decaying exponential to each reference section for each time step. The variance of the residuals between the measured Stokes and anti-Stokes signals and the fitted signals is used as an estimate of the variance in measured signals. This algorithm assumes that the temperature is the same for the entire section but may vary over time and differ per section." @@ -219,7 +219,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Perform calibration and compute the temperature\n", + "## Perform calibration and compute the temperature\n", "We calibrate the measurements with a single method call. Three temperatures are estimated for double-ended setups. The temperature using the Stokes and anti-Stokes meassurements of the forward channel, `tmpf`. The temperature of the backward channel, `tmpb`. And the weigthed average of the two, `tmpw`. The latter is the best estimate of the temperature along the fiber with the smallest uncertainty." ] }, @@ -264,7 +264,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Uncertainty of the calibrated temperature\n", + "## Uncertainty of the calibrated temperature\n", "The uncertainty of the calibrated temperature can be computed in two manners:\n", "1. The variance of the calibrated temperature can be approximated using linear error propagation\n", " - Very fast computation\n", diff --git a/docs/notebooks/14Lossy_splices.ipynb b/docs/notebooks/14Lossy_splices.ipynb index b7959031..c2c98e6e 100644 --- a/docs/notebooks/14Lossy_splices.ipynb +++ b/docs/notebooks/14Lossy_splices.ipynb @@ -24,7 +24,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Demonstration\n", + "## Demonstration\n", "To demonstrate the effect of a lossy splice, we'll load the same dataset that was used in previous notebooks, and modify the data to simulate a lossy splice." ] }, @@ -230,5 +230,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/notebooks/17Temperature_uncertainty_single_ended.ipynb b/docs/notebooks/17Temperature_uncertainty_single_ended.ipynb index 7fd9000d..f270404f 100644 --- a/docs/notebooks/17Temperature_uncertainty_single_ended.ipynb +++ b/docs/notebooks/17Temperature_uncertainty_single_ended.ipynb @@ -5,7 +5,7 @@ "id": "bcd29771", "metadata": {}, "source": [ - "# Uncertainty of the temperature estimated using single-ended calibration\n", + "# 17. Uncertainty of the temperature estimated using single-ended calibration\n", "After comleting single-ended calibration, you might be interested in inspecting the uncertainty of the estimated temperature.\n", "- Decomposing the uncertainty\n", "- Monte Carlo estimate of the standard error\n", diff --git a/docs/reference/index.rst b/docs/reference/index.rst index b73574c7..9d9e4821 100644 --- a/docs/reference/index.rst +++ b/docs/reference/index.rst @@ -5,5 +5,6 @@ Reference :glob: .. automodapi:: dtscalibration - :no-inheritance-diagram: :skip: plot_dask + :members: + :no-inheritance-diagram: diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 8a51afe0..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -IPython -matplotlib>=3.0.0 -nbsphinx -recommonmark -sphinx<6 -sphinx-automodapi --e .[docs] diff --git a/docs/usage.rst b/docs/usage.rst deleted file mode 100644 index 7b73d5bf..00000000 --- a/docs/usage.rst +++ /dev/null @@ -1,7 +0,0 @@ -===== -Usage -===== - -To use dtscalibration in a project:: - - import dtscalibration diff --git a/pyproject.toml b/pyproject.toml index 1fdadfda..7b05a985 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,10 +83,13 @@ dev = [ "nbformat", # Needed to run the tests ] docs = [ # Required for ReadTheDocs + "IPython", "myst_parser", + "matplotlib>=3.0.0", "sphinx", "sphinx_rtd_theme", "sphinx-autoapi", + "sphinx-automodapi", "coverage[toml]", "nbsphinx", "ipykernel", diff --git a/src/dtscalibration/__init__.py b/src/dtscalibration/__init__.py index 2087d2b9..3836a779 100644 --- a/src/dtscalibration/__init__.py +++ b/src/dtscalibration/__init__.py @@ -1,17 +1,17 @@ # coding=utf-8 from dtscalibration.datastore import DataStore -from dtscalibration.datastore import open_datastore -from dtscalibration.datastore import open_mf_datastore -from dtscalibration.datastore import read_apsensing_files -from dtscalibration.datastore import read_sensornet_files -from dtscalibration.datastore import read_sensortran_files -from dtscalibration.datastore import read_silixa_files from dtscalibration.datastore_utils import check_dims from dtscalibration.datastore_utils import check_timestep_allclose from dtscalibration.datastore_utils import get_netcdf_encoding from dtscalibration.datastore_utils import merge_double_ended from dtscalibration.datastore_utils import shift_double_ended from dtscalibration.datastore_utils import suggest_cable_shift_double_ended +from dtscalibration.io import open_datastore +from dtscalibration.io import open_mf_datastore +from dtscalibration.io import read_apsensing_files +from dtscalibration.io import read_sensornet_files +from dtscalibration.io import read_sensortran_files +from dtscalibration.io import read_silixa_files from dtscalibration.plot import plot_accuracy from dtscalibration.plot import plot_location_residuals_double_ended from dtscalibration.plot import plot_residuals_reference_sections @@ -41,7 +41,7 @@ ] # filenames = ['datastore.py', 'datastore_utils.py', 'calibrate_utils.py', -# 'plot.py', 'io.py'] +# 'plot.py', 'io_utils.py'] # filenames = ['plot.py'] # # for filename in filenames: diff --git a/src/dtscalibration/calibrate_utils.py b/src/dtscalibration/calibrate_utils.py index 66047af1..6e90e208 100644 --- a/src/dtscalibration/calibrate_utils.py +++ b/src/dtscalibration/calibrate_utils.py @@ -20,7 +20,7 @@ def parse_st_var(ds, st_var, st_label="st"): If `callable` the variance of the noise from the Stokes detector is a function of the intensity, as defined in the callable function. Or when the variance is a function of the intensity (Poisson - distributed) define a DataArray of the shape shape as ds.st, where the + distributed) define a DataArray of the shape as ds.st, where the variance can be a function of time and/or x. st_label : string Name of the (reverse) stokes/anti-stokes data variable which is being @@ -46,6 +46,147 @@ def parse_st_var(ds, st_var, st_label="st"): return st_var_sec +def calibration_single_ended_helper( + self, + st_var, + ast_var, + fix_alpha, + fix_dalpha, + fix_gamma, + matching_indices, + nt, + nta, + nx, + solver, +): + """Only used in `calibration_single_ended()`""" + for input_item in [st_var, ast_var]: + assert input_item is not None, ( + "For wls define all " "variances (`st_var`, " "`ast_var`) " + ) + calc_cov = True + split = calibration_single_ended_solver( + self, + st_var, + ast_var, + calc_cov=calc_cov, + solver="external_split", + matching_indices=matching_indices, + ) + y = split["y"] + w = split["w"] + + # Stack all X's + if fix_alpha: + assert not fix_dalpha, "Use either `fix_dalpha` or `fix_alpha`" + assert fix_alpha[0].size == self.x.size, ( + "fix_alpha also needs to be defined outside the reference " "sections" + ) + assert fix_alpha[1].size == self.x.size, ( + "fix_alpha also needs to be defined outside the reference " "sections" + ) + p_val = split["p0_est_alpha"].copy() + + if np.any(matching_indices): + raise NotImplementedError( + "Configuring fix_alpha and matching sections requires extra code" + ) + + X = sp.hstack( + (split["X_gamma"], split["X_alpha"], split["X_c"], split["X_TA"]) + ).tocsr() + ip_use = list(range(1 + nx + nt + nta * nt)) + + else: + X = sp.vstack( + ( + sp.hstack( + ( + split["X_gamma"], + split["X_dalpha"], + split["X_c"], + split["X_TA"], + ) + ), + split["X_m"], + ) + ).tocsr() + p_val = split["p0_est_dalpha"].copy() + ip_use = list(range(1 + 1 + nt + nta * nt)) + p_var = np.zeros_like(p_val) + p_cov = np.zeros((p_val.size, p_val.size), dtype=float) + if fix_gamma is not None: + ip_remove = [0] + ip_use = [i for i in ip_use if i not in ip_remove] + p_val[ip_remove] = fix_gamma[0] + p_var[ip_remove] = fix_gamma[1] + + X_gamma = ( + sp.vstack((split["X_gamma"], split["X_m"].tocsr()[:, 0].tocoo())) + .toarray() + .flatten() + ) + + y -= fix_gamma[0] * X_gamma + w = 1 / (1 / w + fix_gamma[1] * X_gamma) + if fix_alpha is not None: + ip_remove = list(range(1, nx + 1)) + ip_use = [i for i in ip_use if i not in ip_remove] + p_val[ip_remove] = fix_alpha[0] + p_var[ip_remove] = fix_alpha[1] + + # X_alpha needs to be vertically extended to support matching sections + y -= split["X_alpha"].dot(fix_alpha[0]) + w = 1 / (1 / w + split["X_alpha"].dot(fix_alpha[1])) + if fix_dalpha is not None: + ip_remove = [1] + ip_use = [i for i in ip_use if i not in ip_remove] + p_val[ip_remove] = fix_dalpha[0] + p_var[ip_remove] = fix_dalpha[1] + + y -= np.hstack( + ( + fix_dalpha[0] * split["X_dalpha"].toarray().flatten(), + ( + fix_dalpha[0] + * split["X_m"].tocsr()[:, 1].tocoo().toarray().flatten() + ), + ) + ) + w = 1 / ( + 1 / w + + np.hstack( + ( + fix_dalpha[1] * split["X_dalpha"].toarray().flatten(), + ( + fix_dalpha[1] + * split["X_m"].tocsr()[:, 1].tocoo().toarray().flatten() + ), + ) + ) + ) + if solver == "sparse": + out = wls_sparse( + X[:, ip_use], + y, + w=w, + x0=p_val[ip_use], + calc_cov=calc_cov, + verbose=False, + ) + + elif solver == "stats": + out = wls_stats(X[:, ip_use], y, w=w, calc_cov=calc_cov, verbose=False) + + else: + assert 0, "Unknown solver" + p_val[ip_use] = out[0] + p_var[ip_use] = out[1] + np.fill_diagonal(p_cov, p_var) # set variance of all fixed params + p_cov[np.ix_(ip_use, ip_use)] = out[2] + return p_cov, p_val, p_var + + def calibration_single_ended_solver( # noqa: MC0001 ds, st_var=None, @@ -342,6 +483,623 @@ def calibration_single_ended_solver( # noqa: MC0001 return (p_sol, p_var, p_cov) if calc_cov else (p_sol, p_var) +def calibration_double_ended_helper( + self, + st_var, + ast_var, + rst_var, + rast_var, + fix_alpha, + fix_gamma, + nt, + nta, + nx, + nx_sec, + ix_sec, + matching_indices, + solver, + verbose, +): + if fix_alpha or fix_gamma: + split = calibration_double_ended_solver( + self, + st_var, + ast_var, + rst_var, + rast_var, + calc_cov=True, + solver="external_split", + matching_indices=matching_indices, + verbose=verbose, + ) + else: + out = calibration_double_ended_solver( + self, + st_var, + ast_var, + rst_var, + rast_var, + calc_cov=True, + solver=solver, + matching_indices=matching_indices, + verbose=verbose, + ) + + p_val, p_var, p_cov = out + # adjust split to fix parameters + """Wrapped in a function to reduce memory usage. + Constructing: + Z_gamma (nt * nx, 1). Data: positive 1/temp + Z_D (nt * nx, nt). Data: ones + E (nt * nx, nx). Data: ones + Zero_gamma (nt * nx, 1) + zero_d (nt * nx, nt) + Z_TA_fw (nt * nx, nta * 2 * nt) minus ones + Z_TA_bw (nt * nx, nta * 2 * nt) minus ones + Z_TA_E (nt * nx, nta * 2 * nt) + I_fw = 1/Tref*gamma - D_fw - E - TA_fw + I_bw = 1/Tref*gamma - D_bw + E - TA_bw + (I_bw - I_fw) / 2 = D_fw/2 - D_bw/2 + E + TA_fw/2 - TA_bw/2 Eq42 + """ + if fix_alpha and fix_gamma: + assert np.size(fix_alpha[0]) == self.x.size, "define alpha for each location" + assert ( + np.size(fix_alpha[1]) == self.x.size + ), "define var alpha for each location" + m = ( + "The integrated differential attenuation is zero at the " + "first index of the reference sections." + ) + assert np.abs(fix_alpha[0][ix_sec[0]]) < 1e-8, m + # The array with the integrated differential att is termed E + + if np.any(matching_indices): + n_E_in_cal = split["ix_from_cal_match_to_glob"].size + p0_est = np.concatenate( + ( + split["p0_est"][1 : 1 + 2 * nt], + split["p0_est"][1 + 2 * nt + n_E_in_cal :], + ) + ) + X_E1 = sp.csr_matrix(([], ([], [])), shape=(nt * nx_sec, self.x.size)) + X_E1[:, ix_sec[1:]] = split["E"] + X_E2 = X_E1[:, split["ix_from_cal_match_to_glob"]] + X_E = sp.vstack( + ( + -X_E2, + X_E2, + split["E_match_F"], + split["E_match_B"], + split["E_match_no_cal"], + ) + ) + + X_gamma = ( + sp.vstack( + ( + split["Z_gamma"], + split["Z_gamma"], + split["Zero_eq12_gamma"], + split["Zero_eq12_gamma"], + split["Zero_eq3_gamma"], + ) + ) + .toarray() + .flatten() + ) + + X = sp.vstack( + ( + sp.hstack((-split["Z_D"], split["Zero_d"], split["Z_TA_fw"])), + sp.hstack((split["Zero_d"], -split["Z_D"], split["Z_TA_bw"])), + sp.hstack((split["Zero_d_eq12"], split["Z_TA_eq1"])), + sp.hstack((split["Zero_d_eq12"], split["Z_TA_eq2"])), + sp.hstack((split["d_no_cal"], split["Z_TA_eq3"])), + ) + ) + + y = np.concatenate( + ( + split["y_F"], + split["y_B"], + split["y_eq1"], + split["y_eq2"], + split["y_eq3"], + ) + ) + y -= X_E.dot(fix_alpha[0][split["ix_from_cal_match_to_glob"]]) + y -= fix_gamma[0] * X_gamma + + # variances are added. weight is the inverse of the variance + # of the observations + w_ = np.concatenate( + ( + split["w_F"], + split["w_B"], + split["w_eq1"], + split["w_eq2"], + split["w_eq3"], + ) + ) + w = 1 / ( + 1 / w_ + + X_E.dot(fix_alpha[1][split["ix_from_cal_match_to_glob"]]) + + fix_gamma[1] * X_gamma + ) + + else: + # X_gamma + X_E = sp.vstack((-split["E"], split["E"])) + X_gamma = ( + sp.vstack((split["Z_gamma"], split["Z_gamma"])).toarray().flatten() + ) + # Use only the remaining coefficients + # Stack all X's + X = sp.vstack( + ( + sp.hstack((-split["Z_D"], split["Zero_d"], split["Z_TA_fw"])), + sp.hstack((split["Zero_d"], -split["Z_D"], split["Z_TA_bw"])), + ) + ) + + # Move the coefficients times the fixed gamma to the + # observations + y = np.concatenate((split["y_F"], split["y_B"])) + y -= X_E.dot(fix_alpha[0][ix_sec[1:]]) + y -= fix_gamma[0] * X_gamma + # variances are added. weight is the inverse of the variance + # of the observations + w_ = np.concatenate((split["w_F"], split["w_B"])) + w = 1 / ( + 1 / w_ + X_E.dot(fix_alpha[1][ix_sec[1:]]) + fix_gamma[1] * X_gamma + ) + + # [C_1, C_2, .., C_nt, TA_fw_a_1, TA_fw_a_2, TA_fw_a_nt, + # TA_bw_a_1, TA_bw_a_2, TA_bw_a_nt] Then continues with + # TA for connector b. + p0_est = np.concatenate( + ( + split["p0_est"][1 : 1 + 2 * nt], + split["p0_est"][1 + 2 * nt + nx_sec - 1 :], + ) + ) + + if solver == "sparse": + out = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=True, verbose=False) + + elif solver == "stats": + out = wls_stats(X, y, w=w, calc_cov=True, verbose=False) + + # Added fixed gamma and its variance to the solution + p_val = np.concatenate( + ([fix_gamma[0]], out[0][: 2 * nt], fix_alpha[0], out[0][2 * nt :]) + ) + p_var = np.concatenate( + ([fix_gamma[1]], out[1][: 2 * nt], fix_alpha[1], out[1][2 * nt :]) + ) + + # whether it returns a copy or a view depends on what + # version of numpy you are using + p_cov = np.diag(p_var).copy() + from_i = np.concatenate( + ( + np.arange(1, 2 * nt + 1), + np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), + ) + ) + iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") + p_cov[iox_sec1, iox_sec2] = out[2] + + elif fix_gamma: + if np.any(matching_indices): + # n_E_in_cal = split['ix_from_cal_match_to_glob'].size + p0_est = split["p0_est"][1:] + X_E1 = sp.csr_matrix(([], ([], [])), shape=(nt * nx_sec, self.x.size)) + from_i = ix_sec[1:] + X_E1[:, from_i] = split["E"] + X_E2 = X_E1[:, split["ix_from_cal_match_to_glob"]] + X = sp.vstack( + ( + sp.hstack( + ( + -split["Z_D"], + split["Zero_d"], + -X_E2, + split["Z_TA_fw"], + ) + ), + sp.hstack((split["Zero_d"], -split["Z_D"], X_E2, split["Z_TA_bw"])), + sp.hstack( + ( + split["Zero_d_eq12"], + split["E_match_F"], + split["Z_TA_eq1"], + ) + ), + sp.hstack( + ( + split["Zero_d_eq12"], + split["E_match_B"], + split["Z_TA_eq2"], + ) + ), + sp.hstack( + ( + split["d_no_cal"], + split["E_match_no_cal"], + split["Z_TA_eq3"], + ) + ), + ) + ) + X_gamma = ( + sp.vstack( + ( + split["Z_gamma"], + split["Z_gamma"], + split["Zero_eq12_gamma"], + split["Zero_eq12_gamma"], + split["Zero_eq3_gamma"], + ) + ) + .toarray() + .flatten() + ) + + y = np.concatenate( + ( + split["y_F"], + split["y_B"], + split["y_eq1"], + split["y_eq2"], + split["y_eq3"], + ) + ) + y -= fix_gamma[0] * X_gamma + + # variances are added. weight is the inverse of the variance + # of the observations + w_ = np.concatenate( + ( + split["w_F"], + split["w_B"], + split["w_eq1"], + split["w_eq2"], + split["w_eq3"], + ) + ) + w = 1 / (1 / w_ + fix_gamma[1] * X_gamma) + + else: + X_gamma = ( + sp.vstack((split["Z_gamma"], split["Z_gamma"])).toarray().flatten() + ) + # Use only the remaining coefficients + X = sp.vstack( + ( + sp.hstack( + ( + -split["Z_D"], + split["Zero_d"], + -split["E"], + split["Z_TA_fw"], + ) + ), + sp.hstack( + ( + split["Zero_d"], + -split["Z_D"], + split["E"], + split["Z_TA_bw"], + ) + ), + ) + ) + # Move the coefficients times the fixed gamma to the + # observations + y = np.concatenate((split["y_F"], split["y_B"])) + y -= fix_gamma[0] * X_gamma + # variances are added. weight is the inverse of the variance + # of the observations + w_ = np.concatenate((split["w_F"], split["w_B"])) + w = 1 / (1 / w_ + fix_gamma[1] * X_gamma) + + p0_est = split["p0_est"][1:] + + if solver == "sparse": + out = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=True, verbose=False) + + elif solver == "stats": + out = wls_stats(X, y, w=w, calc_cov=True, verbose=False) + + # put E outside of reference section in solution + # concatenating makes a copy of the data instead of using a + # pointer + ds_sub = self[["st", "ast", "rst", "rast", "trans_att"]] + ds_sub["df"] = (("time",), out[0][:nt]) + ds_sub["df_var"] = (("time",), out[1][:nt]) + ds_sub["db"] = (("time",), out[0][nt : 2 * nt]) + ds_sub["db_var"] = (("time",), out[1][nt : 2 * nt]) + + if nta > 0: + if np.any(matching_indices): + n_E_in_cal = split["ix_from_cal_match_to_glob"].size + ta = out[0][2 * nt + n_E_in_cal :].reshape((nt, 2, nta), order="F") + ta_var = out[1][2 * nt + n_E_in_cal :].reshape((nt, 2, nta), order="F") + + else: + ta = out[0][2 * nt + nx_sec - 1 :].reshape((nt, 2, nta), order="F") + ta_var = out[1][2 * nt + nx_sec - 1 :].reshape((nt, 2, nta), order="F") + + talpha_fw = ta[:, 0, :] + talpha_bw = ta[:, 1, :] + talpha_fw_var = ta_var[:, 0, :] + talpha_bw_var = ta_var[:, 1, :] + else: + talpha_fw = None + talpha_bw = None + talpha_fw_var = None + talpha_bw_var = None + + E_all_exact, E_all_var_exact = calc_alpha_double( + mode="exact", + ds=ds_sub, + st_var=st_var, + ast_var=ast_var, + rst_var=rst_var, + rast_var=rast_var, + ix_alpha_is_zero=ix_sec[0], + talpha_fw=talpha_fw, + talpha_bw=talpha_bw, + talpha_fw_var=talpha_fw_var, + talpha_bw_var=talpha_bw_var, + ) + + if not np.any(matching_indices): + # Added fixed gamma and its variance to the solution. And + # expand to include locations outside reference sections. + p_val = np.concatenate( + ( + [fix_gamma[0]], + out[0][: 2 * nt], + E_all_exact, + out[0][2 * nt + nx_sec - 1 :], + ) + ) + p_val[1 + 2 * nt + ix_sec[1:]] = out[0][2 * nt : 2 * nt + nx_sec - 1] + p_val[1 + 2 * nt + ix_sec[0]] = 0.0 + p_var = np.concatenate( + ( + [fix_gamma[1]], + out[1][: 2 * nt], + E_all_var_exact, + out[1][2 * nt + nx_sec - 1 :], + ) + ) + p_var[1 + 2 * nt + ix_sec[1:]] = out[1][2 * nt : 2 * nt + nx_sec - 1] + else: + n_E_in_cal = split["ix_from_cal_match_to_glob"].size + + # Added fixed gamma and its variance to the solution. And + # expand to include locations outside reference sections. + p_val = np.concatenate( + ( + [fix_gamma[0]], + out[0][: 2 * nt], + E_all_exact, + out[0][2 * nt + n_E_in_cal :], + ) + ) + p_val[1 + 2 * nt + split["ix_from_cal_match_to_glob"]] = out[0][ + 2 * nt : 2 * nt + n_E_in_cal + ] + p_val[1 + 2 * nt + ix_sec[0]] = 0.0 + p_var = np.concatenate( + ( + [fix_gamma[1]], + out[1][: 2 * nt], + E_all_var_exact, + out[1][2 * nt + n_E_in_cal :], + ) + ) + p_var[1 + 2 * nt + split["ix_from_cal_match_to_glob"]] = out[1][ + 2 * nt : 2 * nt + n_E_in_cal + ] + + p_cov = np.diag(p_var).copy() + + if not np.any(matching_indices): + from_i = np.concatenate( + ( + np.arange(1, 2 * nt + 1), + 2 * nt + 1 + ix_sec[1:], + np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), + ) + ) + else: + from_i = np.concatenate( + ( + np.arange(1, 2 * nt + 1), + 2 * nt + 1 + split["ix_from_cal_match_to_glob"], + np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), + ) + ) + + iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") + p_cov[iox_sec1, iox_sec2] = out[2] + + elif fix_alpha: + assert np.size(fix_alpha[0]) == self.x.size, "define alpha for each location" + assert ( + np.size(fix_alpha[1]) == self.x.size + ), "define var alpha for each location" + m = ( + "The integrated differential attenuation is zero at the " + "first index of the reference sections." + ) + assert np.abs(fix_alpha[0][ix_sec[0]]) < 1e-6, m + # The array with the integrated differential att is termed E + + if not np.any(matching_indices): + # X_gamma + X_E = sp.vstack((-split["E"], split["E"])) + # Use only the remaining coefficients + # Stack all X's + X = sp.vstack( + ( + sp.hstack( + ( + split["Z_gamma"], + -split["Z_D"], + split["Zero_d"], + split["Z_TA_fw"], + ) + ), + sp.hstack( + ( + split["Z_gamma"], + split["Zero_d"], + -split["Z_D"], + split["Z_TA_bw"], + ) + ), + ) + ) + # Move the coefficients times the fixed gamma to the + # observations + y = np.concatenate((split["y_F"], split["y_B"])) + y -= X_E.dot(fix_alpha[0][ix_sec[1:]]) + + # variances are added. weight is the inverse of the variance + # of the observations + w_ = np.concatenate((split["w_F"], split["w_B"])) + w = 1 / (1 / w_ + X_E.dot(fix_alpha[1][ix_sec[1:]])) + + p0_est = np.concatenate( + ( + split["p0_est"][: 1 + 2 * nt], + split["p0_est"][1 + 2 * nt + nx_sec - 1 :], + ) + ) + + else: + n_E_in_cal = split["ix_from_cal_match_to_glob"].size + p0_est = np.concatenate( + ( + split["p0_est"][: 1 + 2 * nt], + split["p0_est"][1 + 2 * nt + n_E_in_cal :], + ) + ) + X_E1 = sp.csr_matrix(([], ([], [])), shape=(nt * nx_sec, self.x.size)) + X_E1[:, ix_sec[1:]] = split["E"] + X_E2 = X_E1[:, split["ix_from_cal_match_to_glob"]] + X_E = sp.vstack( + ( + -X_E2, + X_E2, + split["E_match_F"], + split["E_match_B"], + split["E_match_no_cal"], + ) + ) + + X = sp.vstack( + ( + sp.hstack( + ( + split["Z_gamma"], + -split["Z_D"], + split["Zero_d"], + split["Z_TA_fw"], + ) + ), + sp.hstack( + ( + split["Z_gamma"], + split["Zero_d"], + -split["Z_D"], + split["Z_TA_bw"], + ) + ), + sp.hstack( + ( + split["Zero_eq12_gamma"], + split["Zero_d_eq12"], + split["Z_TA_eq1"], + ) + ), + sp.hstack( + ( + split["Zero_eq12_gamma"], + split["Zero_d_eq12"], + split["Z_TA_eq2"], + ) + ), + sp.hstack( + ( + split["Zero_eq3_gamma"], + split["d_no_cal"], + split["Z_TA_eq3"], + ) + ), + ) + ) + + y = np.concatenate( + ( + split["y_F"], + split["y_B"], + split["y_eq1"], + split["y_eq2"], + split["y_eq3"], + ) + ) + y -= X_E.dot(fix_alpha[0][split["ix_from_cal_match_to_glob"]]) + + # variances are added. weight is the inverse of the variance + # of the observations + w_ = np.concatenate( + ( + split["w_F"], + split["w_B"], + split["w_eq1"], + split["w_eq2"], + split["w_eq3"], + ) + ) + w = 1 / (1 / w_ + X_E.dot(fix_alpha[1][split["ix_from_cal_match_to_glob"]])) + + if solver == "sparse": + out = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=True, verbose=False) + + elif solver == "stats": + out = wls_stats(X, y, w=w, calc_cov=True, verbose=False) + + # Added fixed gamma and its variance to the solution + p_val = np.concatenate( + (out[0][: 1 + 2 * nt], fix_alpha[0], out[0][1 + 2 * nt :]) + ) + p_var = np.concatenate( + (out[1][: 1 + 2 * nt], fix_alpha[1], out[1][1 + 2 * nt :]) + ) + + p_cov = np.diag(p_var).copy() + + from_i = np.concatenate( + ( + np.arange(1 + 2 * nt), + np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), + ) + ) + + iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") + p_cov[iox_sec1, iox_sec2] = out[2] + + else: + pass + return p_cov, p_val, p_var + + def calibration_double_ended_solver( # noqa: MC0001 ds, st_var=None, diff --git a/src/dtscalibration/datastore.py b/src/dtscalibration/datastore.py index ae5c92b0..814c21b0 100644 --- a/src/dtscalibration/datastore.py +++ b/src/dtscalibration/datastore.py @@ -1,6 +1,3 @@ -import fnmatch -import glob -import inspect import os import warnings from typing import Dict @@ -16,25 +13,16 @@ from scipy.optimize import minimize from scipy.sparse import linalg as ln -from dtscalibration.calibrate_utils import calc_alpha_double -from dtscalibration.calibrate_utils import calibration_double_ended_solver -from dtscalibration.calibrate_utils import calibration_single_ended_solver +from dtscalibration.calibrate_utils import calibration_double_ended_helper +from dtscalibration.calibrate_utils import calibration_single_ended_helper from dtscalibration.calibrate_utils import match_sections from dtscalibration.calibrate_utils import parse_st_var -from dtscalibration.calibrate_utils import wls_sparse -from dtscalibration.calibrate_utils import wls_stats +from dtscalibration.datastore_utils import ParameterIndexDoubleEnded +from dtscalibration.datastore_utils import ParameterIndexSingleEnded +from dtscalibration.datastore_utils import check_deprecated_kwargs from dtscalibration.datastore_utils import check_timestep_allclose -from dtscalibration.io import _dim_attrs -from dtscalibration.io import apsensing_xml_version_check -from dtscalibration.io import read_apsensing_files_routine -from dtscalibration.io import read_sensornet_files_routine_v3 -from dtscalibration.io import read_sensortran_files_routine -from dtscalibration.io import read_silixa_files_routine_v4 -from dtscalibration.io import read_silixa_files_routine_v6 -from dtscalibration.io import sensornet_ddf_version_check -from dtscalibration.io import sensortran_binary_version_check -from dtscalibration.io import silixa_xml_version_check -from dtscalibration.io import ziphandle_to_filepathlist +from dtscalibration.datastore_utils import ufunc_per_section_helper +from dtscalibration.io_utils import _dim_attrs dtsattr_namelist = ["double_ended_flag"] dim_attrs = {k: v for kl, v in _dim_attrs.items() for k in kl} @@ -46,7 +34,7 @@ class DataStore(xr.Dataset): """The data class that stores the measurements, contains calibration methods to relate Stokes and anti-Stokes to temperature. The user should - never initiate this class directly, but use read_xml_dir or open_datastore + never initiate this class directly, but use open_datastore functions instead. Parameters @@ -85,7 +73,6 @@ class DataStore(xr.Dataset): See Also -------- - dtscalibration.read_xml_dir : Load measurements stored in XML-files dtscalibration.open_datastore : Load (calibrated) measurements from netCDF-like file """ @@ -219,77 +206,117 @@ def sections(self): """ if "_sections" not in self.attrs: self.attrs["_sections"] = yaml.dump(None) + return yaml.load(self.attrs["_sections"], Loader=yaml.UnsafeLoader) @sections.setter def sections(self, sections: Dict[str, List[slice]]): - sections_fix_slice_fixed = None - - if sections: - assert isinstance(sections, dict) - - # be less restrictive for capitalized labels - # find lower cases label - labels = np.reshape( - [[s.lower(), s] for s in self.data_vars.keys()], (-1,) - ).tolist() - - sections_fix = dict() - for k, v in sections.items(): - if k.lower() in labels: - i_lower_case = labels.index(k.lower()) - i_normal_case = i_lower_case + 1 - k_normal_case = labels[i_normal_case] - sections_fix[k_normal_case] = v - else: - assert k in self.data_vars, ( - "The keys of the " - "sections-dictionary should " - "refer to a valid timeserie " - "already stored in " - "ds.data_vars " - ) + sections_validated = None - sections_fix_slice_fixed = dict() + if sections is not None: + sections_validated = self.validate_sections(sections=sections) - for k, v in sections_fix.items(): - assert isinstance(v, (list, tuple)), ( - "The values of the sections-dictionary " - "should be lists of slice objects." - ) + self.attrs["_sections"] = yaml.dump(sections_validated) + pass - for vi in v: - assert isinstance(vi, slice), ( - "The values of the sections-dictionary should " - "be lists of slice objects." - ) + @sections.deleter + def sections(self): + self.sections = None + pass - assert self.x.sel(x=vi).size > 0, ( - f"Better define the {k} section. You tried {vi}, " - "which is out of reach" - ) + def validate_sections(self, sections: Dict[str, List[slice]]): + assert isinstance(sections, dict) + + # be less restrictive for capitalized labels + # find lower cases label + labels = np.reshape( + [[s.lower(), s] for s in self.data_vars.keys()], (-1,) + ).tolist() + + sections_fix = dict() + for k, v in sections.items(): + if k.lower() in labels: + i_lower_case = labels.index(k.lower()) + i_normal_case = i_lower_case + 1 + k_normal_case = labels[i_normal_case] + sections_fix[k_normal_case] = v + else: + assert k in self.data_vars, ( + "The keys of the " + "sections-dictionary should " + "refer to a valid timeserie " + "already stored in " + "ds.data_vars " + ) - # sorted stretches - stretch_unsort = [slice(float(vi.start), float(vi.stop)) for vi in v] - stretch_start = [i.start for i in stretch_unsort] - stretch_i_sorted = np.argsort(stretch_start) - sections_fix_slice_fixed[k] = [ - stretch_unsort[i] for i in stretch_i_sorted - ] + sections_fix_slice_fixed = dict() - # Prevent overlapping slices - ix_sec = self.ufunc_per_section( - sections=sections_fix_slice_fixed, x_indices=True, calc_per="all" + for k, v in sections_fix.items(): + assert isinstance(v, (list, tuple)), ( + "The values of the sections-dictionary " + "should be lists of slice objects." ) - assert np.unique(ix_sec).size == ix_sec.size, "The sections are overlapping" - self.attrs["_sections"] = yaml.dump(sections_fix_slice_fixed) - pass + for vi in v: + assert isinstance(vi, slice), ( + "The values of the sections-dictionary should " + "be lists of slice objects." + ) - @sections.deleter - def sections(self): - self.sections = None - pass + assert self.x.sel(x=vi).size > 0, ( + f"Better define the {k} section. You tried {vi}, " + "which is not within the x-dimension" + ) + + # sorted stretches + stretch_unsort = [slice(float(vi.start), float(vi.stop)) for vi in v] + stretch_start = [i.start for i in stretch_unsort] + stretch_i_sorted = np.argsort(stretch_start) + sections_fix_slice_fixed[k] = [stretch_unsort[i] for i in stretch_i_sorted] + + # Prevent overlapping slices + ix_sec = self.ufunc_per_section( + sections=sections_fix_slice_fixed, x_indices=True, calc_per="all" + ) + assert np.unique(ix_sec).size == ix_sec.size, "The sections are overlapping" + + return sections_fix_slice_fixed + + def check_reference_section_values(self): + """ + Checks if the values of the used sections are of the right datatype + (floats), if there are finite number (no NaN/inf), and if the time + dimension corresponds with the time dimension of the st/ast data. + + Parameters + ---------- + + Returns + ------- + + """ + for key in self.sections.keys(): + if not np.issubdtype(self[key].dtype, np.floating): + raise ValueError( + 'Data of reference temperature "' + + key + + '" does not have a float data type. Please ensure that ' + "the data is of a valid type (e.g. np.float32)" + ) + + if np.any(~np.isfinite(self[key].values)): + raise ValueError( + 'NaN/inf value(s) found in reference temperature "' + key + '"' + ) + + if self[key].dims != ("time",): + raise ValueError( + "Time dimension of the reference temperature timeseries " + + key + + "is not the same as the time dimension" + + " of the Stokes measurement. See examples/notebooks/09" + + "Import_timeseries.ipynb for more info" + ) @property def is_double_ended(self) -> float: @@ -481,7 +508,7 @@ def to_mf_netcdf( compute=True, time_chunks_from_key="st", ): - """Write DataStore to multiple to multiple netCDF files. + """Write DataStore to multiple netCDF files. Splits the DataStore along the time dimension using the chunks. It first checks if all chunks in `ds` are time aligned. If this is not @@ -741,66 +768,6 @@ def get_section_indices(self, sec): xis = self.x.astype(int) * 0 + np.arange(self.x.size, dtype=int) return xis.sel(x=sec).values - def check_deprecated_kwargs(self, kwargs): - """ - Internal function that parses the `kwargs` for depreciated keyword - arguments. - - Depreciated keywords raise an error, pending to be depreciated do not. - But this requires that the code currently deals with those arguments. - - Parameters - ---------- - kwargs : Dict - A dictionary with keyword arguments. - - Returns - ------- - - """ - msg = """Previously, it was possible to manually set the label from - which the Stokes and anti-Stokes were read within the DataStore - object. To reduce the clutter in the code base and be able to - maintain it, this option was removed. - See: https://github.com/dtscalibration/python-dts-calibration/issues/81 - - The new **fixed** names are: st, ast, rst, rast. - - It is still possible to use the previous defaults, for example when - reading stored measurements from netCDF, by renaming the labels. The - old default labels were ST, AST, REV-ST, REV-AST. - - ``` - ds = open_datastore(path_to_old_file) - ds = ds.rename_labels() - ds.calibration_double_ended( - st_var=1.5, - ast_var=1.5, - rst_var=1., - rast_var=1., - method='wls') - ``` - - ds.tmpw.plot() - """ - list_of_depr = ["st_label", "ast_label", "rst_label", "rast_label"] - list_of_pending_depr = ["transient_asym_att_x", "transient_att_x"] - - kwargs = {k: v for k, v in kwargs.items() if k not in list_of_pending_depr} - - for k in kwargs: - if k in list_of_depr: - raise NotImplementedError(msg) - - if len(kwargs) != 0: - raise NotImplementedError( - "The following keywords are not " - + "supported: " - + ", ".join(kwargs.keys()) - ) - - pass - def rename_labels(self, assertion=True): """ Renames the `ST` DataArrays (old convention) to `st` (new convention). @@ -972,21 +939,33 @@ def variance_stokes_constant(self, st_label, sections=None, reshape_residuals=Tr dtscalibration/python-dts-calibration/blob/main/examples/notebooks/\ 04Calculate_variance_Stokes.ipynb>`_ """ - if sections: - self.sections = sections + + # var_I, resid = variance_stokes_constant_util(st_label, sections=None, reshape_residuals=True) + # def variance_stokes_constant_util(st_label, sections=None, reshape_residuals=True): + def func_fit(p, xs): + return p[:xs, None] * p[None, xs:] + + def func_cost(p, data, xs): + fit = func_fit(p, xs) + return np.sum((fit - data) ** 2) + + if sections is None: + sections = self.sections else: - assert self.sections, "sections are not defined" + sections = self.validate_sections(sections) assert self[st_label].dims[0] == "x", "Stokes are transposed" check_timestep_allclose(self, eps=0.01) - data_dict = da.compute( - self.ufunc_per_section(label=st_label, calc_per="stretch") - )[ - 0 - ] # should maybe be per section. But then residuals + # should maybe be per section. But then residuals # seem to be correlated between stretches. I don't know why.. BdT. + data_dict = da.compute( + self.ufunc_per_section( + sections=sections, label=st_label, calc_per="stretch" + ) + )[0] + resid_list = [] for k, v in data_dict.items(): @@ -1011,7 +990,9 @@ def variance_stokes_constant(self, st_label, sections=None, reshape_residuals=Tr return var_I, resid else: - ix_resid = self.ufunc_per_section(x_indices=True, calc_per="all") + ix_resid = self.ufunc_per_section( + sections=sections, x_indices=True, calc_per="all" + ) resid_sorted = np.full(shape=self[st_label].shape, fill_value=np.nan) resid_sorted[ix_resid, :] = resid @@ -1148,10 +1129,10 @@ def variance_stokes_exponential( dtscalibration/python-dts-calibration/blob/main/examples/notebooks/\ 04Calculate_variance_Stokes.ipynb>`_ """ - if sections: - self.sections = sections + if sections is None: + sections = self.sections else: - assert self.sections, "sections are not defined" + sections = self.validate_sections(sections) assert self[st_label].dims[0] == "x", "Stokes are transposed" @@ -1164,7 +1145,7 @@ def variance_stokes_exponential( y_list = [] # intensities of stokes x_list = [] # length rel to start of section. for alpha - for k, stretches in self.sections.items(): + for k, stretches in sections.items(): for stretch in stretches: y_list.append(self[st_label].sel(x=stretch).data.T.reshape(-1)) _x = self.x.sel(x=stretch).data.copy() @@ -1258,6 +1239,7 @@ def variance_stokes_exponential( if not reshape_residuals: return var_I, resid + else: # restructure the residuals, such that they can be plotted and # added to ds @@ -1274,7 +1256,9 @@ def variance_stokes_exponential( resid_res.append(resid[lenis:lenie].T.reshape((nt, leni)).T) _resid = np.concatenate(resid_res) - _resid_x = self.ufunc_per_section(label="x", calc_per="all") + _resid_x = self.ufunc_per_section( + sections=sections, label="x", calc_per="all" + ) isort = np.argsort(_resid_x) resid_x = _resid_x[isort] # get indices from ufunc directly resid = _resid[isort, :] @@ -1288,7 +1272,7 @@ def variance_stokes_exponential( return var_I, resid_da def variance_stokes_linear( - self, st_label, sections=None, nbin=50, through_zero=True, plot_fit=False + self, st_label, sections=None, nbin=50, through_zero=False, plot_fit=False ): """ Approximate the variance of the noise in Stokes intensity measurements @@ -1403,15 +1387,17 @@ def variance_stokes_linear( """ import matplotlib.pyplot as plt - if sections: - self.sections = sections + if sections is None: + sections = self.sections else: - assert self.sections, "sections are not defined" + sections = self.validate_sections(sections) assert self[st_label].dims[0] == "x", "Stokes are transposed" - _, resid = self.variance_stokes(st_label=st_label) + _, resid = self.variance_stokes(sections=sections, st_label=st_label) - ix_sec = self.ufunc_per_section(x_indices=True, calc_per="all") + ix_sec = self.ufunc_per_section( + sections=sections, x_indices=True, calc_per="all" + ) st = self.isel(x=ix_sec)[st_label].values.ravel() diff_st = resid.isel(x=ix_sec).values.ravel() @@ -1438,6 +1424,7 @@ def variance_stokes_linear( # VAR(Stokes) = slope * Stokes offset = 0.0 slope = np.linalg.lstsq(st_sort_mean[:, None], st_sort_var, rcond=None)[0] + else: # VAR(Stokes) = slope * Stokes + offset slope, offset = np.linalg.lstsq( @@ -1453,7 +1440,9 @@ def variance_stokes_linear( f"not possible. Most likely, your {st_label} do " f"not vary enough to fit a linear curve. Either " f"use `through_zero` option or use " - f"`ds.variance_stokes_constant()`" + f"`ds.variance_stokes_constant()`. Another reason " + f"could be that your sections are defined to be " + f"wider than they actually are." ) def var_fun(stokes): @@ -1480,7 +1469,7 @@ def var_fun(stokes): return slope, offset, st_sort_mean, st_sort_var, resid, var_fun def i_var(self, st_var, ast_var, st_label="st", ast_label="ast"): - """ + r""" Compute the variance of an observation given the stokes and anti-Stokes intensities and their variance. The variance, :math:`\sigma^2_{I_{m,n}}`, of the distribution of the @@ -1612,51 +1601,7 @@ def inverse_variance_weighted_mean_array( pass - def in_confidence_interval(self, ci_label, conf_ints=None, sections=None): - """ - Returns an array with bools wether the temperature of the reference - sections are within the confidence intervals - - Parameters - ---------- - sections : Dict[str, List[slice]] - ci_label : str - The label of the data containing the confidence intervals. - conf_ints : Tuple - A tuple containing two floats between 0 and 1, representing the - levels between which the reference temperature should lay. - - Returns - ------- - - """ - if sections is None: - sections = self.sections - - if conf_ints is None: - conf_ints = self[ci_label].values - - assert len(conf_ints) == 2, "Please define conf_ints" - - tmp_dn = self[ci_label].sel(CI=conf_ints[0], method="nearest") - tmp_up = self[ci_label].sel(CI=conf_ints[1], method="nearest") - - ref = self.ufunc_per_section( - sections=sections, label="st", ref_temp_broadcasted=True, calc_per="all" - ) - ix_resid = self.ufunc_per_section( - sections=sections, x_indices=True, calc_per="all" - ) - ref_sorted = np.full(shape=tmp_dn.shape, fill_value=np.nan) - ref_sorted[ix_resid, :] = ref - ref_da = xr.DataArray(data=ref_sorted, coords=tmp_dn.coords) - - mask_dn = ref_da >= tmp_dn - mask_up = ref_da <= tmp_up - - return np.logical_and(mask_dn, mask_up) - - def set_trans_att(self, trans_att=None, **kwargs): + def set_trans_att(self, trans_att=None): """Gracefully set the locations that introduce directional differential attenuation @@ -1671,23 +1616,6 @@ def set_trans_att(self, trans_att=None, **kwargs): If multiple locations are defined, the losses are added. """ - - if "transient_att_x" in kwargs: - warnings.warn( - "transient_att_x argument will be deprecated in version 2, " - "use trans_att", - DeprecationWarning, - ) - trans_att = kwargs["transient_att_x"] - - if "transient_asym_att_x" in kwargs: - warnings.warn( - "transient_asym_att_x arg will be deprecated in version 2, " - "use trans_att", - DeprecationWarning, - ) - trans_att = kwargs["transient_asym_att_x"] - if "trans_att" in self.coords and self.trans_att.size > 0: raise_warning = 0 @@ -1713,42 +1641,6 @@ def set_trans_att(self, trans_att=None, **kwargs): self.trans_att.attrs = dim_attrs["trans_att"] pass - def check_reference_section_values(self): - """ - Checks if the values of the used sections are of the right datatype - (floats), if there are finite number (no NaN/inf), and if the time - dimension corresponds with the time dimension of the st/ast data. - - Parameters - ---------- - - Returns - ------- - - """ - for key in self.sections.keys(): - if not np.issubdtype(self[key].dtype, np.floating): - raise ValueError( - 'Data of reference temperature "' - + key - + '" does not have a float data type. Please ensure that ' - "the data is of a valid type (e.g. np.float32)" - ) - - if np.any(~np.isfinite(self[key].values)): - raise ValueError( - 'NaN/inf value(s) found in reference temperature "' + key + '"' - ) - - if self[key].dims != ("time",): - raise ValueError( - "Time dimension of the reference temperature timeseries " - + key - + "is not the same as the time dimension" - + " of the Stokes measurement. See examples/notebooks/09" - + "Import_timeseries.ipynb for more info" - ) - def calibration_single_ended( self, sections=None, @@ -1766,7 +1658,7 @@ def calibration_single_ended( fix_alpha=None, **kwargs, ): - """ + r""" Calibrate the Stokes (`ds.st`) and anti-Stokes (`ds.ast`) data to temperature using fiber sections with a known temperature (`ds.sections`) for single-ended setups. The calibrated temperature is @@ -1859,8 +1751,6 @@ def calibration_single_ended( has three items. The first two items are the slices of the sections that are matched. The third item is a boolean and is True if the two sections have a reverse direction ("J-configuration"). - transient_att_x, transient_asym_att_x : iterable, optional - Depreciated. See trans_att trans_att : iterable, optional Splices can cause jumps in differential attenuation. Normal single ended calibration assumes these are not present. An additional loss @@ -1897,13 +1787,11 @@ def calibration_single_ended( 07Calibrate_single_wls.ipynb>`_ """ - self.check_deprecated_kwargs(kwargs) + check_deprecated_kwargs(kwargs) self.set_trans_att(trans_att=trans_att, **kwargs) - if sections: + if sections is not None: self.sections = sections - else: - assert self.sections, "sections are not defined" if method == "wls": assert st_var is not None and ast_var is not None, "Set `st_var`" @@ -1933,151 +1821,26 @@ def calibration_single_ended( ) if method == "wls": - for input_item in [st_var, ast_var]: - assert input_item is not None, ( - "For wls define all " "variances (`st_var`, " "`ast_var`) " - ) - - calc_cov = True - - split = calibration_single_ended_solver( + p_cov, p_val, p_var = calibration_single_ended_helper( self, st_var, ast_var, - calc_cov=calc_cov, - solver="external_split", - matching_indices=matching_indices, + fix_alpha, + fix_dalpha, + fix_gamma, + matching_indices, + nt, + nta, + nx, + solver, ) - y = split["y"] - w = split["w"] - - # Stack all X's - if fix_alpha: - assert not fix_dalpha, "Use either `fix_dalpha` or `fix_alpha`" - assert fix_alpha[0].size == self.x.size, ( - "fix_alpha also needs to be defined outside the reference " - "sections" - ) - assert fix_alpha[1].size == self.x.size, ( - "fix_alpha also needs to be defined outside the reference " - "sections" - ) - p_val = split["p0_est_alpha"].copy() - - if np.any(matching_indices): - raise NotImplementedError( - "Configuring fix_alpha and matching sections requires extra code" - ) - - X = sp.hstack( - (split["X_gamma"], split["X_alpha"], split["X_c"], split["X_TA"]) - ).tocsr() - ip_use = list(range(1 + nx + nt + nta * nt)) - - else: - X = sp.vstack( - ( - sp.hstack( - ( - split["X_gamma"], - split["X_dalpha"], - split["X_c"], - split["X_TA"], - ) - ), - split["X_m"], - ) - ).tocsr() - p_val = split["p0_est_dalpha"].copy() - ip_use = list(range(1 + 1 + nt + nta * nt)) - - p_var = np.zeros_like(p_val) - p_cov = np.zeros((p_val.size, p_val.size), dtype=float) - - if fix_gamma is not None: - ip_remove = [0] - ip_use = [i for i in ip_use if i not in ip_remove] - p_val[ip_remove] = fix_gamma[0] - p_var[ip_remove] = fix_gamma[1] - - X_gamma = ( - sp.vstack((split["X_gamma"], split["X_m"].tocsr()[:, 0].tocoo())) - .toarray() - .flatten() - ) - - y -= fix_gamma[0] * X_gamma - w = 1 / (1 / w + fix_gamma[1] * X_gamma) - - if fix_alpha is not None: - ip_remove = list(range(1, nx + 1)) - ip_use = [i for i in ip_use if i not in ip_remove] - p_val[ip_remove] = fix_alpha[0] - p_var[ip_remove] = fix_alpha[1] - - # X_alpha needs to be vertically extended to support matching sections - y -= split["X_alpha"].dot(fix_alpha[0]) - w = 1 / (1 / w + split["X_alpha"].dot(fix_alpha[1])) - - if fix_dalpha is not None: - ip_remove = [1] - ip_use = [i for i in ip_use if i not in ip_remove] - p_val[ip_remove] = fix_dalpha[0] - p_var[ip_remove] = fix_dalpha[1] - - y -= np.hstack( - ( - fix_dalpha[0] * split["X_dalpha"].toarray().flatten(), - ( - fix_dalpha[0] - * split["X_m"].tocsr()[:, 1].tocoo().toarray().flatten() - ), - ) - ) - w = 1 / ( - 1 / w - + np.hstack( - ( - fix_dalpha[1] * split["X_dalpha"].toarray().flatten(), - ( - fix_dalpha[1] - * split["X_m"].tocsr()[:, 1].tocoo().toarray().flatten() - ), - ) - ) - ) - - if solver == "sparse": - out = wls_sparse( - X[:, ip_use], - y, - w=w, - x0=p_val[ip_use], - calc_cov=calc_cov, - verbose=False, - ) - - elif solver == "stats": - out = wls_stats(X[:, ip_use], y, w=w, calc_cov=calc_cov, verbose=False) - - else: - assert 0, "Unknown solver" - - p_val[ip_use] = out[0] - p_var[ip_use] = out[1] - np.fill_diagonal(p_cov, p_var) # set variance of all fixed params - p_cov[np.ix_(ip_use, ip_use)] = out[2] - elif method == "external": for input_item in [p_val, p_var, p_cov]: assert ( input_item is not None ), "Define p_val, p_var, p_cov when using an external solver" - elif method == "external_split": - raise ValueError("Not implemented yet") - else: raise ValueError("Choose a valid method") @@ -2251,7 +2014,7 @@ def calibration_double_ended( verbose=False, **kwargs, ): - """ + r""" See example notebook 8 for an explanation on how to use this function. Calibrate the Stokes (`ds.st`) and anti-Stokes (`ds.ast`) of the forward channel and from the backward channel (`ds.rst`, `ds.rast`) data to @@ -2344,6 +2107,8 @@ def calibration_double_ended( estimate of the temperature is obtained from the weighted average of :math:`T_\mathrm{F}` and :math:`T_\mathrm{B}` as discussed in Section 7.2 [1]_ . + + Parameters ---------- p_val : array-like, optional @@ -2408,8 +2173,6 @@ def calibration_double_ended( memory, is faster, and gives the same result as the statsmodels solver. The statsmodels solver is mostly used to check the sparse solver. `'stats'` is the default. - transient_att_x, transient_asym_att_x : iterable, optional - Depreciated. See trans_att trans_att : iterable, optional Splices can cause jumps in differential attenuation. Normal single ended calibration assumes these are not present. An additional loss @@ -2454,14 +2217,12 @@ def calibration_double_ended( 08Calibrate_double_wls.ipynb>`_ """ # TODO: confidence intervals using the variance approximated by linear error propagation - self.check_deprecated_kwargs(kwargs) + check_deprecated_kwargs(kwargs) self.set_trans_att(trans_att=trans_att, **kwargs) - if sections: + if sections is not None: self.sections = sections - else: - assert self.sections, "sections are not defined" self.check_reference_section_values() @@ -2512,649 +2273,27 @@ def calibration_double_ended( matching_indices = match_sections(self, matching_sections) if method == "wls": - if fix_alpha or fix_gamma: - split = calibration_double_ended_solver( - self, - st_var, - ast_var, - rst_var, - rast_var, - calc_cov=True, - solver="external_split", - matching_indices=matching_indices, - verbose=verbose, - ) - else: - out = calibration_double_ended_solver( - self, - st_var, - ast_var, - rst_var, - rast_var, - calc_cov=True, - solver=solver, - matching_indices=matching_indices, - verbose=verbose, - ) + p_cov, p_val, p_var = calibration_double_ended_helper( + self, + st_var, + ast_var, + rst_var, + rast_var, + fix_alpha, + fix_gamma, + nt, + nta, + nx, + nx_sec, + ix_sec, + matching_indices, + solver, + verbose, + ) - p_val, p_var, p_cov = out - - # adjust split to fix parameters - """Wrapped in a function to reduce memory usage. - Constructing: - Z_gamma (nt * nx, 1). Data: positive 1/temp - Z_D (nt * nx, nt). Data: ones - E (nt * nx, nx). Data: ones - Zero_gamma (nt * nx, 1) - zero_d (nt * nx, nt) - Z_TA_fw (nt * nx, nta * 2 * nt) minus ones - Z_TA_bw (nt * nx, nta * 2 * nt) minus ones - Z_TA_E (nt * nx, nta * 2 * nt) - I_fw = 1/Tref*gamma - D_fw - E - TA_fw - I_bw = 1/Tref*gamma - D_bw + E - TA_bw - (I_bw - I_fw) / 2 = D_fw/2 - D_bw/2 + E + TA_fw/2 - TA_bw/2 Eq42 - """ - if fix_alpha and fix_gamma: - assert ( - np.size(fix_alpha[0]) == self.x.size - ), "define alpha for each location" - assert ( - np.size(fix_alpha[1]) == self.x.size - ), "define var alpha for each location" - m = ( - "The integrated differential attenuation is zero at the " - "first index of the reference sections." - ) - assert np.abs(fix_alpha[0][ix_sec[0]]) < 1e-8, m - # The array with the integrated differential att is termed E - - if np.any(matching_indices): - n_E_in_cal = split["ix_from_cal_match_to_glob"].size - p0_est = np.concatenate( - ( - split["p0_est"][1 : 1 + 2 * nt], - split["p0_est"][1 + 2 * nt + n_E_in_cal :], - ) - ) - X_E1 = sp.csr_matrix( - ([], ([], [])), shape=(nt * nx_sec, self.x.size) - ) - X_E1[:, ix_sec[1:]] = split["E"] - X_E2 = X_E1[:, split["ix_from_cal_match_to_glob"]] - X_E = sp.vstack( - ( - -X_E2, - X_E2, - split["E_match_F"], - split["E_match_B"], - split["E_match_no_cal"], - ) - ) - - X_gamma = ( - sp.vstack( - ( - split["Z_gamma"], - split["Z_gamma"], - split["Zero_eq12_gamma"], - split["Zero_eq12_gamma"], - split["Zero_eq3_gamma"], - ) - ) - .toarray() - .flatten() - ) - - X = sp.vstack( - ( - sp.hstack( - (-split["Z_D"], split["Zero_d"], split["Z_TA_fw"]) - ), - sp.hstack( - (split["Zero_d"], -split["Z_D"], split["Z_TA_bw"]) - ), - sp.hstack((split["Zero_d_eq12"], split["Z_TA_eq1"])), - sp.hstack((split["Zero_d_eq12"], split["Z_TA_eq2"])), - sp.hstack((split["d_no_cal"], split["Z_TA_eq3"])), - ) - ) - - y = np.concatenate( - ( - split["y_F"], - split["y_B"], - split["y_eq1"], - split["y_eq2"], - split["y_eq3"], - ) - ) - y -= X_E.dot(fix_alpha[0][split["ix_from_cal_match_to_glob"]]) - y -= fix_gamma[0] * X_gamma - - # variances are added. weight is the inverse of the variance - # of the observations - w_ = np.concatenate( - ( - split["w_F"], - split["w_B"], - split["w_eq1"], - split["w_eq2"], - split["w_eq3"], - ) - ) - w = 1 / ( - 1 / w_ - + X_E.dot(fix_alpha[1][split["ix_from_cal_match_to_glob"]]) - + fix_gamma[1] * X_gamma - ) - - else: - # X_gamma - X_E = sp.vstack((-split["E"], split["E"])) - X_gamma = ( - sp.vstack((split["Z_gamma"], split["Z_gamma"])) - .toarray() - .flatten() - ) - # Use only the remaining coefficients - # Stack all X's - X = sp.vstack( - ( - sp.hstack( - (-split["Z_D"], split["Zero_d"], split["Z_TA_fw"]) - ), - sp.hstack( - (split["Zero_d"], -split["Z_D"], split["Z_TA_bw"]) - ), - ) - ) - - # Move the coefficients times the fixed gamma to the - # observations - y = np.concatenate((split["y_F"], split["y_B"])) - y -= X_E.dot(fix_alpha[0][ix_sec[1:]]) - y -= fix_gamma[0] * X_gamma - # variances are added. weight is the inverse of the variance - # of the observations - w_ = np.concatenate((split["w_F"], split["w_B"])) - w = 1 / ( - 1 / w_ - + X_E.dot(fix_alpha[1][ix_sec[1:]]) - + fix_gamma[1] * X_gamma - ) - - # [C_1, C_2, .., C_nt, TA_fw_a_1, TA_fw_a_2, TA_fw_a_nt, - # TA_bw_a_1, TA_bw_a_2, TA_bw_a_nt] Then continues with - # TA for connector b. - p0_est = np.concatenate( - ( - split["p0_est"][1 : 1 + 2 * nt], - split["p0_est"][1 + 2 * nt + nx_sec - 1 :], - ) - ) - - if solver == "sparse": - out = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=True, verbose=False) - - elif solver == "stats": - out = wls_stats(X, y, w=w, calc_cov=True, verbose=False) - - # Added fixed gamma and its variance to the solution - p_val = np.concatenate( - ([fix_gamma[0]], out[0][: 2 * nt], fix_alpha[0], out[0][2 * nt :]) - ) - p_var = np.concatenate( - ([fix_gamma[1]], out[1][: 2 * nt], fix_alpha[1], out[1][2 * nt :]) - ) - - # whether it returns a copy or a view depends on what - # version of numpy you are using - p_cov = np.diag(p_var).copy() - from_i = np.concatenate( - ( - np.arange(1, 2 * nt + 1), - np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), - ) - ) - iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") - p_cov[iox_sec1, iox_sec2] = out[2] - - elif fix_gamma: - if np.any(matching_indices): - # n_E_in_cal = split['ix_from_cal_match_to_glob'].size - p0_est = split["p0_est"][1:] - X_E1 = sp.csr_matrix( - ([], ([], [])), shape=(nt * nx_sec, self.x.size) - ) - from_i = ix_sec[1:] - X_E1[:, from_i] = split["E"] - X_E2 = X_E1[:, split["ix_from_cal_match_to_glob"]] - X = sp.vstack( - ( - sp.hstack( - ( - -split["Z_D"], - split["Zero_d"], - -X_E2, - split["Z_TA_fw"], - ) - ), - sp.hstack( - (split["Zero_d"], -split["Z_D"], X_E2, split["Z_TA_bw"]) - ), - sp.hstack( - ( - split["Zero_d_eq12"], - split["E_match_F"], - split["Z_TA_eq1"], - ) - ), - sp.hstack( - ( - split["Zero_d_eq12"], - split["E_match_B"], - split["Z_TA_eq2"], - ) - ), - sp.hstack( - ( - split["d_no_cal"], - split["E_match_no_cal"], - split["Z_TA_eq3"], - ) - ), - ) - ) - X_gamma = ( - sp.vstack( - ( - split["Z_gamma"], - split["Z_gamma"], - split["Zero_eq12_gamma"], - split["Zero_eq12_gamma"], - split["Zero_eq3_gamma"], - ) - ) - .toarray() - .flatten() - ) - - y = np.concatenate( - ( - split["y_F"], - split["y_B"], - split["y_eq1"], - split["y_eq2"], - split["y_eq3"], - ) - ) - y -= fix_gamma[0] * X_gamma - - # variances are added. weight is the inverse of the variance - # of the observations - w_ = np.concatenate( - ( - split["w_F"], - split["w_B"], - split["w_eq1"], - split["w_eq2"], - split["w_eq3"], - ) - ) - w = 1 / (1 / w_ + fix_gamma[1] * X_gamma) - - else: - X_gamma = ( - sp.vstack((split["Z_gamma"], split["Z_gamma"])) - .toarray() - .flatten() - ) - # Use only the remaining coefficients - X = sp.vstack( - ( - sp.hstack( - ( - -split["Z_D"], - split["Zero_d"], - -split["E"], - split["Z_TA_fw"], - ) - ), - sp.hstack( - ( - split["Zero_d"], - -split["Z_D"], - split["E"], - split["Z_TA_bw"], - ) - ), - ) - ) - # Move the coefficients times the fixed gamma to the - # observations - y = np.concatenate((split["y_F"], split["y_B"])) - y -= fix_gamma[0] * X_gamma - # variances are added. weight is the inverse of the variance - # of the observations - w_ = np.concatenate((split["w_F"], split["w_B"])) - w = 1 / (1 / w_ + fix_gamma[1] * X_gamma) - - p0_est = split["p0_est"][1:] - - if solver == "sparse": - out = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=True, verbose=False) - - elif solver == "stats": - out = wls_stats(X, y, w=w, calc_cov=True, verbose=False) - - # put E outside of reference section in solution - # concatenating makes a copy of the data instead of using a - # pointer - ds_sub = self[["st", "ast", "rst", "rast", "trans_att"]] - ds_sub["df"] = (("time",), out[0][:nt]) - ds_sub["df_var"] = (("time",), out[1][:nt]) - ds_sub["db"] = (("time",), out[0][nt : 2 * nt]) - ds_sub["db_var"] = (("time",), out[1][nt : 2 * nt]) - - if nta > 0: - if np.any(matching_indices): - n_E_in_cal = split["ix_from_cal_match_to_glob"].size - ta = out[0][2 * nt + n_E_in_cal :].reshape( - (nt, 2, nta), order="F" - ) - ta_var = out[1][2 * nt + n_E_in_cal :].reshape( - (nt, 2, nta), order="F" - ) - - else: - ta = out[0][2 * nt + nx_sec - 1 :].reshape( - (nt, 2, nta), order="F" - ) - ta_var = out[1][2 * nt + nx_sec - 1 :].reshape( - (nt, 2, nta), order="F" - ) - - talpha_fw = ta[:, 0, :] - talpha_bw = ta[:, 1, :] - talpha_fw_var = ta_var[:, 0, :] - talpha_bw_var = ta_var[:, 1, :] - else: - talpha_fw = None - talpha_bw = None - talpha_fw_var = None - talpha_bw_var = None - - E_all_exact, E_all_var_exact = calc_alpha_double( - mode="exact", - ds=ds_sub, - st_var=st_var, - ast_var=ast_var, - rst_var=rst_var, - rast_var=rast_var, - ix_alpha_is_zero=ix_sec[0], - talpha_fw=talpha_fw, - talpha_bw=talpha_bw, - talpha_fw_var=talpha_fw_var, - talpha_bw_var=talpha_bw_var, - ) - - if not np.any(matching_indices): - # Added fixed gamma and its variance to the solution. And - # expand to include locations outside reference sections. - p_val = np.concatenate( - ( - [fix_gamma[0]], - out[0][: 2 * nt], - E_all_exact, - out[0][2 * nt + nx_sec - 1 :], - ) - ) - p_val[1 + 2 * nt + ix_sec[1:]] = out[0][ - 2 * nt : 2 * nt + nx_sec - 1 - ] - p_val[1 + 2 * nt + ix_sec[0]] = 0.0 - p_var = np.concatenate( - ( - [fix_gamma[1]], - out[1][: 2 * nt], - E_all_var_exact, - out[1][2 * nt + nx_sec - 1 :], - ) - ) - p_var[1 + 2 * nt + ix_sec[1:]] = out[1][ - 2 * nt : 2 * nt + nx_sec - 1 - ] - else: - n_E_in_cal = split["ix_from_cal_match_to_glob"].size - - # Added fixed gamma and its variance to the solution. And - # expand to include locations outside reference sections. - p_val = np.concatenate( - ( - [fix_gamma[0]], - out[0][: 2 * nt], - E_all_exact, - out[0][2 * nt + n_E_in_cal :], - ) - ) - p_val[1 + 2 * nt + split["ix_from_cal_match_to_glob"]] = out[0][ - 2 * nt : 2 * nt + n_E_in_cal - ] - p_val[1 + 2 * nt + ix_sec[0]] = 0.0 - p_var = np.concatenate( - ( - [fix_gamma[1]], - out[1][: 2 * nt], - E_all_var_exact, - out[1][2 * nt + n_E_in_cal :], - ) - ) - p_var[1 + 2 * nt + split["ix_from_cal_match_to_glob"]] = out[1][ - 2 * nt : 2 * nt + n_E_in_cal - ] - - p_cov = np.diag(p_var).copy() - - if not np.any(matching_indices): - from_i = np.concatenate( - ( - np.arange(1, 2 * nt + 1), - 2 * nt + 1 + ix_sec[1:], - np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), - ) - ) - else: - from_i = np.concatenate( - ( - np.arange(1, 2 * nt + 1), - 2 * nt + 1 + split["ix_from_cal_match_to_glob"], - np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), - ) - ) - - iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") - p_cov[iox_sec1, iox_sec2] = out[2] - - elif fix_alpha: - assert ( - np.size(fix_alpha[0]) == self.x.size - ), "define alpha for each location" - assert ( - np.size(fix_alpha[1]) == self.x.size - ), "define var alpha for each location" - m = ( - "The integrated differential attenuation is zero at the " - "first index of the reference sections." - ) - assert np.abs(fix_alpha[0][ix_sec[0]]) < 1e-6, m - # The array with the integrated differential att is termed E - - if not np.any(matching_indices): - # X_gamma - X_E = sp.vstack((-split["E"], split["E"])) - # Use only the remaining coefficients - # Stack all X's - X = sp.vstack( - ( - sp.hstack( - ( - split["Z_gamma"], - -split["Z_D"], - split["Zero_d"], - split["Z_TA_fw"], - ) - ), - sp.hstack( - ( - split["Z_gamma"], - split["Zero_d"], - -split["Z_D"], - split["Z_TA_bw"], - ) - ), - ) - ) - # Move the coefficients times the fixed gamma to the - # observations - y = np.concatenate((split["y_F"], split["y_B"])) - y -= X_E.dot(fix_alpha[0][ix_sec[1:]]) - - # variances are added. weight is the inverse of the variance - # of the observations - w_ = np.concatenate((split["w_F"], split["w_B"])) - w = 1 / (1 / w_ + X_E.dot(fix_alpha[1][ix_sec[1:]])) - - p0_est = np.concatenate( - ( - split["p0_est"][: 1 + 2 * nt], - split["p0_est"][1 + 2 * nt + nx_sec - 1 :], - ) - ) - - else: - n_E_in_cal = split["ix_from_cal_match_to_glob"].size - p0_est = np.concatenate( - ( - split["p0_est"][: 1 + 2 * nt], - split["p0_est"][1 + 2 * nt + n_E_in_cal :], - ) - ) - X_E1 = sp.csr_matrix( - ([], ([], [])), shape=(nt * nx_sec, self.x.size) - ) - X_E1[:, ix_sec[1:]] = split["E"] - X_E2 = X_E1[:, split["ix_from_cal_match_to_glob"]] - X_E = sp.vstack( - ( - -X_E2, - X_E2, - split["E_match_F"], - split["E_match_B"], - split["E_match_no_cal"], - ) - ) - - X = sp.vstack( - ( - sp.hstack( - ( - split["Z_gamma"], - -split["Z_D"], - split["Zero_d"], - split["Z_TA_fw"], - ) - ), - sp.hstack( - ( - split["Z_gamma"], - split["Zero_d"], - -split["Z_D"], - split["Z_TA_bw"], - ) - ), - sp.hstack( - ( - split["Zero_eq12_gamma"], - split["Zero_d_eq12"], - split["Z_TA_eq1"], - ) - ), - sp.hstack( - ( - split["Zero_eq12_gamma"], - split["Zero_d_eq12"], - split["Z_TA_eq2"], - ) - ), - sp.hstack( - ( - split["Zero_eq3_gamma"], - split["d_no_cal"], - split["Z_TA_eq3"], - ) - ), - ) - ) - - y = np.concatenate( - ( - split["y_F"], - split["y_B"], - split["y_eq1"], - split["y_eq2"], - split["y_eq3"], - ) - ) - y -= X_E.dot(fix_alpha[0][split["ix_from_cal_match_to_glob"]]) - - # variances are added. weight is the inverse of the variance - # of the observations - w_ = np.concatenate( - ( - split["w_F"], - split["w_B"], - split["w_eq1"], - split["w_eq2"], - split["w_eq3"], - ) - ) - w = 1 / ( - 1 / w_ - + X_E.dot(fix_alpha[1][split["ix_from_cal_match_to_glob"]]) - ) - - if solver == "sparse": - out = wls_sparse(X, y, w=w, x0=p0_est, calc_cov=True, verbose=False) - - elif solver == "stats": - out = wls_stats(X, y, w=w, calc_cov=True, verbose=False) - - # Added fixed gamma and its variance to the solution - p_val = np.concatenate( - (out[0][: 1 + 2 * nt], fix_alpha[0], out[0][1 + 2 * nt :]) - ) - p_var = np.concatenate( - (out[1][: 1 + 2 * nt], fix_alpha[1], out[1][1 + 2 * nt :]) - ) - - p_cov = np.diag(p_var).copy() - - from_i = np.concatenate( - ( - np.arange(1 + 2 * nt), - np.arange(1 + 2 * nt + nx, 1 + 2 * nt + nx + nta * nt * 2), - ) - ) - - iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") - p_cov[iox_sec1, iox_sec2] = out[2] - - else: - pass - - elif method == "external": - for input_item in [p_val, p_var, p_cov]: - assert input_item is not None + elif method == "external": + for input_item in [p_val, p_var, p_cov]: + assert input_item is not None elif method == "external_split": raise ValueError("Not implemented yet") @@ -3462,7 +2601,7 @@ def calibration_double_ended( pass - def conf_int_single_ended( + def average_single_ended( self, p_val="p_val", p_cov="p_cov", @@ -3470,29 +2609,24 @@ def conf_int_single_ended( ast_var=None, conf_ints=None, mc_sample_size=100, + ci_avg_time_flag1=False, + ci_avg_time_flag2=False, + ci_avg_time_sel=None, + ci_avg_time_isel=None, + ci_avg_x_flag1=False, + ci_avg_x_flag2=False, + ci_avg_x_sel=None, + ci_avg_x_isel=None, + var_only_sections=None, da_random_state=None, mc_remove_set_flag=True, reduce_memory_usage=False, **kwargs, ): """ - Estimation of the confidence intervals for the temperatures measured - with a single-ended setup. It consists of five steps. First, the variances - of the Stokes and anti-Stokes intensity measurements are estimated - following the steps in Section 4 [1]_. A Normal - distribution is assigned to each intensity measurement that is centered - at the measurement and using the estimated variance. Second, a multi- - variate Normal distribution is assigned to the estimated parameters - using the covariance matrix from the calibration procedure presented in - Section 5 [1]_. Third, the distributions are sampled, and the - temperature is computed with Equation 12 [1]_. Fourth, step - three is repeated, e.g., 10,000 times for each location and for each - time. The resulting 10,000 realizations of the temperatures - approximate the probability density functions of the estimated - temperature at that location and time. Fifth, the standard uncertainties - are computed with the standard deviations of the realizations of the - temperatures, and the 95\% confidence intervals are computed from the - 2.5\% and 97.5\% percentiles of the realizations of the temperatures. + Average temperatures from single-ended setups. + + Four types of averaging are implemented. Please see Example Notebook 16. Parameters @@ -3523,6 +2657,80 @@ def conf_int_single_ended( mc_sample_size : int Size of the monte carlo parameter set used to calculate the confidence interval + ci_avg_time_flag1 : bool + The confidence intervals differ each time step. Assumes the + temperature varies during the measurement period. Computes the + arithmic temporal mean. If you would like to know the confidence + interfal of: + (1) a single additional measurement. So you can state "if another + measurement were to be taken, it would have this ci" + (2) all measurements. So you can state "The temperature remained + during the entire measurement period between these ci bounds". + Adds "tmpw" + '_avg1' and "tmpw" + '_mc_avg1_var' to the + DataStore. If `conf_ints` are set, also the confidence intervals + `_mc_avg1` are added to the DataStore. Works independently of the + ci_avg_time_flag2 and ci_avg_x_flag. + ci_avg_time_flag2 : bool + The confidence intervals differ each time step. Assumes the + temperature remains constant during the measurement period. + Computes the inverse-variance-weighted-temporal-mean temperature + and its uncertainty. + If you would like to know the confidence interfal of: + (1) I want to estimate a background temperature with confidence + intervals. I hereby assume the temperature does not change over + time and average all measurements to get a better estimate of the + background temperature. + Adds "tmpw" + '_avg2' and "tmpw" + '_mc_avg2_var' to the + DataStore. If `conf_ints` are set, also the confidence intervals + `_mc_avg2` are added to the DataStore. Works independently of the + ci_avg_time_flag1 and ci_avg_x_flag. + ci_avg_time_sel : slice + Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a + selection of the data + ci_avg_time_isel : iterable of int + Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a + selection of the data + ci_avg_x_flag1 : bool + The confidence intervals differ at each location. Assumes the + temperature varies over `x` and over time. Computes the + arithmic spatial mean. If you would like to know the confidence + interfal of: + (1) a single additional measurement location. So you can state "if + another measurement location were to be taken, + it would have this ci" + (2) all measurement locations. So you can state "The temperature + along the fiber remained between these ci bounds". + Adds "tmpw" + '_avgx1' and "tmpw" + '_mc_avgx1_var' to the + DataStore. If `conf_ints` are set, also the confidence intervals + `_mc_avgx1` are added to the DataStore. Works independently of the + ci_avg_time_flag1, ci_avg_time_flag2 and ci_avg_x2_flag. + ci_avg_x_flag2 : bool + The confidence intervals differ at each location. Assumes the + temperature is the same at each location but varies over time. + Computes the inverse-variance-weighted-spatial-mean temperature + and its uncertainty. + If you would like to know the confidence interfal of: + (1) I have put a lot of fiber in water, and I know that the + temperature variation in the water is much smaller than along + other parts of the fiber. And I would like to average the + measurements from multiple locations to improve the estimated + temperature. + Adds "tmpw" + '_avg2' and "tmpw" + '_mc_avg2_var' to the + DataStore. If `conf_ints` are set, also the confidence intervals + `_mc_avg2` are added to the DataStore. Works independently of the + ci_avg_time_flag1 and ci_avg_x_flag. + ci_avg_x_sel : slice + Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a + selection of the data + ci_avg_x_isel : iterable of int + Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a + selection of the data + var_only_sections : bool + useful if using the ci_avg_x_flag. Only calculates the var over the + sections, so that the values can be compared with accuracy along the + reference sections. Where the accuracy is the variance of the + residuals between the estimated temperature and temperature of the + water baths. da_random_state For testing purposes. Similar to random seed. The seed for dask. Makes random not so random. To produce reproducable results for @@ -3533,207 +2741,227 @@ def conf_int_single_ended( reduce_memory_usage : bool Use less memory but at the expense of longer computation time + Returns + ------- - References - ---------- - .. [1] des Tombe, B., Schilperoort, B., & Bakker, M. (2020). Estimation - of Temperature and Associated Uncertainty from Fiber-Optic Raman- - Spectrum Distributed Temperature Sensing. Sensors, 20(8), 2235. - https://doi.org/10.3390/s20082235 """ - self.check_deprecated_kwargs(kwargs) - - if da_random_state: - state = da_random_state - else: - state = da.random.RandomState() + check_deprecated_kwargs(kwargs) - no, nt = self.st.data.shape - if "trans_att" in self.keys(): - nta = self.trans_att.size - else: - nta = 0 - - assert isinstance(p_val, (str, np.ndarray, np.generic)) - if isinstance(p_val, str): - p_val = self[p_val].data + if var_only_sections is not None: + raise NotImplementedError() - npar = p_val.size + self.conf_int_single_ended( + p_val=p_val, + p_cov=p_cov, + st_var=st_var, + ast_var=ast_var, + conf_ints=None, + mc_sample_size=mc_sample_size, + da_random_state=da_random_state, + mc_remove_set_flag=False, + reduce_memory_usage=reduce_memory_usage, + **kwargs, + ) - # number of parameters - if npar == nt + 2 + nt * nta: - fixed_alpha = False - elif npar == 1 + no + nt + nt * nta: - fixed_alpha = True - else: - raise Exception("The size of `p_val` is not what I expected") + if ci_avg_time_sel is not None: + time_dim2 = "time" + "_avg" + x_dim2 = "x" + self.coords[time_dim2] = ( + (time_dim2,), + self["time"].sel(**{"time": ci_avg_time_sel}).data, + ) + self["tmpf_avgsec"] = ( + ("x", time_dim2), + self["tmpf"].sel(**{"time": ci_avg_time_sel}).data, + ) + self["tmpf_mc_set"] = ( + ("mc", "x", time_dim2), + self["tmpf" + "_mc_set"].sel(**{"time": ci_avg_time_sel}).data, + ) - self.coords["mc"] = range(mc_sample_size) + elif ci_avg_time_isel is not None: + time_dim2 = "time" + "_avg" + x_dim2 = "x" + self.coords[time_dim2] = ( + (time_dim2,), + self["time"].isel(**{"time": ci_avg_time_isel}).data, + ) + self["tmpf_avgsec"] = ( + ("x", time_dim2), + self["tmpf"].isel(**{"time": ci_avg_time_isel}).data, + ) + self["tmpf_mc_set"] = ( + ("mc", "x", time_dim2), + self["tmpf" + "_mc_set"].isel(**{"time": ci_avg_time_isel}).data, + ) - if conf_ints: - self.coords["CI"] = conf_ints + elif ci_avg_x_sel is not None: + time_dim2 = "time" + x_dim2 = "x_avg" + self.coords[x_dim2] = ((x_dim2,), self.x.sel(x=ci_avg_x_sel).data) + self["tmpf_avgsec"] = ( + (x_dim2, "time"), + self["tmpf"].sel(x=ci_avg_x_sel).data, + ) + self["tmpf_mc_set"] = ( + ("mc", x_dim2, "time"), + self["tmpf_mc_set"].sel(x=ci_avg_x_sel).data, + ) - # WLS - if isinstance(p_cov, str): - p_cov = self[p_cov].data - assert p_cov.shape == (npar, npar) + elif ci_avg_x_isel is not None: + time_dim2 = "time" + x_dim2 = "x_avg" + self.coords[x_dim2] = ((x_dim2,), self.x.isel(x=ci_avg_x_isel).data) + self["tmpf_avgsec"] = ( + (x_dim2, time_dim2), + self["tmpf"].isel(x=ci_avg_x_isel).data, + ) + self["tmpf_mc_set"] = ( + ("mc", x_dim2, time_dim2), + self["tmpf_mc_set"].isel(x=ci_avg_x_isel).data, + ) + else: + self["tmpf_avgsec"] = self["tmpf"] + x_dim2 = "x" + time_dim2 = "time" - p_mc = sst.multivariate_normal.rvs(mean=p_val, cov=p_cov, size=mc_sample_size) + # subtract the mean temperature + q = self["tmpf_mc_set"] - self["tmpf_avgsec"] + self["tmpf_mc" + "_avgsec_var"] = q.var(dim="mc", ddof=1) - if fixed_alpha: - self["alpha_mc"] = (("mc", "x"), p_mc[:, 1 : no + 1]) - self["c_mc"] = (("mc", "time"), p_mc[:, 1 + no : 1 + no + nt]) - else: - self["dalpha_mc"] = (("mc",), p_mc[:, 1]) - self["c_mc"] = (("mc", "time"), p_mc[:, 2 : nt + 2]) + if ci_avg_x_flag1: + # unweighted mean + self["tmpf_avgx1"] = self["tmpf" + "_avgsec"].mean(dim=x_dim2) - self["gamma_mc"] = (("mc",), p_mc[:, 0]) - if nta: - self["ta_mc"] = ( - ("mc", "trans_att", "time"), - np.reshape(p_mc[:, -nt * nta :], (mc_sample_size, nta, nt)), - ) + q = self["tmpf_mc_set"] - self["tmpf_avgsec"] + qvar = q.var(dim=["mc", x_dim2], ddof=1) + self["tmpf_mc_avgx1_var"] = qvar - rsize = (self.mc.size, self.x.size, self.time.size) + if conf_ints: + new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[2]) + avg_axis = self["tmpf_mc_set"].get_axis_num(["mc", x_dim2]) + q = self["tmpf_mc_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), + chunks=new_chunks, # + drop_axis=avg_axis, + # avg dimensions are dropped from input arr + new_axis=0, + ) # The new CI dim is added as firsaxis - if reduce_memory_usage: - memchunk = da.ones( - (mc_sample_size, no, nt), chunks={0: -1, 1: 1, 2: "auto"} - ).chunks - else: - memchunk = da.ones( - (mc_sample_size, no, nt), chunks={0: -1, 1: "auto", 2: "auto"} - ).chunks + self["tmpf_mc_avgx1"] = (("CI", time_dim2), q) - # Draw from the normal distributions for the Stokes intensities - for k, st_labeli, st_vari in zip( - ["r_st", "r_ast"], ["st", "ast"], [st_var, ast_var] - ): - # Load the mean as chunked Dask array, otherwise eats memory - if type(self[st_labeli].data) == da.core.Array: - loc = da.asarray(self[st_labeli].data, chunks=memchunk[1:]) - else: - loc = da.from_array(self[st_labeli].data, chunks=memchunk[1:]) + if ci_avg_x_flag2: + q = self["tmpf_mc_set"] - self["tmpf_avgsec"] - # Make sure variance is of size (no, nt) - if np.size(st_vari) > 1: - if st_vari.shape == self[st_labeli].shape: - pass - else: - st_vari = np.broadcast_to(st_vari, (no, nt)) - else: - pass + qvar = q.var(dim=["mc"], ddof=1) - # Load variance as chunked Dask array, otherwise eats memory - if type(st_vari) == da.core.Array: - st_vari_da = da.asarray(st_vari, chunks=memchunk[1:]) + # Inverse-variance weighting + avg_x_var = 1 / (1 / qvar).sum(dim=x_dim2) - elif callable(st_vari) and type(self[st_labeli].data) == da.core.Array: - st_vari_da = da.asarray( - st_vari(self[st_labeli]).data, chunks=memchunk[1:] - ) + self["tmpf_mc_avgx2_var"] = avg_x_var - elif callable(st_vari) and type(self[st_labeli].data) != da.core.Array: - st_vari_da = da.from_array( - st_vari(self[st_labeli]).data, chunks=memchunk[1:] - ) + self["tmpf" + "_mc_avgx2_set"] = (self["tmpf_mc_set"] / qvar).sum( + dim=x_dim2 + ) * avg_x_var + self["tmpf" + "_avgx2"] = self["tmpf" + "_mc_avgx2_set"].mean(dim="mc") - else: - st_vari_da = da.from_array(st_vari, chunks=memchunk[1:]) + if conf_ints: + new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[2]) + avg_axis_avgx = self["tmpf_mc_set"].get_axis_num("mc") - self[k] = ( - ("mc", "x", "time"), - state.normal( - loc=loc, # has chunks=memchunk[1:] - scale=st_vari_da**0.5, - size=rsize, - chunks=memchunk, - ), - ) + qq = self["tmpf_mc_avgx2_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avgx), + chunks=new_chunks, # + drop_axis=avg_axis_avgx, + # avg dimensions are dropped from input arr + new_axis=0, + dtype=float, + ) # The new CI dimension is added as + # firsaxis + self["tmpf_mc_avgx2"] = (("CI", time_dim2), qq) - ta_arr = np.zeros((mc_sample_size, no, nt)) + if ci_avg_time_flag1 is not None: + # unweighted mean + self["tmpf_avg1"] = self["tmpf" + "_avgsec"].mean(dim=time_dim2) - if nta: - for ii, ta in enumerate(self["ta_mc"]): - for tai, taxi in zip(ta.values, self.trans_att.values): - ta_arr[ii, self.x.values >= taxi] = ( - ta_arr[ii, self.x.values >= taxi] + tai - ) - self["ta_mc_arr"] = (("mc", "x", "time"), ta_arr) + q = self["tmpf_mc_set"] - self["tmpf_avgsec"] + qvar = q.var(dim=["mc", time_dim2], ddof=1) + self["tmpf_mc_avg1_var"] = qvar - if fixed_alpha: - self["tmpf_mc_set"] = ( - self["gamma_mc"] - / ( - ( - np.log(self["r_st"]) - - np.log(self["r_ast"]) - + (self["c_mc"] + self["ta_mc_arr"]) - ) - + self["alpha_mc"] - ) - - 273.15 - ) - else: - self["tmpf_mc_set"] = ( - self["gamma_mc"] - / ( - ( - np.log(self["r_st"]) - - np.log(self["r_ast"]) - + (self["c_mc"] + self["ta_mc_arr"]) - ) - + (self["dalpha_mc"] * self.x) - ) - - 273.15 - ) + if conf_ints: + new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[1]) + avg_axis = self["tmpf_mc_set"].get_axis_num(["mc", time_dim2]) + q = self["tmpf_mc_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), + chunks=new_chunks, # + drop_axis=avg_axis, + # avg dimensions are dropped from input arr + new_axis=0, + ) # The new CI dim is added as firsaxis - avg_dims = ["mc"] + self["tmpf_mc_avg1"] = (("CI", x_dim2), q) - avg_axis = self["tmpf_mc_set"].get_axis_num(avg_dims) + if ci_avg_time_flag2: + q = self["tmpf_mc_set"] - self["tmpf_avgsec"] - self["tmpf_mc_var"] = (self["tmpf_mc_set"] - self["tmpf"]).var( - dim=avg_dims, ddof=1 - ) + qvar = q.var(dim=["mc"], ddof=1) - if conf_ints: - new_chunks = ((len(conf_ints),),) + self["tmpf_mc_set"].chunks[1:] + # Inverse-variance weighting + avg_time_var = 1 / (1 / qvar).sum(dim=time_dim2) - qq = self["tmpf_mc_set"] + self["tmpf_mc_avg2_var"] = avg_time_var - q = qq.data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks, # - drop_axis=avg_axis, # avg dimesnions are dropped from input arr - new_axis=0, - ) # The new CI dimension is added as first axis + self["tmpf" + "_mc_avg2_set"] = (self["tmpf_mc_set"] / qvar).sum( + dim=time_dim2 + ) * avg_time_var + self["tmpf_avg2"] = self["tmpf" + "_mc_avg2_set"].mean(dim="mc") - self["tmpf_mc"] = (("CI", "x", "time"), q) + if conf_ints: + new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[1]) + avg_axis_avg2 = self["tmpf_mc_set"].get_axis_num("mc") - if mc_remove_set_flag: - drop_var = [ + qq = self["tmpf_mc_avg2_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avg2), + chunks=new_chunks, # + drop_axis=avg_axis_avg2, + # avg dimensions are dropped from input arr + new_axis=0, + dtype=float, + ) # The new CI dimension is added as + # firsaxis + self["tmpf_mc_avg2"] = (("CI", x_dim2), qq) + # Clean up the garbage. All arrays with a Monte Carlo dimension. + if mc_remove_set_flag: + remove_mc_set = [ + "r_st", + "r_ast", "gamma_mc", "dalpha_mc", - "alpha_mc", "c_mc", + "x_avg", + "time_avg", "mc", - "r_st", - "r_ast", - "tmpf_mc_set", "ta_mc_arr", ] - for k in drop_var: + remove_mc_set.append("tmpf_avgsec") + remove_mc_set.append("tmpf_mc_set") + remove_mc_set.append("tmpf_mc_avg2_set") + remove_mc_set.append("tmpf_mc_avgx2_set") + remove_mc_set.append("tmpf_mc_avgsec_var") + + for k in remove_mc_set: if k in self: del self[k] - pass - def average_single_ended( + def average_double_ended( self, p_val="p_val", p_cov="p_cov", st_var=None, ast_var=None, + rst_var=None, + rast_var=None, conf_ints=None, mc_sample_size=100, ci_avg_time_flag1=False, @@ -3744,31 +2972,29 @@ def average_single_ended( ci_avg_x_flag2=False, ci_avg_x_sel=None, ci_avg_x_isel=None, - var_only_sections=None, da_random_state=None, mc_remove_set_flag=True, reduce_memory_usage=False, **kwargs, ): """ - Average temperatures from single-ended setups. + Average temperatures from double-ended setups. Four types of averaging are implemented. Please see Example Notebook 16. - Parameters ---------- p_val : array-like, optional Define `p_val`, `p_var`, `p_cov` if you used an external function - for calibration. Has size 2 + `nt`. First value is :math:`\gamma`, - second is :math:`\Delta \\alpha`, others are :math:`C` for each + for calibration. Has size 2 + `nt`. First value is :math:`\\gamma`, + second is :math:`\\Delta \\alpha`, others are :math:`C` for each timestep. If set to False, no uncertainty in the parameters is propagated into the confidence intervals. Similar to the spec sheets of the DTS manufacturers. And similar to passing an array filled with zeros p_cov : array-like, optional The covariances of `p_val`. - st_var, ast_var : float, callable, array-like, optional + st_var, ast_var, rst_var, rast_var : float, callable, array-like, optional The variance of the measurement noise of the Stokes signals in the forward direction. If `float` the variance of the noise from the Stokes detector is described with a single value. @@ -3852,12 +3078,6 @@ def average_single_ended( ci_avg_x_isel : iterable of int Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a selection of the data - var_only_sections : bool - useful if using the ci_avg_x_flag. Only calculates the var over the - sections, so that the values can be compared with accuracy along the - reference sections. Where the accuracy is the variance of the - residuals between the estimated temperature and temperature of the - water baths. da_random_state For testing purposes. Similar to random seed. The seed for dask. Makes random not so random. To produce reproducable results for @@ -3872,16 +3092,62 @@ def average_single_ended( ------- """ - self.check_deprecated_kwargs(kwargs) - if var_only_sections is not None: - raise NotImplementedError() + def create_da_ta2(no, i_splice, direction="fw", chunks=None): + """create mask array mc, o, nt""" - self.conf_int_single_ended( + if direction == "fw": + arr = da.concatenate( + ( + da.zeros( + (1, i_splice, 1), chunks=((1, i_splice, 1)), dtype=bool + ), + da.ones( + (1, no - i_splice, 1), + chunks=(1, no - i_splice, 1), + dtype=bool, + ), + ), + axis=1, + ).rechunk((1, chunks[1], 1)) + else: + arr = da.concatenate( + ( + da.ones((1, i_splice, 1), chunks=(1, i_splice, 1), dtype=bool), + da.zeros( + (1, no - i_splice, 1), + chunks=((1, no - i_splice, 1)), + dtype=bool, + ), + ), + axis=1, + ).rechunk((1, chunks[1], 1)) + return arr + + check_deprecated_kwargs(kwargs) + + if (ci_avg_x_flag1 or ci_avg_x_flag2) and ( + ci_avg_time_flag1 or ci_avg_time_flag2 + ): + raise NotImplementedError( + "Incompatible flags. Can not pick " "the right chunks" + ) + + elif not ( + ci_avg_x_flag1 or ci_avg_x_flag2 or ci_avg_time_flag1 or ci_avg_time_flag2 + ): + raise NotImplementedError("Pick one of the averaging options") + + else: + pass + + self.conf_int_double_ended( p_val=p_val, p_cov=p_cov, st_var=st_var, ast_var=ast_var, + rst_var=rst_var, + rast_var=rast_var, conf_ints=None, mc_sample_size=mc_sample_size, da_random_state=da_random_state, @@ -3890,291 +3156,382 @@ def average_single_ended( **kwargs, ) - if ci_avg_time_sel is not None: - time_dim2 = "time" + "_avg" - x_dim2 = "x" - self.coords[time_dim2] = ( - (time_dim2,), - self["time"].sel(**{"time": ci_avg_time_sel}).data, - ) - self["tmpf_avgsec"] = ( - ("x", time_dim2), - self["tmpf"].sel(**{"time": ci_avg_time_sel}).data, - ) - self["tmpf_mc_set"] = ( - ("mc", "x", time_dim2), - self["tmpf" + "_mc_set"].sel(**{"time": ci_avg_time_sel}).data, - ) - - elif ci_avg_time_isel is not None: - time_dim2 = "time" + "_avg" - x_dim2 = "x" - self.coords[time_dim2] = ( - (time_dim2,), - self["time"].isel(**{"time": ci_avg_time_isel}).data, - ) - self["tmpf_avgsec"] = ( - ("x", time_dim2), - self["tmpf"].isel(**{"time": ci_avg_time_isel}).data, - ) - self["tmpf_mc_set"] = ( - ("mc", "x", time_dim2), - self["tmpf" + "_mc_set"].isel(**{"time": ci_avg_time_isel}).data, - ) - - elif ci_avg_x_sel is not None: - time_dim2 = "time" - x_dim2 = "x_avg" - self.coords[x_dim2] = ((x_dim2,), self.x.sel(x=ci_avg_x_sel).data) - self["tmpf_avgsec"] = ( - (x_dim2, "time"), - self["tmpf"].sel(x=ci_avg_x_sel).data, - ) - self["tmpf_mc_set"] = ( - ("mc", x_dim2, "time"), - self["tmpf_mc_set"].sel(x=ci_avg_x_sel).data, - ) - - elif ci_avg_x_isel is not None: - time_dim2 = "time" - x_dim2 = "x_avg" - self.coords[x_dim2] = ((x_dim2,), self.x.isel(x=ci_avg_x_isel).data) - self["tmpf_avgsec"] = ( - (x_dim2, time_dim2), - self["tmpf"].isel(x=ci_avg_x_isel).data, - ) - self["tmpf_mc_set"] = ( - ("mc", x_dim2, time_dim2), - self["tmpf_mc_set"].isel(x=ci_avg_x_isel).data, - ) - else: - self["tmpf_avgsec"] = self["tmpf"] - x_dim2 = "x" - time_dim2 = "time" + for label in ["tmpf", "tmpb"]: + if ci_avg_time_sel is not None: + time_dim2 = "time" + "_avg" + x_dim2 = "x" + self.coords[time_dim2] = ( + (time_dim2,), + self["time"].sel(**{"time": ci_avg_time_sel}).data, + ) + self[label + "_avgsec"] = ( + ("x", time_dim2), + self[label].sel(**{"time": ci_avg_time_sel}).data, + ) + self[label + "_mc_set"] = ( + ("mc", "x", time_dim2), + self[label + "_mc_set"].sel(**{"time": ci_avg_time_sel}).data, + ) - # subtract the mean temperature - q = self["tmpf_mc_set"] - self["tmpf_avgsec"] - self["tmpf_mc" + "_avgsec_var"] = q.var(dim="mc", ddof=1) + elif ci_avg_time_isel is not None: + time_dim2 = "time" + "_avg" + x_dim2 = "x" + self.coords[time_dim2] = ( + (time_dim2,), + self["time"].isel(**{"time": ci_avg_time_isel}).data, + ) + self[label + "_avgsec"] = ( + ("x", time_dim2), + self[label].isel(**{"time": ci_avg_time_isel}).data, + ) + self[label + "_mc_set"] = ( + ("mc", "x", time_dim2), + self[label + "_mc_set"].isel(**{"time": ci_avg_time_isel}).data, + ) - if ci_avg_x_flag1: - # unweighted mean - self["tmpf_avgx1"] = self["tmpf" + "_avgsec"].mean(dim=x_dim2) + elif ci_avg_x_sel is not None: + time_dim2 = "time" + x_dim2 = "x_avg" + self.coords[x_dim2] = ((x_dim2,), self.x.sel(x=ci_avg_x_sel).data) + self[label + "_avgsec"] = ( + (x_dim2, "time"), + self[label].sel(x=ci_avg_x_sel).data, + ) + self[label + "_mc_set"] = ( + ("mc", x_dim2, "time"), + self[label + "_mc_set"].sel(x=ci_avg_x_sel).data, + ) - q = self["tmpf_mc_set"] - self["tmpf_avgsec"] - qvar = q.var(dim=["mc", x_dim2], ddof=1) - self["tmpf_mc_avgx1_var"] = qvar + elif ci_avg_x_isel is not None: + time_dim2 = "time" + x_dim2 = "x_avg" + self.coords[x_dim2] = ((x_dim2,), self.x.isel(x=ci_avg_x_isel).data) + self[label + "_avgsec"] = ( + (x_dim2, time_dim2), + self[label].isel(x=ci_avg_x_isel).data, + ) + self[label + "_mc_set"] = ( + ("mc", x_dim2, time_dim2), + self[label + "_mc_set"].isel(x=ci_avg_x_isel).data, + ) + else: + self[label + "_avgsec"] = self[label] + x_dim2 = "x" + time_dim2 = "time" - if conf_ints: - new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[2]) - avg_axis = self["tmpf_mc_set"].get_axis_num(["mc", x_dim2]) - q = self["tmpf_mc_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks, # - drop_axis=avg_axis, - # avg dimensions are dropped from input arr - new_axis=0, - ) # The new CI dim is added as firsaxis + memchunk = self[label + "_mc_set"].chunks - self["tmpf_mc_avgx1"] = (("CI", time_dim2), q) + # subtract the mean temperature + q = self[label + "_mc_set"] - self[label + "_avgsec"] + self[label + "_mc" + "_avgsec_var"] = q.var(dim="mc", ddof=1) - if ci_avg_x_flag2: - q = self["tmpf_mc_set"] - self["tmpf_avgsec"] + if ci_avg_x_flag1: + # unweighted mean + self[label + "_avgx1"] = self[label + "_avgsec"].mean(dim=x_dim2) - qvar = q.var(dim=["mc"], ddof=1) + q = self[label + "_mc_set"] - self[label + "_avgsec"] + qvar = q.var(dim=["mc", x_dim2], ddof=1) + self[label + "_mc_avgx1_var"] = qvar - # Inverse-variance weighting - avg_x_var = 1 / (1 / qvar).sum(dim=x_dim2) + if conf_ints: + new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[2]) + avg_axis = self[label + "_mc_set"].get_axis_num(["mc", x_dim2]) + q = self[label + "_mc_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), + chunks=new_chunks, # + drop_axis=avg_axis, + # avg dimensions are dropped from input arr + new_axis=0, + ) # The new CI dim is added as firsaxis - self["tmpf_mc_avgx2_var"] = avg_x_var + self[label + "_mc_avgx1"] = (("CI", time_dim2), q) - self["tmpf" + "_mc_avgx2_set"] = (self["tmpf_mc_set"] / qvar).sum( - dim=x_dim2 - ) * avg_x_var - self["tmpf" + "_avgx2"] = self["tmpf" + "_mc_avgx2_set"].mean(dim="mc") + if ci_avg_x_flag2: + q = self[label + "_mc_set"] - self[label + "_avgsec"] - if conf_ints: - new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[2]) - avg_axis_avgx = self["tmpf_mc_set"].get_axis_num("mc") + qvar = q.var(dim=["mc"], ddof=1) - qq = self["tmpf_mc_avgx2_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avgx), - chunks=new_chunks, # - drop_axis=avg_axis_avgx, - # avg dimensions are dropped from input arr - new_axis=0, - dtype=float, - ) # The new CI dimension is added as - # firsaxis - self["tmpf_mc_avgx2"] = (("CI", time_dim2), qq) + # Inverse-variance weighting + avg_x_var = 1 / (1 / qvar).sum(dim=x_dim2) - if ci_avg_time_flag1 is not None: - # unweighted mean - self["tmpf_avg1"] = self["tmpf" + "_avgsec"].mean(dim=time_dim2) + self[label + "_mc_avgx2_var"] = avg_x_var - q = self["tmpf_mc_set"] - self["tmpf_avgsec"] - qvar = q.var(dim=["mc", time_dim2], ddof=1) - self["tmpf_mc_avg1_var"] = qvar + self[label + "_mc_avgx2_set"] = (self[label + "_mc_set"] / qvar).sum( + dim=x_dim2 + ) * avg_x_var + self[label + "_avgx2"] = self[label + "_mc_avgx2_set"].mean(dim="mc") + + if conf_ints: + new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[2]) + avg_axis_avgx = self[label + "_mc_set"].get_axis_num("mc") + + qq = self[label + "_mc_avgx2_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avgx), + chunks=new_chunks, # + drop_axis=avg_axis_avgx, + # avg dimensions are dropped from input arr + new_axis=0, + dtype=float, + ) # The new CI dimension is added as + # firsaxis + self[label + "_mc_avgx2"] = (("CI", time_dim2), qq) + + if ci_avg_time_flag1 is not None: + # unweighted mean + self[label + "_avg1"] = self[label + "_avgsec"].mean(dim=time_dim2) + + q = self[label + "_mc_set"] - self[label + "_avgsec"] + qvar = q.var(dim=["mc", time_dim2], ddof=1) + self[label + "_mc_avg1_var"] = qvar + + if conf_ints: + new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[1]) + avg_axis = self[label + "_mc_set"].get_axis_num(["mc", time_dim2]) + q = self[label + "_mc_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), + chunks=new_chunks, # + drop_axis=avg_axis, + # avg dimensions are dropped from input arr + new_axis=0, + ) # The new CI dim is added as firsaxis + + self[label + "_mc_avg1"] = (("CI", x_dim2), q) + + if ci_avg_time_flag2: + q = self[label + "_mc_set"] - self[label + "_avgsec"] + + qvar = q.var(dim=["mc"], ddof=1) + + # Inverse-variance weighting + avg_time_var = 1 / (1 / qvar).sum(dim=time_dim2) + + self[label + "_mc_avg2_var"] = avg_time_var + + self[label + "_mc_avg2_set"] = (self[label + "_mc_set"] / qvar).sum( + dim=time_dim2 + ) * avg_time_var + self[label + "_avg2"] = self[label + "_mc_avg2_set"].mean(dim="mc") + + if conf_ints: + new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[1]) + avg_axis_avg2 = self[label + "_mc_set"].get_axis_num("mc") + + qq = self[label + "_mc_avg2_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avg2), + chunks=new_chunks, # + drop_axis=avg_axis_avg2, + # avg dimensions are dropped from input arr + new_axis=0, + dtype=float, + ) # The new CI dimension is added as + # firsaxis + self[label + "_mc_avg2"] = (("CI", x_dim2), qq) + + # Weighted mean of the forward and backward + tmpw_var = 1 / ( + 1 / self["tmpf_mc" + "_avgsec_var"] + 1 / self["tmpb_mc" + "_avgsec_var"] + ) + + q = ( + self["tmpf_mc_set"] / self["tmpf_mc" + "_avgsec_var"] + + self["tmpb_mc_set"] / self["tmpb_mc" + "_avgsec_var"] + ) * tmpw_var + + self["tmpw" + "_mc_set"] = q # + + # self["tmpw"] = self["tmpw" + '_mc_set'].mean(dim='mc') + self["tmpw" + "_avgsec"] = ( + self["tmpf_avgsec"] / self["tmpf_mc" + "_avgsec_var"] + + self["tmpb_avgsec"] / self["tmpb_mc" + "_avgsec_var"] + ) * tmpw_var + + q = self["tmpw" + "_mc_set"] - self["tmpw" + "_avgsec"] + self["tmpw" + "_mc" + "_avgsec_var"] = q.var(dim="mc", ddof=1) + + if ci_avg_time_flag1: + self["tmpw" + "_avg1"] = self["tmpw" + "_avgsec"].mean(dim=time_dim2) + + self["tmpw" + "_mc_avg1_var"] = self["tmpw" + "_mc_set"].var( + dim=["mc", time_dim2] + ) if conf_ints: - new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[1]) - avg_axis = self["tmpf_mc_set"].get_axis_num(["mc", time_dim2]) - q = self["tmpf_mc_set"].data.map_blocks( + new_chunks_weighted = ((len(conf_ints),),) + (memchunk[1],) + avg_axis = self["tmpw" + "_mc_set"].get_axis_num(["mc", time_dim2]) + q2 = self["tmpw" + "_mc_set"].data.map_blocks( lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks, # - drop_axis=avg_axis, - # avg dimensions are dropped from input arr + chunks=new_chunks_weighted, + # Explicitly define output chunks + drop_axis=avg_axis, # avg dimensions are dropped new_axis=0, - ) # The new CI dim is added as firsaxis - - self["tmpf_mc_avg1"] = (("CI", x_dim2), q) + dtype=float, + ) # The new CI dimension is added as + # first axis + self["tmpw" + "_mc_avg1"] = (("CI", x_dim2), q2) if ci_avg_time_flag2: - q = self["tmpf_mc_set"] - self["tmpf_avgsec"] + tmpw_var_avg2 = 1 / ( + 1 / self["tmpf_mc_avg2_var"] + 1 / self["tmpb_mc_avg2_var"] + ) - qvar = q.var(dim=["mc"], ddof=1) + q = ( + self["tmpf_mc_avg2_set"] / self["tmpf_mc_avg2_var"] + + self["tmpb_mc_avg2_set"] / self["tmpb_mc_avg2_var"] + ) * tmpw_var_avg2 - # Inverse-variance weighting - avg_time_var = 1 / (1 / qvar).sum(dim=time_dim2) + self["tmpw" + "_mc_avg2_set"] = q # - self["tmpf_mc_avg2_var"] = avg_time_var + self["tmpw" + "_avg2"] = ( + self["tmpf_avg2"] / self["tmpf_mc_avg2_var"] + + self["tmpb_avg2"] / self["tmpb_mc_avg2_var"] + ) * tmpw_var_avg2 - self["tmpf" + "_mc_avg2_set"] = (self["tmpf_mc_set"] / qvar).sum( - dim=time_dim2 - ) * avg_time_var - self["tmpf_avg2"] = self["tmpf" + "_mc_avg2_set"].mean(dim="mc") + self["tmpw" + "_mc_avg2_var"] = tmpw_var_avg2 if conf_ints: - new_chunks = (len(conf_ints), self["tmpf_mc_set"].chunks[1]) - avg_axis_avg2 = self["tmpf_mc_set"].get_axis_num("mc") - - qq = self["tmpf_mc_avg2_set"].data.map_blocks( + # We first need to know the x-dim-chunk-size + new_chunks_weighted = ((len(conf_ints),),) + (memchunk[1],) + avg_axis_avg2 = self["tmpw" + "_mc_avg2_set"].get_axis_num("mc") + q2 = self["tmpw" + "_mc_avg2_set"].data.map_blocks( lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avg2), - chunks=new_chunks, # - drop_axis=avg_axis_avg2, - # avg dimensions are dropped from input arr + chunks=new_chunks_weighted, + # Explicitly define output chunks + drop_axis=avg_axis_avg2, # avg dimensions are dropped + new_axis=0, + dtype=float, + ) # The new CI dimension is added as firstax + self["tmpw" + "_mc_avg2"] = (("CI", x_dim2), q2) + + if ci_avg_x_flag1: + self["tmpw" + "_avgx1"] = self["tmpw" + "_avgsec"].mean(dim=x_dim2) + + self["tmpw" + "_mc_avgx1_var"] = self["tmpw" + "_mc_set"].var(dim=x_dim2) + + if conf_ints: + new_chunks_weighted = ((len(conf_ints),),) + (memchunk[2],) + avg_axis = self["tmpw" + "_mc_set"].get_axis_num(["mc", x_dim2]) + q2 = self["tmpw" + "_mc_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), + chunks=new_chunks_weighted, + # Explicitly define output chunks + drop_axis=avg_axis, # avg dimensions are dropped new_axis=0, dtype=float, ) # The new CI dimension is added as - # firsaxis - self["tmpf_mc_avg2"] = (("CI", x_dim2), qq) + # first axis + self["tmpw" + "_mc_avgx1"] = (("CI", time_dim2), q2) + + if ci_avg_x_flag2: + tmpw_var_avgx2 = 1 / ( + 1 / self["tmpf_mc_avgx2_var"] + 1 / self["tmpb_mc_avgx2_var"] + ) + + q = ( + self["tmpf_mc_avgx2_set"] / self["tmpf_mc_avgx2_var"] + + self["tmpb_mc_avgx2_set"] / self["tmpb_mc_avgx2_var"] + ) * tmpw_var_avgx2 + + self["tmpw" + "_mc_avgx2_set"] = q # + + self["tmpw" + "_avgx2"] = ( + self["tmpf_avgx2"] / self["tmpf_mc_avgx2_var"] + + self["tmpb_avgx2"] / self["tmpb_mc_avgx2_var"] + ) * tmpw_var_avgx2 + + self["tmpw" + "_mc_avgx2_var"] = tmpw_var_avgx2 + + if conf_ints: + # We first need to know the x-dim-chunk-size + new_chunks_weighted = ((len(conf_ints),),) + (memchunk[2],) + avg_axis_avgx2 = self["tmpw" + "_mc_avgx2_set"].get_axis_num("mc") + q2 = self["tmpw" + "_mc_avgx2_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avgx2), + chunks=new_chunks_weighted, + # Explicitly define output chunks + drop_axis=avg_axis_avgx2, # avg dimensions are dropped + new_axis=0, + dtype=float, + ) # The new CI dimension is added as firstax + self["tmpw" + "_mc_avgx2"] = (("CI", time_dim2), q2) + # Clean up the garbage. All arrays with a Monte Carlo dimension. if mc_remove_set_flag: remove_mc_set = [ "r_st", "r_ast", + "r_rst", + "r_rast", "gamma_mc", - "dalpha_mc", - "c_mc", + "alpha_mc", + "df_mc", + "db_mc", "x_avg", "time_avg", "mc", - "ta_mc_arr", ] - remove_mc_set.append("tmpf_avgsec") - remove_mc_set.append("tmpf_mc_set") - remove_mc_set.append("tmpf_mc_avg2_set") - remove_mc_set.append("tmpf_mc_avgx2_set") - remove_mc_set.append("tmpf_mc_avgsec_var") + + for i in ["tmpf", "tmpb", "tmpw"]: + remove_mc_set.append(i + "_avgsec") + remove_mc_set.append(i + "_mc_set") + remove_mc_set.append(i + "_mc_avg2_set") + remove_mc_set.append(i + "_mc_avgx2_set") + remove_mc_set.append(i + "_mc_avgsec_var") + + if self.trans_att.size: + remove_mc_set.append('talpha"_fw_mc') + remove_mc_set.append('talpha"_bw_mc') for k in remove_mc_set: if k in self: del self[k] pass - def conf_int_double_ended( + def conf_int_single_ended( self, p_val="p_val", p_cov="p_cov", st_var=None, ast_var=None, - rst_var=None, - rast_var=None, conf_ints=None, mc_sample_size=100, - var_only_sections=False, da_random_state=None, mc_remove_set_flag=True, reduce_memory_usage=False, **kwargs, ): - """ + r""" Estimation of the confidence intervals for the temperatures measured - with a double-ended setup. - Double-ended setups require four additional steps to estimate the - confidence intervals for the temperature. First, the variances of the - Stokes and anti-Stokes intensity measurements of the forward and - backward channels are estimated following the steps in - Section 4 [1]_. See `ds.variance_stokes_constant()`. - A Normal distribution is assigned to each - intensity measurement that is centered at the measurement and using the - estimated variance. Second, a multi-variate Normal distribution is - assigned to the estimated parameters using the covariance matrix from - the calibration procedure presented in Section 6 [1]_ (`p_cov`). Third, - Normal distributions are assigned for :math:`A` (`ds.alpha`) - for each location - outside of the reference sections. These distributions are centered - around :math:`A_p` and have variance :math:`\sigma^2\left[A_p\\right]` - given by Equations 44 and 45. Fourth, the distributions are sampled - and :math:`T_{\mathrm{F},m,n}` and :math:`T_{\mathrm{B},m,n}` are - computed with Equations 16 and 17, respectively. Fifth, step four is repeated to - compute, e.g., 10,000 realizations (`mc_sample_size`) of :math:`T_{\mathrm{F},m,n}` and - :math:`T_{\mathrm{B},m,n}` to approximate their probability density - functions. Sixth, the standard uncertainties of - :math:`T_{\mathrm{F},m,n}` and :math:`T_{\mathrm{B},m,n}` - (:math:`\sigma\left[T_{\mathrm{F},m,n}\\right]` and - :math:`\sigma\left[T_{\mathrm{B},m,n}\\right]`) are estimated with the - standard deviation of their realizations. Seventh, for each realization - :math:`i` the temperature :math:`T_{m,n,i}` is computed as the weighted - average of :math:`T_{\mathrm{F},m,n,i}` and - :math:`T_{\mathrm{B},m,n,i}`: - - .. math:: - - T_{m,n,i} =\ - \sigma^2\left[T_{m,n}\\right]\left({\\frac{T_{\mathrm{F},m,n,i}}{\ - \sigma^2\left[T_{\mathrm{F},m,n}\\right]} +\ - \\frac{T_{\mathrm{B},m,n,i}}{\ - \sigma^2\left[T_{\mathrm{B},m,n}\\right]}}\\right) - - where - - .. math:: - - \sigma^2\left[T_{m,n}\\right] = \\frac{1}{1 /\ - \sigma^2\left[T_{\mathrm{F},m,n}\\right] + 1 /\ - \sigma^2\left[T_{\mathrm{B},m,n}\\right]} - - The best estimate of the temperature :math:`T_{m,n}` is computed - directly from the best estimates of :math:`T_{\mathrm{F},m,n}` and - :math:`T_{\mathrm{B},m,n}` as: - - .. math:: - T_{m,n} =\ - \sigma^2\left[T_{m,n}\\right]\left({\\frac{T_{\mathrm{F},m,n}}{\ - \sigma^2\left[T_{\mathrm{F},m,n}\\right]} + \\frac{T_{\mathrm{B},m,n}}{\ - \sigma^2\left[T_{\mathrm{B},m,n}\\right]}}\\right) + with a single-ended setup. It consists of five steps. First, the variances + of the Stokes and anti-Stokes intensity measurements are estimated + following the steps in Section 4 [1]_. A Normal + distribution is assigned to each intensity measurement that is centered + at the measurement and using the estimated variance. Second, a multi- + variate Normal distribution is assigned to the estimated parameters + using the covariance matrix from the calibration procedure presented in + Section 5 [1]_. Third, the distributions are sampled, and the + temperature is computed with Equation 12 [1]_. Fourth, step + three is repeated, e.g., 10,000 times for each location and for each + time. The resulting 10,000 realizations of the temperatures + approximate the probability density functions of the estimated + temperature at that location and time. Fifth, the standard uncertainties + are computed with the standard deviations of the realizations of the + temperatures, and the 95\% confidence intervals are computed from the + 2.5\% and 97.5\% percentiles of the realizations of the temperatures. - Alternatively, the best estimate of :math:`T_{m,n}` can be approximated - with the mean of the :math:`T_{m,n,i}` values. Finally, the 95\% - confidence interval for :math:`T_{m,n}` are estimated with the 2.5\% and - 97.5\% percentiles of :math:`T_{m,n,i}`. Parameters ---------- p_val : array-like, optional Define `p_val`, `p_var`, `p_cov` if you used an external function - for calibration. Has size `1 + 2 * nt + nx + 2 * nt * nta`. - First value is :math:`\gamma`, then `nt` times - :math:`D_\mathrm{F}`, then `nt` times - :math:`D_\mathrm{B}`, then for each location :math:`D_\mathrm{B}`, - then for each connector that introduces directional attenuation two - parameters per time step. - p_cov : array-like, optional - The covariances of `p_val`. Square matrix. + for calibration. Has size 2 + `nt`. First value is :math:`\gamma`, + second is :math:`\Delta \\alpha`, others are :math:`C` for each + timestep. If set to False, no uncertainty in the parameters is propagated into the confidence intervals. Similar to the spec sheets of the DTS - manufacturers. And similar to passing an array filled with zeros. - st_var, ast_var, rst_var, rast_var : float, callable, array-like, optional + manufacturers. And similar to passing an array filled with zeros + p_cov : array-like, optional + The covariances of `p_val`. + st_var, ast_var : float, callable, array-like, optional The variance of the measurement noise of the Stokes signals in the forward direction. If `float` the variance of the noise from the Stokes detector is described with a single value. @@ -4185,16 +3542,11 @@ def conf_int_double_ended( x. Required if method is wls. conf_ints : iterable object of float A list with the confidence boundaries that are calculated. Valid - values are between [0, 1]. + values are between + [0, 1]. mc_sample_size : int Size of the monte carlo parameter set used to calculate the confidence interval - var_only_sections : bool - useful if using the ci_avg_x_flag. Only calculates the var over the - sections, so that the values can be compared with accuracy along the - reference sections. Where the accuracy is the variance of the - residuals between the estimated temperature and temperature of the - water baths. da_random_state For testing purposes. Similar to random seed. The seed for dask. Makes random not so random. To produce reproducable results for @@ -4205,8 +3557,6 @@ def conf_int_double_ended( reduce_memory_usage : bool Use less memory but at the expense of longer computation time - Returns - ------- References ---------- @@ -4214,61 +3564,61 @@ def conf_int_double_ended( of Temperature and Associated Uncertainty from Fiber-Optic Raman- Spectrum Distributed Temperature Sensing. Sensors, 20(8), 2235. https://doi.org/10.3390/s20082235 - """ - - def create_da_ta2(no, i_splice, direction="fw", chunks=None): - """create mask array mc, o, nt""" - - if direction == "fw": - arr = da.concatenate( - ( - da.zeros( - (1, i_splice, 1), chunks=((1, i_splice, 1)), dtype=bool - ), - da.ones( - (1, no - i_splice, 1), - chunks=(1, no - i_splice, 1), - dtype=bool, - ), - ), - axis=1, - ).rechunk((1, chunks[1], 1)) - else: - arr = da.concatenate( - ( - da.ones((1, i_splice, 1), chunks=(1, i_splice, 1), dtype=bool), - da.zeros( - (1, no - i_splice, 1), - chunks=((1, no - i_splice, 1)), - dtype=bool, - ), - ), - axis=1, - ).rechunk((1, chunks[1], 1)) - return arr - - self.check_deprecated_kwargs(kwargs) + check_deprecated_kwargs(kwargs) if da_random_state: - # In testing environments - assert isinstance(da_random_state, da.random.RandomState) state = da_random_state else: state = da.random.RandomState() + no, nt = self.st.data.shape + if "trans_att" in self.keys(): + nta = self.trans_att.size + else: + nta = 0 + + assert isinstance(p_val, (str, np.ndarray, np.generic)) + if isinstance(p_val, str): + p_val = self[p_val].data + + npar = p_val.size + + # number of parameters + if npar == nt + 2 + nt * nta: + fixed_alpha = False + elif npar == 1 + no + nt + nt * nta: + fixed_alpha = True + else: + raise Exception("The size of `p_val` is not what I expected") + + self.coords["mc"] = range(mc_sample_size) + if conf_ints: - assert "tmpw", ( - "Current implementation requires you to " - 'define "tmpw" when estimating confidence ' - "intervals" - ) + self.coords["CI"] = conf_ints - no, nt = self.st.shape - nta = self.trans_att.size - npar = 1 + 2 * nt + no + nt * 2 * nta # number of parameters + # WLS + if isinstance(p_cov, str): + p_cov = self[p_cov].data + assert p_cov.shape == (npar, npar) - rsize = (mc_sample_size, no, nt) + p_mc = sst.multivariate_normal.rvs(mean=p_val, cov=p_cov, size=mc_sample_size) + + if fixed_alpha: + self["alpha_mc"] = (("mc", "x"), p_mc[:, 1 : no + 1]) + self["c_mc"] = (("mc", "time"), p_mc[:, 1 + no : 1 + no + nt]) + else: + self["dalpha_mc"] = (("mc",), p_mc[:, 1]) + self["c_mc"] = (("mc", "time"), p_mc[:, 2 : nt + 2]) + + self["gamma_mc"] = (("mc",), p_mc[:, 0]) + if nta: + self["ta_mc"] = ( + ("mc", "trans_att", "time"), + np.reshape(p_mc[:, -nt * nta :], (mc_sample_size, nta, nt)), + ) + + rsize = (self.mc.size, self.x.size, self.time.size) if reduce_memory_usage: memchunk = da.ones( @@ -4279,164 +3629,28 @@ def create_da_ta2(no, i_splice, direction="fw", chunks=None): (mc_sample_size, no, nt), chunks={0: -1, 1: "auto", 2: "auto"} ).chunks - self.coords["mc"] = range(mc_sample_size) - if conf_ints: - self.coords["CI"] = conf_ints - - assert isinstance(p_val, (str, np.ndarray, np.generic)) - if isinstance(p_val, str): - p_val = self[p_val].values - assert p_val.shape == (npar,), ( - "Did you set 'talpha' as " - "keyword argument of the " - "conf_int_double_ended() function?" - ) + # Draw from the normal distributions for the Stokes intensities + for k, st_labeli, st_vari in zip( + ["r_st", "r_ast"], ["st", "ast"], [st_var, ast_var] + ): + # Load the mean as chunked Dask array, otherwise eats memory + if type(self[st_labeli].data) == da.core.Array: + loc = da.asarray(self[st_labeli].data, chunks=memchunk[1:]) + else: + loc = da.from_array(self[st_labeli].data, chunks=memchunk[1:]) - assert isinstance(p_cov, (str, np.ndarray, np.generic, bool)) + # Make sure variance is of size (no, nt) + if np.size(st_vari) > 1: + if st_vari.shape == self[st_labeli].shape: + pass + else: + st_vari = np.broadcast_to(st_vari, (no, nt)) + else: + pass - if isinstance(p_cov, bool) and not p_cov: - # Exclude parameter uncertainty if p_cov == False - gamma = p_val[0] - d_fw = p_val[1 : nt + 1] - d_bw = p_val[1 + nt : 2 * nt + 1] - alpha = p_val[2 * nt + 1 : 2 * nt + 1 + no] - - self["gamma_mc"] = (tuple(), gamma) - self["alpha_mc"] = (("x",), alpha) - self["df_mc"] = (("time",), d_fw) - self["db_mc"] = (("time",), d_bw) - - if nta: - ta = p_val[2 * nt + 1 + no :].reshape((nt, 2, nta), order="F") - ta_fw = ta[:, 0, :] - ta_bw = ta[:, 1, :] - - ta_fw_arr = np.zeros((no, nt)) - for tai, taxi in zip(ta_fw.T, self.coords["trans_att"].values): - ta_fw_arr[self.x.values >= taxi] = ( - ta_fw_arr[self.x.values >= taxi] + tai - ) - - ta_bw_arr = np.zeros((no, nt)) - for tai, taxi in zip(ta_bw.T, self.coords["trans_att"].values): - ta_bw_arr[self.x.values < taxi] = ( - ta_bw_arr[self.x.values < taxi] + tai - ) - - self["talpha_fw_mc"] = (("x", "time"), ta_fw_arr) - self["talpha_bw_mc"] = (("x", "time"), ta_bw_arr) - - elif isinstance(p_cov, bool) and p_cov: - raise NotImplementedError("Not an implemented option. Check p_cov argument") - - else: - # WLS - if isinstance(p_cov, str): - p_cov = self[p_cov].values - assert p_cov.shape == (npar, npar) - - ix_sec = self.ufunc_per_section(x_indices=True, calc_per="all") - nx_sec = ix_sec.size - from_i = np.concatenate( - ( - np.arange(1 + 2 * nt), - 1 + 2 * nt + ix_sec, - np.arange(1 + 2 * nt + no, 1 + 2 * nt + no + nt * 2 * nta), - ) - ) - iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") - po_val = p_val[from_i] - po_cov = p_cov[iox_sec1, iox_sec2] - - po_mc = sst.multivariate_normal.rvs( - mean=po_val, cov=po_cov, size=mc_sample_size - ) - - gamma = po_mc[:, 0] - d_fw = po_mc[:, 1 : nt + 1] - d_bw = po_mc[:, 1 + nt : 2 * nt + 1] - - self["gamma_mc"] = (("mc",), gamma) - self["df_mc"] = (("mc", "time"), d_fw) - self["db_mc"] = (("mc", "time"), d_bw) - - # calculate alpha seperately - alpha = np.zeros((mc_sample_size, no), dtype=float) - alpha[:, ix_sec] = po_mc[:, 1 + 2 * nt : 1 + 2 * nt + nx_sec] - - not_ix_sec = np.array([i for i in range(no) if i not in ix_sec]) - - if np.any(not_ix_sec): - not_alpha_val = p_val[2 * nt + 1 + not_ix_sec] - not_alpha_var = p_cov[2 * nt + 1 + not_ix_sec, 2 * nt + 1 + not_ix_sec] - - not_alpha_mc = np.random.normal( - loc=not_alpha_val, - scale=not_alpha_var**0.5, - size=(mc_sample_size, not_alpha_val.size), - ) - - alpha[:, not_ix_sec] = not_alpha_mc - - self["alpha_mc"] = (("mc", "x"), alpha) - - if nta: - ta = po_mc[:, 2 * nt + 1 + nx_sec :].reshape( - (mc_sample_size, nt, 2, nta), order="F" - ) - ta_fw = ta[:, :, 0, :] - ta_bw = ta[:, :, 1, :] - - ta_fw_arr = da.zeros( - (mc_sample_size, no, nt), chunks=memchunk, dtype=float - ) - for tai, taxi in zip( - ta_fw.swapaxes(0, 2), self.coords["trans_att"].values - ): - # iterate over the splices - i_splice = sum(self.x.values < taxi) - mask = create_da_ta2(no, i_splice, direction="fw", chunks=memchunk) - - ta_fw_arr += mask * tai.T[:, None, :] - - ta_bw_arr = da.zeros( - (mc_sample_size, no, nt), chunks=memchunk, dtype=float - ) - for tai, taxi in zip( - ta_bw.swapaxes(0, 2), self.coords["trans_att"].values - ): - i_splice = sum(self.x.values < taxi) - mask = create_da_ta2(no, i_splice, direction="bw", chunks=memchunk) - - ta_bw_arr += mask * tai.T[:, None, :] - - self["talpha_fw_mc"] = (("mc", "x", "time"), ta_fw_arr) - self["talpha_bw_mc"] = (("mc", "x", "time"), ta_bw_arr) - - # Draw from the normal distributions for the Stokes intensities - for k, st_labeli, st_vari in zip( - ["r_st", "r_ast", "r_rst", "r_rast"], - ["st", "ast", "rst", "rast"], - [st_var, ast_var, rst_var, rast_var], - ): - # Load the mean as chunked Dask array, otherwise eats memory - if type(self[st_labeli].data) == da.core.Array: - loc = da.asarray(self[st_labeli].data, chunks=memchunk[1:]) - else: - loc = da.from_array(self[st_labeli].data, chunks=memchunk[1:]) - - # Make sure variance is of size (no, nt) - if np.size(st_vari) > 1: - if st_vari.shape == self[st_labeli].shape: - pass - else: - st_vari = np.broadcast_to(st_vari, (no, nt)) - else: - pass - - # Load variance as chunked Dask array, otherwise eats memory - if type(st_vari) == da.core.Array: - st_vari_da = da.asarray(st_vari, chunks=memchunk[1:]) + # Load variance as chunked Dask array, otherwise eats memory + if type(st_vari) == da.core.Array: + st_vari_da = da.asarray(st_vari, chunks=memchunk[1:]) elif callable(st_vari) and type(self[st_labeli].data) == da.core.Array: st_vari_da = da.asarray( @@ -4461,134 +3675,84 @@ def create_da_ta2(no, i_splice, direction="fw", chunks=None): ), ) - for label in ["tmpf", "tmpb"]: - if "tmpw" or label: - if label == "tmpf": - if nta: - self["tmpf_mc_set"] = ( - self["gamma_mc"] - / ( - np.log(self["r_st"] / self["r_ast"]) - + self["df_mc"] - + self["alpha_mc"] - + self["talpha_fw_mc"] - ) - - 273.15 - ) - else: - self["tmpf_mc_set"] = ( - self["gamma_mc"] - / ( - np.log(self["r_st"] / self["r_ast"]) - + self["df_mc"] - + self["alpha_mc"] - ) - - 273.15 - ) - else: - if nta: - self["tmpb_mc_set"] = ( - self["gamma_mc"] - / ( - np.log(self["r_rst"] / self["r_rast"]) - + self["db_mc"] - - self["alpha_mc"] - + self["talpha_bw_mc"] - ) - - 273.15 - ) - else: - self["tmpb_mc_set"] = ( - self["gamma_mc"] - / ( - np.log(self["r_rst"] / self["r_rast"]) - + self["db_mc"] - - self["alpha_mc"] - ) - - 273.15 - ) - - if var_only_sections: - # sets the values outside the reference sections to NaN - xi = self.ufunc_per_section(x_indices=True, calc_per="all") - x_mask_ = [True if ix in xi else False for ix in range(self.x.size)] - x_mask = np.reshape(x_mask_, (1, -1, 1)) - self[label + "_mc_set"] = self[label + "_mc_set"].where(x_mask) + ta_arr = np.zeros((mc_sample_size, no, nt)) - # subtract the mean temperature - q = self[label + "_mc_set"] - self[label] - self[label + "_mc_var"] = q.var(dim="mc", ddof=1) + if nta: + for ii, ta in enumerate(self["ta_mc"]): + for tai, taxi in zip(ta.values, self.trans_att.values): + ta_arr[ii, self.x.values >= taxi] = ( + ta_arr[ii, self.x.values >= taxi] + tai + ) + self["ta_mc_arr"] = (("mc", "x", "time"), ta_arr) - if conf_ints: - new_chunks = list(self[label + "_mc_set"].chunks) - new_chunks[0] = (len(conf_ints),) - avg_axis = self[label + "_mc_set"].get_axis_num("mc") - q = self[label + "_mc_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks, # - drop_axis=avg_axis, - # avg dimensions are dropped from input arr - new_axis=0, - ) # The new CI dimension is added as firsaxis + if fixed_alpha: + self["tmpf_mc_set"] = ( + self["gamma_mc"] + / ( + ( + np.log(self["r_st"]) + - np.log(self["r_ast"]) + + (self["c_mc"] + self["ta_mc_arr"]) + ) + + self["alpha_mc"] + ) + - 273.15 + ) + else: + self["tmpf_mc_set"] = ( + self["gamma_mc"] + / ( + ( + np.log(self["r_st"]) + - np.log(self["r_ast"]) + + (self["c_mc"] + self["ta_mc_arr"]) + ) + + (self["dalpha_mc"] * self.x) + ) + - 273.15 + ) - self[label + "_mc"] = (("CI", "x", "time"), q) + avg_dims = ["mc"] - # Weighted mean of the forward and backward - tmpw_var = 1 / (1 / self["tmpf_mc_var"] + 1 / self["tmpb_mc_var"]) + avg_axis = self["tmpf_mc_set"].get_axis_num(avg_dims) - q = ( - self["tmpf_mc_set"] / self["tmpf_mc_var"] - + self["tmpb_mc_set"] / self["tmpb_mc_var"] - ) * tmpw_var + self["tmpf_mc_var"] = (self["tmpf_mc_set"] - self["tmpf"]).var( + dim=avg_dims, ddof=1 + ) - self["tmpw" + "_mc_set"] = q # + if conf_ints: + new_chunks = ((len(conf_ints),),) + self["tmpf_mc_set"].chunks[1:] - self["tmpw"] = ( - self["tmpf"] / self["tmpf_mc_var"] + self["tmpb"] / self["tmpb_mc_var"] - ) * tmpw_var + qq = self["tmpf_mc_set"] - q = self["tmpw" + "_mc_set"] - self["tmpw"] - self["tmpw" + "_mc_var"] = q.var(dim="mc", ddof=1) - - # Calculate the CI of the weighted MC_set - if conf_ints: - new_chunks_weighted = ((len(conf_ints),),) + memchunk[1:] - avg_axis = self["tmpw" + "_mc_set"].get_axis_num("mc") - q2 = self["tmpw" + "_mc_set"].data.map_blocks( + q = qq.data.map_blocks( lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks_weighted, # Explicitly define output chunks - drop_axis=avg_axis, # avg dimensions are dropped + chunks=new_chunks, # + drop_axis=avg_axis, # avg dimesnions are dropped from input arr new_axis=0, - dtype=float, ) # The new CI dimension is added as first axis - self["tmpw" + "_mc"] = (("CI", "x", "time"), q2) - # Clean up the garbage. All arrays with a Monte Carlo dimension. + self["tmpf_mc"] = (("CI", "x", "time"), q) + if mc_remove_set_flag: - remove_mc_set = [ - "r_st", - "r_ast", - "r_rst", - "r_rast", + drop_var = [ "gamma_mc", + "dalpha_mc", "alpha_mc", - "df_mc", - "db_mc", + "c_mc", + "mc", + "r_st", + "r_ast", + "tmpf_mc_set", + "ta_mc_arr", ] - - for i in ["tmpf", "tmpb", "tmpw"]: - remove_mc_set.append(i + "_mc_set") - - if nta: - remove_mc_set.append('talpha"_fw_mc') - remove_mc_set.append('talpha"_bw_mc') - - for k in remove_mc_set: + for k in drop_var: if k in self: del self[k] + pass - def average_double_ended( + def conf_int_double_ended( self, p_val="p_val", p_cov="p_cov", @@ -4598,36 +3762,91 @@ def average_double_ended( rast_var=None, conf_ints=None, mc_sample_size=100, - ci_avg_time_flag1=False, - ci_avg_time_flag2=False, - ci_avg_time_sel=None, - ci_avg_time_isel=None, - ci_avg_x_flag1=False, - ci_avg_x_flag2=False, - ci_avg_x_sel=None, - ci_avg_x_isel=None, + var_only_sections=False, da_random_state=None, mc_remove_set_flag=True, reduce_memory_usage=False, **kwargs, ): - """ - Average temperatures from double-ended setups. + r""" + Estimation of the confidence intervals for the temperatures measured + with a double-ended setup. + Double-ended setups require four additional steps to estimate the + confidence intervals for the temperature. First, the variances of the + Stokes and anti-Stokes intensity measurements of the forward and + backward channels are estimated following the steps in + Section 4 [1]_. See `ds.variance_stokes_constant()`. + A Normal distribution is assigned to each + intensity measurement that is centered at the measurement and using the + estimated variance. Second, a multi-variate Normal distribution is + assigned to the estimated parameters using the covariance matrix from + the calibration procedure presented in Section 6 [1]_ (`p_cov`). Third, + Normal distributions are assigned for :math:`A` (`ds.alpha`) + for each location + outside of the reference sections. These distributions are centered + around :math:`A_p` and have variance :math:`\sigma^2\left[A_p\\right]` + given by Equations 44 and 45. Fourth, the distributions are sampled + and :math:`T_{\mathrm{F},m,n}` and :math:`T_{\mathrm{B},m,n}` are + computed with Equations 16 and 17, respectively. Fifth, step four is repeated to + compute, e.g., 10,000 realizations (`mc_sample_size`) of :math:`T_{\mathrm{F},m,n}` and + :math:`T_{\mathrm{B},m,n}` to approximate their probability density + functions. Sixth, the standard uncertainties of + :math:`T_{\mathrm{F},m,n}` and :math:`T_{\mathrm{B},m,n}` + (:math:`\sigma\left[T_{\mathrm{F},m,n}\\right]` and + :math:`\sigma\left[T_{\mathrm{B},m,n}\\right]`) are estimated with the + standard deviation of their realizations. Seventh, for each realization + :math:`i` the temperature :math:`T_{m,n,i}` is computed as the weighted + average of :math:`T_{\mathrm{F},m,n,i}` and + :math:`T_{\mathrm{B},m,n,i}`: - Four types of averaging are implemented. Please see Example Notebook 16. + .. math:: + + T_{m,n,i} =\ + \sigma^2\left[T_{m,n}\\right]\left({\\frac{T_{\mathrm{F},m,n,i}}{\ + \sigma^2\left[T_{\mathrm{F},m,n}\\right]} +\ + \\frac{T_{\mathrm{B},m,n,i}}{\ + \sigma^2\left[T_{\mathrm{B},m,n}\\right]}}\\right) + + where + + .. math:: + + \sigma^2\left[T_{m,n}\\right] = \\frac{1}{1 /\ + \sigma^2\left[T_{\mathrm{F},m,n}\\right] + 1 /\ + \sigma^2\left[T_{\mathrm{B},m,n}\\right]} + + The best estimate of the temperature :math:`T_{m,n}` is computed + directly from the best estimates of :math:`T_{\mathrm{F},m,n}` and + :math:`T_{\mathrm{B},m,n}` as: + + .. math:: + T_{m,n} =\ + \sigma^2\left[T_{m,n}\\right]\left({\\frac{T_{\mathrm{F},m,n}}{\ + \sigma^2\left[T_{\mathrm{F},m,n}\\right]} + \\frac{T_{\mathrm{B},m,n}}{\ + \sigma^2\left[T_{\mathrm{B},m,n}\\right]}}\\right) + + Alternatively, the best estimate of :math:`T_{m,n}` can be approximated + with the mean of the :math:`T_{m,n,i}` values. Finally, the 95\% + confidence interval for :math:`T_{m,n}` are estimated with the 2.5\% and + 97.5\% percentiles of :math:`T_{m,n,i}`. + + Assumes sections are set. Parameters ---------- p_val : array-like, optional Define `p_val`, `p_var`, `p_cov` if you used an external function - for calibration. Has size 2 + `nt`. First value is :math:`\\gamma`, - second is :math:`\\Delta \\alpha`, others are :math:`C` for each - timestep. + for calibration. Has size `1 + 2 * nt + nx + 2 * nt * nta`. + First value is :math:`\gamma`, then `nt` times + :math:`D_\mathrm{F}`, then `nt` times + :math:`D_\mathrm{B}`, then for each location :math:`D_\mathrm{B}`, + then for each connector that introduces directional attenuation two + parameters per time step. + p_cov : array-like, optional + The covariances of `p_val`. Square matrix. If set to False, no uncertainty in the parameters is propagated into the confidence intervals. Similar to the spec sheets of the DTS - manufacturers. And similar to passing an array filled with zeros - p_cov : array-like, optional - The covariances of `p_val`. + manufacturers. And similar to passing an array filled with zeros. st_var, ast_var, rst_var, rast_var : float, callable, array-like, optional The variance of the measurement noise of the Stokes signals in the forward direction. If `float` the variance of the noise from the @@ -4639,79 +3858,16 @@ def average_double_ended( x. Required if method is wls. conf_ints : iterable object of float A list with the confidence boundaries that are calculated. Valid - values are between - [0, 1]. + values are between [0, 1]. mc_sample_size : int Size of the monte carlo parameter set used to calculate the confidence interval - ci_avg_time_flag1 : bool - The confidence intervals differ each time step. Assumes the - temperature varies during the measurement period. Computes the - arithmic temporal mean. If you would like to know the confidence - interfal of: - (1) a single additional measurement. So you can state "if another - measurement were to be taken, it would have this ci" - (2) all measurements. So you can state "The temperature remained - during the entire measurement period between these ci bounds". - Adds "tmpw" + '_avg1' and "tmpw" + '_mc_avg1_var' to the - DataStore. If `conf_ints` are set, also the confidence intervals - `_mc_avg1` are added to the DataStore. Works independently of the - ci_avg_time_flag2 and ci_avg_x_flag. - ci_avg_time_flag2 : bool - The confidence intervals differ each time step. Assumes the - temperature remains constant during the measurement period. - Computes the inverse-variance-weighted-temporal-mean temperature - and its uncertainty. - If you would like to know the confidence interfal of: - (1) I want to estimate a background temperature with confidence - intervals. I hereby assume the temperature does not change over - time and average all measurements to get a better estimate of the - background temperature. - Adds "tmpw" + '_avg2' and "tmpw" + '_mc_avg2_var' to the - DataStore. If `conf_ints` are set, also the confidence intervals - `_mc_avg2` are added to the DataStore. Works independently of the - ci_avg_time_flag1 and ci_avg_x_flag. - ci_avg_time_sel : slice - Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a - selection of the data - ci_avg_time_isel : iterable of int - Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a - selection of the data - ci_avg_x_flag1 : bool - The confidence intervals differ at each location. Assumes the - temperature varies over `x` and over time. Computes the - arithmic spatial mean. If you would like to know the confidence - interfal of: - (1) a single additional measurement location. So you can state "if - another measurement location were to be taken, - it would have this ci" - (2) all measurement locations. So you can state "The temperature - along the fiber remained between these ci bounds". - Adds "tmpw" + '_avgx1' and "tmpw" + '_mc_avgx1_var' to the - DataStore. If `conf_ints` are set, also the confidence intervals - `_mc_avgx1` are added to the DataStore. Works independently of the - ci_avg_time_flag1, ci_avg_time_flag2 and ci_avg_x2_flag. - ci_avg_x_flag2 : bool - The confidence intervals differ at each location. Assumes the - temperature is the same at each location but varies over time. - Computes the inverse-variance-weighted-spatial-mean temperature - and its uncertainty. - If you would like to know the confidence interfal of: - (1) I have put a lot of fiber in water, and I know that the - temperature variation in the water is much smaller than along - other parts of the fiber. And I would like to average the - measurements from multiple locations to improve the estimated - temperature. - Adds "tmpw" + '_avg2' and "tmpw" + '_mc_avg2_var' to the - DataStore. If `conf_ints` are set, also the confidence intervals - `_mc_avg2` are added to the DataStore. Works independently of the - ci_avg_time_flag1 and ci_avg_x_flag. - ci_avg_x_sel : slice - Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a - selection of the data - ci_avg_x_isel : iterable of int - Compute ci_avg_time_flag1 and ci_avg_time_flag2 using only a - selection of the data + var_only_sections : bool + useful if using the ci_avg_x_flag. Only calculates the var over the + sections, so that the values can be compared with accuracy along the + reference sections. Where the accuracy is the variance of the + residuals between the estimated temperature and temperature of the + water baths. da_random_state For testing purposes. Similar to random seed. The seed for dask. Makes random not so random. To produce reproducable results for @@ -4725,6 +3881,13 @@ def average_double_ended( Returns ------- + References + ---------- + .. [1] des Tombe, B., Schilperoort, B., & Bakker, M. (2020). Estimation + of Temperature and Associated Uncertainty from Fiber-Optic Raman- + Spectrum Distributed Temperature Sensing. Sensors, 20(8), 2235. + https://doi.org/10.3390/s20082235 + """ def create_da_ta2(no, i_splice, direction="fw", chunks=None): @@ -4758,335 +3921,320 @@ def create_da_ta2(no, i_splice, direction="fw", chunks=None): ).rechunk((1, chunks[1], 1)) return arr - self.check_deprecated_kwargs(kwargs) + check_deprecated_kwargs(kwargs) - if (ci_avg_x_flag1 or ci_avg_x_flag2) and ( - ci_avg_time_flag1 or ci_avg_time_flag2 - ): - raise NotImplementedError( - "Incompatible flags. Can not pick " "the right chunks" + if da_random_state: + # In testing environments + assert isinstance(da_random_state, da.random.RandomState) + state = da_random_state + else: + state = da.random.RandomState() + + if conf_ints: + assert "tmpw", ( + "Current implementation requires you to " + 'define "tmpw" when estimating confidence ' + "intervals" ) - elif not ( - ci_avg_x_flag1 or ci_avg_x_flag2 or ci_avg_time_flag1 or ci_avg_time_flag2 - ): - raise NotImplementedError("Pick one of the averaging options") + no, nt = self.st.shape + nta = self.trans_att.size + npar = 1 + 2 * nt + no + nt * 2 * nta # number of parameters + + rsize = (mc_sample_size, no, nt) + if reduce_memory_usage: + memchunk = da.ones( + (mc_sample_size, no, nt), chunks={0: -1, 1: 1, 2: "auto"} + ).chunks else: - pass + memchunk = da.ones( + (mc_sample_size, no, nt), chunks={0: -1, 1: "auto", 2: "auto"} + ).chunks - self.conf_int_double_ended( - p_val=p_val, - p_cov=p_cov, - st_var=st_var, - ast_var=ast_var, - rst_var=rst_var, - rast_var=rast_var, - conf_ints=None, - mc_sample_size=mc_sample_size, - da_random_state=da_random_state, - mc_remove_set_flag=False, - reduce_memory_usage=reduce_memory_usage, - **kwargs, + self.coords["mc"] = range(mc_sample_size) + if conf_ints: + self.coords["CI"] = conf_ints + + assert isinstance(p_val, (str, np.ndarray, np.generic)) + if isinstance(p_val, str): + p_val = self[p_val].values + assert p_val.shape == (npar,), ( + "Did you set 'talpha' as " + "keyword argument of the " + "conf_int_double_ended() function?" ) - for label in ["tmpf", "tmpb"]: - if ci_avg_time_sel is not None: - time_dim2 = "time" + "_avg" - x_dim2 = "x" - self.coords[time_dim2] = ( - (time_dim2,), - self["time"].sel(**{"time": ci_avg_time_sel}).data, - ) - self[label + "_avgsec"] = ( - ("x", time_dim2), - self[label].sel(**{"time": ci_avg_time_sel}).data, - ) - self[label + "_mc_set"] = ( - ("mc", "x", time_dim2), - self[label + "_mc_set"].sel(**{"time": ci_avg_time_sel}).data, - ) + assert isinstance(p_cov, (str, np.ndarray, np.generic, bool)) - elif ci_avg_time_isel is not None: - time_dim2 = "time" + "_avg" - x_dim2 = "x" - self.coords[time_dim2] = ( - (time_dim2,), - self["time"].isel(**{"time": ci_avg_time_isel}).data, - ) - self[label + "_avgsec"] = ( - ("x", time_dim2), - self[label].isel(**{"time": ci_avg_time_isel}).data, - ) - self[label + "_mc_set"] = ( - ("mc", "x", time_dim2), - self[label + "_mc_set"].isel(**{"time": ci_avg_time_isel}).data, - ) + if isinstance(p_cov, bool) and not p_cov: + # Exclude parameter uncertainty if p_cov == False + gamma = p_val[0] + d_fw = p_val[1 : nt + 1] + d_bw = p_val[1 + nt : 2 * nt + 1] + alpha = p_val[2 * nt + 1 : 2 * nt + 1 + no] - elif ci_avg_x_sel is not None: - time_dim2 = "time" - x_dim2 = "x_avg" - self.coords[x_dim2] = ((x_dim2,), self.x.sel(x=ci_avg_x_sel).data) - self[label + "_avgsec"] = ( - (x_dim2, "time"), - self[label].sel(x=ci_avg_x_sel).data, - ) - self[label + "_mc_set"] = ( - ("mc", x_dim2, "time"), - self[label + "_mc_set"].sel(x=ci_avg_x_sel).data, + self["gamma_mc"] = (tuple(), gamma) + self["alpha_mc"] = (("x",), alpha) + self["df_mc"] = (("time",), d_fw) + self["db_mc"] = (("time",), d_bw) + + if nta: + ta = p_val[2 * nt + 1 + no :].reshape((nt, 2, nta), order="F") + ta_fw = ta[:, 0, :] + ta_bw = ta[:, 1, :] + + ta_fw_arr = np.zeros((no, nt)) + for tai, taxi in zip(ta_fw.T, self.coords["trans_att"].values): + ta_fw_arr[self.x.values >= taxi] = ( + ta_fw_arr[self.x.values >= taxi] + tai + ) + + ta_bw_arr = np.zeros((no, nt)) + for tai, taxi in zip(ta_bw.T, self.coords["trans_att"].values): + ta_bw_arr[self.x.values < taxi] = ( + ta_bw_arr[self.x.values < taxi] + tai + ) + + self["talpha_fw_mc"] = (("x", "time"), ta_fw_arr) + self["talpha_bw_mc"] = (("x", "time"), ta_bw_arr) + + elif isinstance(p_cov, bool) and p_cov: + raise NotImplementedError("Not an implemented option. Check p_cov argument") + + else: + # WLS + if isinstance(p_cov, str): + p_cov = self[p_cov].values + assert p_cov.shape == (npar, npar) + + ix_sec = self.ufunc_per_section(x_indices=True, calc_per="all") + nx_sec = ix_sec.size + from_i = np.concatenate( + ( + np.arange(1 + 2 * nt), + 1 + 2 * nt + ix_sec, + np.arange(1 + 2 * nt + no, 1 + 2 * nt + no + nt * 2 * nta), ) + ) + iox_sec1, iox_sec2 = np.meshgrid(from_i, from_i, indexing="ij") + po_val = p_val[from_i] + po_cov = p_cov[iox_sec1, iox_sec2] - elif ci_avg_x_isel is not None: - time_dim2 = "time" - x_dim2 = "x_avg" - self.coords[x_dim2] = ((x_dim2,), self.x.isel(x=ci_avg_x_isel).data) - self[label + "_avgsec"] = ( - (x_dim2, time_dim2), - self[label].isel(x=ci_avg_x_isel).data, + po_mc = sst.multivariate_normal.rvs( + mean=po_val, cov=po_cov, size=mc_sample_size + ) + + gamma = po_mc[:, 0] + d_fw = po_mc[:, 1 : nt + 1] + d_bw = po_mc[:, 1 + nt : 2 * nt + 1] + + self["gamma_mc"] = (("mc",), gamma) + self["df_mc"] = (("mc", "time"), d_fw) + self["db_mc"] = (("mc", "time"), d_bw) + + # calculate alpha seperately + alpha = np.zeros((mc_sample_size, no), dtype=float) + alpha[:, ix_sec] = po_mc[:, 1 + 2 * nt : 1 + 2 * nt + nx_sec] + + not_ix_sec = np.array([i for i in range(no) if i not in ix_sec]) + + if np.any(not_ix_sec): + not_alpha_val = p_val[2 * nt + 1 + not_ix_sec] + not_alpha_var = p_cov[2 * nt + 1 + not_ix_sec, 2 * nt + 1 + not_ix_sec] + + not_alpha_mc = np.random.normal( + loc=not_alpha_val, + scale=not_alpha_var**0.5, + size=(mc_sample_size, not_alpha_val.size), ) - self[label + "_mc_set"] = ( - ("mc", x_dim2, time_dim2), - self[label + "_mc_set"].isel(x=ci_avg_x_isel).data, + + alpha[:, not_ix_sec] = not_alpha_mc + + self["alpha_mc"] = (("mc", "x"), alpha) + + if nta: + ta = po_mc[:, 2 * nt + 1 + nx_sec :].reshape( + (mc_sample_size, nt, 2, nta), order="F" ) - else: - self[label + "_avgsec"] = self[label] - x_dim2 = "x" - time_dim2 = "time" + ta_fw = ta[:, :, 0, :] + ta_bw = ta[:, :, 1, :] - memchunk = self[label + "_mc_set"].chunks + ta_fw_arr = da.zeros( + (mc_sample_size, no, nt), chunks=memchunk, dtype=float + ) + for tai, taxi in zip( + ta_fw.swapaxes(0, 2), self.coords["trans_att"].values + ): + # iterate over the splices + i_splice = sum(self.x.values < taxi) + mask = create_da_ta2(no, i_splice, direction="fw", chunks=memchunk) - # subtract the mean temperature - q = self[label + "_mc_set"] - self[label + "_avgsec"] - self[label + "_mc" + "_avgsec_var"] = q.var(dim="mc", ddof=1) + ta_fw_arr += mask * tai.T[:, None, :] - if ci_avg_x_flag1: - # unweighted mean - self[label + "_avgx1"] = self[label + "_avgsec"].mean(dim=x_dim2) + ta_bw_arr = da.zeros( + (mc_sample_size, no, nt), chunks=memchunk, dtype=float + ) + for tai, taxi in zip( + ta_bw.swapaxes(0, 2), self.coords["trans_att"].values + ): + i_splice = sum(self.x.values < taxi) + mask = create_da_ta2(no, i_splice, direction="bw", chunks=memchunk) - q = self[label + "_mc_set"] - self[label + "_avgsec"] - qvar = q.var(dim=["mc", x_dim2], ddof=1) - self[label + "_mc_avgx1_var"] = qvar + ta_bw_arr += mask * tai.T[:, None, :] - if conf_ints: - new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[2]) - avg_axis = self[label + "_mc_set"].get_axis_num(["mc", x_dim2]) - q = self[label + "_mc_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks, # - drop_axis=avg_axis, - # avg dimensions are dropped from input arr - new_axis=0, - ) # The new CI dim is added as firsaxis + self["talpha_fw_mc"] = (("mc", "x", "time"), ta_fw_arr) + self["talpha_bw_mc"] = (("mc", "x", "time"), ta_bw_arr) - self[label + "_mc_avgx1"] = (("CI", time_dim2), q) + # Draw from the normal distributions for the Stokes intensities + for k, st_labeli, st_vari in zip( + ["r_st", "r_ast", "r_rst", "r_rast"], + ["st", "ast", "rst", "rast"], + [st_var, ast_var, rst_var, rast_var], + ): + # Load the mean as chunked Dask array, otherwise eats memory + if type(self[st_labeli].data) == da.core.Array: + loc = da.asarray(self[st_labeli].data, chunks=memchunk[1:]) + else: + loc = da.from_array(self[st_labeli].data, chunks=memchunk[1:]) - if ci_avg_x_flag2: - q = self[label + "_mc_set"] - self[label + "_avgsec"] + # Make sure variance is of size (no, nt) + if np.size(st_vari) > 1: + if st_vari.shape == self[st_labeli].shape: + pass + else: + st_vari = np.broadcast_to(st_vari, (no, nt)) + else: + pass - qvar = q.var(dim=["mc"], ddof=1) + # Load variance as chunked Dask array, otherwise eats memory + if type(st_vari) == da.core.Array: + st_vari_da = da.asarray(st_vari, chunks=memchunk[1:]) - # Inverse-variance weighting - avg_x_var = 1 / (1 / qvar).sum(dim=x_dim2) + elif callable(st_vari) and type(self[st_labeli].data) == da.core.Array: + st_vari_da = da.asarray( + st_vari(self[st_labeli]).data, chunks=memchunk[1:] + ) - self[label + "_mc_avgx2_var"] = avg_x_var + elif callable(st_vari) and type(self[st_labeli].data) != da.core.Array: + st_vari_da = da.from_array( + st_vari(self[st_labeli]).data, chunks=memchunk[1:] + ) - self[label + "_mc_avgx2_set"] = (self[label + "_mc_set"] / qvar).sum( - dim=x_dim2 - ) * avg_x_var - self[label + "_avgx2"] = self[label + "_mc_avgx2_set"].mean(dim="mc") + else: + st_vari_da = da.from_array(st_vari, chunks=memchunk[1:]) - if conf_ints: - new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[2]) - avg_axis_avgx = self[label + "_mc_set"].get_axis_num("mc") + self[k] = ( + ("mc", "x", "time"), + state.normal( + loc=loc, # has chunks=memchunk[1:] + scale=st_vari_da**0.5, + size=rsize, + chunks=memchunk, + ), + ) - qq = self[label + "_mc_avgx2_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avgx), - chunks=new_chunks, # - drop_axis=avg_axis_avgx, - # avg dimensions are dropped from input arr - new_axis=0, - dtype=float, - ) # The new CI dimension is added as - # firsaxis - self[label + "_mc_avgx2"] = (("CI", time_dim2), qq) + for label in ["tmpf", "tmpb"]: + if "tmpw" or label: + if label == "tmpf": + if nta: + self["tmpf_mc_set"] = ( + self["gamma_mc"] + / ( + np.log(self["r_st"] / self["r_ast"]) + + self["df_mc"] + + self["alpha_mc"] + + self["talpha_fw_mc"] + ) + - 273.15 + ) + else: + self["tmpf_mc_set"] = ( + self["gamma_mc"] + / ( + np.log(self["r_st"] / self["r_ast"]) + + self["df_mc"] + + self["alpha_mc"] + ) + - 273.15 + ) + else: + if nta: + self["tmpb_mc_set"] = ( + self["gamma_mc"] + / ( + np.log(self["r_rst"] / self["r_rast"]) + + self["db_mc"] + - self["alpha_mc"] + + self["talpha_bw_mc"] + ) + - 273.15 + ) + else: + self["tmpb_mc_set"] = ( + self["gamma_mc"] + / ( + np.log(self["r_rst"] / self["r_rast"]) + + self["db_mc"] + - self["alpha_mc"] + ) + - 273.15 + ) - if ci_avg_time_flag1 is not None: - # unweighted mean - self[label + "_avg1"] = self[label + "_avgsec"].mean(dim=time_dim2) + if var_only_sections: + # sets the values outside the reference sections to NaN + xi = self.ufunc_per_section(x_indices=True, calc_per="all") + x_mask_ = [True if ix in xi else False for ix in range(self.x.size)] + x_mask = np.reshape(x_mask_, (1, -1, 1)) + self[label + "_mc_set"] = self[label + "_mc_set"].where(x_mask) - q = self[label + "_mc_set"] - self[label + "_avgsec"] - qvar = q.var(dim=["mc", time_dim2], ddof=1) - self[label + "_mc_avg1_var"] = qvar + # subtract the mean temperature + q = self[label + "_mc_set"] - self[label] + self[label + "_mc_var"] = q.var(dim="mc", ddof=1) if conf_ints: - new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[1]) - avg_axis = self[label + "_mc_set"].get_axis_num(["mc", time_dim2]) + new_chunks = list(self[label + "_mc_set"].chunks) + new_chunks[0] = (len(conf_ints),) + avg_axis = self[label + "_mc_set"].get_axis_num("mc") q = self[label + "_mc_set"].data.map_blocks( lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), chunks=new_chunks, # drop_axis=avg_axis, # avg dimensions are dropped from input arr new_axis=0, - ) # The new CI dim is added as firsaxis - - self[label + "_mc_avg1"] = (("CI", x_dim2), q) - - if ci_avg_time_flag2: - q = self[label + "_mc_set"] - self[label + "_avgsec"] - - qvar = q.var(dim=["mc"], ddof=1) - - # Inverse-variance weighting - avg_time_var = 1 / (1 / qvar).sum(dim=time_dim2) - - self[label + "_mc_avg2_var"] = avg_time_var - - self[label + "_mc_avg2_set"] = (self[label + "_mc_set"] / qvar).sum( - dim=time_dim2 - ) * avg_time_var - self[label + "_avg2"] = self[label + "_mc_avg2_set"].mean(dim="mc") - - if conf_ints: - new_chunks = (len(conf_ints), self[label + "_mc_set"].chunks[1]) - avg_axis_avg2 = self[label + "_mc_set"].get_axis_num("mc") + ) # The new CI dimension is added as firsaxis - qq = self[label + "_mc_avg2_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avg2), - chunks=new_chunks, # - drop_axis=avg_axis_avg2, - # avg dimensions are dropped from input arr - new_axis=0, - dtype=float, - ) # The new CI dimension is added as - # firsaxis - self[label + "_mc_avg2"] = (("CI", x_dim2), qq) + self[label + "_mc"] = (("CI", "x", "time"), q) # Weighted mean of the forward and backward - tmpw_var = 1 / ( - 1 / self["tmpf_mc" + "_avgsec_var"] + 1 / self["tmpb_mc" + "_avgsec_var"] - ) + tmpw_var = 1 / (1 / self["tmpf_mc_var"] + 1 / self["tmpb_mc_var"]) q = ( - self["tmpf_mc_set"] / self["tmpf_mc" + "_avgsec_var"] - + self["tmpb_mc_set"] / self["tmpb_mc" + "_avgsec_var"] + self["tmpf_mc_set"] / self["tmpf_mc_var"] + + self["tmpb_mc_set"] / self["tmpb_mc_var"] ) * tmpw_var self["tmpw" + "_mc_set"] = q # - # self["tmpw"] = self["tmpw" + '_mc_set'].mean(dim='mc') - self["tmpw" + "_avgsec"] = ( - self["tmpf_avgsec"] / self["tmpf_mc" + "_avgsec_var"] - + self["tmpb_avgsec"] / self["tmpb_mc" + "_avgsec_var"] + self["tmpw"] = ( + self["tmpf"] / self["tmpf_mc_var"] + self["tmpb"] / self["tmpb_mc_var"] ) * tmpw_var - q = self["tmpw" + "_mc_set"] - self["tmpw" + "_avgsec"] - self["tmpw" + "_mc" + "_avgsec_var"] = q.var(dim="mc", ddof=1) - - if ci_avg_time_flag1: - self["tmpw" + "_avg1"] = self["tmpw" + "_avgsec"].mean(dim=time_dim2) - - self["tmpw" + "_mc_avg1_var"] = self["tmpw" + "_mc_set"].var( - dim=["mc", time_dim2] - ) - - if conf_ints: - new_chunks_weighted = ((len(conf_ints),),) + (memchunk[1],) - avg_axis = self["tmpw" + "_mc_set"].get_axis_num(["mc", time_dim2]) - q2 = self["tmpw" + "_mc_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks_weighted, - # Explicitly define output chunks - drop_axis=avg_axis, # avg dimensions are dropped - new_axis=0, - dtype=float, - ) # The new CI dimension is added as - # first axis - self["tmpw" + "_mc_avg1"] = (("CI", x_dim2), q2) - - if ci_avg_time_flag2: - tmpw_var_avg2 = 1 / ( - 1 / self["tmpf_mc_avg2_var"] + 1 / self["tmpb_mc_avg2_var"] - ) - - q = ( - self["tmpf_mc_avg2_set"] / self["tmpf_mc_avg2_var"] - + self["tmpb_mc_avg2_set"] / self["tmpb_mc_avg2_var"] - ) * tmpw_var_avg2 - - self["tmpw" + "_mc_avg2_set"] = q # - - self["tmpw" + "_avg2"] = ( - self["tmpf_avg2"] / self["tmpf_mc_avg2_var"] - + self["tmpb_avg2"] / self["tmpb_mc_avg2_var"] - ) * tmpw_var_avg2 - - self["tmpw" + "_mc_avg2_var"] = tmpw_var_avg2 - - if conf_ints: - # We first need to know the x-dim-chunk-size - new_chunks_weighted = ((len(conf_ints),),) + (memchunk[1],) - avg_axis_avg2 = self["tmpw" + "_mc_avg2_set"].get_axis_num("mc") - q2 = self["tmpw" + "_mc_avg2_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avg2), - chunks=new_chunks_weighted, - # Explicitly define output chunks - drop_axis=avg_axis_avg2, # avg dimensions are dropped - new_axis=0, - dtype=float, - ) # The new CI dimension is added as firstax - self["tmpw" + "_mc_avg2"] = (("CI", x_dim2), q2) - - if ci_avg_x_flag1: - self["tmpw" + "_avgx1"] = self["tmpw" + "_avgsec"].mean(dim=x_dim2) - - self["tmpw" + "_mc_avgx1_var"] = self["tmpw" + "_mc_set"].var(dim=x_dim2) - - if conf_ints: - new_chunks_weighted = ((len(conf_ints),),) + (memchunk[2],) - avg_axis = self["tmpw" + "_mc_set"].get_axis_num(["mc", x_dim2]) - q2 = self["tmpw" + "_mc_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), - chunks=new_chunks_weighted, - # Explicitly define output chunks - drop_axis=avg_axis, # avg dimensions are dropped - new_axis=0, - dtype=float, - ) # The new CI dimension is added as - # first axis - self["tmpw" + "_mc_avgx1"] = (("CI", time_dim2), q2) - - if ci_avg_x_flag2: - tmpw_var_avgx2 = 1 / ( - 1 / self["tmpf_mc_avgx2_var"] + 1 / self["tmpb_mc_avgx2_var"] - ) - - q = ( - self["tmpf_mc_avgx2_set"] / self["tmpf_mc_avgx2_var"] - + self["tmpb_mc_avgx2_set"] / self["tmpb_mc_avgx2_var"] - ) * tmpw_var_avgx2 - - self["tmpw" + "_mc_avgx2_set"] = q # - - self["tmpw" + "_avgx2"] = ( - self["tmpf_avgx2"] / self["tmpf_mc_avgx2_var"] - + self["tmpb_avgx2"] / self["tmpb_mc_avgx2_var"] - ) * tmpw_var_avgx2 - - self["tmpw" + "_mc_avgx2_var"] = tmpw_var_avgx2 + q = self["tmpw" + "_mc_set"] - self["tmpw"] + self["tmpw" + "_mc_var"] = q.var(dim="mc", ddof=1) - if conf_ints: - # We first need to know the x-dim-chunk-size - new_chunks_weighted = ((len(conf_ints),),) + (memchunk[2],) - avg_axis_avgx2 = self["tmpw" + "_mc_avgx2_set"].get_axis_num("mc") - q2 = self["tmpw" + "_mc_avgx2_set"].data.map_blocks( - lambda x: np.percentile(x, q=conf_ints, axis=avg_axis_avgx2), - chunks=new_chunks_weighted, - # Explicitly define output chunks - drop_axis=avg_axis_avgx2, # avg dimensions are dropped - new_axis=0, - dtype=float, - ) # The new CI dimension is added as firstax - self["tmpw" + "_mc_avgx2"] = (("CI", time_dim2), q2) + # Calculate the CI of the weighted MC_set + if conf_ints: + new_chunks_weighted = ((len(conf_ints),),) + memchunk[1:] + avg_axis = self["tmpw" + "_mc_set"].get_axis_num("mc") + q2 = self["tmpw" + "_mc_set"].data.map_blocks( + lambda x: np.percentile(x, q=conf_ints, axis=avg_axis), + chunks=new_chunks_weighted, # Explicitly define output chunks + drop_axis=avg_axis, # avg dimensions are dropped + new_axis=0, + dtype=float, + ) # The new CI dimension is added as first axis + self["tmpw" + "_mc"] = (("CI", "x", "time"), q2) # Clean up the garbage. All arrays with a Monte Carlo dimension. if mc_remove_set_flag: @@ -5099,26 +4247,65 @@ def create_da_ta2(no, i_splice, direction="fw", chunks=None): "alpha_mc", "df_mc", "db_mc", - "x_avg", - "time_avg", - "mc", ] - for i in ["tmpf", "tmpb", "tmpw"]: - remove_mc_set.append(i + "_avgsec") - remove_mc_set.append(i + "_mc_set") - remove_mc_set.append(i + "_mc_avg2_set") - remove_mc_set.append(i + "_mc_avgx2_set") - remove_mc_set.append(i + "_mc_avgsec_var") + for i in ["tmpf", "tmpb", "tmpw"]: + remove_mc_set.append(i + "_mc_set") + + if nta: + remove_mc_set.append('talpha"_fw_mc') + remove_mc_set.append('talpha"_bw_mc') + + for k in remove_mc_set: + if k in self: + del self[k] + pass + + def in_confidence_interval(self, ci_label, conf_ints=None, sections=None): + """ + Returns an array with bools wether the temperature of the reference + sections are within the confidence intervals + + Parameters + ---------- + sections : Dict[str, List[slice]] + ci_label : str + The label of the data containing the confidence intervals. + conf_ints : Tuple + A tuple containing two floats between 0 and 1, representing the + levels between which the reference temperature should lay. + + Returns + ------- + + """ + if sections is None: + sections = self.sections + else: + sections = self.validate_sections(sections) + + if conf_ints is None: + conf_ints = self[ci_label].values + + assert len(conf_ints) == 2, "Please define conf_ints" - if self.trans_att.size: - remove_mc_set.append('talpha"_fw_mc') - remove_mc_set.append('talpha"_bw_mc') + tmp_dn = self[ci_label].sel(CI=conf_ints[0], method="nearest") + tmp_up = self[ci_label].sel(CI=conf_ints[1], method="nearest") - for k in remove_mc_set: - if k in self: - del self[k] - pass + ref = self.ufunc_per_section( + sections=sections, label="st", ref_temp_broadcasted=True, calc_per="all" + ) + ix_resid = self.ufunc_per_section( + sections=sections, x_indices=True, calc_per="all" + ) + ref_sorted = np.full(shape=tmp_dn.shape, fill_value=np.nan) + ref_sorted[ix_resid, :] = ref + ref_da = xr.DataArray(data=ref_sorted, coords=tmp_dn.coords) + + mask_dn = ref_da >= tmp_dn + mask_up = ref_da <= tmp_up + + return np.logical_and(mask_dn, mask_up) def temperature_residuals(self, label=None, sections=None): """ @@ -5145,6 +4332,11 @@ def temperature_residuals(self, label=None, sections=None): resid_da : xarray.DataArray The residuals as DataArray """ + if sections is None: + sections = self.sections + else: + sections = self.validate_sections(sections) + resid_temp = self.ufunc_per_section( sections=sections, label=label, temp_err=True, calc_per="all" ) @@ -5287,1013 +4479,34 @@ def ufunc_per_section( if sections is None: sections = self.sections - if not func: - - def func(a): - """ - - Parameters - ---------- - a - - Returns - ------- - - """ - return a - - elif isinstance(func, str) and func == "var": - - def func(a): - """ - - Parameters - ---------- - a - - Returns - ------- - - """ - return np.var(a, ddof=1) - + if label is None: + dataarray = None else: - assert callable(func) + dataarray = self[label] - assert calc_per in ["all", "section", "stretch"] - - if not x_indices and ( - (label and hasattr(self[label].data, "chunks")) - or ( - subtract_from_label - and hasattr(self[subtract_from_label].data, "chunks") - ) - ): - concat = da.concatenate + if x_indices: + x_coords = self.x + reference_dataset = None else: - concat = np.concatenate - - out = dict() - - for k, section in sections.items(): - out[k] = [] - for stretch in section: - if x_indices: - assert not subtract_from_label - assert not temp_err - assert not ref_temp_broadcasted - # so it is slicable with x-indices - self["_x_indices"] = self.x.astype(int) * 0 + np.arange(self.x.size) - arg1 = self["_x_indices"].sel(x=stretch).data - del self["_x_indices"] - - else: - arg1 = self[label].sel(x=stretch).data - - if subtract_from_label: - # calculate std wrt other series - # check_dims(self, [subtract_from_label], - # correct_dims=('x', 'time')) - - assert not temp_err - - arg2 = self[subtract_from_label].sel(x=stretch).data - out[k].append(arg1 - arg2) - - elif temp_err: - # calculate std wrt reference temperature of the - # corresponding bath - arg2 = self[k].data - out[k].append(arg1 - arg2) - - elif ref_temp_broadcasted: - assert not temp_err - assert not subtract_from_label - - arg2 = da.broadcast_to(self[k].data, arg1.shape) - out[k].append(arg2) - - else: - # calculate std wrt mean value - out[k].append(arg1) - - if calc_per == "stretch": - out[k] = [func(argi, **func_kwargs) for argi in out[k]] - - elif calc_per == "section": - # flatten the out_dict to sort them - start = [i.start for i in section] - i_sorted = np.argsort(start) - out_flat_sort = [out[k][i] for i in i_sorted] - out[k] = func(concat(out_flat_sort), **func_kwargs) - - if calc_per == "all": - # flatten the out_dict to sort them - start = [item.start for sublist in sections.values() for item in sublist] - i_sorted = np.argsort(start) - out_flat = [item for sublist in out.values() for item in sublist] - out_flat_sort = [out_flat[i] for i in i_sorted] - out = func(concat(out_flat_sort, axis=0), **func_kwargs) - - if ( - hasattr(out, "chunks") - and len(out.chunks) > 0 - and "x" in self[label].dims - ): - # also sum the chunksize in the x dimension - # first find out where the x dim is - ixdim = self[label].dims.index("x") - c_old = out.chunks - c_new = list(c_old) - c_new[ixdim] = sum(c_old[ixdim]) - out = out.rechunk(c_new) - + x_coords = None + reference_dataset = {k: self[k] for k in sections} + + out = ufunc_per_section_helper( + x_coords=x_coords, + sections=sections, + func=func, + dataarray=dataarray, + subtract_from_dataarray=subtract_from_label, + reference_dataset=reference_dataset, + subtract_reference_from_dataarray=temp_err, + ref_temp_broadcasted=ref_temp_broadcasted, + calc_per=calc_per, + **func_kwargs, + ) return out def resample_datastore(*args, **kwargs): - raise "ds.resample_datastore() is deprecated. Use " "from dtscalibration import DataStore; DataStore(ds.resample()) " "instead. See example notebook 2." - - -class ParameterIndexDoubleEnded: - """ - npar = 1 + 2 * nt + nx + 2 * nt * nta - assert pval.size == npar - assert p_var.size == npar - if calc_cov: - assert p_cov.shape == (npar, npar) - gamma = pval[0] - d_fw = pval[1:nt + 1] - d_bw = pval[1 + nt:1 + 2 * nt] - alpha = pval[1 + 2 * nt:1 + 2 * nt + nx] - # store calibration parameters in DataStore - self["gamma"] = (tuple(), gamma) - self["alpha"] = (('x',), alpha) - self["df"] = (('time',), d_fw) - self["db"] = (('time',), d_bw) - if nta > 0: - ta = pval[1 + 2 * nt + nx:].reshape((nt, 2, nta), order='F') - self['talpha_fw'] = (('time', 'trans_att'), ta[:, 0, :]) - self['talpha_bw'] = (('time', 'trans_att'), ta[:, 1, :]) - """ - - def __init__(self, nt, nx, nta, fix_gamma=False, fix_alpha=False): - self.nt = nt - self.nx = nx - self.nta = nta - self.fix_gamma = fix_gamma - self.fix_alpha = fix_alpha - - @property - def all(self): - return np.concatenate( - (self.gamma, self.df, self.db, self.alpha, self.ta.flatten(order="F")) - ) - - @property - def npar(self): - if not self.fix_gamma and not self.fix_alpha: - return 1 + 2 * self.nt + self.nx + 2 * self.nt * self.nta - elif self.fix_gamma and not self.fix_alpha: - return 2 * self.nt + self.nx + 2 * self.nt * self.nta - elif not self.fix_gamma and self.fix_alpha: - return 1 + 2 * self.nt + 2 * self.nt * self.nta - elif self.fix_gamma and self.fix_alpha: - return 2 * self.nt + 2 * self.nt * self.nta - - @property - def gamma(self): - if self.fix_gamma: - return [] - else: - return [0] - - @property - def df(self): - if self.fix_gamma: - return list(range(self.nt)) - else: - return list(range(1, self.nt + 1)) - - @property - def db(self): - if self.fix_gamma: - return list(range(self.nt, 2 * self.nt)) - else: - return list(range(1 + self.nt, 1 + 2 * self.nt)) - - @property - def alpha(self): - if self.fix_alpha: - return [] - elif self.fix_gamma: - return list(range(2 * self.nt, 1 + 2 * self.nt + self.nx)) - elif not self.fix_gamma: - return list(range(1 + 2 * self.nt, 1 + 2 * self.nt + self.nx)) - - @property - def ta(self): - if self.nta == 0: - return np.zeros((self.nt, 2, 0)) - elif not self.fix_gamma and not self.fix_alpha: - arr = np.arange(1 + 2 * self.nt + self.nx, self.npar) - elif self.fix_gamma and not self.fix_alpha: - arr = np.arange(2 * self.nt + self.nx, self.npar) - elif not self.fix_gamma and self.fix_alpha: - arr = np.arange(1 + 2 * self.nt, self.npar) - elif self.fix_gamma and self.fix_alpha: - arr = np.arange(2 * self.nt, self.npar) - - return arr.reshape((self.nt, 2, self.nta), order="F") - - @property - def taf(self): - """ - Use `.reshape((nt, nta))` to convert array to (time-dim and transatt-dim). Order is the default C order. - ta = pval[1 + 2 * nt + nx:].reshape((nt, 2, nta), order='F') - self['talpha_fw'] = (('time', 'trans_att'), ta[:, 0, :]) - """ - return self.ta[:, 0, :].flatten(order="C") - - @property - def tab(self): - """ - Use `.reshape((nt, nta))` to convert array to (time-dim and transatt-dim). Order is the default C order. - ta = pval[1 + 2 * nt + nx:].reshape((nt, 2, nta), order='F') - self['talpha_bw'] = (('time', 'trans_att'), ta[:, 1, :]) - """ - return self.ta[:, 1, :].flatten(order="C") - - def get_ta_pars(self, pval): - if self.nta > 0: - if pval.ndim == 1: - return np.take_along_axis(pval[None, None], self.ta, axis=-1) - - else: - # assume shape is (a, npar) and returns shape (nt, 2, nta, a) - assert pval.shape[1] == self.npar and pval.ndim == 2 - return np.stack([self.get_ta_pars(v) for v in pval], axis=-1) - - else: - return np.zeros(shape=(self.nt, 2, 0)) - - def get_taf_pars(self, pval): - """returns taf parameters of shape (nt, nta) or (nt, nta, a)""" - return self.get_ta_pars(pval=pval)[:, 0, :] - - def get_tab_pars(self, pval): - """returns taf parameters of shape (nt, nta) or (nt, nta, a)""" - return self.get_ta_pars(pval=pval)[:, 1, :] - - def get_taf_values(self, pval, x, trans_att, axis=""): - """returns taf parameters of shape (nx, nt)""" - pval = np.atleast_2d(pval) - - assert pval.ndim == 2 and pval.shape[1] == self.npar - - arr_out = np.zeros((self.nx, self.nt)) - - if self.nta == 0: - pass - - elif axis == "": - assert pval.shape[0] == 1 - - arr = np.transpose(self.get_taf_pars(pval), axes=(1, 2, 0)) # (nta, 1, nt) - - for tai, taxi in zip(arr, trans_att): - arr_out[x >= taxi] += tai - - elif axis == "x": - assert pval.shape[0] == self.nx - - arr = np.transpose(self.get_taf_pars(pval), axes=(1, 2, 0)) # (nta, nx, nt) - - for tai, taxi in zip(arr, trans_att): - # loop over nta, tai has shape (x, t) - arr_out[x >= taxi] += tai[x >= taxi] - - elif axis == "time": - assert pval.shape[0] == self.nt - - # arr (nt, nta, nt) to have shape (nta, nt, nt) - arr = np.transpose(self.get_taf_pars(pval), axes=(1, 2, 0)) - - for tai, taxi in zip(arr, trans_att): - # loop over nta, tai has shape (t, t) - arr_out[x >= taxi] += np.diag(tai)[None] - - return arr_out - - def get_tab_values(self, pval, x, trans_att, axis=""): - """returns tab parameters of shape (nx, nt)""" - assert pval.shape[-1] == self.npar - - arr_out = np.zeros((self.nx, self.nt)) - - if self.nta == 0: - pass - - elif axis == "": - pval = np.squeeze(pval) - assert pval.ndim == 1 - arr = np.transpose(self.get_tab_pars(pval), axes=(1, 0)) - - for tai, taxi in zip(arr, trans_att): - arr_out[x < taxi] += tai - - elif axis == "x": - assert pval.ndim == 2 and pval.shape[0] == self.nx - - arr = np.transpose(self.get_tab_pars(pval), axes=(1, 2, 0)) - - for tai, taxi in zip(arr, trans_att): - # loop over nta, tai has shape (x, t) - arr_out[x < taxi] += tai[x < taxi] - - elif axis == "time": - assert pval.ndim == 2 and pval.shape[0] == self.nt - - # arr (nt, nta, nt) to have shape (nta, nt, nt) - arr = np.transpose(self.get_tab_pars(pval), axes=(1, 2, 0)) - - for tai, taxi in zip(arr, trans_att): - # loop over nta, tai has shape (t, t) - arr_out[x < taxi] += np.diag(tai) - - return arr_out - - -class ParameterIndexSingleEnded: - """ - if parameter fixed, they are not in - npar = 1 + 1 + nt + nta * nt - """ - - def __init__(self, nt, nx, nta, includes_alpha=False, includes_dalpha=True): - assert not ( - includes_alpha and includes_dalpha - ), "Cannot hold both dalpha and alpha" - self.nt = nt - self.nx = nx - self.nta = nta - self.includes_alpha = includes_alpha - self.includes_dalpha = includes_dalpha - - @property - def all(self): - return np.concatenate( - (self.gamma, self.dalpha, self.alpha, self.c, self.ta.flatten(order="F")) - ) - - @property - def npar(self): - if self.includes_alpha: - return 1 + self.nx + self.nt + self.nta * self.nt - elif self.includes_dalpha: - return 1 + 1 + self.nt + self.nta * self.nt - else: - return 1 + self.nt + self.nta * self.nt - - @property - def gamma(self): - return [0] - - @property - def dalpha(self): - if self.includes_dalpha: - return [1] - else: - return [] - - @property - def alpha(self): - if self.includes_alpha: - return list(range(1, 1 + self.nx)) - else: - return [] - - @property - def c(self): - if self.includes_alpha: - return list(range(1 + self.nx, 1 + self.nx + self.nt)) - elif self.includes_dalpha: - return list(range(1 + 1, 1 + 1 + self.nt)) - else: - return list(range(1, 1 + self.nt)) - - @property - def taf(self): - """returns taf parameters of shape (nt, nta) or (nt, nta, a)""" - # ta = p_val[-nt * nta:].reshape((nt, nta), order='F') - # self["talpha"] = (('time', 'trans_att'), ta[:, :]) - if self.includes_alpha: - return np.arange( - 1 + self.nx + self.nt, 1 + self.nx + self.nt + self.nt * self.nta - ).reshape((self.nt, self.nta), order="F") - elif self.includes_dalpha: - return np.arange( - 1 + 1 + self.nt, 1 + 1 + self.nt + self.nt * self.nta - ).reshape((self.nt, self.nta), order="F") - else: - return np.arange(1 + self.nt, 1 + self.nt + self.nt * self.nta).reshape( - (self.nt, self.nta), order="F" - ) - - def get_taf_pars(self, pval): - if self.nta > 0: - if pval.ndim == 1: - # returns shape (nta, nt) - assert len(pval) == self.npar, "Length of pval is incorrect" - return np.stack([pval[tafi] for tafi in self.taf.T]) - - else: - # assume shape is (a, npar) and returns shape (nta, nt, a) - assert pval.shape[1] == self.npar and pval.ndim == 2 - return np.stack([self.get_taf_pars(v) for v in pval], axis=-1) - - else: - return np.zeros(shape=(self.nt, 0)) - - def get_taf_values(self, pval, x, trans_att, axis=""): - """returns taf parameters of shape (nx, nt)""" - # assert pval.ndim == 2 and pval.shape[1] == self.npar - - arr_out = np.zeros((self.nx, self.nt)) - - if self.nta == 0: - pass - - elif axis == "": - pval = pval.flatten() - assert pval.shape == (self.npar,) - arr = self.get_taf_pars(pval) - assert arr.shape == ( - self.nta, - self.nt, - ) - - for tai, taxi in zip(arr, trans_att): - arr_out[x >= taxi] += tai - - elif axis == "x": - assert pval.shape == (self.nx, self.npar) - arr = self.get_taf_pars(pval) - assert arr.shape == (self.nta, self.nx, self.nt) - - for tai, taxi in zip(arr, trans_att): - # loop over nta, tai has shape (x, t) - arr_out[x >= taxi] += tai[x >= taxi] - - elif axis == "time": - assert pval.shape == (self.nt, self.npar) - arr = self.get_taf_pars(pval) - assert arr.shape == (self.nta, self.nt, self.nt) - - for tai, taxi in zip(arr, trans_att): - # loop over nta, tai has shape (t, t) - arr_out[x >= taxi] += np.diag(tai)[None] - - return arr_out - - -def open_datastore( - filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, - engine=None, - chunks=None, - lock=None, - cache=None, - drop_variables=None, - backend_kwargs=None, - load_in_memory=False, - **kwargs, -): - """Load and decode a datastore from a file or file-like object. - Parameters - ---------- - filename_or_obj : str, Path, file or xarray.backends.*DataStore - Strings and Path objects are interpreted as a path to a netCDF file - or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. - engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', - 'pseudonetcdf'}, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - 'netcdf4'. - chunks : int or dict, optional - If chunks is provided, it used to load the new dataset into dask - arrays. ``chunks={}`` loads the dataset with dask using a single - chunk for all arrays. - lock : False, True or threading.Lock, optional - If chunks is provided, this argument is passed on to - :py:func:`dask.array.from_array`. By default, a global lock is - used when reading data from netCDF files with the netcdf4 and h5netcdf - engines to avoid issues with concurrent access when using dask's - multithreaded backend. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. - drop_variables: string or iterable, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: dictionary, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. - - Returns - ------- - dataset : Dataset - The newly created dataset. - - See Also - -------- - xarray.open_dataset - xarray.load_dataset - """ - - xr_kws = inspect.signature(xr.open_dataset).parameters.keys() - - ds_kwargs = {k: v for k, v in kwargs.items() if k not in xr_kws} - - if chunks is None: - chunks = {} - - with xr.open_dataset( - filename_or_obj, - group=group, - decode_cf=decode_cf, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - engine=engine, - chunks=chunks, - lock=lock, - cache=cache, - drop_variables=drop_variables, - backend_kwargs=backend_kwargs, - ) as ds_xr: - ds = DataStore( - data_vars=ds_xr.data_vars, - coords=ds_xr.coords, - attrs=ds_xr.attrs, - **ds_kwargs, - ) - - # to support deprecated st_labels - ds = ds.rename_labels(assertion=False) - - if load_in_memory: - if "cache" in kwargs: - raise TypeError("cache has no effect in this context") - return ds.load() - - else: - return ds - - -def open_mf_datastore( - path=None, paths=None, combine="by_coords", load_in_memory=False, **kwargs -): - """ - Open a datastore from multiple netCDF files. This script assumes the - datastore was split along the time dimension. But only variables with a - time dimension should be concatenated in the time dimension. Other - options from xarray do not support this. - - Parameters - ---------- - combine : {'by_coords', 'nested'}, optional - Leave it at by_coords - path : str - A file path to the stored netcdf files with an asterisk in the - filename to list all. Ensure you have leading zeros in the file - numbering. - paths : list - Define you own list of file paths. - Returns - ------- - dataset : Dataset - The newly created dataset. - """ - from xarray.backends.api import open_mfdataset - - if paths is None: - paths = sorted(glob.glob(path)) - assert paths, "No files match found with: " + path - - with open_mfdataset(paths=paths, combine=combine, **kwargs) as xds: - ds = DataStore(data_vars=xds.data_vars, coords=xds.coords, attrs=xds.attrs) - - # to support deprecated st_labels - ds = ds.rename_labels(assertion=False) - - if load_in_memory: - if "cache" in kwargs: - raise TypeError("cache has no effect in this context") - return ds.load() - - else: - return ds - - -def read_silixa_files( - filepathlist=None, - directory=None, - zip_handle=None, - file_ext="*.xml", - timezone_netcdf="UTC", - silent=False, - load_in_memory="auto", - **kwargs, -): - """Read a folder with measurement files. Each measurement file contains - values for a - single timestep. Remember to check which timezone you are working in. - - The silixa files are already timezone aware - - Parameters - ---------- - filepathlist : list of str, optional - List of paths that point the the silixa files - directory : str, Path, optional - Path to folder - timezone_netcdf : str, optional - Timezone string of the netcdf file. UTC follows CF-conventions. - file_ext : str, optional - file extension of the measurement files - silent : bool - If set tot True, some verbose texts are not printed to stdout/screen - load_in_memory : {'auto', True, False} - If 'auto' the Stokes data is only loaded to memory for small files - kwargs : dict-like, optional - keyword-arguments are passed to DataStore initialization - - Returns - ------- - datastore : DataStore - The newly created datastore. - """ - - assert "timezone_input_files" not in kwargs, ( - "The silixa files are " "already timezone aware" - ) - - if filepathlist is None and zip_handle is None: - filepathlist = sorted(glob.glob(os.path.join(directory, file_ext))) - - # Make sure that the list of files contains any files - assert ( - len(filepathlist) >= 1 - ), "No measurement files found in provided " "directory: \n" + str(directory) - - elif filepathlist is None and zip_handle: - filepathlist = ziphandle_to_filepathlist(fh=zip_handle, extension=file_ext) - - # Make sure that the list of files contains any files - assert len(filepathlist) >= 1, ( - "No measurement files found in provided " "list/directory" - ) - - xml_version = silixa_xml_version_check(filepathlist) - - if xml_version == 4: - data_vars, coords, attrs = read_silixa_files_routine_v4( - filepathlist, - timezone_netcdf=timezone_netcdf, - silent=silent, - load_in_memory=load_in_memory, - ) - - elif xml_version in (6, 7, 8): - data_vars, coords, attrs = read_silixa_files_routine_v6( - filepathlist, - xml_version=xml_version, - timezone_netcdf=timezone_netcdf, - silent=silent, - load_in_memory=load_in_memory, - ) - - else: - raise NotImplementedError( - "Silixa xml version " + "{0} not implemented".format(xml_version) - ) - - ds = DataStore(data_vars=data_vars, coords=coords, attrs=attrs, **kwargs) - return ds - - -def read_sensortran_files(directory, timezone_netcdf="UTC", silent=False, **kwargs): - """Read a folder with measurement files. Each measurement file contains - values for a - single timestep. Remember to check which timezone you are working in. - - The sensortran files are already timezone aware - - Parameters - ---------- - directory : str, Path - Path to folder containing BinaryRawDTS and BinaryTemp files - timezone_netcdf : str, optional - Timezone string of the netcdf file. UTC follows CF-conventions. - silent : bool - If set tot True, some verbose texts are not printed to stdout/screen - kwargs : dict-like, optional - keyword-arguments are passed to DataStore initialization - - Returns - ------- - datastore : DataStore - The newly created datastore. - """ - - filepathlist_dts = sorted(glob.glob(os.path.join(directory, "*BinaryRawDTS.dat"))) - - # Make sure that the list of files contains any files - assert ( - len(filepathlist_dts) >= 1 - ), "No RawDTS measurement files found " "in provided directory: \n" + str(directory) - - filepathlist_temp = [f.replace("RawDTS", "Temp") for f in filepathlist_dts] - - for ii, fname in enumerate(filepathlist_dts): - # Check if corresponding temperature file exists - if not os.path.isfile(filepathlist_temp[ii]): - raise FileNotFoundError( - "Could not find BinaryTemp " + "file corresponding to {}".format(fname) - ) - - version = sensortran_binary_version_check(filepathlist_dts) - - if version == 3: - data_vars, coords, attrs = read_sensortran_files_routine( - filepathlist_dts, - filepathlist_temp, - timezone_netcdf=timezone_netcdf, - silent=silent, + raise ( + "ds.resample_datastore() is deprecated. Use from dtscalibration import DataStore; " + "DataStore(ds.resample()) instead. See example notebook 2." ) - else: - raise NotImplementedError( - "Sensortran binary version " + "{0} not implemented".format(version) - ) - - ds = DataStore(data_vars=data_vars, coords=coords, attrs=attrs, **kwargs) - return ds - - -def read_apsensing_files( - filepathlist=None, - directory=None, - file_ext="*.xml", - timezone_netcdf="UTC", - timezone_input_files="UTC", - silent=False, - load_in_memory="auto", - **kwargs, -): - """Read a folder with measurement files. Each measurement file contains - values for a single timestep. Remember to check which timezone - you are working in. - - Parameters - ---------- - filepathlist : list of str, optional - List of paths that point the the silixa files - directory : str, Path, optional - Path to folder - timezone_netcdf : str, optional - Timezone string of the netcdf file. UTC follows CF-conventions. - timezone_input_files : str, optional - Timezone string of the measurement files. - Remember to check when measurements are taken. - Also if summertime is used. - file_ext : str, optional - file extension of the measurement files - silent : bool - If set tot True, some verbose texts are not printed to stdout/screen - load_in_memory : {'auto', True, False} - If 'auto' the Stokes data is only loaded to memory for small files - kwargs : dict-like, optional - keyword-arguments are passed to DataStore initialization - - Notes - ----- - Only XML files are supported for now - - Returns - ------- - datastore : DataStore - The newly created datastore. - """ - if not file_ext == "*.xml": - raise NotImplementedError("Only .xml files are supported for now") - - if filepathlist is None: - filepathlist = sorted(glob.glob(os.path.join(directory, file_ext))) - - # Make sure that the list of files contains any files - assert ( - len(filepathlist) >= 1 - ), "No measurement files found in provided " "directory: \n" + str(directory) - - # Make sure that the list of files contains any files - assert len(filepathlist) >= 1, ( - "No measurement files found in provided " "list/directory" - ) - - device = apsensing_xml_version_check(filepathlist) - - valid_devices = ["CP320"] - - if device in valid_devices: - pass - - else: - warnings.warn( - "AP sensing device " - '"{0}"'.format(device) - + " has not been tested.\nPlease open an issue on github" - + " and provide an example file" - ) - - data_vars, coords, attrs = read_apsensing_files_routine( - filepathlist, - timezone_netcdf=timezone_netcdf, - silent=silent, - load_in_memory=load_in_memory, - ) - - ds = DataStore(data_vars=data_vars, coords=coords, attrs=attrs, **kwargs) - return ds - - -def read_sensornet_files( - filepathlist=None, - directory=None, - file_ext="*.ddf", - timezone_netcdf="UTC", - timezone_input_files="UTC", - silent=False, - add_internal_fiber_length=50.0, - fiber_length=None, - **kwargs, -): - """Read a folder with measurement files. Each measurement file contains - values for a single timestep. Remember to check which timezone - you are working in. - - Parameters - ---------- - filepathlist : list of str, optional - List of paths that point the the silixa files - directory : str, Path, optional - Path to folder - timezone_netcdf : str, optional - Timezone string of the netcdf file. UTC follows CF-conventions. - timezone_input_files : str, optional - Timezone string of the measurement files. - Remember to check when measurements are taken. - Also if summertime is used. - file_ext : str, optional - file extension of the measurement files - silent : bool - If set tot True, some verbose texts are not printed to stdout/screen - add_internal_fiber_length : float - Set to zero if only the measurements of the fiber connected to the DTS - system of interest. Set to 50 if you also want to keep the internal - reference section. - fiber_length : float - It is the fiber length between the two connector entering the DTS - device. If left to `None`, it is approximated with - `x[-1] - add_internal_fiber_length`. - kwargs : dict-like, optional - keyword-arguments are passed to DataStore initialization - - Notes - ----- - Compressed sensornet files can not be directly decoded, - because the files are encoded with encoding='windows-1252' instead of - UTF-8. - - Returns - ------- - datastore : DataStore - The newly created datastore. - """ - if filepathlist is None: - # Also look for files in sub-folders - filepathlist_unsorted = glob.glob( - os.path.join(directory, "**", file_ext), recursive=True - ) - - # Make sure that the list of files contains any files - msg = "No measurement files found in provided directory: \n" + str(directory) - assert len(filepathlist_unsorted) >= 1, msg - - # sort based on dates in filesname. A simple sorted() is not sufficient - # as month folders do not sort well - basenames = [os.path.basename(fp) for fp in filepathlist_unsorted] - dates = ["".join(bn.split(" ")[2:4]) for bn in basenames] - i_sort = np.argsort(dates) - filepathlist = [filepathlist_unsorted[i] for i in i_sort] - - # Check measurements are all from same channel - chno = [bn.split(" ")[1] for bn in basenames] - assert ( - len(set(chno)) == 1 - ), "Folder contains measurements from multiple channels" - - # Make sure that the list of files contains any files - assert len(filepathlist) >= 1, ( - "No measurement files found in provided " "list/directory" - ) - - ddf_version = sensornet_ddf_version_check(filepathlist) - - valid_versions = [ - "Halo DTS v1*", - "ORYX F/W v1.02 Oryx Data Collector v3*", - "ORYX F/W v4.00 Oryx Data Collector v3*", - "Sentinel DTS v5*", - ] - - valid = any([fnmatch.fnmatch(ddf_version, v_) for v_ in valid_versions]) - - if valid: - if fnmatch.fnmatch(ddf_version, "Halo DTS v1*"): - flip_reverse_measurements = True - elif fnmatch.fnmatch(ddf_version, "Sentinel DTS v5*"): - flip_reverse_measurements = True - else: - flip_reverse_measurements = False - - else: - flip_reverse_measurements = False - warnings.warn( - "Sensornet .dff version " - '"{0}"'.format(ddf_version) - + " has not been tested.\nPlease open an issue on github" - + " and provide an example file" - ) - - data_vars, coords, attrs = read_sensornet_files_routine_v3( - filepathlist, - timezone_netcdf=timezone_netcdf, - timezone_input_files=timezone_input_files, - silent=silent, - add_internal_fiber_length=add_internal_fiber_length, - fiber_length=fiber_length, - flip_reverse_measurements=flip_reverse_measurements, - ) - - ds = DataStore(data_vars=data_vars, coords=coords, attrs=attrs, **kwargs) - return ds - - -def func_fit(p, xs): - return p[:xs, None] * p[None, xs:] - - -def func_cost(p, data, xs): - fit = func_fit(p, xs) - return np.sum((fit - data) ** 2) diff --git a/src/dtscalibration/datastore_utils.py b/src/dtscalibration/datastore_utils.py index fe23f53c..dc2daad3 100644 --- a/src/dtscalibration/datastore_utils.py +++ b/src/dtscalibration/datastore_utils.py @@ -3,6 +3,7 @@ from typing import Optional from typing import Union +import dask.array as da import matplotlib.pyplot as plt import numpy as np import numpy.typing as npt @@ -54,30 +55,412 @@ def check_dims( ) -def get_netcdf_encoding( - ds: "DataStore", zlib: bool = True, complevel: int = 5, **kwargs -) -> dict: - """Get default netcdf compression parameters. The same for each data variable. +class ParameterIndexDoubleEnded: + """ + npar = 1 + 2 * nt + nx + 2 * nt * nta + assert pval.size == npar + assert p_var.size == npar + if calc_cov: + assert p_cov.shape == (npar, npar) + gamma = pval[0] + d_fw = pval[1:nt + 1] + d_bw = pval[1 + nt:1 + 2 * nt] + alpha = pval[1 + 2 * nt:1 + 2 * nt + nx] + # store calibration parameters in DataStore + self["gamma"] = (tuple(), gamma) + self["alpha"] = (('x',), alpha) + self["df"] = (('time',), d_fw) + self["db"] = (('time',), d_bw) + if nta > 0: + ta = pval[1 + 2 * nt + nx:].reshape((nt, 2, nta), order='F') + self['talpha_fw'] = (('time', 'trans_att'), ta[:, 0, :]) + self['talpha_bw'] = (('time', 'trans_att'), ta[:, 1, :]) + """ - TODO: Truncate precision to XML precision per data variable + def __init__(self, nt, nx, nta, fix_gamma=False, fix_alpha=False): + self.nt = nt + self.nx = nx + self.nta = nta + self.fix_gamma = fix_gamma + self.fix_alpha = fix_alpha + + @property + def all(self): + return np.concatenate( + (self.gamma, self.df, self.db, self.alpha, self.ta.flatten(order="F")) + ) + + @property + def npar(self): + if not self.fix_gamma and not self.fix_alpha: + return 1 + 2 * self.nt + self.nx + 2 * self.nt * self.nta + elif self.fix_gamma and not self.fix_alpha: + return 2 * self.nt + self.nx + 2 * self.nt * self.nta + elif not self.fix_gamma and self.fix_alpha: + return 1 + 2 * self.nt + 2 * self.nt * self.nta + elif self.fix_gamma and self.fix_alpha: + return 2 * self.nt + 2 * self.nt * self.nta + + @property + def gamma(self): + if self.fix_gamma: + return [] + else: + return [0] + + @property + def df(self): + if self.fix_gamma: + return list(range(self.nt)) + else: + return list(range(1, self.nt + 1)) + + @property + def db(self): + if self.fix_gamma: + return list(range(self.nt, 2 * self.nt)) + else: + return list(range(1 + self.nt, 1 + 2 * self.nt)) + + @property + def alpha(self): + if self.fix_alpha: + return [] + elif self.fix_gamma: + return list(range(2 * self.nt, 1 + 2 * self.nt + self.nx)) + elif not self.fix_gamma: + return list(range(1 + 2 * self.nt, 1 + 2 * self.nt + self.nx)) + + @property + def ta(self): + if self.nta == 0: + return np.zeros((self.nt, 2, 0)) + elif not self.fix_gamma and not self.fix_alpha: + arr = np.arange(1 + 2 * self.nt + self.nx, self.npar) + elif self.fix_gamma and not self.fix_alpha: + arr = np.arange(2 * self.nt + self.nx, self.npar) + elif not self.fix_gamma and self.fix_alpha: + arr = np.arange(1 + 2 * self.nt, self.npar) + elif self.fix_gamma and self.fix_alpha: + arr = np.arange(2 * self.nt, self.npar) + + return arr.reshape((self.nt, 2, self.nta), order="F") + + @property + def taf(self): + """ + Use `.reshape((nt, nta))` to convert array to (time-dim and transatt-dim). Order is the default C order. + ta = pval[1 + 2 * nt + nx:].reshape((nt, 2, nta), order='F') + self['talpha_fw'] = (('time', 'trans_att'), ta[:, 0, :]) + """ + return self.ta[:, 0, :].flatten(order="C") + + @property + def tab(self): + """ + Use `.reshape((nt, nta))` to convert array to (time-dim and transatt-dim). Order is the default C order. + ta = pval[1 + 2 * nt + nx:].reshape((nt, 2, nta), order='F') + self['talpha_bw'] = (('time', 'trans_att'), ta[:, 1, :]) + """ + return self.ta[:, 1, :].flatten(order="C") + + def get_ta_pars(self, pval): + if self.nta > 0: + if pval.ndim == 1: + return np.take_along_axis(pval[None, None], self.ta, axis=-1) + + else: + # assume shape is (a, npar) and returns shape (nt, 2, nta, a) + assert pval.shape[1] == self.npar and pval.ndim == 2 + return np.stack([self.get_ta_pars(v) for v in pval], axis=-1) + + else: + return np.zeros(shape=(self.nt, 2, 0)) + + def get_taf_pars(self, pval): + """returns taf parameters of shape (nt, nta) or (nt, nta, a)""" + return self.get_ta_pars(pval=pval)[:, 0, :] + + def get_tab_pars(self, pval): + """returns taf parameters of shape (nt, nta) or (nt, nta, a)""" + return self.get_ta_pars(pval=pval)[:, 1, :] + + def get_taf_values(self, pval, x, trans_att, axis=""): + """returns taf parameters of shape (nx, nt)""" + pval = np.atleast_2d(pval) + + assert pval.ndim == 2 and pval.shape[1] == self.npar + + arr_out = np.zeros((self.nx, self.nt)) + + if self.nta == 0: + pass + + elif axis == "": + assert pval.shape[0] == 1 + + arr = np.transpose(self.get_taf_pars(pval), axes=(1, 2, 0)) # (nta, 1, nt) + + for tai, taxi in zip(arr, trans_att): + arr_out[x >= taxi] += tai + + elif axis == "x": + assert pval.shape[0] == self.nx + + arr = np.transpose(self.get_taf_pars(pval), axes=(1, 2, 0)) # (nta, nx, nt) + + for tai, taxi in zip(arr, trans_att): + # loop over nta, tai has shape (x, t) + arr_out[x >= taxi] += tai[x >= taxi] + + elif axis == "time": + assert pval.shape[0] == self.nt + + # arr (nt, nta, nt) to have shape (nta, nt, nt) + arr = np.transpose(self.get_taf_pars(pval), axes=(1, 2, 0)) + + for tai, taxi in zip(arr, trans_att): + # loop over nta, tai has shape (t, t) + arr_out[x >= taxi] += np.diag(tai)[None] + + return arr_out + + def get_tab_values(self, pval, x, trans_att, axis=""): + """returns tab parameters of shape (nx, nt)""" + assert pval.shape[-1] == self.npar + + arr_out = np.zeros((self.nx, self.nt)) + + if self.nta == 0: + pass + + elif axis == "": + pval = np.squeeze(pval) + assert pval.ndim == 1 + arr = np.transpose(self.get_tab_pars(pval), axes=(1, 0)) + + for tai, taxi in zip(arr, trans_att): + arr_out[x < taxi] += tai + + elif axis == "x": + assert pval.ndim == 2 and pval.shape[0] == self.nx + + arr = np.transpose(self.get_tab_pars(pval), axes=(1, 2, 0)) + + for tai, taxi in zip(arr, trans_att): + # loop over nta, tai has shape (x, t) + arr_out[x < taxi] += tai[x < taxi] + + elif axis == "time": + assert pval.ndim == 2 and pval.shape[0] == self.nt + # arr (nt, nta, nt) to have shape (nta, nt, nt) + arr = np.transpose(self.get_tab_pars(pval), axes=(1, 2, 0)) + + for tai, taxi in zip(arr, trans_att): + # loop over nta, tai has shape (t, t) + arr_out[x < taxi] += np.diag(tai) + + return arr_out + + +class ParameterIndexSingleEnded: + """ + if parameter fixed, they are not in + npar = 1 + 1 + nt + nta * nt + """ + + def __init__(self, nt, nx, nta, includes_alpha=False, includes_dalpha=True): + assert not ( + includes_alpha and includes_dalpha + ), "Cannot hold both dalpha and alpha" + self.nt = nt + self.nx = nx + self.nta = nta + self.includes_alpha = includes_alpha + self.includes_dalpha = includes_dalpha + + @property + def all(self): + return np.concatenate( + (self.gamma, self.dalpha, self.alpha, self.c, self.ta.flatten(order="F")) + ) + + @property + def npar(self): + if self.includes_alpha: + return 1 + self.nx + self.nt + self.nta * self.nt + elif self.includes_dalpha: + return 1 + 1 + self.nt + self.nta * self.nt + else: + return 1 + self.nt + self.nta * self.nt + + @property + def gamma(self): + return [0] + + @property + def dalpha(self): + if self.includes_dalpha: + return [1] + else: + return [] + + @property + def alpha(self): + if self.includes_alpha: + return list(range(1, 1 + self.nx)) + else: + return [] + + @property + def c(self): + if self.includes_alpha: + return list(range(1 + self.nx, 1 + self.nx + self.nt)) + elif self.includes_dalpha: + return list(range(1 + 1, 1 + 1 + self.nt)) + else: + return list(range(1, 1 + self.nt)) + + @property + def taf(self): + """returns taf parameters of shape (nt, nta) or (nt, nta, a)""" + # ta = p_val[-nt * nta:].reshape((nt, nta), order='F') + # self["talpha"] = (('time', 'trans_att'), ta[:, :]) + if self.includes_alpha: + return np.arange( + 1 + self.nx + self.nt, 1 + self.nx + self.nt + self.nt * self.nta + ).reshape((self.nt, self.nta), order="F") + elif self.includes_dalpha: + return np.arange( + 1 + 1 + self.nt, 1 + 1 + self.nt + self.nt * self.nta + ).reshape((self.nt, self.nta), order="F") + else: + return np.arange(1 + self.nt, 1 + self.nt + self.nt * self.nta).reshape( + (self.nt, self.nta), order="F" + ) + + def get_taf_pars(self, pval): + if self.nta > 0: + if pval.ndim == 1: + # returns shape (nta, nt) + assert len(pval) == self.npar, "Length of pval is incorrect" + return np.stack([pval[tafi] for tafi in self.taf.T]) + + else: + # assume shape is (a, npar) and returns shape (nta, nt, a) + assert pval.shape[1] == self.npar and pval.ndim == 2 + return np.stack([self.get_taf_pars(v) for v in pval], axis=-1) + + else: + return np.zeros(shape=(self.nt, 0)) + + def get_taf_values(self, pval, x, trans_att, axis=""): + """returns taf parameters of shape (nx, nt)""" + # assert pval.ndim == 2 and pval.shape[1] == self.npar + + arr_out = np.zeros((self.nx, self.nt)) + + if self.nta == 0: + pass + + elif axis == "": + pval = pval.flatten() + assert pval.shape == (self.npar,) + arr = self.get_taf_pars(pval) + assert arr.shape == ( + self.nta, + self.nt, + ) + + for tai, taxi in zip(arr, trans_att): + arr_out[x >= taxi] += tai + + elif axis == "x": + assert pval.shape == (self.nx, self.npar) + arr = self.get_taf_pars(pval) + assert arr.shape == (self.nta, self.nx, self.nt) + + for tai, taxi in zip(arr, trans_att): + # loop over nta, tai has shape (x, t) + arr_out[x >= taxi] += tai[x >= taxi] + + elif axis == "time": + assert pval.shape == (self.nt, self.npar) + arr = self.get_taf_pars(pval) + assert arr.shape == (self.nta, self.nt, self.nt) + + for tai, taxi in zip(arr, trans_att): + # loop over nta, tai has shape (t, t) + arr_out[x >= taxi] += np.diag(tai)[None] + + return arr_out + + +def check_deprecated_kwargs(kwargs): + """ + Internal function that parses the `kwargs` for depreciated keyword + arguments. + + Depreciated keywords raise an error, pending to be depreciated do not. + But this requires that the code currently deals with those arguments. Parameters ---------- - zlib - complevel - ds : DataStore + kwargs : Dict + A dictionary with keyword arguments. Returns ------- - encoding: - Encoding dictionary. + """ - comp = dict(zlib=zlib, complevel=complevel) - comp.update(kwargs) - encoding = {var: comp for var in ds.data_vars} + msg = """Previously, it was possible to manually set the label from + which the Stokes and anti-Stokes were read within the DataStore + object. To reduce the clutter in the code base and be able to + maintain it, this option was removed. + See: https://github.com/dtscalibration/python-dts-calibration/issues/81 + + The new **fixed** names are: st, ast, rst, rast. + + It is still possible to use the previous defaults, for example when + reading stored measurements from netCDF, by renaming the labels. The + old default labels were ST, AST, REV-ST, REV-AST. + + ``` + ds = open_datastore(path_to_old_file) + ds = ds.rename_labels() + ds.calibration_double_ended( + st_var=1.5, + ast_var=1.5, + rst_var=1., + rast_var=1., + method='wls') + ``` + + ds.tmpw.plot() + """ + list_of_depr = [ + "st_label", + "ast_label", + "rst_label", + "rast_label", + "transient_asym_att_x", + "transient_att_x", + ] + list_of_pending_depr = [] + + kwargs = {k: v for k, v in kwargs.items() if k not in list_of_pending_depr} + + for k in kwargs: + if k in list_of_depr: + raise NotImplementedError(msg) + + if len(kwargs) != 0: + raise NotImplementedError( + "The following keywords are not " + "supported: " + ", ".join(kwargs.keys()) + ) - return encoding + pass def check_timestep_allclose(ds: "DataStore", eps: float = 0.01) -> None: @@ -119,6 +502,32 @@ def check_timestep_allclose(ds: "DataStore", eps: float = 0.01) -> None: ) +def get_netcdf_encoding( + ds: "DataStore", zlib: bool = True, complevel: int = 5, **kwargs +) -> dict: + """Get default netcdf compression parameters. The same for each data variable. + + TODO: Truncate precision to XML precision per data variable + + + Parameters + ---------- + zlib + complevel + ds : DataStore + + Returns + ------- + encoding: + Encoding dictionary. + """ + comp = dict(zlib=zlib, complevel=complevel) + comp.update(kwargs) + encoding = {var: comp for var in ds.data_vars} + + return encoding + + def merge_double_ended( ds_fw: "DataStore", ds_bw: "DataStore", @@ -145,6 +554,7 @@ def merge_double_ended( Manually estimated cable length to base alignment on plot_result : bool Plot the aligned Stokes of the forward and backward channels + verbose : bool Returns ------- @@ -539,3 +949,261 @@ def suggest_cable_shift_double_ended( plt.tight_layout() return ishift1, ishift2 + + +def ufunc_per_section_helper( + sections=None, + func=None, + x_coords=None, + reference_dataset=None, + dataarray=None, + subtract_from_dataarray=None, + subtract_reference_from_dataarray=False, + ref_temp_broadcasted=False, + calc_per="stretch", + **func_kwargs, +): + """ + User function applied to parts of the cable. Super useful, + many options and slightly + complicated. + + The function `func` is taken over all the timesteps and calculated + per `calc_per`. This + is returned as a dictionary + + Parameters + ---------- + sections : Dict[str, List[slice]], optional + If `None` is supplied, `ds.sections` is used. Define calibration + sections. Each section requires a reference temperature time series, + such as the temperature measured by an external temperature sensor. + They should already be part of the DataStore object. `sections` + is defined with a dictionary with its keywords of the + names of the reference temperature time series. Its values are + lists of slice objects, where each slice object is a fiber stretch + that has the reference temperature. Afterwards, `sections` is stored + under `ds.sections`. + func : callable, str + A numpy function, or lambda function to apply to each 'calc_per'. + x_coords : xarray.DataArray, optional + x-coordinates, stored as ds.x. If supplied, returns the x-indices of + the reference sections. + reference_dataset : xarray.Dataset or Dict, optional + Contains the reference temperature timeseries refered to in `sections`. + Not required if `x_indices`. + dataarray : xarray.DataArray, optional + Pass your DataArray of which you want to compute the statistics. Has an + (x,) dimension or (x, time) dimensions. + subtract_from_dataarray : xarray.DataArray, optional + Pass your DataArray of which you want to subtract from `dataarray` before you + compute the statistics. Has an (x,) dimension or (x, time) dimensions. + subtract_reference_from_dataarray : bool + If True the reference temperature according to sections is subtracted from + dataarray before computing statistics + ref_temp_broadcasted : bool + Use if you want to return the reference temperature of shape of the reference + sections + calc_per : {'all', 'section', 'stretch'} + func_kwargs : dict + Dictionary with options that are passed to func + + Returns + ------- + + Examples + -------- + + 1. Calculate the variance of the residuals in the along ALL the\ + reference sections wrt the temperature of the water baths + + >>> tmpf_var = ufunc_per_section_helper( + >>> func='var', + >>> calc_per='all', + >>> dataarray=d['tmpf'], + >>> subtract_reference_from_dataarray=True) + + 2. Calculate the variance of the residuals in the along PER\ + reference section wrt the temperature of the water baths + + >>> tmpf_var = ufunc_per_section_helper + >>> func='var', + >>> calc_per='stretch', + >>> dataarray=d['tmpf'], + >>> subtract_reference_from_dataarray=True) + + 3. Calculate the variance of the residuals in the along PER\ + water bath wrt the temperature of the water baths + + >>> tmpf_var = ufunc_per_section_helper( + >>> func='var', + >>> calc_per='section', + >>> dataarray=d['tmpf'], + >>> subtract_reference_from_dataarray=True) + + 4. Obtain the coordinates of the measurements per section + + >>> locs = ufunc_per_section_helper( + >>> func=None, + >>> dataarray=d.x, + >>> subtract_reference_from_dataarray=False, + >>> ref_temp_broadcasted=False, + >>> calc_per='stretch') + + 5. Number of observations per stretch + + >>> nlocs = ufunc_per_section_helper( + >>> func=len, + >>> dataarray=d.x, + >>> subtract_reference_from_dataarray=False, + >>> ref_temp_broadcasted=False, + >>> calc_per='stretch') + + 6. broadcast the temperature of the reference sections to\ + stretch/section/all dimensions. The value of the reference\ + temperature (a timeseries) is broadcasted to the shape of self[\ + label]. The dataarray is not used for anything else. + + >>> temp_ref = ufunc_per_section_helper( + >>> dataarray=d["st"], + >>> ref_temp_broadcasted=True, + >>> calc_per='all') + + 7. x-coordinate index + + >>> ix_loc = ufunc_per_section_helperx_coords=d.x) + + + Note + ---- + If `dataarray` or `subtract_from_dataarray` is a Dask array, a Dask + array is returned else a numpy array is returned + """ + if not func: + + def func(a): + """ + + Parameters + ---------- + a + + Returns + ------- + + """ + return a + + elif isinstance(func, str) and func == "var": + + def func(a): + """ + + Parameters + ---------- + a + + Returns + ------- + + """ + return np.var(a, ddof=1) + + else: + assert callable(func) + + assert calc_per in ["all", "section", "stretch"] + + if x_coords is None and ( + (dataarray is not None and hasattr(dataarray.data, "chunks")) + or (subtract_from_dataarray and hasattr(subtract_from_dataarray.data, "chunks")) + ): + concat = da.concatenate + else: + concat = np.concatenate + + out = dict() + + for k, section in sections.items(): + out[k] = [] + for stretch in section: + if x_coords is not None: + # get indices from stretches + assert subtract_from_dataarray is None + assert not subtract_reference_from_dataarray + assert not ref_temp_broadcasted + # so it is slicable with x-indices + _x_indices = x_coords.astype(int) * 0 + np.arange(x_coords.size) + arg1 = _x_indices.sel(x=stretch).data + out[k].append(arg1) + + elif ( + subtract_from_dataarray is not None + and not subtract_reference_from_dataarray + and not ref_temp_broadcasted + ): + # calculate std wrt other series + arg1 = dataarray.sel(x=stretch).data + arg2 = subtract_from_dataarray.sel(x=stretch).data + out[k].append(arg1 - arg2) + + elif ( + subtract_from_dataarray is None + and subtract_reference_from_dataarray + and not ref_temp_broadcasted + ): + # calculate std wrt reference temperature of the corresponding bath + arg1 = dataarray.sel(x=stretch).data + arg2 = reference_dataset[k].data + out[k].append(arg1 - arg2) + + elif ( + subtract_from_dataarray is None + and not subtract_reference_from_dataarray + and ref_temp_broadcasted + ): + # Broadcast the reference temperature to the length of the stretch + arg1 = dataarray.sel(x=stretch).data + arg2 = da.broadcast_to(reference_dataset[k].data, arg1.shape) + out[k].append(arg2) + + elif ( + subtract_from_dataarray is None + and not subtract_reference_from_dataarray + and not ref_temp_broadcasted + ): + # calculate std wrt mean value + arg1 = dataarray.sel(x=stretch).data + out[k].append(arg1) + + if calc_per == "stretch": + out[k] = [func(argi, **func_kwargs) for argi in out[k]] + + elif calc_per == "section": + # flatten the out_dict to sort them + start = [i.start for i in section] + i_sorted = np.argsort(start) + out_flat_sort = [out[k][i] for i in i_sorted] + out[k] = func(concat(out_flat_sort), **func_kwargs) + + elif calc_per == "all": + pass + + if calc_per == "all": + # flatten the out_dict to sort them + start = [item.start for sublist in sections.values() for item in sublist] + i_sorted = np.argsort(start) + out_flat = [item for sublist in out.values() for item in sublist] + out_flat_sort = [out_flat[i] for i in i_sorted] + out = func(concat(out_flat_sort, axis=0), **func_kwargs) + + if hasattr(out, "chunks") and len(out.chunks) > 0 and "x" in dataarray.dims: + # also sum the chunksize in the x dimension + # first find out where the x dim is + ixdim = dataarray.dims.index("x") + c_old = out.chunks + c_new = list(c_old) + c_new[ixdim] = sum(c_old[ixdim]) + out = out.rechunk(c_new) + + return out diff --git a/src/dtscalibration/io.py b/src/dtscalibration/io.py index db10c5bd..be7d0dd6 100644 --- a/src/dtscalibration/io.py +++ b/src/dtscalibration/io.py @@ -1,2006 +1,564 @@ -# coding=utf-8 +import fnmatch +import glob +import inspect import os -import struct -from contextlib import contextmanager -from xml.etree import ElementTree +import warnings -import dask -import dask.array as da import numpy as np -import pandas as pd - -# Returns a dictionary with the attributes to the dimensions. -# The keys refer to the naming used in the raw files. -# TODO: attrs for st_var and use it in parse_st_var function -_dim_attrs = { - ("x", "distance"): dict( - name="distance", - description="Length along fiber", - long_description="Starting at connector of forward channel", - units="m", - ), - ("tmp", "temperature"): dict( - name="tmp", description="Temperature calibrated by device", units=r"$^\circ$C" - ), - ("st",): dict(name="st", description="Stokes intensity", units="-"), - ("ast",): dict(name="ast", description="anti-Stokes intensity", units="-"), - ("rst",): dict(name="rst", description="reverse Stokes intensity", units="-"), - ("rast",): dict( - name="rast", description="reverse anti-Stokes intensity", units="-" - ), - ("tmpf",): dict( - name="tmpf", - description="Temperature estimated using the Stokes and anti-Stokes from the Forward channel", - units=r"$^\circ$C", - ), - ("tmpb",): dict( - name="tmpb", - description="Temperature estimated using the Stokes and anti-Stokes from the Backward channel", - units=r"$^\circ$C", - ), - ("tmpw",): dict( - name="tmpw", - description="Temperature estimated using the Stokes and anti-Stokes from both the Forward and Backward channel.", - units=r"$^\circ$C", - ), - ("tmpf_var",): dict( - name="tmpf_var", - description="Uncertainty variance in tmpf estimated with linear-error propagation", - units=r"$^\circ$C$^2$", - ), - ("tmpb_var",): dict( - name="tmpb_var", - description="Uncertainty variance in tmpb estimated with linear-error propagation", - units=r"$^\circ$C$^2$", - ), - ("tmpw_var",): dict( - name="tmpw_var", - description="Uncertainty variance in tmpw estimated with linear-error propagation", - units=r"$^\circ$C$^2$", - ), - ("tmpw_var_lower",): dict( - name="tmpw_var_lower", - description="Lower bound of uncertainty variance in tmpw estimated with linear-error propagation. " - "Excludes the parameter uncertainties.", - units=r"$^\circ$C$^2$", - ), - ("tmpw_var_approx",): dict( - name="tmpw_var_upper", - description="Upper bound of uncertainty variance in tmpw estimated with linear-error propagation. " - "Excludes the correlation between tmpf and tmpb caused by alpha.", - units=r"$^\circ$C$^2$", - ), - ("tmpf_mc_var",): dict( - name="tmpf_mc_var", - description="Uncertainty variance in tmpf estimated with Monte Carlo", - units=r"$^\circ$C$^2$", - ), - ("tmpb_mc_var",): dict( - name="tmpb_mc_var", - description="Uncertainty variance in tmpb estimated with Monte Carlo", - units=r"$^\circ$C$^2$", - ), - ("tmpw_mc_var",): dict( - name="tmpw_mc_var", - description="Uncertainty variance in tmpw estimated with Monte Carlo", - units=r"$^\circ$C$^2$", - ), - ("acquisitionTime",): dict( - name="acquisitionTime", - description="Measurement duration of forward channel", - long_description="Actual measurement duration of forward " "channel", - units="seconds", - ), - ("userAcquisitionTimeFW",): dict( - name="userAcquisitionTimeFW", - description="Measurement duration of forward channel", - long_description="Desired measurement duration of forward " "channel", - units="seconds", - ), - ("userAcquisitionTimeBW",): dict( - name="userAcquisitionTimeBW", - description="Measurement duration of backward channel", - long_description="Desired measurement duration of backward " "channel", - units="seconds", - ), - ("trans_att",): dict( - name="Locations introducing transient directional differential " "attenuation", - description="Locations along the x-dimension introducing transient " - "directional differential attenuation", - long_description="Connectors introduce additional differential " - "attenuation that is different for the forward " - "and backward direction, and varies over time.", - units="m", - ), -} - -# Because variations in the names exist between the different file formats. The -# tuple as key contains the possible keys, which is expanded below. -dim_attrs = {k: v for kl, v in _dim_attrs.items() for k in kl} - -dim_attrs_apsensing = dict(dim_attrs) -dim_attrs_apsensing["TEMP"] = dim_attrs_apsensing.pop("tmp") -dim_attrs_apsensing["TEMP"]["name"] = "TEMP" -dim_attrs_apsensing.pop("acquisitionTime") -dim_attrs_apsensing.pop("userAcquisitionTimeFW") -dim_attrs_apsensing.pop("userAcquisitionTimeBW") - - -@contextmanager -def open_file(path, **kwargs): - if isinstance(path, tuple): - # print('\nabout to open zipfile', path[0], '. from', path[1]) - # assert isinstance(path[1], zip) - the_file = path[1].open(path[0], **kwargs) - - else: - the_file = open(path, **kwargs) - - yield the_file - the_file.close() - - -def silixa_xml_version_check(filepathlist): - """Function which tests which version of xml files have to be read. +import xarray as xr + +from dtscalibration import DataStore +from dtscalibration.io_utils import apsensing_xml_version_check +from dtscalibration.io_utils import read_apsensing_files_routine +from dtscalibration.io_utils import read_sensornet_files_routine_v3 +from dtscalibration.io_utils import read_sensortran_files_routine +from dtscalibration.io_utils import read_silixa_files_routine_v4 +from dtscalibration.io_utils import read_silixa_files_routine_v6 +from dtscalibration.io_utils import sensornet_ddf_version_check +from dtscalibration.io_utils import sensortran_binary_version_check +from dtscalibration.io_utils import silixa_xml_version_check +from dtscalibration.io_utils import ziphandle_to_filepathlist + + +def open_datastore( + filename_or_obj, + group=None, + decode_cf=True, + mask_and_scale=None, + decode_times=True, + concat_characters=True, + decode_coords=True, + engine=None, + chunks=None, + lock=None, + cache=None, + drop_variables=None, + backend_kwargs=None, + load_in_memory=False, + **kwargs, +): + """Load and decode a datastore from a file or file-like object. Most + arguments are passed to xarray.open_dataset(). Parameters ---------- - filepathlist + filename_or_obj : str, Path, file or xarray.backends.*DataStore + Strings and Path objects are interpreted as a path to a netCDF file + or an OpenDAP URL and opened with python-netCDF4, unless the filename + ends with .gz, in which case the file is gunzipped and opened with + scipy.io.netcdf (only netCDF3 supported). File-like objects are opened + with scipy.io.netcdf (only netCDF3 supported). + group : str, optional + Path to the netCDF4 group in the given file to open (only works for + netCDF4 files). + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. mask_and_scale defaults to True except for the + pseudonetcdf backend. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + decode_coords : bool, optional + If True, decode the 'coordinates' attribute to identify coordinates in + the resulting dataset. + engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', + 'pseudonetcdf'}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + 'netcdf4'. + chunks : int or dict, optional + If chunks is provided, it used to load the new dataset into dask + arrays. ``chunks={}`` loads the dataset with dask using a single + chunk for all arrays. + lock : False, True or threading.Lock, optional + If chunks is provided, this argument is passed on to + :py:func:`dask.array.from_array`. By default, a global lock is + used when reading data from netCDF files with the netcdf4 and h5netcdf + engines to avoid issues with concurrent access when using dask's + multithreaded backend. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + drop_variables: string or iterable, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + backend_kwargs: dictionary, optional + A dictionary of keyword arguments to pass on to the backend. This + may be useful when backend options would improve performance or + allow user control of dataset processing. Returns ------- + dataset : Dataset + The newly created dataset. + See Also + -------- + xarray.open_dataset + xarray.load_dataset """ - sep = ":" - attrs = read_silixa_attrs_singlefile(filepathlist[0], sep) - - version_string = attrs["customData:SystemSettings:softwareVersion"] - - # Get major version from string. Tested for Ultima v4, v6, v7 XT-DTS v6 - major_version = int(version_string.replace(" ", "").split(":")[-1][0]) - - return major_version + xr_kws = inspect.signature(xr.open_dataset).parameters.keys() + + ds_kwargs = {k: v for k, v in kwargs.items() if k not in xr_kws} + + if chunks is None: + chunks = {} + + with xr.open_dataset( + filename_or_obj, + group=group, + decode_cf=decode_cf, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + engine=engine, + chunks=chunks, + lock=lock, + cache=cache, + drop_variables=drop_variables, + backend_kwargs=backend_kwargs, + ) as ds_xr: + ds = DataStore( + data_vars=ds_xr.data_vars, + coords=ds_xr.coords, + attrs=ds_xr.attrs, + **ds_kwargs, + ) + # to support deprecated st_labels + ds = ds.rename_labels(assertion=False) -def apsensing_xml_version_check(filepathlist): - """Function which tests which version of xml files are read. + if load_in_memory: + if "cache" in kwargs: + raise TypeError("cache has no effect in this context") + return ds.load() - Parameters - ---------- - filepathlist + else: + return ds - Returns - ------- +def open_mf_datastore( + path=None, paths=None, combine="by_coords", load_in_memory=False, **kwargs +): """ - - sep = ":" - attrs, _ = read_apsensing_attrs_singlefile(filepathlist[0], sep) - - return attrs["wellbore:uid"] - - -def sensornet_ddf_version_check(filepathlist): - """Function which checks and returns the .ddf file version + Open a datastore from multiple netCDF files. This script assumes the + datastore was split along the time dimension. But only variables with a + time dimension should be concatenated in the time dimension. Other + options from xarray do not support this. Parameters ---------- - filepathlist - + combine : {'by_coords', 'nested'}, optional + Leave it at by_coords + path : str + A file path to the stored netcdf files with an asterisk in the + filename to list all. Ensure you have leading zeros in the file + numbering. + paths : list + Define you own list of file paths. Returns ------- - ddf_version - + dataset : Dataset + The newly created dataset. """ + from xarray.backends.api import open_mfdataset - # Obtain metadata fro mthe first file - _, meta = read_sensornet_single(filepathlist[0]) - - if "Software version number" in meta.keys(): - version_string = meta["Software version number"] - else: - raise ValueError( - "Software version number could not be detected in .ddf file" - + "Either file is corrupted or not supported" - ) - - ddf_version = version_string.replace(",", ".") + if paths is None: + paths = sorted(glob.glob(path)) + assert paths, "No files match found with: " + path - return ddf_version + with open_mfdataset(paths=paths, combine=combine, **kwargs) as xds: + ds = DataStore(data_vars=xds.data_vars, coords=xds.coords, attrs=xds.attrs) + # to support deprecated st_labels + ds = ds.rename_labels(assertion=False) -def sensortran_binary_version_check(filepathlist): - """Function which tests which version the sensortran binaries are. + if load_in_memory: + if "cache" in kwargs: + raise TypeError("cache has no effect in this context") + return ds.load() - Parameters - ---------- - filepathlist - - Returns - ------- - - """ - fname = filepathlist[0] - - with open(fname, "rb") as f: - f.read(2) - version = struct.unpack("= 1 + ), "No measurement files found in provided " "directory: \n" + str(directory) - attrs["forwardMeasurementChannel"] = attrs["customData:forwardMeasurementChannel"] - if double_ended_flag: - attrs["backwardMeasurementChannel"] = attrs[ - "customData:reverseMeasurementChannel" - ] - else: - attrs["backwardMeasurementChannel"] = "N/A" + elif filepathlist is None and zip_handle: + filepathlist = ziphandle_to_filepathlist(fh=zip_handle, extension=file_ext) - # obtain basic data info - data_item_names = ( - attrs["logData:mnemonicList"].replace(" ", "").strip(" ").split(",") + # Make sure that the list of files contains any files + assert len(filepathlist) >= 1, ( + "No measurement files found in provided " "list/directory" ) - nitem = len(data_item_names) - - ntime = len(filepathlist) - - double_ended_flag = bool(int(attrs["isDoubleEnded"])) - chFW = int(attrs["forwardMeasurementChannel"]) - 1 # zero-based - if double_ended_flag: - chBW = int(attrs["backwardMeasurementChannel"]) - 1 # zero-based - else: - # no backward channel is negative value. writes better to netcdf - chBW = -1 - # print summary - if not silent: - print(f"{ntime} files were found, each representing a single timestep") - print(f"{nitem} recorded vars were found: " + ", ".join(data_item_names)) - print(f"Recorded at {nx} points along the cable") + xml_version = silixa_xml_version_check(filepathlist) - if double_ended_flag: - print("The measurement is double ended") - else: - print("The measurement is single ended") - - # obtain timeseries from data - timeseries_loc_in_hierarchy = [ - ("log", "customData", "acquisitionTime"), - ("log", "customData", "referenceTemperature"), - ("log", "customData", "probe1Temperature"), - ("log", "customData", "probe2Temperature"), - ("log", "customData", "referenceProbeVoltage"), - ("log", "customData", "probe1Voltage"), - ("log", "customData", "probe2Voltage"), - ( - "log", - "customData", - "UserConfiguration", - "ChannelConfiguration", - "AcquisitionConfiguration", - "AcquisitionTime", - "userAcquisitionTimeFW", - ), - ] - - if double_ended_flag: - timeseries_loc_in_hierarchy.append( - ( - "log", - "customData", - "UserConfiguration", - "ChannelConfiguration", - "AcquisitionConfiguration", - "AcquisitionTime", - "userAcquisitionTimeBW", - ) + if xml_version == 4: + data_vars, coords, attrs = read_silixa_files_routine_v4( + filepathlist, + timezone_netcdf=timezone_netcdf, + silent=silent, + load_in_memory=load_in_memory, ) - timeseries = { - item[-1]: dict(loc=item, array=np.zeros(ntime, dtype=np.float32)) - for item in timeseries_loc_in_hierarchy - } - - # add units to timeseries (unit of measurement) - for key, item in timeseries.items(): - if f"customData:{key}:uom" in attrs: - item["uom"] = attrs[f"customData:{key}:uom"] - else: - item["uom"] = "" - - # Gather data - arr_path = "s:" + "/s:".join(["log", "logData", "data"]) - - @dask.delayed - def grab_data_per_file(file_handle): - """ - - Parameters - ---------- - file_handle - - Returns - ------- - - """ - with open_file(file_handle, mode="r") as f_h: - eltree = ElementTree.parse(f_h) - arr_el = eltree.findall(arr_path, namespaces=ns) - - if not len(arr_el) == nx: - raise ValueError( - "Inconsistent length of x-dimension" - + "\nCheck if files are mixed up, or if the number of " - + "data points vary per file." - ) - - # remove the breaks on both sides of the string - # split the string on the comma - arr_str = [arr_eli.text[1:-1].split(",") for arr_eli in arr_el] - - return np.array(arr_str, dtype=float) - - data_lst_dly = [grab_data_per_file(fp) for fp in filepathlist] - data_lst = [ - da.from_delayed(x, shape=(nx, nitem), dtype=float) for x in data_lst_dly - ] - data_arr = da.stack(data_lst).T # .compute() - - # Check whether to compute data_arr (if possible 25% faster) - data_arr_cnk = data_arr.rechunk({0: -1, 1: -1, 2: "auto"}) - if load_in_memory == "auto" and data_arr_cnk.npartitions <= 5: - if not silent: - print("Reading the data from disk") - data_arr = data_arr_cnk.compute() - elif load_in_memory: - if not silent: - print("Reading the data from disk") - data_arr = data_arr_cnk.compute() - else: - if not silent: - print("Not reading the data from disk") - data_arr = data_arr_cnk - - data_vars = {} - for name, data_arri in zip(data_item_names, data_arr): - if name == "LAF": - continue - if tld[name] in dim_attrs: - data_vars[tld[name]] = (["x", "time"], data_arri, dim_attrs[tld[name]]) - else: - raise ValueError(f"Dont know what to do with the {name} data column") - - # Obtaining the timeseries data (reference temperature etc) - _ts_dtype = [(k, np.float32) for k in timeseries] - _time_dtype = [ - ("filename_tstamp", np.int64), - ("minDateTimeIndex", "= 1 + ), "No RawDTS measurement files found " "in provided directory: \n" + str(directory) - # Amount of datapoints is the size of the logdata tree - nx = len(logdata_tree) - - sep = ":" - ns = {"s": namespace[1:-1]} - - # Obtain metadata from the first file - attrs = read_silixa_attrs_singlefile(filepathlist[0], sep) - - # Add standardised required attributes - attrs["isDoubleEnded"] = attrs["customData:isDoubleEnded"] - double_ended_flag = bool(int(attrs["isDoubleEnded"])) - - attrs["forwardMeasurementChannel"] = attrs["customData:forwardMeasurementChannel"] - if double_ended_flag: - attrs["backwardMeasurementChannel"] = attrs[ - "customData:reverseMeasurementChannel" - ] - else: - attrs["backwardMeasurementChannel"] = "N/A" - - chFW = int(attrs["forwardMeasurementChannel"]) - 1 # zero-based - if double_ended_flag: - chBW = int(attrs["backwardMeasurementChannel"]) - 1 # zero-based - else: - # no backward channel is negative value. writes better to netcdf - chBW = -1 - - # obtain basic data info - if double_ended_flag: - data_item_names = [attrs[f"logCurveInfo_{x}:mnemonic"] for x in range(6)] - else: - data_item_names = [attrs[f"logCurveInfo_{x}:mnemonic"] for x in range(4)] - - nitem = len(data_item_names) - - ntime = len(filepathlist) - - # print summary - if not silent: - print(f"{ntime} files were found, each representing a single timestep") - print(f"{nitem} recorded vars were found: " + ", ".join(data_item_names)) - print(f"Recorded at {nx} points along the cable") - - if double_ended_flag: - print("The measurement is double ended") - else: - print("The measurement is single ended") - - # obtain timeseries from data - timeseries_loc_in_hierarchy = [ - ("wellLog", "customData", "acquisitionTime"), - ("wellLog", "customData", "referenceTemperature"), - ("wellLog", "customData", "probe1Temperature"), - ("wellLog", "customData", "probe2Temperature"), - ("wellLog", "customData", "referenceProbeVoltage"), - ("wellLog", "customData", "probe1Voltage"), - ("wellLog", "customData", "probe2Voltage"), - ( - "wellLog", - "customData", - "UserConfiguration", - "ChannelConfiguration", - "AcquisitionConfiguration", - "AcquisitionTime", - "userAcquisitionTimeFW", - ), - ] + filepathlist_temp = [f.replace("RawDTS", "Temp") for f in filepathlist_dts] - if double_ended_flag: - timeseries_loc_in_hierarchy.append( - ( - "wellLog", - "customData", - "UserConfiguration", - "ChannelConfiguration", - "AcquisitionConfiguration", - "AcquisitionTime", - "userAcquisitionTimeBW", + for ii, fname in enumerate(filepathlist_dts): + # Check if corresponding temperature file exists + if not os.path.isfile(filepathlist_temp[ii]): + raise FileNotFoundError( + "Could not find BinaryTemp " + "file corresponding to {}".format(fname) ) - ) - - timeseries = { - item[-1]: dict(loc=item, array=np.zeros(ntime, dtype=np.float32)) - for item in timeseries_loc_in_hierarchy - } - - # add units to timeseries (unit of measurement) - for key, item in timeseries.items(): - if f"customData:{key}:uom" in attrs: - item["uom"] = attrs[f"customData:{key}:uom"] - else: - item["uom"] = "" - - # Gather data - arr_path = "s:" + "/s:".join(["wellLog", "logData", "data"]) - - @dask.delayed - def grab_data_per_file(file_handle): - """ - - Parameters - ---------- - file_handle - - Returns - ------- - - """ - with open_file(file_handle, mode="r") as f_h: - eltree = ElementTree.parse(f_h) - arr_el = eltree.findall(arr_path, namespaces=ns) - - if not len(arr_el) == nx: - raise ValueError( - "Inconsistent length of x-dimension" - + "\nCheck if files are mixed up, or if the number of " - + "data points vary per file." - ) - - # remove the breaks on both sides of the string - # split the string on the comma - arr_str = [arr_eli.text.split(",") for arr_eli in arr_el] - return np.array(arr_str, dtype=float) - - data_lst_dly = [grab_data_per_file(fp) for fp in filepathlist] - data_lst = [ - da.from_delayed(x, shape=(nx, nitem), dtype=float) for x in data_lst_dly - ] - data_arr = da.stack(data_lst).T # .compute() - - # Check whether to compute data_arr (if possible 25% faster) - data_arr_cnk = data_arr.rechunk({0: -1, 1: -1, 2: "auto"}) - if load_in_memory == "auto" and data_arr_cnk.npartitions <= 5: - if not silent: - print("Reading the data from disk") - data_arr = data_arr_cnk.compute() - elif load_in_memory: - if not silent: - print("Reading the data from disk") - data_arr = data_arr_cnk.compute() - else: - if not silent: - print("Not reading the data from disk") - data_arr = data_arr_cnk - - data_vars = {} - for name, data_arri in zip(data_item_names, data_arr): - if name == "LAF": - continue - if tld[name] in dim_attrs: - data_vars[tld[name]] = (["x", "time"], data_arri, dim_attrs[tld[name]]) + version = sensortran_binary_version_check(filepathlist_dts) - else: - raise ValueError("Dont know what to do with the" " {name} data column") - - # Obtaining the timeseries data (reference temperature etc) - _ts_dtype = [(k, np.float32) for k in timeseries] - _time_dtype = [ - ("filename_tstamp", np.int64), - ("minDateTimeIndex", "= 1 + ), "No measurement files found in provided " "directory: \n" + str(directory) - double_ended_flag = bool(int(attrs["isDoubleEnded"])) - - attrs["forwardMeasurementChannel"] = meta["forward channel"][-1] - if double_ended_flag: - attrs["backwardMeasurementChannel"] = "N/A" - else: - attrs["backwardMeasurementChannel"] = meta["reverse channel"][-1] - - # obtain basic data info - nx = data["x"].size - - ntime = len(filepathlist) + # Make sure that the list of files contains any files + assert len(filepathlist) >= 1, ( + "No measurement files found in provided " "list/directory" + ) - # chFW = int(attrs['forwardMeasurementChannel']) - 1 # zero-based - # if double_ended_flag: - # chBW = int(attrs['backwardMeasurementChannel']) - 1 # zero-based - # else: - # # no backward channel is negative value. writes better to netcdf - # chBW = -1 + device = apsensing_xml_version_check(filepathlist) - # print summary - if not silent: - print("%s files were found," % ntime + " each representing a single timestep") - print("Recorded at %s points along the cable" % nx) + valid_devices = ["CP320"] - if double_ended_flag: - print("The measurement is double ended") - else: - print("The measurement is single ended") - - # Gather data - # x has already been read. should not change over time - xraw = data["x"] - - # Define all variables - referenceTemperature = np.zeros(ntime) - probe1temperature = np.zeros(ntime) - probe2temperature = np.zeros(ntime) - gamma_ddf = np.zeros(ntime) - k_internal = np.zeros(ntime) - k_external = np.zeros(ntime) - acquisitiontimeFW = np.zeros(ntime) - acquisitiontimeBW = np.zeros(ntime) - - timestamp = [""] * ntime - ST = np.zeros((nx, ntime)) - AST = np.zeros((nx, ntime)) - TMP = np.zeros((nx, ntime)) - - if double_ended_flag: - REV_ST = np.zeros((nx, ntime)) - REV_AST = np.zeros((nx, ntime)) - - for ii in range(ntime): - data, meta = read_sensornet_single(filepathlist[ii]) - - timestamp[ii] = pd.DatetimeIndex([meta["date"] + " " + meta["time"]])[0] - probe1temperature[ii] = float(meta["T ext. ref 1 (°C)"]) - probe2temperature[ii] = float(meta["T ext. ref 2 (°C)"]) - referenceTemperature[ii] = float(meta["T internal ref (°C)"]) - gamma_ddf[ii] = float(meta["gamma"]) - k_internal[ii] = float(meta["k internal"]) - k_external[ii] = float(meta["k external"]) - acquisitiontimeFW[ii] = float(meta["forward acquisition time"]) - acquisitiontimeBW[ii] = float(meta["reverse acquisition time"]) - - ST[:, ii] = data["st"] - AST[:, ii] = data["ast"] - TMP[:, ii] = data["tmp"] - - if double_ended_flag: - REV_ST[:, ii] = data["rst"] - REV_AST[:, ii] = data["rast"] - - if fiber_length is None and double_ended_flag: - fiber_length = np.max([0.0, xraw[-1] - add_internal_fiber_length]) - elif fiber_length is None and not double_ended_flag: - fiber_length = xraw[-1] - else: + if device in valid_devices: pass - assert fiber_length > 0.0, ( - "`fiber_length` is not defined. Use key" - "word argument in read function." + str(fiber_length) - ) - - fiber_start_index = (np.abs(xraw + add_internal_fiber_length)).argmin() - fiber_0_index = np.abs(xraw).argmin() - fiber_1_index = (np.abs(xraw - fiber_length)).argmin() - fiber_n_indices = fiber_1_index - fiber_0_index - fiber_n_indices_internal = fiber_0_index - fiber_start_index - if double_ended_flag: - fiber_end_index = np.min([xraw.size, fiber_1_index + fiber_n_indices_internal]) else: - fiber_end_index = fiber_1_index - - if double_ended_flag: - if not flip_reverse_measurements: - # fiber length how the backward channel is aligned - fiber_length_raw = float(meta["fibre end"]) - fiber_bw_1_index = np.abs(xraw - fiber_length_raw).argmin() - fiber_bw_end_index = np.min( - [xraw.size, fiber_bw_1_index + (fiber_end_index - fiber_1_index)] - ) - fiber_bw_start_index = np.max( - [0, fiber_bw_1_index - fiber_n_indices - fiber_n_indices_internal] - ) - - REV_ST = REV_ST[fiber_bw_start_index:fiber_bw_end_index] - REV_AST = REV_AST[fiber_bw_start_index:fiber_bw_end_index] - - else: - # Use the fiber indices from the forward channel - n_indices_internal_left = fiber_0_index - fiber_start_index - n_indices_internal_right = np.max([0, fiber_end_index - fiber_1_index]) - n_indices_internal_shortest = np.min( - [n_indices_internal_left, n_indices_internal_right] - ) - fiber_start_index = fiber_0_index - n_indices_internal_shortest - fiber_end_index = ( - fiber_0_index + fiber_n_indices + n_indices_internal_shortest - ) - REV_ST = REV_ST[fiber_end_index:fiber_start_index:-1] - REV_AST = REV_AST[fiber_end_index:fiber_start_index:-1] - - x = xraw[fiber_start_index:fiber_end_index] - TMP = TMP[fiber_start_index:fiber_end_index] - ST = ST[fiber_start_index:fiber_end_index] - AST = AST[fiber_start_index:fiber_end_index] - - data_vars = { - "st": (["x", "time"], ST, dim_attrs["st"]), - "ast": (["x", "time"], AST, dim_attrs["ast"]), - "tmp": (["x", "time"], TMP, dim_attrs["tmp"]), - "probe1Temperature": ( - "time", - probe1temperature, - { - "name": "Probe 1 temperature", - "description": "reference probe 1 " "temperature", - "units": r"$^\circ$C", - }, - ), - "probe2Temperature": ( - "time", - probe2temperature, - { - "name": "Probe 2 temperature", - "description": "reference probe 2 " "temperature", - "units": r"$^\circ$C", - }, - ), - "referenceTemperature": ( - "time", - referenceTemperature, - { - "name": "reference temperature", - "description": "Internal reference " "temperature", - "units": r"$^\circ$C", - }, - ), - "gamma_ddf": ( - "time", - gamma_ddf, - { - "name": "gamma ddf", - "description": "machine " "calibrated gamma", - "units": "-", - }, - ), - "k_internal": ( - "time", - k_internal, - { - "name": "k internal", - "description": "machine calibrated " "internal k", - "units": "-", - }, - ), - "k_external": ( - "time", - k_external, - { - "name": "reference temperature", - "description": "machine calibrated " "external k", - "units": "-", - }, - ), - "userAcquisitionTimeFW": ( - "time", - acquisitiontimeFW, - dim_attrs["userAcquisitionTimeFW"], - ), - "userAcquisitionTimeBW": ( - "time", - acquisitiontimeBW, - dim_attrs["userAcquisitionTimeBW"], - ), - } - - if double_ended_flag: - data_vars["rst"] = (["x", "time"], REV_ST, dim_attrs["rst"]) - data_vars["rast"] = (["x", "time"], REV_AST, dim_attrs["rast"]) - - filenamelist = [os.path.split(f)[-1] for f in filepathlist] - - coords = {"x": ("x", x, dim_attrs["x"]), "filename": ("time", filenamelist)} - - dtFW = data_vars["userAcquisitionTimeFW"][1].astype("timedelta64[s]") - dtBW = data_vars["userAcquisitionTimeBW"][1].astype("timedelta64[s]") - if not double_ended_flag: - tcoords = coords_time( - np.array(timestamp).astype("datetime64[ns]"), - timezone_netcdf=timezone_netcdf, - timezone_input_files=timezone_input_files, - dtFW=dtFW, - double_ended_flag=double_ended_flag, + warnings.warn( + "AP sensing device " + '"{0}"'.format(device) + + " has not been tested.\nPlease open an issue on github" + + " and provide an example file" ) - else: - tcoords = coords_time( - np.array(timestamp).astype("datetime64[ns]"), - timezone_netcdf=timezone_netcdf, - timezone_input_files=timezone_input_files, - dtFW=dtFW, - dtBW=dtBW, - double_ended_flag=double_ended_flag, - ) - - coords.update(tcoords) - - return data_vars, coords, attrs - -def read_silixa_attrs_singlefile(filename, sep): - """ - - Parameters - ---------- - filename - sep - - Returns - ------- - - """ - import xmltodict - - def metakey(meta, dict_to_parse, prefix): - """ - Fills the metadata dictionairy with data from dict_to_parse. - The dict_to_parse is the raw data from a silixa xml-file. - dict_to_parse is a nested dictionary to represent the - different levels of hierarchy. For example, - toplevel = {lowlevel: {key: value}}. - This function returns {'toplevel:lowlevel:key': value}. - Where prefix is the flattened hierarchy. - - Parameters - ---------- - meta : dict - the output dictionairy with prcessed metadata - dict_to_parse : dict - prefix - - Returns - ------- - - """ - - for key in dict_to_parse: - if prefix == "": - prefix_parse = key.replace("@", "") - else: - prefix_parse = sep.join([prefix, key.replace("@", "")]) - - if prefix_parse == sep.join(("logData", "data")): - # skip the LAF , ST data - continue - - if hasattr(dict_to_parse[key], "keys"): - # Nested dictionaries, flatten hierarchy. - meta.update(metakey(meta, dict_to_parse[key], prefix_parse)) - - elif isinstance(dict_to_parse[key], list): - # if the key has values for the multiple channels - for ival, val in enumerate(dict_to_parse[key]): - num_key = prefix_parse + "_" + str(ival) - meta.update(metakey(meta, val, num_key)) - else: - meta[prefix_parse] = dict_to_parse[key] - - return meta - - with open_file(filename) as fh: - doc_ = xmltodict.parse(fh.read()) - - if "wellLogs" in doc_.keys(): - doc = doc_["wellLogs"]["wellLog"] - else: - doc = doc_["logs"]["log"] - - return metakey({}, doc, "") - - -def read_sensortran_files_routine( - filepathlist_dts, filepathlist_temp, timezone_netcdf="UTC", silent=False -): - """ - Internal routine that reads sensortran files. - Use dtscalibration.read_sensortran_files function instead. - - The sensortran files are in UTC time - - Parameters - ---------- - filepathlist_dts - filepathlist_temp - timezone_netcdf - silent - - Returns - ------- - - """ - - # Obtain metadata from the first file - data_dts, meta_dts = read_sensortran_single(filepathlist_dts[0]) - data_temp, meta_temp = read_sensortran_single(filepathlist_temp[0]) - - attrs = meta_dts - - # Add standardised required attributes - attrs["isDoubleEnded"] = "0" - - attrs["forwardMeasurementChannel"] = meta_dts["channel_id"] - 1 - attrs["backwardMeasurementChannel"] = "N/A" - - # obtain basic data info - nx = meta_temp["num_points"] - - ntime = len(filepathlist_dts) - - # print summary - if not silent: - print("%s files were found," % ntime + " each representing a single timestep") - print("Recorded at %s points along the cable" % nx) - - print("The measurement is single ended") - - # Gather data - # x has already been read. should not change over time - x = data_temp["x"] - - # Define all variables - referenceTemperature = np.zeros(ntime) - acquisitiontimeFW = np.ones(ntime) - - timestamp = [""] * ntime - ST = np.zeros((nx, ntime), dtype=np.int32) - AST = np.zeros((nx, ntime), dtype=np.int32) - TMP = np.zeros((nx, ntime)) - - ST_zero = np.zeros((ntime)) - AST_zero = np.zeros((ntime)) - - for ii in range(ntime): - data_dts, meta_dts = read_sensortran_single(filepathlist_dts[ii]) - data_temp, meta_temp = read_sensortran_single(filepathlist_temp[ii]) - - timestamp[ii] = data_dts["time"] - - referenceTemperature[ii] = data_temp["reference_temperature"] - 273.15 - - ST[:, ii] = data_dts["st"][:nx] - AST[:, ii] = data_dts["ast"][:nx] - # The TMP can vary by 1 or 2 datapoints, dynamically assign the values - TMP[: meta_temp["num_points"], ii] = data_temp["tmp"][:nx] - - zero_index = (meta_dts["num_points"] - nx) // 2 - ST_zero[ii] = np.mean(data_dts["st"][nx + zero_index :]) - AST_zero[ii] = np.mean(data_dts["ast"][nx + zero_index :]) - - data_vars = { - "st": (["x", "time"], ST, dim_attrs["st"]), - "ast": (["x", "time"], AST, dim_attrs["ast"]), - "tmp": ( - ["x", "time"], - TMP, - { - "name": "tmp", - "description": "Temperature calibrated by device", - "units": meta_temp["y_units"], - }, - ), - "referenceTemperature": ( - "time", - referenceTemperature, - { - "name": "reference temperature", - "description": "Internal reference " "temperature", - "units": r"$^\circ$C", - }, - ), - "st_zero": ( - ["time"], - ST_zero, - { - "name": "ST_zero", - "description": "Stokes zero count", - "units": meta_dts["y_units"], - }, - ), - "ast_zero": ( - ["time"], - AST_zero, - { - "name": "AST_zero", - "description": "anit-Stokes zero count", - "units": meta_dts["y_units"], - }, - ), - "userAcquisitionTimeFW": ( - "time", - acquisitiontimeFW, - dim_attrs["userAcquisitionTimeFW"], - ), - } - - filenamelist_dts = [os.path.split(f)[-1] for f in filepathlist_dts] - filenamelist_temp = [os.path.split(f)[-1] for f in filepathlist_temp] - - coords = { - "x": ( - "x", - x, - { - "name": "distance", - "description": "Length along fiber", - "long_description": "Starting at connector " + "of forward channel", - "units": "m", - }, - ), - "filename": ("time", filenamelist_dts), - "filename_temp": ("time", filenamelist_temp), - } - - dtFW = data_vars["userAcquisitionTimeFW"][1].astype("timedelta64[s]") - - tcoords = coords_time( - np.array(timestamp).astype("datetime64[ns]"), + data_vars, coords, attrs = read_apsensing_files_routine( + filepathlist, timezone_netcdf=timezone_netcdf, - timezone_input_files="UTC", - dtFW=dtFW, - double_ended_flag=False, + timezone_input_files=timezone_input_files, + silent=silent, + load_in_memory=load_in_memory, ) - coords.update(tcoords) + ds = DataStore(data_vars=data_vars, coords=coords, attrs=attrs, **kwargs) + return ds - return data_vars, coords, attrs - -def read_sensortran_single(fname): - """ - Internal routine that reads a single sensortran file. - Use dtscalibration.read_sensortran_files function instead. - - Parameters - ---------- - fname - - Returns - ------- - data, metadata - """ - import struct - from datetime import datetime - - meta = {} - data = {} - with open(fname, "rb") as f: - meta["survey_type"] = struct.unpack("= 1, msg + + # sort based on dates in filesname. A simple sorted() is not sufficient + # as month folders do not sort well + basenames = [os.path.basename(fp) for fp in filepathlist_unsorted] + dates = ["".join(bn.split(" ")[2:4]) for bn in basenames] + i_sort = np.argsort(dates) + filepathlist = [filepathlist_unsorted[i] for i in i_sort] + + # Check measurements are all from same channel + chno = [bn.split(" ")[1] for bn in basenames] + assert ( + len(set(chno)) == 1 + ), "Folder contains measurements from multiple channels" + + # Make sure that the list of files contains any files + assert len(filepathlist) >= 1, ( + "No measurement files found in provided " "list/directory" ) - logdata_tree = logtree.find("./{0}logData".format(namespace)) - - # Amount of datapoints is the size of the logdata tree - nx = len(logdata_tree) - sep = ":" - ns = {"s": namespace[1:-1]} - - # Obtain metadata from the first file - attrs, skip_chars = read_apsensing_attrs_singlefile(filepathlist[0], sep) - - # Add standardised required attributes - # No example of DE file available - attrs["isDoubleEnded"] = "0" - double_ended_flag = bool(int(attrs["isDoubleEnded"])) - - attrs["forwardMeasurementChannel"] = attrs[ - "wellbore:dtsMeasurementSet:dtsMeasurement:connectedToFiber:uidRef" - ] - attrs["backwardMeasurementChannel"] = "N/A" + ddf_version = sensornet_ddf_version_check(filepathlist) - data_item_names = [ - attrs["wellbore:wellLogSet:wellLog:logCurveInfo_{0}:mnemonic".format(x)] - for x in range(0, 4) + valid_versions = [ + "Halo DTS v1*", + "ORYX F/W v1.02 Oryx Data Collector v3*", + "ORYX F/W v4.00 Oryx Data Collector v3*", + "Sentinel DTS v5*", ] - nitem = len(data_item_names) + valid = any([fnmatch.fnmatch(ddf_version, v_) for v_ in valid_versions]) - ntime = len(filepathlist) - - # print summary - if not silent: - print("%s files were found, each representing a single timestep" % ntime) - print("%s recorded vars were found: " % nitem + ", ".join(data_item_names)) - print("Recorded at %s points along the cable" % nx) - - if double_ended_flag: - print("The measurement is double ended") + if valid: + if fnmatch.fnmatch(ddf_version, "Halo DTS v1*"): + flip_reverse_measurements = True + elif fnmatch.fnmatch(ddf_version, "Sentinel DTS v5*"): + flip_reverse_measurements = True else: - print("The measurement is single ended") - - # Gather data - arr_path = "s:" + "/s:".join( - [ - "wellSet", - "well", - "wellboreSet", - "wellbore", - "wellLogSet", - "wellLog", - "logData", - "data", - ] - ) - - @dask.delayed - def grab_data_per_file(file_handle): - """ - - Parameters - ---------- - file_handle - - Returns - ------- - - """ - - with open_file(file_handle, mode="r") as f_h: - if skip_chars: - f_h.read(3) - eltree = ElementTree.parse(f_h) - arr_el = eltree.findall(arr_path, namespaces=ns) - - if not len(arr_el) == nx: - raise ValueError( - "Inconsistent length of x-dimension" - + "\nCheck if files are mixed up, or if the number of " - + "data points vary per file." - ) - - # remove the breaks on both sides of the string - # split the string on the comma - arr_str = [arr_eli.text.split(",") for arr_eli in arr_el] - return np.array(arr_str, dtype=float) + flip_reverse_measurements = False - data_lst_dly = [grab_data_per_file(fp) for fp in filepathlist] - - data_lst = [ - da.from_delayed(x, shape=(nx, nitem), dtype=float) for x in data_lst_dly - ] - data_arr = da.stack(data_lst).T # .compute() - - # Check whether to compute data_arr (if possible 25% faster) - data_arr_cnk = data_arr.rechunk({0: -1, 1: -1, 2: "auto"}) - if load_in_memory == "auto" and data_arr_cnk.npartitions <= 5: - if not silent: - print("Reading the data from disk") - data_arr = data_arr_cnk.compute() - elif load_in_memory: - if not silent: - print("Reading the data from disk") - data_arr = data_arr_cnk.compute() else: - if not silent: - print("Not reading the data from disk") - data_arr = data_arr_cnk - - data_vars = {} - for name, data_arri in zip(data_item_names, data_arr): - if name == "LAF": - continue - - if tld[name] in dim_attrs_apsensing: - data_vars[tld[name]] = ( - ["x", "time"], - data_arri, - dim_attrs_apsensing[tld[name]], - ) - elif name in dim_attrs_apsensing: - data_vars[tld[name]] = (["x", "time"], data_arri, dim_attrs_apsensing[name]) - else: - raise ValueError( - "Dont know what to do with the" + " {} data column".format(name) - ) - - _time_dtype = [("filename_tstamp", np.int64), ("acquisitionTime", " 0.0, ( + "`fiber_length` is not defined. Use key" + "word argument in read function." + str(fiber_length) + ) + + fiber_start_index = (np.abs(xraw + add_internal_fiber_length)).argmin() + fiber_0_index = np.abs(xraw).argmin() + fiber_1_index = (np.abs(xraw - fiber_length)).argmin() + fiber_n_indices = fiber_1_index - fiber_0_index + fiber_n_indices_internal = fiber_0_index - fiber_start_index + if double_ended_flag: + fiber_end_index = np.min([xraw.size, fiber_1_index + fiber_n_indices_internal]) + else: + fiber_end_index = fiber_1_index + + if double_ended_flag: + if not flip_reverse_measurements: + # fiber length how the backward channel is aligned + fiber_length_raw = float(meta["fibre end"]) + fiber_bw_1_index = np.abs(xraw - fiber_length_raw).argmin() + fiber_bw_end_index = np.min( + [xraw.size, fiber_bw_1_index + (fiber_end_index - fiber_1_index)] + ) + fiber_bw_start_index = np.max( + [0, fiber_bw_1_index - fiber_n_indices - fiber_n_indices_internal] + ) + + REV_ST = REV_ST[fiber_bw_start_index:fiber_bw_end_index] + REV_AST = REV_AST[fiber_bw_start_index:fiber_bw_end_index] + + else: + # Use the fiber indices from the forward channel + n_indices_internal_left = fiber_0_index - fiber_start_index + n_indices_internal_right = np.max([0, fiber_end_index - fiber_1_index]) + n_indices_internal_shortest = np.min( + [n_indices_internal_left, n_indices_internal_right] + ) + fiber_start_index = fiber_0_index - n_indices_internal_shortest + fiber_end_index = ( + fiber_0_index + fiber_n_indices + n_indices_internal_shortest + ) + REV_ST = REV_ST[fiber_end_index:fiber_start_index:-1] + REV_AST = REV_AST[fiber_end_index:fiber_start_index:-1] + + x = xraw[fiber_start_index:fiber_end_index] + TMP = TMP[fiber_start_index:fiber_end_index] + ST = ST[fiber_start_index:fiber_end_index] + AST = AST[fiber_start_index:fiber_end_index] + + data_vars = { + "st": (["x", "time"], ST, dim_attrs["st"]), + "ast": (["x", "time"], AST, dim_attrs["ast"]), + "tmp": (["x", "time"], TMP, dim_attrs["tmp"]), + "probe1Temperature": ( + "time", + probe1temperature, + { + "name": "Probe 1 temperature", + "description": "reference probe 1 " "temperature", + "units": r"$^\circ$C", + }, + ), + "probe2Temperature": ( + "time", + probe2temperature, + { + "name": "Probe 2 temperature", + "description": "reference probe 2 " "temperature", + "units": r"$^\circ$C", + }, + ), + "referenceTemperature": ( + "time", + referenceTemperature, + { + "name": "reference temperature", + "description": "Internal reference " "temperature", + "units": r"$^\circ$C", + }, + ), + "gamma_ddf": ( + "time", + gamma_ddf, + { + "name": "gamma ddf", + "description": "machine " "calibrated gamma", + "units": "-", + }, + ), + "k_internal": ( + "time", + k_internal, + { + "name": "k internal", + "description": "machine calibrated " "internal k", + "units": "-", + }, + ), + "k_external": ( + "time", + k_external, + { + "name": "reference temperature", + "description": "machine calibrated " "external k", + "units": "-", + }, + ), + "userAcquisitionTimeFW": ( + "time", + acquisitiontimeFW, + dim_attrs["userAcquisitionTimeFW"], + ), + "userAcquisitionTimeBW": ( + "time", + acquisitiontimeBW, + dim_attrs["userAcquisitionTimeBW"], + ), + } + + if double_ended_flag: + data_vars["rst"] = (["x", "time"], REV_ST, dim_attrs["rst"]) + data_vars["rast"] = (["x", "time"], REV_AST, dim_attrs["rast"]) + + filenamelist = [os.path.split(f)[-1] for f in filepathlist] + + coords = {"x": ("x", x, dim_attrs["x"]), "filename": ("time", filenamelist)} + + dtFW = data_vars["userAcquisitionTimeFW"][1].astype("timedelta64[s]") + dtBW = data_vars["userAcquisitionTimeBW"][1].astype("timedelta64[s]") + if not double_ended_flag: + tcoords = coords_time( + np.array(timestamp).astype("datetime64[ns]"), + timezone_netcdf=timezone_netcdf, + timezone_input_files=timezone_input_files, + dtFW=dtFW, + double_ended_flag=double_ended_flag, + ) + else: + tcoords = coords_time( + np.array(timestamp).astype("datetime64[ns]"), + timezone_netcdf=timezone_netcdf, + timezone_input_files=timezone_input_files, + dtFW=dtFW, + dtBW=dtBW, + double_ended_flag=double_ended_flag, + ) + + coords.update(tcoords) + + return data_vars, coords, attrs + + +def read_silixa_attrs_singlefile(filename, sep): + """ + + Parameters + ---------- + filename + sep + + Returns + ------- + + """ + import xmltodict + + def metakey(meta, dict_to_parse, prefix): + """ + Fills the metadata dictionairy with data from dict_to_parse. + The dict_to_parse is the raw data from a silixa xml-file. + dict_to_parse is a nested dictionary to represent the + different levels of hierarchy. For example, + toplevel = {lowlevel: {key: value}}. + This function returns {'toplevel:lowlevel:key': value}. + Where prefix is the flattened hierarchy. + + Parameters + ---------- + meta : dict + the output dictionairy with prcessed metadata + dict_to_parse : dict + prefix + + Returns + ------- + + """ + + for key in dict_to_parse: + if prefix == "": + prefix_parse = key.replace("@", "") + else: + prefix_parse = sep.join([prefix, key.replace("@", "")]) + + if prefix_parse == sep.join(("logData", "data")): + # skip the LAF , ST data + continue + + if hasattr(dict_to_parse[key], "keys"): + # Nested dictionaries, flatten hierarchy. + meta.update(metakey(meta, dict_to_parse[key], prefix_parse)) + + elif isinstance(dict_to_parse[key], list): + # if the key has values for the multiple channels + for ival, val in enumerate(dict_to_parse[key]): + num_key = prefix_parse + "_" + str(ival) + meta.update(metakey(meta, val, num_key)) + else: + meta[prefix_parse] = dict_to_parse[key] + + return meta + + with open_file(filename) as fh: + doc_ = xmltodict.parse(fh.read()) + + if "wellLogs" in doc_.keys(): + doc = doc_["wellLogs"]["wellLog"] + else: + doc = doc_["logs"]["log"] + + return metakey({}, doc, "") + + +def read_sensortran_files_routine( + filepathlist_dts, + filepathlist_temp, + timezone_input_files="UTC", + timezone_netcdf="UTC", + silent=False, +): + """ + Internal routine that reads sensortran files. + Use dtscalibration.read_sensortran_files function instead. + + The sensortran files are in UTC time + + Parameters + ---------- + filepathlist_dts + filepathlist_temp + timezone_netcdf + silent + + Returns + ------- + + """ + assert timezone_input_files == "UTC", "The sensortran files are always in UTC time." + + # Obtain metadata from the first file + data_dts, meta_dts = read_sensortran_single(filepathlist_dts[0]) + data_temp, meta_temp = read_sensortran_single(filepathlist_temp[0]) + + attrs = meta_dts + + # Add standardised required attributes + attrs["isDoubleEnded"] = "0" + + attrs["forwardMeasurementChannel"] = meta_dts["channel_id"] - 1 + attrs["backwardMeasurementChannel"] = "N/A" + + # obtain basic data info + nx = meta_temp["num_points"] + + ntime = len(filepathlist_dts) + + # print summary + if not silent: + print("%s files were found," % ntime + " each representing a single timestep") + print("Recorded at %s points along the cable" % nx) + + print("The measurement is single ended") + + # Gather data + # x has already been read. should not change over time + x = data_temp["x"] + + # Define all variables + referenceTemperature = np.zeros(ntime) + acquisitiontimeFW = np.ones(ntime) + + timestamp = [""] * ntime + ST = np.zeros((nx, ntime), dtype=np.int32) + AST = np.zeros((nx, ntime), dtype=np.int32) + TMP = np.zeros((nx, ntime)) + + ST_zero = np.zeros((ntime)) + AST_zero = np.zeros((ntime)) + + for ii in range(ntime): + data_dts, meta_dts = read_sensortran_single(filepathlist_dts[ii]) + data_temp, meta_temp = read_sensortran_single(filepathlist_temp[ii]) + + timestamp[ii] = data_dts["time"] + + referenceTemperature[ii] = data_temp["reference_temperature"] - 273.15 + + ST[:, ii] = data_dts["st"][:nx] + AST[:, ii] = data_dts["ast"][:nx] + # The TMP can vary by 1 or 2 datapoints, dynamically assign the values + TMP[: meta_temp["num_points"], ii] = data_temp["tmp"][:nx] + + zero_index = (meta_dts["num_points"] - nx) // 2 + ST_zero[ii] = np.mean(data_dts["st"][nx + zero_index :]) + AST_zero[ii] = np.mean(data_dts["ast"][nx + zero_index :]) + + data_vars = { + "st": (["x", "time"], ST, dim_attrs["st"]), + "ast": (["x", "time"], AST, dim_attrs["ast"]), + "tmp": ( + ["x", "time"], + TMP, + { + "name": "tmp", + "description": "Temperature calibrated by device", + "units": meta_temp["y_units"], + }, + ), + "referenceTemperature": ( + "time", + referenceTemperature, + { + "name": "reference temperature", + "description": "Internal reference " "temperature", + "units": r"$^\circ$C", + }, + ), + "st_zero": ( + ["time"], + ST_zero, + { + "name": "ST_zero", + "description": "Stokes zero count", + "units": meta_dts["y_units"], + }, + ), + "ast_zero": ( + ["time"], + AST_zero, + { + "name": "AST_zero", + "description": "anit-Stokes zero count", + "units": meta_dts["y_units"], + }, + ), + "userAcquisitionTimeFW": ( + "time", + acquisitiontimeFW, + dim_attrs["userAcquisitionTimeFW"], + ), + } + + filenamelist_dts = [os.path.split(f)[-1] for f in filepathlist_dts] + filenamelist_temp = [os.path.split(f)[-1] for f in filepathlist_temp] + + coords = { + "x": ( + "x", + x, + { + "name": "distance", + "description": "Length along fiber", + "long_description": "Starting at connector " + "of forward channel", + "units": "m", + }, + ), + "filename": ("time", filenamelist_dts), + "filename_temp": ("time", filenamelist_temp), + } + + dtFW = data_vars["userAcquisitionTimeFW"][1].astype("timedelta64[s]") + + tcoords = coords_time( + np.array(timestamp).astype("datetime64[ns]"), + timezone_netcdf=timezone_netcdf, + timezone_input_files="UTC", + dtFW=dtFW, + double_ended_flag=False, + ) + + coords.update(tcoords) + + return data_vars, coords, attrs + + +def read_sensortran_single(fname): + """ + Internal routine that reads a single sensortran file. + Use dtscalibration.read_sensortran_files function instead. + + Parameters + ---------- + fname + + Returns + ------- + data, metadata + """ + import struct + from datetime import datetime + + meta = {} + data = {} + with open(fname, "rb") as f: + meta["survey_type"] = struct.unpack("