diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f08ec8d3f..a3cfcd644 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ['3.8', '3.9'] + python-version: ['3.9', '3.10'] steps: - uses: actions/checkout@v2 diff --git a/binder/requirements.txt b/binder/requirements.txt index b913cb3e4..9ee80ec8d 100644 --- a/binder/requirements.txt +++ b/binder/requirements.txt @@ -1,4 +1,4 @@ --r ../readthedocs-requirements.txt +scikit-fda[docs] jupytext sphinx-gallery<=0.7.0 . \ No newline at end of file diff --git a/docs/_static/switcher.json b/docs/_static/switcher.json new file mode 100644 index 000000000..f4e362095 --- /dev/null +++ b/docs/_static/switcher.json @@ -0,0 +1,13 @@ +[ + { + "name": "dev", + "version": "dev", + "url": "https://fda.readthedocs.io/en/latest/" + }, + { + "name": "0.9 (stable)", + "version": "stable", + "url": "https://fda.readthedocs.io/en/stable/", + "preferred": true + } +] \ No newline at end of file diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index fede4ca4e..d1f44b52e 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -10,12 +10,16 @@ .. autosummary:: {% for item in methods %} + {% if item != "__init__" %} ~{{ name }}.{{ item }} + {% endif %} {%- endfor %} {% endif %} {% for item in methods %} + {% if item != "__init__" %} .. automethod:: {{ item }} + {% endif %} {%- endfor %} {% endblock %} diff --git a/docs/conf.py b/docs/conf.py index 80583b65e..e25650181 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -46,8 +46,16 @@ "Universidad Autónoma de Madrid" ) github_url = "https://github.com/GAA-UAM/scikit-fda" -rtd_version = os.environ.get("READTHEDOCS_VERSION", "latest") -branch = "master" if rtd_version == "stable" else "develop" +rtd_version = os.environ.get("READTHEDOCS_VERSION") +rtd_version_type = os.environ.get("READTHEDOCS_VERSION_TYPE") + +switcher_version = rtd_version +if switcher_version == "latest": + switcher_version = "dev" +elif rtd_version_type not in {"branch", "tag"}: + switcher_version = skfda.__version__ + +rtd_branch = os.environ.get(" READTHEDOCS_GIT_IDENTIFIER", "develop") language = "en" try: @@ -112,6 +120,14 @@ html_theme_options = { "use_edit_page_button": True, "github_url": github_url, + "switcher": { + "json_url": ( + "https://fda.readthedocs.io/en/latest/_static/switcher.json" + ), + "version_match": switcher_version, + }, + "show_version_warning_banner": True, + "navbar_start": ["navbar-logo", "version-switcher"], "icon_links": [ { "name": "PyPI", @@ -289,7 +305,7 @@ def linkcode_resolve(domain: str, info: Mapping[str, str]) -> str | None: else: linespec = "" - return f"{github_url}/tree/{branch}/skfda/{fn}{linespec}" + return f"{github_url}/tree/{rtd_branch}/skfda/{fn}{linespec}" # -- Options for "sphinx.ext.mathjax" -- @@ -391,7 +407,7 @@ def __repr__(self) -> str: "binder": { "org": "GAA-UAM", "repo": "scikit-fda", - "branch": branch, + "branch": rtd_branch, "binderhub_url": "https://mybinder.org", "dependencies": ["../binder/requirements.txt"], "notebooks_dir": "../examples", diff --git a/docs/index.rst b/docs/index.rst index 55cbe94dc..b5ee3c933 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -34,7 +34,7 @@ Github you can find more information related to the development of the package. :caption: More documentation apilist - glossary + Glossary contributors An exhaustive list of all the contents of the package can be found in the diff --git a/docs/modules/exploratory/stats.rst b/docs/modules/exploratory/stats.rst index 17642a732..b55b05fe6 100644 --- a/docs/modules/exploratory/stats.rst +++ b/docs/modules/exploratory/stats.rst @@ -31,4 +31,5 @@ statistics can be used. skfda.exploratory.stats.cov skfda.exploratory.stats.var + skfda.exploratory.stats.std diff --git a/docs/modules/ml/regression.rst b/docs/modules/ml/regression.rst index 539d38ec2..b3e224e78 100644 --- a/docs/modules/ml/regression.rst +++ b/docs/modules/ml/regression.rst @@ -57,4 +57,17 @@ regression is fitted using the coefficients of the functions in said basis. .. autosummary:: :toctree: autosummary - skfda.ml.regression.FPCARegression \ No newline at end of file + skfda.ml.regression.FPCARegression + +FPLS regression +----------------- +This module includes the implementation of FPLS (Functional Partial Least Squares) +regression. This implementation accepts either functional or multivariate data as the regressor and the response. +FPLS regression consists on performing the FPLS dimensionality reduction algorithm +but using a regression deflation strategy. + + +.. autosummary:: + :toctree: autosummary + + skfda.ml.regression.FPLSRegression \ No newline at end of file diff --git a/docs/modules/preprocessing/dim_reduction.rst b/docs/modules/preprocessing/dim_reduction.rst index e35e5675f..d78d8705c 100644 --- a/docs/modules/preprocessing/dim_reduction.rst +++ b/docs/modules/preprocessing/dim_reduction.rst @@ -36,9 +36,12 @@ Other dimensionality reduction methods construct new features from existing ones. For example, in functional principal component analysis, we project the data samples into a smaller sample of functions that preserve most of the original -variance. +variance. Similarly, in functional partial least squares, we project +the data samples into a smaller sample of functions that preserve most +of the covariance between the two data blocks. .. autosummary:: :toctree: autosummary - skfda.preprocessing.dim_reduction.FPCA \ No newline at end of file + skfda.preprocessing.dim_reduction.FPCA + skfda.preprocessing.dim_reduction.FPLS \ No newline at end of file diff --git a/docs/refs.bib b/docs/refs.bib index 1d415f3c2..a40d1a108 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -115,6 +115,21 @@ @article{berrendero++_2022_functional keywords = {62J12,62R10,Functional data,Kernel methods in statistics,Logistic regression,Mathematics - Statistics Theory,Reproducing kernel Hilbert spaces} } +@article{borggaard+thodberg_1992_optimal, + title = {Optimal Minimal Neural Interpretation of Spectra}, + author = {Borggaard, {\relax Claus}. and Thodberg, Hans Henrik.}, + year = {1992}, + month = mar, + journal = {Analytical Chemistry}, + volume = {64}, + number = {5}, + pages = {545--551}, + issn = {0003-2700}, + doi = {10.1021/ac00029a018}, + url = {https://doi.org/10.1021/ac00029a018}, + urldate = {2018-12-09} +} + @article{cuesta-albertos++_2017_ddgclassifier, title = {The {{DDᴳ}}-Classifier in the Functional Setting}, author = {{Cuesta-Albertos}, J. A. and {Febrero-Bande}, M. and {Oviedo~de~la~Fuente}, M.}, @@ -310,6 +325,25 @@ @article{gutierrez++_1992_numerical pages={67--77}, year={1992}, publisher={Wiley Online Library} + +@article{hastie++_1995_penalized, + title = {Penalized Discriminant Analysis}, + author = {Hastie, Trevor and Buja, Andreas and Tibshirani, Robert}, + year = {1995}, + month = feb, + journal = {The Annals of Statistics}, + volume = {23}, + number = {1}, + pages = {73--102}, + issn = {0090-5364, 2168-8966}, + doi = {10.1214/aos/1176324456}, + url = {https://projecteuclid.org/euclid.aos/1176324456}, + urldate = {2018-12-09}, + abstract = {Fisher's linear discriminant analysis (LDA) is a popular data-analytic tool for studying the relationship between a set of predictors and a categorical response. In this paper we describe a penalized version of LDA. It is designed for situations in which there are many highly correlated predictors, such as those obtained by discretizing a function, or the grey-scale values of the pixels in a series of images. In cases such as these it is natural, efficient and sometimes essential to impose a spatial smoothness constraint on the coefficients, both for improved prediction performance and interpretability. We cast the classification problem into a regression framework via optimal scoring. Using this, our proposal facilitates the use of any penalized regression technique in the classification setting. The technique is illustrated with examples in speech recognition and handwritten character recognition.}, + langid = {english}, + mrnumber = {MR1331657}, + zmnumber = {0821.62031}, + keywords = {discrimination,regularization,Signal and image classification} } @article{li++_2012_ddclassifier, @@ -366,6 +400,14 @@ @article{marron++_2015_functional keywords = {alignment,dynamic time warping,elastic metric,Fisher–Rao metric,Functional data analysis,registration,warping} } +@misc{meteorologicalstateagencyofspainaemet_2009_meteorological, + title = {Meteorological Data of {{Spanish}} Weather Stations between Years 1980 and 2009.}, + author = {{Meteorological State Agency of Spain (AEMET)}}, + year = {2009}, + url = {http://www.aemet.es/}, + abstract = {The data were obtained from the FTP of AEMET in 2009.} +} + @article{pini++_2018_hotelling, title = {Hotelling's {{T2}} in Separable {{Hilbert}} Spaces}, author = {Pini, Alessia and Stamm, Aymeric and Vantini, Simone}, @@ -383,6 +425,35 @@ @article{pini++_2018_hotelling keywords = {Functional data,High-dimensional data Hotelling’s,Hilbert space,Nonparametric inference,Permutation test} } +@incollection{pintado+romo_2005_depthbased, + title = {Depth-Based Classification for Functional Data}, + shorttitle = {Data {{Depth}}}, + booktitle = {Data {{Depth}}: {{Robust Multivariate Analysis}}, {{Computational Geometry}} and {{Applications}}}, + author = {Pintado, Sara and Romo, Juan}, + year = {2005}, + month = nov, + series = {{{DIMACS Series}} in {{Discrete Mathematics}} and {{Theoretical Computer Science}}}, + volume = {72}, + pages = {103--119}, + publisher = {{American Mathematical Society}}, + doi = {10.1090/dimacs/072/08}, + isbn = {978-0-8218-3596-8} +} + +@inproceedings{ramos-carreno++_2022_scikitfda, + title = {{{scikit-fda}}: {{Computational}} Tools for Machine Learning with Functional Data}, + shorttitle = {Scikit-Fda}, + booktitle = {2022 {{IEEE}} 34th {{International Conference}} on {{Tools}} with {{Artificial Intelligence}} ({{ICTAI}})}, + author = {{Ramos-Carre{\~n}o}, Carlos and Torrecilla, Jos{\'e} Luis and Hong, Yujian and Su{\'a}rez, Alberto}, + year = {2022}, + month = oct, + pages = {213--218}, + issn = {2375-0197}, + doi = {10.1109/ICTAI56018.2022.00038}, + abstract = {Machine learning from functional data poses particular challenges that require specific computational tools that take into account their structure. In this work, we present scikit-fda, a Python library for functional data analysis, visualization, preprocessing, and machine learning. The library is designed for smooth integration in the Python scientific ecosystem. In particular, it complements and can be used in combination with scikit-learn, the reference Python library for machine learning. The functionality of scikit-fda is illustrated in clustering, regression, and classification problems from different areas of application.}, + keywords = {Data analysis,data visualization,Data visualization,Documentation,Ecosystems,Extrapolation,functional data analysis,Interpolation,machine learning,Machine learning,Python toolbox} +} + @inbook{ramsay+silverman_2005_functionala, title = {From Functional Data to Smooth Functions}, booktitle = {Functional Data Analysis}, @@ -440,6 +511,24 @@ @inbook{ramsay+silverman_2005_registration keywords = {Multivariate analysis} } +@incollection{romeo+marzoljaen_2014_analisis, + title = {{An\'alisis del viento y la niebla en el aeropuerto de Los Rodeos (Tenerife). Cambios y tendencias}}, + booktitle = {{Cambio clim\'atico y cambio global.}}, + author = {Romeo, V{\'i}ctor M. and Marzol Ja{\'e}n, M{\textordfeminine} Victoria}, + editor = {Fern{\'a}ndez Montes, Sonia and S{\'a}nchez Rodrigo, Fernando}, + year = {2014}, + series = {{Publicaciones de la Asociaci\'on Espa\~nola de Climatolog\'ia. Serie A.}}, + number = {9}, + pages = {325--334}, + publisher = {{Asociaci\'on Espa\~nola de Climatolog\'ia}}, + url = {https://repositorio.aemet.es/handle/20.500.11765/8173}, + urldate = {2022-07-28}, + abstract = {[ES]La particular localizaci\'on del aeropuerto Tenerife Norte-Los Rodeos, en el nordeste de la isla de Tenerife, es la causa de la modificaci\'on de la direcci\'on habitual de los vientos alisios del NE a vientos del NO, acompa\~nados frecuentemente por nubosidad. Esto ocasiona que en un n\'umero significativo de d\'ias al a\~no este aeropuerto est\'e cubierto por la niebla, lo que afecta de una forma muy importante a su operatividad. El objetivo del trabajo es caracterizar su r\'egimen de vientos y de la niebla durante los \'ultimos trece a\~nos (2000-2012) y comprobar si se han producido cambios significativos en ambas variables clim\'aticas con respecto a los periodos normales de 1961-1990 y 1971-2000. Los resultados indican que las dos direcciones dominantes del viento, NO y SE, mantienen sus frecuencias no as\'i la niebla que ha aumentado su presencia en los \'ultimos a\~nos.}, + copyright = {Licencia CC: Reconocimiento CC BY}, + isbn = {978-84-16027-69-9}, + langid = {spanish} +} + @article{ruiz-meana++_2003_cariporide, title = {Cariporide Preserves Mitochondrial Proton Gradient and Delays {{ATP}} Depletion in Cardiomyocytes during Ischemic Conditions}, author = {{Ruiz-Meana}, Marisol and {Garcia-Dorado}, David and Pina, Pilar and Inserte, Javier and Agull{\'o}, Luis and {Soler-Soler}, Jordi}, diff --git a/examples/full_examples/README.txt b/examples/full_examples/README.txt deleted file mode 100644 index 658318787..000000000 --- a/examples/full_examples/README.txt +++ /dev/null @@ -1,5 +0,0 @@ -ICTAI examples -============== - -Examples of complete analyses showcasing the main functionalities of the scikit-fda package, -presented at the 34th IEEE International Conference on Tools with Artificial Intelligence (ICTAI). diff --git a/examples/full_examples/plot_aemet_unsupervised.py b/examples/plot_aemet_unsupervised.py similarity index 53% rename from examples/full_examples/plot_aemet_unsupervised.py rename to examples/plot_aemet_unsupervised.py index 8f8886c30..5d68b8fd0 100644 --- a/examples/full_examples/plot_aemet_unsupervised.py +++ b/examples/plot_aemet_unsupervised.py @@ -14,11 +14,13 @@ from typing import Any, Mapping, Tuple +import cartopy.crs as ccrs import matplotlib.pyplot as plt import numpy as np import sklearn.cluster +from cartopy.io.img_tiles import GoogleTiles from matplotlib.axes import Axes -from mpl_toolkits.basemap import Basemap +from matplotlib.figure import Figure from skfda.datasets import fetch_aemet from skfda.exploratory.depth import ModifiedBandDepth @@ -28,31 +30,56 @@ from skfda.ml.clustering import KMeans from skfda.preprocessing.dim_reduction import FPCA -############################################################################## -# We will first load the AEMET dataset and plot it. +# %% +# In this example we explore the curves of daily temperatures at +# different weather stations from Spain, included in the AEMET dataset\ +# :footcite:p:`meteorologicalstateagencyofspainaemet_2009_meteorological`. +# +# We first show how the visualization tools of scikit-fda can be used to +# detect and interpret magnitude and shape outliers. +# We also explain how to employ a clustering method on these temperature +# curves using scikit-fda. +# Then, the location of each weather station is plotted into a map of Spain +# with a different color according to its cluster. +# This reveals a remarkable resemblance to a Spanish climate map. +# A posterior analysis using functional principal component analysis (FPCA) +# explains how the two first principal components are related with relevant +# meteorological concepts, and can be used to reconstruct and interpret +# the original clustering. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% +# We first load the AEMET dataset and plot it. X, _ = fetch_aemet(return_X_y=True) X = X.coordinates[0] X.plot() plt.show() -############################################################################## +# %% # A boxplot can show magnitude outliers, in this case Navacerrada. +# Here the temperatures are lower than in the other curves, as this +# weather station is at a high altitude, near a ski resort. Boxplot( X, depth_method=ModifiedBandDepth(), ).plot() plt.show() -############################################################################## +# %% # A magnitude-shape plot can be used to detect shape outliers, such as the # Canary islands, with a less steeper temperature curve. +# The Canary islands are at a lower latitude, closer to the equator. +# Thus, they have a subtropical climate which presents less temperature +# variation during the year. MagnitudeShapePlot( X, ).plot() plt.show() -############################################################################## +# %% # We now attempt to cluster the curves using functional k-means. n_clusters = 5 n_init = 10 @@ -65,11 +92,11 @@ ) fda_clusters = fda_kmeans.fit_predict(X) -############################################################################## +# %% # We want to plot the cluster of each station in the map of Spain. We need to # define first auxiliary variables and functions for plotting. -coords_spain = (-10, 34.98, 5, 44.8) -coords_canary = (-18.5, 27.5, -13, 29.5) +coords_spain = (-10, 5, 34.98, 44.8) +coords_canary = (-18.5, -13, 27.5, 29.5) # It is easier to obtain the longitudes and latitudes from the data in # a Pandas dataframe. @@ -81,24 +108,20 @@ def create_map( coords: Tuple[float, float, float, float], - ax: Axes, -) -> Basemap: + figsize: Tuple[float, float], +) -> Figure: """Create a map for a region of the world.""" - basemap = Basemap( - *coords, - projection='merc', - resolution="h", - epsg=4326, - ax=ax, - fix_aspect=False, - ) - basemap.arcgisimage( - service='World_Imagery', - xpixels=1000, - dpi=100, - ) + tiler = GoogleTiles(style="satellite") + mercator = tiler.crs + + fig = plt.figure(figsize=figsize) + ax = fig.add_axes([0, 0, 1, 1], projection=mercator) + ax.set_extent(coords, crs=ccrs.PlateCarree()) + + ax.add_image(tiler, 8) + ax.set_adjustable('datalim') - return basemap + return fig def plot_cluster_points( @@ -106,19 +129,18 @@ def plot_cluster_points( latitudes: np.typing.NDArray[np.floating[Any]], clusters: np.typing.NDArray[np.integer[Any]], color_map: Mapping[int, str], - basemap: Basemap, ax: Axes, ) -> None: """Plot the stations in a map with their cluster color.""" - x, y = basemap(longitudes, latitudes) for cluster in range(n_clusters): selection = (clusters == cluster) ax.scatter( - x[selection], - y[selection], + longitudes[selection], + latitudes[selection], s=64, color=color_map[cluster], edgecolors='white', + transform=ccrs.Geodetic(), ) @@ -133,44 +155,59 @@ def plot_cluster_points( # Names of each climate (for this particular seed) climate_names = { - 0: "Cold-mountain", + 0: "Cold/mountain", 1: "Mediterranean", 2: "Atlantic", 3: "Subtropical", 4: "Continental", } -############################################################################## +# %% # We now plot the obtained clustering in the maps. +# +# It is possible to notice that each cluster seems to roughly correspond with +# a particular climate: +# +# - Red points, only present in the Canary islands, correspond to a subtropical +# climate. +# - Yellow points, in the Mediterranean coast, correspond to a Mediterranean +# climate. +# - Green points, in the Atlantic coast of mainland Spain, correspond to an +# Atlantic or oceanic climate. +# - Orange points, in the center of mainland Spain, correspond to a continental +# climate. +# - Finally the purple points are located at the coldest regions of Spain, such +# as in high mountain ranges. +# Thus, it can be associated with a cold or mountain climate. +# +# Notice that a couple of points in the Canary islands are not red. +# The purple one is easy to explain, as it is in mount Teide, the highest +# mountain of Spain. +# The yellow one is in the airport of Los Rodeos, which has its own +# microclimate\ :footcite:p:`romeo+marzoljaen_2014_analisis`. # Mainland -fig_spain = plt.figure(figsize=(8, 6)) -ax_spain = fig_spain.add_axes([0, 0, 1, 1]) -map_spain = create_map(coords_spain, ax=ax_spain) +fig_spain = create_map(coords_spain, figsize=(8, 6)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=fda_clusters, color_map=fda_color_map, - basemap=map_spain, - ax=ax_spain, + ax=fig_spain.axes[0], ) # Canary Islands -fig_canary = plt.figure(figsize=(8, 3)) -ax_canary = fig_canary.add_axes([0, 0, 1, 1]) -map_canary = create_map(coords_canary, ax=ax_canary) +fig_canary = create_map(coords_canary, figsize=(8, 3)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=fda_clusters, color_map=fda_color_map, - basemap=map_canary, - ax=ax_canary, + ax=fig_canary.axes[0], ) plt.show() -############################################################################## +# %% # We now can compute the first two principal components for interpretability, # and project the data over these directions. fpca = FPCA(n_components=2) @@ -182,12 +219,17 @@ def plot_cluster_points( X_red = fpca.transform(X) -############################################################################## +# %% # We now plot the first two principal components as perturbations over the # mean. # # The ``factor`` parameters is a number that multiplies each component in # order to make their effect more noticeable. +# +# It is possible to observe that the first component measures a global +# increase/decrease in temperature. +# The second component instead has the effect of increasing/decreasing +# the variability of the temperatures during the year. fig = plt.figure(figsize=(8, 4)) FPCAPlot( fpca.mean_, @@ -197,8 +239,15 @@ def plot_cluster_points( ).plot() plt.show() -############################################################################## -# We also plot the projections over the first two principal components. +# %% +# We also plot the projections over the first two principal components, +# keeping the cluster colors. +# Here we can easily observe the corresponding characteristics of each +# climate in terms of global temperature and annual variability. +# The two outliers of the Mediterranean climate correspond to the +# aforementioned airport of Los Rodeos, and to Tarifa, which is +# located at the strait of Gibraltar and thus receives also the cold +# waters of the Atlantic, explaining its lower annual variability. fig, ax = plt.subplots(1, 1) for cluster in range(n_clusters): selection = fda_clusters == cluster @@ -214,7 +263,7 @@ def plot_cluster_points( ax.legend() plt.show() -############################################################################## +# %% # We now attempt a multivariate clustering using only these projections. mv_kmeans = sklearn.cluster.KMeans( n_clusters=n_clusters, @@ -223,41 +272,44 @@ def plot_cluster_points( ) mv_clusters = mv_kmeans.fit_predict(X_red) -############################################################################## +# %% # We now plot the multivariate clustering in the maps. We define a different # color map to match cluster colors with the previously obtained ones. +# As you can see, the clustering using only the two first principal components +# matches almost perfectly with the original one, that used the complete +# temperature curves. mv_color_map = { - 0: "red", - 1: "purple", - 2: "yellow", - 3: "green", - 4: "orange", + 0: "yellow", + 1: "orange", + 2: "red", + 3: "purple", + 4: "green", } # Mainland -fig_spain = plt.figure(figsize=(8, 6)) -ax_spain = fig_spain.add_axes([0, 0, 1, 1]) -map_spain = create_map(coords_spain, ax=ax_spain) +fig_spain = create_map(coords_spain, figsize=(8, 6)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=mv_clusters, color_map=mv_color_map, - basemap=map_spain, - ax=ax_spain, + ax=fig_spain.axes[0], ) # Canary Islands -fig_canary = plt.figure(figsize=(8, 3)) -ax_canary = fig_canary.add_axes([0, 0, 1, 1]) -map_canary = create_map(coords_canary, ax=ax_canary) +fig_canary = create_map(coords_canary, figsize=(8, 3)) plot_cluster_points( longitudes=station_longitudes, latitudes=station_latitudes, clusters=mv_clusters, color_map=mv_color_map, - basemap=map_canary, - ax=ax_canary, + ax=fig_canary.axes[0], ) plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/plot_functional_regression.py b/examples/plot_functional_regression.py new file mode 100644 index 000000000..640b29e9f --- /dev/null +++ b/examples/plot_functional_regression.py @@ -0,0 +1,89 @@ +""" +Functional Linear Regression with multivariate covariates. +========================================================== + +This example explores the use of the linear regression with +multivariate (scalar) covariates and functional response. + +""" + +# Author: Rafael Hidalgo Alejo +# License: MIT + +import numpy as np +import pandas as pd +from sklearn.preprocessing import OneHotEncoder + +import skfda +from skfda.ml.regression import LinearRegression +from skfda.representation.basis import FDataBasis, FourierBasis + +# %% +# In this example, we will demonstrate the use of the Linear Regression with +# functional response and multivariate covariates using the +# :func:`weather ` dataset. +# It is possible to divide the weather stations into four groups: +# Atlantic, Pacific, Continental and Artic. +# There are a total of 35 stations in this dataset. + +X_weather, y_weather = skfda.datasets.fetch_weather( + return_X_y=True, as_frame=True, +) +fd = X_weather.iloc[:, 0].values + +# %% +# The main goal is knowing about the effect of stations' geographic location +# on the shape of the temperature curves. +# So we will have a model with a functional response, the temperature curve, +# and five covariates. The first one is the intercept (all entries equal to 1) +# and it shows the contribution of the Canadian mean temperature. The remaining +# covariates use one-hot encoding, with 1 if that weather station is in the +# corresponding climate zone and 0 otherwise. + +# We first create the one-hot encoding of the climates. + +enc = OneHotEncoder(handle_unknown='ignore') +enc.fit([['Atlantic'], ['Continental'], ['Pacific']]) +X = np.array(y_weather).reshape(-1, 1) +X = enc.transform(X).toarray() + +# %% +# Then, we construct a dataframe with each covariate in a different column and +# the temperature curves (responses). + +X_df = pd.DataFrame(X) + +y_basis = FourierBasis(n_basis=65) +y_fd = fd.coordinates[0].to_basis(y_basis) + +# %% +# An intercept term is incorporated. +# All functional coefficients will have the same basis as the response. + +funct_reg = LinearRegression(fit_intercept=True) +funct_reg.fit(X_df, y_fd) + +# %% +# The regression coefficients are shown below. The first one is the intercept +# coefficient, corresponding to Canadian mean temperature. + +funct_reg.intercept_.plot() +funct_reg.coef_[0].plot() +funct_reg.coef_[1].plot() +funct_reg.coef_[2].plot() + +# %% +# Finally, it is shown a panel with the prediction for all climate zones. + +predictions = [] + +predictions.append(funct_reg.predict([[0, 1, 0, 0]])[0]) +predictions.append(funct_reg.predict([[0, 0, 1, 0]])[0]) +predictions.append(funct_reg.predict([[0, 0, 0, 1]])[0]) + +predictions_conc = FDataBasis.concatenate(*predictions) + +predictions_conc.argument_names = ('day',) +predictions_conc.coordinate_names = ('temperature (ºC)',) + +predictions_conc.plot() diff --git a/examples/full_examples/plot_phonemes_classification.py b/examples/plot_phonemes_classification.py similarity index 66% rename from examples/full_examples/plot_phonemes_classification.py rename to examples/plot_phonemes_classification.py index 75d0fd8e7..4b9eafd1e 100644 --- a/examples/full_examples/plot_phonemes_classification.py +++ b/examples/plot_phonemes_classification.py @@ -24,9 +24,22 @@ from skfda.preprocessing.registration import FisherRaoElasticRegistration from skfda.preprocessing.smoothing import KernelSmoother -############################################################################## -# We will first load the (binary) Phoneme dataset, restricted to the first -# 150 variables, and plot the first 20 functions. +# %% +# This example uses the Phoneme dataset\ :footcite:`hastie++_1995_penalized` +# containing the frequency curves of some common phonemes as pronounced by +# different people. +# We illustrate with this data the preprocessing and +# classification techniques available in scikit-fda. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% +# We will first load the (binary) Phoneme dataset and plot the first 20 +# functions. +# We restrict the data to the first 150 variables, as done in +# :footcite:t:`ferraty+vieu_2006_computational`, because most of the +# useful information is in the lower frequencies. X, y = fetch_phoneme(return_X_y=True) X = X[(y == 0) | (y == 1)] @@ -47,8 +60,11 @@ X[:n_plot].plot(group=y) plt.show() -############################################################################## -# We now smooth and plot the data again, as well as the class means. +# %% +# As we just saw, the curves are very noisy. +# We can leverage the continuity of the trajectories by smoothing, using +# a Nadaraya-Watson estimator. +# We then plot the data again, as well as the class means. smoother = KernelSmoother( NadarayaWatsonHatMatrix( bandwidth=0.1, @@ -66,8 +82,23 @@ X_smooth_ao.mean().plot(fig=fig, color="red", linewidth=3) plt.show() -############################################################################## -# We now register the data per class. As Fisher-Rao elastic registration is +# %% +# The smoothed curves are easier to interpret. +# Now, it is possible to appreciate the characteristic landmarks of each class, +# such as maxima or minima. +# However, not all these landmarks appear at the same frequency for each +# trajectory. +# One way to solve it is by registering (aligning) the data. +# We use Fisher-Rao elastic registration, a state-of-the-art registration +# method to illustrate the effect of registration. +# Although this registration method achieves very good results, it +# attempts to align all the curves to a common template. +# Thus, in order to clearly view the specific landmarks of each class we +# have to register the data per class. +# This also means that if the we cannot use this method for a classification +# task if the landmarks of each class are very different, as it is not able +# to do per-class registration with unlabeled data. +# As Fisher-Rao elastic registration is # very slow, we only register the plotted curves as an approximation. reg = FisherRaoElasticRegistration( penalty=0.01, @@ -83,7 +114,7 @@ X_reg_ao.mean().plot(fig=fig, color="red", linewidth=3) plt.show() -############################################################################## +# %% # We now split the smoothed data in train and test datasets. # Note that there is no data leakage because no parameters are fitted in # the smoothing step, but normally you would want to do all preprocessing in @@ -97,7 +128,7 @@ stratify=y, ) -############################################################################## +# %% # We use a k-nn classifier with a functional analog to the Mahalanobis # distance and a fixed number of neighbors. n_neighbors = int(np.sqrt(X_smooth.n_samples)) @@ -115,7 +146,7 @@ score = accuracy_score(y_test, y_pred) print(score) -############################################################################## +# %% # If we wanted to optimize hyperparameters, we can use scikit-learn tools. pipeline = Pipeline([ ("smoother", smoother), @@ -138,3 +169,9 @@ # y_pred = grid_search.predict(X_test) # score = accuracy_score(y_test, y_pred) # print(score) + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/full_examples/plot_tecator_regression.py b/examples/plot_tecator_regression.py similarity index 68% rename from examples/full_examples/plot_tecator_regression.py rename to examples/plot_tecator_regression.py index 96333ce4a..3c19dfa3a 100644 --- a/examples/full_examples/plot_tecator_regression.py +++ b/examples/plot_tecator_regression.py @@ -25,7 +25,19 @@ ) from skfda.representation.basis import BSplineBasis -############################################################################## +# %% +# This example uses the Tecator dataset\ +# :footcite:`borggaard+thodberg_1992_optimal` +# in order to illustrate the problems of functional regression +# and functional variable selection. +# This dataset contains the spectra of absorbances of several pieces of +# finely chopped meat, as well as the percent of its content in water, +# fat and protein. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% # We will first load the Tecator data, keeping only the fat content target, # and plot it. X, y = fetch_tecator(return_X_y=True) @@ -34,13 +46,17 @@ X.plot(gradient_criteria=y) plt.show() -############################################################################## -# We compute numerically the second derivative and plot it. +# %% +# For spectrometric data, the relevant information of the curves can often +# be found in the derivatives\ :footcite:`ferraty+vieu_2006_computational`. +# Thus, we compute numerically the second derivative and plot it. X_der = X.derivative(order=2) X_der.plot(gradient_criteria=y) plt.show() -############################################################################## +# %% +# We first apply a simple linear regression model to compute a baseline +# for our regression predictions. # In order to compute functional linear regression we first convert the data # to a basis expansion. basis = BSplineBasis( @@ -48,7 +64,7 @@ ) X_der_basis = X_der.to_basis(basis) -############################################################################## +# %% # We split the data in train and test, and compute the regression score using # the linear regression model. X_train, X_test, y_train, y_test = train_test_split( @@ -63,8 +79,15 @@ score = r2_score(y_test, y_pred) print(score) -############################################################################## -# Another approach is to use variable selection using maxima hunting. +# %% +# We now will take a different approach. +# It is possible to note from the plot of the derivatives that most information +# necessary for regression can be found at some particular "impact" points. +# Thus, we now apply a functional variable selection method to detect those +# points and use them with a multivariate classifier. +# The variable selection method that we employ here is maxima hunting\ +# :footcite:`berrendero++_2016_variable`, a filter method that computes a +# relevance score for each point of the curve and selects all the local maxima. var_sel = MaximaHunting( local_maxima_selector=RelativeLocalMaximaSelector(max_points=2), ) @@ -72,21 +95,21 @@ print(var_sel.indexes_) -############################################################################## +# %% # We can visualize the relevance function and the selected points. var_sel.dependence_.plot() for p in var_sel.indexes_: plt.axvline(X_der.grid_points[0][p], color="black") plt.show() -############################################################################## +# %% # We also can visualize the selected points on the curves. X_der.plot(gradient_criteria=y) for p in var_sel.indexes_: plt.axvline(X_der.grid_points[0][p], color="black") plt.show() -############################################################################## +# %% # We split the data again (using the same seed), but this time without the # basis expansion. X_train, X_test, y_train, y_test = train_test_split( @@ -95,7 +118,7 @@ random_state=0, ) -############################################################################## +# %% # We now make a pipeline with the variable selection and a multivariate linear # regression method for comparison. pipeline = Pipeline([ @@ -107,7 +130,7 @@ score = r2_score(y_test, y_predicted) print(score) -############################################################################## +# %% # We can use a tree regressor instead to improve both the score and the # interpretability. pipeline = Pipeline([ @@ -119,8 +142,14 @@ score = r2_score(y_test, y_predicted) print(score) -############################################################################## +# %% # We can plot the final version of the tree to explain every prediction. fig, ax = plt.subplots(figsize=(10, 10)) plot_tree(pipeline.named_steps["classifier"], precision=6, filled=True, ax=ax) plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/pyproject.toml b/pyproject.toml index dc5ea6708..dfd1891d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "scikit-fda" description = "Functional Data Analysis Python package." readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.9" license = {file = "LICENSE.txt"} keywords = [ "functional data", @@ -20,7 +20,7 @@ classifiers = [ "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Software Development :: Libraries :: Python Modules", "Typing :: Typed", @@ -45,6 +45,19 @@ dependencies = [ ] [project.optional-dependencies] +docs = [ + "cartopy", + "ipykernel", + "jupyter-sphinx", + "myst-parser", + "pillow", + "pydata-sphinx-theme", + "pytest", + "setuptools>=41.2", + "sphinx>=3", + "sphinx-gallery", + "sphinxcontrib-bibtex", +] test = [ "pytest", "pytest-env", diff --git a/readthedocs-requirements.txt b/readthedocs-requirements.txt deleted file mode 100644 index f646304da..000000000 --- a/readthedocs-requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ --r ./requirements.txt -basemap -basemap-data -basemap-data-hires -ipykernel -Sphinx>=3 -pydata-sphinx-theme -sphinx-gallery -pillow -setuptools>=41.2 -jupyter-sphinx -myst-parser -pytest -sphinxcontrib-bibtex \ No newline at end of file diff --git a/readthedocs.yml b/readthedocs.yml index 2c148156d..46689c30d 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -5,6 +5,11 @@ # Required version: 2 +build: + os: ubuntu-22.04 + tools: + python: "3.9" + # Build documentation in the docs/ directory with Sphinx sphinx: builder: html @@ -18,8 +23,8 @@ sphinx: # Optionally set the version of Python and requirements required to build your docs python: - version: "3.8" install: - - requirements: readthedocs-requirements.txt - method: pip - path: . \ No newline at end of file + path: . + extra_requirements: + - docs \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 2a806336a..000000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -matplotlib -numpy -scipy -setuptools -scikit-learn -multimethod>=1.2 -findiff diff --git a/setup.cfg b/setup.cfg index bcbba2d86..f628b58b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,6 +84,8 @@ ignore = WPS507, # Comparison with not is not the same as with equality WPS520, + # Found bad magic module function: {0} + WPS413 per-file-ignores = __init__.py: diff --git a/skfda/__init__.py b/skfda/__init__.py index 1a8f41021..ac12bc7f5 100644 --- a/skfda/__init__.py +++ b/skfda/__init__.py @@ -30,4 +30,4 @@ concatenate as concatenate, ) -__version__ = "0.8.1" +__version__ = "0.9.1.dev1" diff --git a/skfda/_utils/__init__.py b/skfda/_utils/__init__.py index 387f22b58..bb0636fdd 100644 --- a/skfda/_utils/__init__.py +++ b/skfda/_utils/__init__.py @@ -20,6 +20,7 @@ "_same_domain", "_to_grid", "_to_grid_points", + "function_to_fdatabasis", "nquad_vec", ], '_warping': [ @@ -44,9 +45,9 @@ _same_domain as _same_domain, _to_grid as _to_grid, _to_grid_points as _to_grid_points, + function_to_fdatabasis as function_to_fdatabasis, nquad_vec as nquad_vec, ) - from ._warping import ( invert_warping as invert_warping, normalize_scale as normalize_scale, diff --git a/skfda/_utils/_utils.py b/skfda/_utils/_utils.py index 9a6382b57..aebb5189f 100644 --- a/skfda/_utils/_utils.py +++ b/skfda/_utils/_utils.py @@ -4,7 +4,6 @@ import functools import numbers -from functools import singledispatch from typing import ( TYPE_CHECKING, Any, @@ -36,7 +35,7 @@ ArrayDTypeT = TypeVar("ArrayDTypeT", bound="np.generic") if TYPE_CHECKING: - from ..representation import FData, FDataGrid + from ..representation import FData, FDataBasis, FDataGrid from ..representation.basis import Basis from ..representation.extrapolation import ExtrapolationLike @@ -605,3 +604,36 @@ def _classifier_get_classes( f'one; got {classes.size} class', ) return classes, y_ind + + +def function_to_fdatabasis( + f: Callable[[NDArrayFloat], NDArrayFloat], + new_basis: Basis, +) -> FDataBasis: + """Express a math function as a FDataBasis with a given basis. + + Args: + f: math function. + new_basis: the basis of the output. + + Returns: + FDataBasis: FDataBasis with calculated coefficients and the new + basis. + """ + from .. import FDataBasis # noqa: WPS442 + from ..misc._math import inner_product_matrix + + if isinstance(f, FDataBasis) and f.basis == new_basis: + return f.copy() + + inner_prod = inner_product_matrix( + new_basis, + f, + _domain_range=new_basis.domain_range, + ) + + gram_matrix = new_basis.gram_matrix() + + coefs = np.linalg.solve(gram_matrix, inner_prod) + + return FDataBasis(new_basis, coefs.T) diff --git a/skfda/_utils/ndfunction/__init__.py b/skfda/_utils/ndfunction/__init__.py new file mode 100644 index 000000000..aa0d1fe33 --- /dev/null +++ b/skfda/_utils/ndfunction/__init__.py @@ -0,0 +1,18 @@ +from typing import TYPE_CHECKING + +import lazy_loader as lazy + +__getattr__, __dir__, __all__ = lazy.attach( + __name__, + submod_attrs={ + "_functions": [ + "average_function_value", + ], + }, +) + +if TYPE_CHECKING: + + from ._functions import ( + average_function_value as average_function_value, + ) diff --git a/skfda/_utils/ndfunction/_functions.py b/skfda/_utils/ndfunction/_functions.py new file mode 100644 index 000000000..8f7c9b980 --- /dev/null +++ b/skfda/_utils/ndfunction/_functions.py @@ -0,0 +1,109 @@ +"""Functions for working with arrays of functions.""" +from __future__ import annotations + +import math +from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar + +from ...misc.validation import validate_domain_range +from .. import nquad_vec + +if TYPE_CHECKING: + from ...typing._numpy import NDArrayFloat + from ...representation import FData + from ...typing._base import DomainRangeLike + + +UfuncMethod = Literal[ + "__call__", + "reduce", + "reduceat", + "accumulate", + "outer", + "inner", +] + + +class _SupportsArrayUFunc(Protocol): + def __array_ufunc__( + self, + ufunc: Any, + method: UfuncMethod, + *inputs: Any, + **kwargs: Any, + ) -> Any: + pass + + +T = TypeVar("T", bound=_SupportsArrayUFunc) + + +class _UnaryUfunc(Protocol): + + def __call__(self, __arg: T) -> T: # noqa: WPS112 + pass + + +def _average_function_ufunc( + data: FData, + ufunc: _UnaryUfunc, + *, + domain: DomainRangeLike | None = None, +) -> NDArrayFloat: + + if domain is None: + domain = data.domain_range + else: + domain = validate_domain_range(domain) + + lebesgue_measure = math.prod( + ( + (iterval[1] - iterval[0]) + for iterval in domain + ), + ) + + try: + data_eval = ufunc(data) + except TypeError: + + def integrand(*args: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 + f1 = data(args)[:, 0, :] + return ufunc(f1) + + return nquad_vec( + integrand, + domain, + ) / lebesgue_measure + + else: + return data_eval.integrate(domain=domain) / lebesgue_measure + + +def average_function_value( + data: FData, + *, + domain: DomainRangeLike | None = None, +) -> NDArrayFloat: + r""" + Calculate the average function value for each function. + + This is the value that, if integrated over the whole domain of each + function, has the same integral as the function itself. + + .. math:: + \bar{x} = \frac{1}{\text{Vol}(\mathcal{T})}\int_{\mathcal{T}} x(t) dt + + Args: + data: Functions where we want to calculate the expected value. + domain: Integration domain. By default, the whole domain is used. + + Returns: + ndarray of shape (n_dimensions, n_samples) with the values of the + expectations. + + See also: + `Entry on Wikipedia + `_ + + """ + return _average_function_ufunc(data, ufunc=lambda x: x, domain=domain) diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index 28415c76c..833e398fe 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -180,7 +180,7 @@ def fetch_ucr( The UCR/UEA Time Series Classification repository, hosted at www.timeseriesclassification.com includes plenty of classification problems with univariate and multivariate time series - \ :footcite:p:`dau++_2019_ucr,bagnall++_2018_uea`. + :footcite:p:`dau++_2019_ucr,bagnall++_2018_uea`. They are widely used in the functional data classification literature. Args: @@ -293,6 +293,7 @@ def _fetch_fda_usc(name: str) -> Any: Discriminant Analysis. Ann. Statist. 23 (1995), no. 1, 73--102. doi:10.1214/aos/1176324456. https://projecteuclid.org/euclid.aos/1176324456 + """ @@ -887,14 +888,15 @@ def fetch_weather( Series of daily summaries of 73 spanish weather stations selected for the period 1980-2009. The dataset contains the geographic information of each station and the average for the period 1980-2009 of daily temperature, - daily precipitation and daily wind speed. Meteorological State Agency of - Spain (AEMET), http://www.aemet.es/. Government of Spain. - - Authors: - Manuel Febrero Bande, Manuel Oviedo de la Fuente + daily precipitation and daily wind speed. Source: + Meteorological State Agency of Spain (AEMET). (2009). + Meteorological data of Spanish weather stations between years 1980 + and 2009. [dataset]. http://www.aemet.es/ + The data were obtained from the FTP of AEMET in 2009. + """ diff --git a/skfda/exploratory/depth/_depth.py b/skfda/exploratory/depth/_depth.py index a0d27c2e5..68e51dee8 100644 --- a/skfda/exploratory/depth/_depth.py +++ b/skfda/exploratory/depth/_depth.py @@ -11,9 +11,9 @@ from typing import TypeVar import numpy as np -import scipy.integrate from ..._utils._sklearn_adapter import BaseEstimator +from ..._utils.ndfunction import average_function_value from ...misc.metrics import l2_distance from ...misc.metrics._utils import _fit_metric from ...representation import FData, FDataGrid @@ -82,25 +82,11 @@ def fit( # noqa: D102 def transform(self, X: FDataGrid) -> NDArrayFloat: # noqa: D102 - pointwise_depth = self.multivariate_depth_.transform(X.data_matrix) - - interval_len = ( - self._domain_range[0][1] - - self._domain_range[0][0] + pointwise_depth = X.copy( + data_matrix=self.multivariate_depth_.transform(X.data_matrix), ) - integrand = pointwise_depth - - for d, s in zip(X.domain_range, X.grid_points): - integrand = scipy.integrate.simps( - integrand, - x=s, - axis=1, - ) - interval_len = d[1] - d[0] - integrand /= interval_len - - return integrand + return average_function_value(pointwise_depth).ravel() @property def max(self) -> float: diff --git a/skfda/exploratory/outliers/_directional_outlyingness.py b/skfda/exploratory/outliers/_directional_outlyingness.py index 30e8338d3..91ac8ee67 100644 --- a/skfda/exploratory/outliers/_directional_outlyingness.py +++ b/skfda/exploratory/outliers/_directional_outlyingness.py @@ -98,7 +98,7 @@ def directional_outlyingness_stats( # noqa: WPS218 Method used to order the data. Defaults to :func:`projection depth `. pointwise_weights (array_like, optional): an array containing the - weights of each point of discretisation where values have been + weights of each point of discretization where values have been recorded. Defaults to the same weight for each of the points: 1/len(interval). @@ -183,58 +183,67 @@ def directional_outlyingness_stats( # noqa: WPS218 fdatagrid.domain_range[0][1] - fdatagrid.domain_range[0][0] ) + if not isinstance(pointwise_weights, FDataGrid): + pointwise_weights = fdatagrid.copy( + data_matrix=pointwise_weights, + sample_names=(None,), + ) + depth_pointwise = multivariate_depth(fdatagrid.data_matrix) assert depth_pointwise.shape == fdatagrid.data_matrix.shape[:-1] + depth_pointwise_fd = fdatagrid.copy( + data_matrix=depth_pointwise, + ) + # Obtaining the pointwise median sample Z, to calculate # v(t) = {X(t) − Z(t)}/|| X(t) − Z(t) || median_index = np.argmax(depth_pointwise, axis=0) - pointwise_median = fdatagrid.data_matrix[ - median_index, - range(fdatagrid.data_matrix.shape[1]), - ] - assert pointwise_median.shape == fdatagrid.data_matrix.shape[1:] - v = fdatagrid.data_matrix - pointwise_median - assert v.shape == fdatagrid.data_matrix.shape - v_norm = la.norm(v, axis=-1, keepdims=True) + pointwise_median = fdatagrid.copy( + data_matrix=fdatagrid.data_matrix[ + median_index, + range(fdatagrid.data_matrix.shape[1]), + ][np.newaxis], + sample_names=(None,), + ) + + v = fdatagrid - pointwise_median + v_norm = la.norm(v.data_matrix, axis=-1, keepdims=True) # To avoid ZeroDivisionError, the zeros are substituted by ones (the # reference implementation also does this). v_norm[np.where(v_norm == 0)] = 1 - v_unitary = v / v_norm + v.data_matrix /= v_norm - # Calculation directinal outlyingness - dir_outlyingness = (1 / depth_pointwise[..., np.newaxis] - 1) * v_unitary + # Calculation directional outlyingness + dir_outlyingness = ( + 1 / depth_pointwise_fd - 1 + ) * v # Calculation mean directional outlyingness weighted_dir_outlyingness = ( - dir_outlyingness * pointwise_weights[:, np.newaxis] + dir_outlyingness * pointwise_weights ) - assert weighted_dir_outlyingness.shape == dir_outlyingness.shape - mean_dir_outlyingness = scipy.integrate.simps( - weighted_dir_outlyingness, - fdatagrid.grid_points[0], - axis=1, - ) + mean_dir_outlyingness = weighted_dir_outlyingness.integrate() assert mean_dir_outlyingness.shape == ( fdatagrid.n_samples, fdatagrid.dim_codomain, ) # Calculation variation directional outlyingness - norm = np.square(la.norm( - dir_outlyingness - - mean_dir_outlyingness[:, np.newaxis, :], - axis=-1, - )) - weighted_norm = norm * pointwise_weights - variation_dir_outlyingness = scipy.integrate.simps( - weighted_norm, - fdatagrid.grid_points[0], - axis=1, + norm = dir_outlyingness.copy( + data_matrix=np.square( + la.norm( + dir_outlyingness.data_matrix + - mean_dir_outlyingness[:, np.newaxis, :], + axis=-1, + ) + ) ) + weighted_norm = norm * pointwise_weights + variation_dir_outlyingness = weighted_norm.integrate().ravel() assert variation_dir_outlyingness.shape == (fdatagrid.n_samples,) functional_dir_outlyingness = ( @@ -244,7 +253,7 @@ def directional_outlyingness_stats( # noqa: WPS218 assert functional_dir_outlyingness.shape == (fdatagrid.n_samples,) return DirectionalOutlyingnessStats( - directional_outlyingness=dir_outlyingness, + directional_outlyingness=dir_outlyingness.data_matrix, functional_directional_outlyingness=functional_dir_outlyingness, mean_directional_outlyingness=mean_dir_outlyingness, variation_directional_outlyingness=variation_dir_outlyingness, diff --git a/skfda/exploratory/stats/__init__.py b/skfda/exploratory/stats/__init__.py index 0a1f3a6de..345eb30ac 100644 --- a/skfda/exploratory/stats/__init__.py +++ b/skfda/exploratory/stats/__init__.py @@ -19,6 +19,7 @@ "gmean", "mean", "modified_epigraph_index", + "std", "trim_mean", "var", ], @@ -37,6 +38,7 @@ gmean as gmean, mean as mean, modified_epigraph_index as modified_epigraph_index, + std as std, trim_mean as trim_mean, var as var, ) diff --git a/skfda/exploratory/stats/_fisher_rao.py b/skfda/exploratory/stats/_fisher_rao.py index c897b13fc..5a3ffe6c9 100644 --- a/skfda/exploratory/stats/_fisher_rao.py +++ b/skfda/exploratory/stats/_fisher_rao.py @@ -133,7 +133,7 @@ def _fisher_rao_warping_mean( # Compute shooting vectors for psi_i in psi_data: - inner = scipy.integrate.simps(mu * psi_i, x=eval_points) + inner = scipy.integrate.simpson(mu * psi_i, x=eval_points) inner = max(min(inner, 1), -1) theta = np.arccos(inner) @@ -143,7 +143,7 @@ def _fisher_rao_warping_mean( # Mean of shooting vectors vmean /= warping.n_samples - v_norm = np.sqrt(scipy.integrate.simps(np.square(vmean))) + v_norm = np.sqrt(scipy.integrate.simpson(np.square(vmean))) # Convergence criterion if v_norm < tol: @@ -266,7 +266,7 @@ def fisher_rao_karcher_mean( # Initialize with function closest to the L2 mean with the L2 distance centered = (srsf.T - srsf.mean(axis=0, keepdims=True).T).T - distances = scipy.integrate.simps( + distances = scipy.integrate.simpson( np.square(centered, out=centered), eval_points_normalized, axis=1, @@ -304,14 +304,14 @@ def fisher_rao_karcher_mean( # Convergence criterion mu_norm = np.sqrt( - scipy.integrate.simps( + scipy.integrate.simpson( np.square(mu, out=mu_aux), eval_points_normalized, ), ) mu_diff = np.sqrt( - scipy.integrate.simps( + scipy.integrate.simpson( np.square(mu - mu_1, out=mu_aux), eval_points_normalized, ), diff --git a/skfda/exploratory/stats/_stats.py b/skfda/exploratory/stats/_stats.py index 20d2443e2..5bf81b3b9 100644 --- a/skfda/exploratory/stats/_stats.py +++ b/skfda/exploratory/stats/_stats.py @@ -1,6 +1,7 @@ """Functional data descriptive statistics.""" from __future__ import annotations +import functools from builtins import isinstance from typing import Callable, TypeVar, Union @@ -8,8 +9,10 @@ from scipy import integrate from scipy.stats import rankdata +from skfda._utils.ndfunction import average_function_value + from ...misc.metrics._lp_distances import l2_distance -from ...representation import FData, FDataGrid +from ...representation import FData, FDataBasis, FDataGrid from ...typing._metric import Metric from ...typing._numpy import NDArrayFloat from ..depth import Depth, ModifiedBandDepth @@ -41,22 +44,22 @@ def mean( return (X * weight).sum() -def var(X: FData, ddof: int = 1) -> FDataGrid: +def var(X: FData, correction: int = 0) -> FDataGrid: """ Compute the variance of a set of samples in a FData object. Args: X: Object containing all the set of samples whose variance is desired. - ddof: "Delta Degrees of Freedom": the divisor used in the calculation - is `N - ddof`, where `N` represents the number of elements. By - default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the number of + elements. Default: `0`. Returns: Variance of all the samples in the original object, as a :term:`functional data object` with just one sample. """ - return X.var(ddof=ddof) # type: ignore[no-any-return] + return X.var(correction=correction) # type: ignore[no-any-return] def gmean(X: FDataGrid) -> FDataGrid: @@ -76,7 +79,7 @@ def gmean(X: FDataGrid) -> FDataGrid: def cov( X: FData, - ddof: int = 1 + correction: int = 0, ) -> Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat]: """ Compute the covariance. @@ -86,9 +89,9 @@ def cov( Args: X: Object containing different samples of a functional variable. - ddof: "Delta Degrees of Freedom": the divisor used in the calculation - is `N - ddof`, where `N` represents the number of elements. By - default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the number of + elements. Default: `0`. Returns: @@ -96,7 +99,65 @@ def cov( callable. """ - return X.cov(ddof=ddof) + return X.cov(correction=correction) + + +@functools.singledispatch +def std(X: F, correction: int = 1) -> F: + r""" + Compute the standard deviation of all the samples in a FData object. + + .. math:: + \text{std}_X(t) = \sqrt{\frac{1}{N-\text{correction}} + \sum_{n=1}^{N}{\left(X_n(t) - \overline{X}(t)\right)^2}} + + Args: + X: Object containing all the samples whose standard deviation is + wanted. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the number of + elements. Default: `0`. + + Returns: + Standard deviation of all the samples in the original object, as a + :term:`functional data object` with just one sample. + + """ + raise NotImplementedError("Not implemented for this type") + + +@std.register +def std_fdatagrid(X: FDataGrid, correction: int = 1) -> FDataGrid: + """Compute the standard deviation of a FDataGrid.""" + return X.copy( + data_matrix=np.std( + X.data_matrix, axis=0, ddof=correction, + )[np.newaxis, ...], + sample_names=(None,), + ) + + +@std.register +def std_fdatabasis(X: FDataBasis, correction: int = 1) -> FDataBasis: + """Compute the standard deviation of a FDataBasis.""" + from ..._utils import function_to_fdatabasis + + basis = X.basis + coeff_cov_matrix = np.cov( + X.coefficients, rowvar=False, ddof=correction, + ).reshape((basis.n_basis, basis.n_basis)) + + def std_function(t_points: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 + basis_evaluation = basis(t_points).reshape((basis.n_basis, -1)) + std_values = np.sqrt( + np.sum( + basis_evaluation * (coeff_cov_matrix @ basis_evaluation), + axis=0, + ), + ) + return np.reshape(std_values, (1, -1, X.dim_codomain)) + + return function_to_fdatabasis(f=std_function, new_basis=X.basis) def modified_epigraph_index(X: FDataGrid) -> NDArrayFloat: @@ -108,33 +169,20 @@ def modified_epigraph_index(X: FDataGrid) -> NDArrayFloat: with all the other curves of our dataset. """ - interval_len = ( - X.domain_range[0][1] - - X.domain_range[0][0] - ) - - # Array containing at each point the number of curves + # Functions containing at each point the number of curves # are above it. - num_functions_above: NDArrayFloat = rankdata( - -X.data_matrix, - method='max', - axis=0, - ) - 1 - - integrand = num_functions_above - - for d, s in zip(X.domain_range, X.grid_points): - integrand = integrate.simps( - integrand, - x=s, - axis=1, - ) - interval_len = d[1] - d[0] - integrand /= interval_len - - integrand /= X.n_samples + num_functions_above = X.copy( + data_matrix=rankdata( + -X.data_matrix, + method='max', + axis=0, + ) - 1, + ) - return integrand.flatten() + return ( + average_function_value(num_functions_above) + / num_functions_above.n_samples + ).ravel() def depth_based_median( diff --git a/skfda/exploratory/visualization/_boxplot.py b/skfda/exploratory/visualization/_boxplot.py index 19aae239b..e315a92f0 100644 --- a/skfda/exploratory/visualization/_boxplot.py +++ b/skfda/exploratory/visualization/_boxplot.py @@ -11,7 +11,6 @@ from typing import Sequence, Tuple import matplotlib -import matplotlib.pyplot as plt import numpy as np from matplotlib.axes import Axes from matplotlib.colors import Colormap @@ -376,7 +375,7 @@ def __init__( self._fdatagrid = fdatagrid self._prob = prob - self._colormap = plt.cm.get_cmap('RdPu') + self._colormap = matplotlib.colormaps['RdPu'] self.barcol = "blue" self.outliercol = "red" self.mediancol = "black" @@ -701,7 +700,7 @@ def __init__( self._non_outlying_envelope = _envelopes.compute_envelope(inliers) self._fdatagrid = fdatagrid - self.colormap = plt.cm.get_cmap('Greys') + self.colormap = matplotlib.colormaps['Greys'] self._boxcol = 1.0 self._outcol = 0.7 diff --git a/skfda/exploratory/visualization/_magnitude_shape_plot.py b/skfda/exploratory/visualization/_magnitude_shape_plot.py index bb5c56d51..6fc13fbdf 100644 --- a/skfda/exploratory/visualization/_magnitude_shape_plot.py +++ b/skfda/exploratory/visualization/_magnitude_shape_plot.py @@ -10,7 +10,6 @@ from typing import Any, Sequence import matplotlib -import matplotlib.pyplot as plt import numpy as np from matplotlib.artist import Artist from matplotlib.axes import Axes @@ -193,7 +192,7 @@ def __init__( self._fdata = fdata self._outliers = outliers - self._colormap = plt.cm.get_cmap('seismic') + self._colormap = matplotlib.colormaps['seismic'] self._color = 0.2 self._outliercol = 0.8 self.xlabel = 'MO' diff --git a/skfda/inference/anova/_anova_oneway.py b/skfda/inference/anova/_anova_oneway.py index feeb18a17..d95430b71 100644 --- a/skfda/inference/anova/_anova_oneway.py +++ b/skfda/inference/anova/_anova_oneway.py @@ -182,7 +182,13 @@ def v_asymptotic_stat( .. footbibliography:: """ - return float(_v_asymptotic_stat_with_reps(*fd, weights=weights, p=p)) + return float( + _v_asymptotic_stat_with_reps( + *fd, + weights=weights, + p=p, + ).reshape(()) + ) def _anova_bootstrap( @@ -220,6 +226,7 @@ def _anova_bootstrap( cov_est = concatenate(fd_grouped).cov( grid_points, grid_points, + correction=1, ) k_est = [cov_est] * len(fd_grouped) else: @@ -228,6 +235,7 @@ def _anova_bootstrap( fdg.cov( grid_points, grid_points, + correction=1, ) for fdg in fd_grouped ] diff --git a/skfda/inference/hotelling/_hotelling.py b/skfda/inference/hotelling/_hotelling.py index 9f07e4944..f0aba205e 100644 --- a/skfda/inference/hotelling/_hotelling.py +++ b/skfda/inference/hotelling/_hotelling.py @@ -103,10 +103,12 @@ def hotelling_t2( k1 = fd1.cov( fd1.grid_points[0], fd1.grid_points[0], + correction=1, ) k2 = fd2.cov( fd2.grid_points[0], fd2.grid_points[0], + correction=1, ) m = m.reshape((-1, 1)) # Reshaping the mean for a proper matrix product diff --git a/skfda/misc/_math.py b/skfda/misc/_math.py index 4e02de2f8..5d63f9100 100644 --- a/skfda/misc/_math.py +++ b/skfda/misc/_math.py @@ -12,7 +12,7 @@ import numpy as np import scipy.integrate -from .._utils import nquad_vec +from .._utils import _same_domain, nquad_vec from ..representation import FData, FDataBasis, FDataGrid from ..representation.basis import Basis from ..typing._base import DomainRange @@ -243,6 +243,7 @@ def inner_product( Args: arg1: First sample. arg2: Second sample. + kwargs: Additional parameters for the function. Returns: Vector with the inner products of each pair of samples. @@ -363,7 +364,7 @@ def _inner_product_fdatagrid( # Perform quadrature inside the einsum for i, s in enumerate(arg1.grid_points[::-1]): identity = np.eye(len(s)) - weights = scipy.integrate.simps(identity, x=s) + weights = scipy.integrate.simpson(identity, x=s) index = (slice(None),) + (np.newaxis,) * (i + 1) d1 *= weights[index] @@ -388,10 +389,17 @@ def _inner_product_fdatabasis( arg2: Union[FDataBasis, Basis], *, _matrix: bool = False, + _domain_range: Optional[DomainRange] = None, inner_product_matrix: Optional[NDArrayFloat] = None, force_numerical: bool = False, ) -> NDArrayFloat: + if not _same_domain(arg1, arg2): + raise ValueError("Both Objects should have the same domain_range") + + if _domain_range and not np.array_equal(arg1.domain_range, _domain_range): + raise ValueError("_domain_range should be the same as arg objects") + if isinstance(arg1, Basis): arg1 = arg1.to_basis() @@ -479,8 +487,15 @@ def _inner_product_integrate( def integrand(*args: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 f_args = np.asarray(args) - f1 = arg1(f_args)[:, 0, :] - f2 = arg2(f_args)[:, 0, :] + try: + f1 = arg1(f_args)[:, 0, :] + except Exception: + f1 = arg1(f_args) + + try: + f2 = arg2(f_args)[:, 0, :] + except Exception: + f2 = arg2(f_args) if _matrix: ret = np.einsum('n...,m...->nm...', f1, f2) diff --git a/skfda/misc/covariances.py b/skfda/misc/covariances.py index d910024ea..64aeb44de 100644 --- a/skfda/misc/covariances.py +++ b/skfda/misc/covariances.py @@ -768,27 +768,27 @@ class Empirical(Covariance): The sample covariance function is defined as . math:: - K(t, s) = \frac{1}{N-\text{ddof}}\sum_{n=1}^N\left(x_n(t) - + K(t, s) = \frac{1}{N-\text{correction}}\sum_{n=1}^N\left(x_n(t) - \bar{x}(t)\right) \left(x_n(s) - \bar{x}(s)\right) where :math:`x_n(t)` is the n-th sample and :math:`\bar{x}(t)` is the mean of the samples. :math:`N` is the number of samples, - :math:`\text{ddof}` means "Delta Degrees of Freedom" and is such that - :math:`N-\text{ddof}` is the divisor used in the calculation of the - covariance function. + :math:`\text{correction}` is the degrees of freedom adjustment and is such + that :math:`N-\text{correction}` is the divisor used in the calculation of + the covariance function. """ _latex_formula = ( - r"K(t, s) = \frac{1}{N-\text{ddof}}\sum_{n=1}^N(x_n(t) - \bar{x}(t))" - r"(x_n(s) - \bar{x}(s))" + r"K(t, s) = \frac{1}{N-\text{correction}}\sum_{n=1}^N" + r"(x_n(t) - \bar{x}(t))(x_n(s) - \bar{x}(s))" ) _parameters_str = [ ("data", "data"), ] cov_fdata: FData - ddof: int + correction: int @abc.abstractmethod def __init__(self, data: FData) -> None: @@ -817,17 +817,17 @@ class EmpiricalGrid(Empirical): """Sample covariance function for FDataGrid.""" cov_fdata: FDataGrid - ddof: int + correction: int - def __init__(self, data: FDataGrid, ddof: int = 1) -> None: + def __init__(self, data: FDataGrid, correction: int = 0) -> None: super().__init__(data=data) - self.ddof = ddof + self.correction = correction self.cov_fdata = data.copy( data_matrix=np.cov( data.data_matrix[..., 0], rowvar=False, - ddof=ddof, + ddof=correction, )[np.newaxis, ...], grid_points=[ data.grid_points[0], @@ -853,16 +853,16 @@ class EmpiricalBasis(Empirical): cov_fdata: FDataBasis coeff_matrix: NDArrayFloat - ddof: int + correction: int - def __init__(self, data: FDataBasis, ddof: int = 1) -> None: + def __init__(self, data: FDataBasis, correction: int = 0) -> None: super().__init__(data=data) - self.ddof = ddof + self.correction = correction self.coeff_matrix = np.cov( data.coefficients, rowvar=False, - ddof=ddof, + ddof=correction, ) self.cov_fdata = FDataBasis( basis=TensorBasis([data.basis, data.basis]), diff --git a/skfda/misc/hat_matrix.py b/skfda/misc/hat_matrix.py index fc94adc6b..81694bd47 100644 --- a/skfda/misc/hat_matrix.py +++ b/skfda/misc/hat_matrix.py @@ -11,20 +11,21 @@ import abc import math -from typing import Callable, TypeVar, Union, overload +from typing import Callable, Final, TypeVar, Union, overload import numpy as np from .._utils._sklearn_adapter import BaseEstimator from ..representation._functional_data import FData from ..representation.basis import FDataBasis -from ..typing._base import GridPointsLike from ..typing._numpy import NDArrayFloat from . import kernels -Input = TypeVar("Input", bound=Union[FData, GridPointsLike]) +Input = TypeVar("Input", bound=Union[FData, NDArrayFloat]) Prediction = TypeVar("Prediction", bound=Union[NDArrayFloat, FData]) +DEFAULT_BANDWIDTH_PERCENTILE: Final = 15 + class HatMatrix( BaseEstimator, @@ -52,8 +53,8 @@ def __call__( self, *, delta_x: NDArrayFloat, - X_train: Input | None = None, - X: Input | None = None, + X_train: Input, + X: Input, y_train: None = None, weights: NDArrayFloat | None = None, _cv: bool = False, @@ -65,8 +66,8 @@ def __call__( self, *, delta_x: NDArrayFloat, - X_train: Input | None = None, - X: Input | None = None, + X_train: Input, + X: Input, y_train: Prediction | None = None, weights: NDArrayFloat | None = None, _cv: bool = False, @@ -77,8 +78,8 @@ def __call__( self, *, delta_x: NDArrayFloat, - X_train: Input | None = None, - X: Input | None = None, + X_train: Input, + X: Input, y_train: NDArrayFloat | FData | None = None, weights: NDArrayFloat | None = None, _cv: bool = False, @@ -186,7 +187,7 @@ def _hat_matrix_function_not_normalized( ) -> NDArrayFloat: bandwidth = ( - np.percentile(np.abs(delta_x), 15) + float(np.percentile(delta_x, DEFAULT_BANDWIDTH_PERCENTILE)) if self.bandwidth is None else self.bandwidth ) @@ -278,8 +279,8 @@ def __call__( self, *, delta_x: NDArrayFloat, - X_train: FData | GridPointsLike | None = None, - X: FData | GridPointsLike | None = None, + X_train: FData | NDArrayFloat, + X: FData | NDArrayFloat, y_train: None = None, weights: NDArrayFloat | None = None, _cv: bool = False, @@ -291,8 +292,8 @@ def __call__( self, *, delta_x: NDArrayFloat, - X_train: FData | GridPointsLike | None = None, - X: FData | GridPointsLike | None = None, + X_train: FData | NDArrayFloat, + X: FData | NDArrayFloat, y_train: Prediction | None = None, weights: NDArrayFloat | None = None, _cv: bool = False, @@ -303,19 +304,37 @@ def __call__( # noqa: D102 self, *, delta_x: NDArrayFloat, - X_train: FData | GridPointsLike | None = None, - X: FData | GridPointsLike | None = None, + X_train: FData | NDArrayFloat, + X: FData | NDArrayFloat, y_train: NDArrayFloat | FData | None = None, weights: NDArrayFloat | None = None, _cv: bool = False, ) -> NDArrayFloat | FData: bandwidth = ( - np.percentile(np.abs(delta_x), 15) + float(np.percentile(delta_x, DEFAULT_BANDWIDTH_PERCENTILE)) if self.bandwidth is None else self.bandwidth ) + # Smoothing for functions of one variable + if not isinstance(X_train, FDataBasis) and X_train[0].shape[0] == 1: + delta_x = np.subtract.outer( + X.flatten(), + X_train.flatten(), + ) + + assert y_train is None + + return super().__call__( # noqa: WPS503 + delta_x=delta_x, + X_train=X_train, + X=X, + y_train=y_train, + weights=weights, + _cv=_cv, + ) + # Regression if isinstance(X_train, FData): assert isinstance(X, FData) @@ -326,44 +345,41 @@ def __call__( # noqa: D102 ): raise ValueError("Only FDataBasis is supported for now.") + m1 = X_train.coefficients + m2 = X.coefficients + if y_train is None: y_train = np.identity(X_train.n_samples) - m1 = X_train.coefficients - m2 = X.coefficients + else: + m1 = X_train + m2 = X + + if y_train is None: + y_train = np.identity(X_train.shape[0]) - # Subtract previous matrices obtaining a 3D matrix - # The i-th element contains the matrix X_train - X[i] - C = m1 - m2[:, np.newaxis] + # Subtract previous matrices obtaining a 3D matrix + # The i-th element contains the matrix X_train - X[i] + C = m1 - m2[:, np.newaxis] + # Inner product matrix only is applicable in regression + if isinstance(X_train, FDataBasis): inner_product_matrix = X_train.basis.inner_product_matrix() # Calculate new coefficients taking into account cross-products # if the basis is orthonormal, C would not change C = C @ inner_product_matrix # noqa: WPS350 - # Adding a column of ones in the first position of all matrices - dims = (C.shape[0], C.shape[1], 1) - C = np.concatenate((np.ones(dims), C), axis=-1) - - return self._solve_least_squares( - delta_x=delta_x, - coefs=C, - y_train=y_train, - bandwidth=bandwidth, - ) - - # Smoothing - else: + # Adding a column of ones in the first position of all matrices + dims = (C.shape[0], C.shape[1], 1) + C = np.concatenate((np.ones(dims), C), axis=-1) - return super().__call__( # type: ignore[misc, type-var] # noqa: WPS503 - delta_x=delta_x, - X_train=X_train, - X=X, - y_train=y_train, # type: ignore[arg-type] - weights=weights, - _cv=_cv, - ) + return self._solve_least_squares( + delta_x=delta_x, + coefs=C, + y_train=y_train, + bandwidth=bandwidth, + ) def _solve_least_squares( self, @@ -485,7 +501,7 @@ def _hat_matrix_function_not_normalized( # For each row in the distances matrix, it calculates the furthest # point within the k nearest neighbours vec = np.sort( - np.abs(delta_x), + delta_x, axis=1, )[:, n_neighbors - 1] + tol diff --git a/skfda/misc/metrics/_fisher_rao.py b/skfda/misc/metrics/_fisher_rao.py index 7fd18e4da..bfc4e2f89 100644 --- a/skfda/misc/metrics/_fisher_rao.py +++ b/skfda/misc/metrics/_fisher_rao.py @@ -234,7 +234,7 @@ def fisher_rao_amplitude_distance( penalty = np.sqrt(penalty, out=penalty) penalty -= 1 penalty = np.square(penalty, out=penalty) - penalty = scipy.integrate.simps(penalty, x=eval_points_normalized) + penalty = scipy.integrate.simpson(penalty, x=eval_points_normalized) distance = np.sqrt(distance**2 + lam * penalty) @@ -322,7 +322,7 @@ def fisher_rao_phase_distance( derivative_warping = np.sqrt(derivative_warping, out=derivative_warping) - d = scipy.integrate.simps(derivative_warping, x=eval_points_normalized) + d = scipy.integrate.simpson(derivative_warping, x=eval_points_normalized) d = np.clip(d, -1, 1) return np.arccos(d) # type: ignore[no-any-return] @@ -394,7 +394,7 @@ def _fisher_rao_warping_distance( product = np.multiply(srsf_warping1, srsf_warping2, out=srsf_warping1) - d = scipy.integrate.simps(product, x=warping1.grid_points[0]) + d = scipy.integrate.simpson(product, x=warping1.grid_points[0]) d = np.clip(d, -1, 1) return np.arccos(d) # type: ignore[no-any-return] diff --git a/skfda/misc/metrics/_lp_norms.py b/skfda/misc/metrics/_lp_norms.py index 2a6c2f9a6..0465dc005 100644 --- a/skfda/misc/metrics/_lp_norms.py +++ b/skfda/misc/metrics/_lp_norms.py @@ -7,7 +7,7 @@ import scipy.integrate from typing_extensions import Final -from ...representation import FData, FDataBasis +from ...representation import FData, FDataBasis, FDataGrid from ...typing._metric import Norm from ...typing._numpy import NDArrayFloat @@ -135,20 +135,21 @@ def __call__(self, vector: Union[NDArrayFloat, FData]) -> NDArrayFloat: ) res = np.sqrt(integral[0]).flatten() - else: + elif isinstance(vector, FDataGrid): data_matrix = vector.data_matrix - original_shape = data_matrix.shape - data_matrix = data_matrix.reshape(-1, original_shape[-1]) - data_matrix = (np.linalg.norm( - vector.data_matrix, - ord=vector_norm, - axis=-1, - keepdims=True, - ) if isinstance(vector_norm, (float, int)) - else vector_norm(data_matrix) - ) - data_matrix = data_matrix.reshape(original_shape[:-1] + (1,)) + if isinstance(vector_norm, (float, int)): + data_matrix = np.linalg.norm( + vector.data_matrix, + ord=vector_norm, + axis=-1, + keepdims=True, + ) + else: + original_shape = data_matrix.shape + data_matrix = data_matrix.reshape(-1, original_shape[-1]) + data_matrix = vector_norm(data_matrix) + data_matrix = data_matrix.reshape(original_shape[:-1] + (1,)) if np.isinf(self.p): @@ -157,18 +158,19 @@ def __call__(self, vector: Union[NDArrayFloat, FData]) -> NDArrayFloat: axis=tuple(range(1, data_matrix.ndim)), ) - elif vector.dim_domain == 1: + else: + integrand = vector.copy( + data_matrix=data_matrix ** self.p, + coordinate_names=(None,), + ) # Computes the norm, approximating the integral with Simpson's # rule. - res = scipy.integrate.simps( - data_matrix[..., 0] ** self.p, - x=vector.grid_points, - ) ** (1 / self.p) - - else: - # Needed to perform surface integration - return NotImplemented + res = integrand.integrate().ravel() ** (1 / self.p) + else: + raise NotImplementedError( + f"LpNorm not implemented for type {type(vector)}", + ) if len(res) == 1: return res[0] # type: ignore[no-any-return] diff --git a/skfda/misc/metrics/_utils.py b/skfda/misc/metrics/_utils.py index 9850057a0..30d1fae7e 100644 --- a/skfda/misc/metrics/_utils.py +++ b/skfda/misc/metrics/_utils.py @@ -120,8 +120,8 @@ class NormInducedMetric(Metric[VectorType]): >>> l2_distance = NormInducedMetric(l2_norm) >>> d = l2_distance(fd, fd2) - >>> float('%.3f'% d) - 0.289 + >>> float(d[0]) + 0.288... """ diff --git a/skfda/misc/scoring.py b/skfda/misc/scoring.py index 419d25c0f..7491b92ff 100644 --- a/skfda/misc/scoring.py +++ b/skfda/misc/scoring.py @@ -74,7 +74,7 @@ def _var( from ..exploratory.stats import mean, var if weights is None: - return var(x, ddof=0) + return var(x, correction=0) return mean( # type: ignore[no-any-return] np.power(x - mean(x, weights=weights), 2), diff --git a/skfda/ml/classification/_centroid_classifiers.py b/skfda/ml/classification/_centroid_classifiers.py index dd19c2f1a..4bac8ad9e 100644 --- a/skfda/ml/classification/_centroid_classifiers.py +++ b/skfda/ml/classification/_centroid_classifiers.py @@ -127,7 +127,7 @@ class DTMClassifier(NearestCentroid[Input, Target]): Test samples are classified to the class that minimizes the distance of the observation to the trimmed mean of the group - :footcite:`fraiman+muniz_2001_trimmed`. + :footcite:`pintado+romo_2005_depthbased`. Parameters: proportiontocut: @@ -174,6 +174,7 @@ class DTMClassifier(NearestCentroid[Input, Target]): See also: :class:`~skfda.ml.classification.NearestCentroid` + :class:`~skfda.exploratory.stats.trim_mean` References: .. footbibliography:: diff --git a/skfda/ml/classification/_depth_classifiers.py b/skfda/ml/classification/_depth_classifiers.py index 47e29e239..c258d608b 100644 --- a/skfda/ml/classification/_depth_classifiers.py +++ b/skfda/ml/classification/_depth_classifiers.py @@ -71,7 +71,7 @@ class DDClassifier( Depth-versus-depth (DD) classifer for functional data. Transforms the data into a DD-plot and then classifies using a polynomial - of a chosen degree\ :footcite:p:`li++_2012_ddclassifier`. + of a chosen degree :footcite:p:`li++_2012_ddclassifier`. The polynomial passes through zero and maximizes the accuracy of the classification on the train dataset. @@ -117,7 +117,7 @@ class DDClassifier( See also: :class:`~skfda.ml.classification.DDGClassifier` :class:`~skfda.ml.classification.MaximumDepthClassifier` - :class:`~skfda.preprocessing.dim_reduction.feature_extraction._ddg_transformer` + :class:`~skfda.preprocessing.dim_reduction.feature_construction._ddg_transformer` References: .. footbibliography:: @@ -307,7 +307,7 @@ class DDGClassifier( See also: :class:`~skfda.ml.classification.DDClassifier` :class:`~skfda.ml.classification.MaximumDepthClassifier` - :class:`~skfda.preprocessing.dim_reduction.feature_extraction._ddg_transformer` + :class:`~skfda.preprocessing.dim_reduction.feature_construction._ddg_transformer` References: .. footbibliography:: diff --git a/skfda/ml/classification/_logistic_regression.py b/skfda/ml/classification/_logistic_regression.py index ae95a1380..37ff0e4f6 100644 --- a/skfda/ml/classification/_logistic_regression.py +++ b/skfda/ml/classification/_logistic_regression.py @@ -62,8 +62,8 @@ class LogisticRegression( >>> from skfda.datasets import make_gaussian_process >>> from skfda.ml.classification import LogisticRegression >>> - >>> n_samples = 10000 - >>> n_features = 200 + >>> n_samples = 2000 + >>> n_features = 101 >>> >>> def mean_1(t): ... return (np.abs(t - 0.25) @@ -90,7 +90,7 @@ class LogisticRegression( >>> np.allclose(sorted(lr.points_), [0.25, 0.5, 0.75], rtol=1e-2) True >>> lr.score(X[1::2],y[1::2]) - 0.7498 + 0.768 References: .. footbibliography:: @@ -132,11 +132,9 @@ def fit( # noqa: D102, WPS210 (self.max_features, n_features), ) - penalty = 'none' if self.penalty is None else self.penalty - # multivariate logistic regression mvlr = mvLogisticRegression( - penalty=penalty, + penalty=self.penalty, C=self.C, solver=self.solver, max_iter=self.max_iter, diff --git a/skfda/ml/clustering/_hierarchical.py b/skfda/ml/clustering/_hierarchical.py index c8f63cf81..75e06e8f3 100644 --- a/skfda/ml/clustering/_hierarchical.py +++ b/skfda/ml/clustering/_hierarchical.py @@ -173,7 +173,7 @@ def _init_estimator(self) -> None: self._estimator = sklearn.cluster.AgglomerativeClustering( n_clusters=self.n_clusters, - affinity='precomputed', + metric='precomputed', memory=self.memory, connectivity=self.connectivity, compute_full_tree=self.compute_full_tree, diff --git a/skfda/ml/clustering/_kmeans.py b/skfda/ml/clustering/_kmeans.py index 9d58e068a..c433eb186 100644 --- a/skfda/ml/clustering/_kmeans.py +++ b/skfda/ml/clustering/_kmeans.py @@ -147,7 +147,7 @@ def _check_clustering(self, fdata: Input) -> Input: return fdata def _tolerance(self, fdata: Input) -> float: - variance = fdata.var(ddof=0) + variance = fdata.var(correction=0) mean_variance = np.mean(variance[0].data_matrix) return float(mean_variance * self.tol) diff --git a/skfda/ml/regression/__init__.py b/skfda/ml/regression/__init__.py index d01c3fdc4..3fb0a892c 100644 --- a/skfda/ml/regression/__init__.py +++ b/skfda/ml/regression/__init__.py @@ -10,6 +10,7 @@ "_kernel_regression": ["KernelRegression"], "_linear_regression": ["LinearRegression"], "_fpca_regression": ["FPCARegression"], + "_fpls_regression": ["FPLSRegression"], "_neighbors_regression": [ "KNeighborsRegressor", "RadiusNeighborsRegressor", @@ -19,6 +20,7 @@ if TYPE_CHECKING: from ._fpca_regression import FPCARegression + from ._fpls_regression import FPLSRegression as FPLSRegression from ._historical_linear_model import ( HistoricalLinearRegression as HistoricalLinearRegression, ) diff --git a/skfda/ml/regression/_coefficients.py b/skfda/ml/regression/_coefficients.py index e9cd9f72d..fe6abde11 100644 --- a/skfda/ml/regression/_coefficients.py +++ b/skfda/ml/regression/_coefficients.py @@ -157,8 +157,12 @@ def coefficient_info_from_covariate( def _coefficient_info_from_covariate_ndarray( # type: ignore[misc] X: NDArrayFloat, y: NDArrayFloat, + basis: Basis = None, **_: Any, ) -> CoefficientInfo[NDArrayFloat]: + if basis: + return CoefficientInfoNdarray(basis=basis) + return CoefficientInfoNdarray(basis=np.identity(X.shape[1], dtype=X.dtype)) diff --git a/skfda/ml/regression/_fpls_regression.py b/skfda/ml/regression/_fpls_regression.py new file mode 100644 index 000000000..ea4dd4eac --- /dev/null +++ b/skfda/ml/regression/_fpls_regression.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +from typing import Any, TypeVar, Union + +from sklearn.utils.validation import check_is_fitted + +from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin +from ...misc.regularization import L2Regularization +from ...preprocessing.dim_reduction import FPLS +from ...representation import FDataGrid +from ...representation.basis import Basis, FDataBasis +from ...typing._numpy import NDArrayFloat + +InputType = TypeVar( + "InputType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + +OutputType = TypeVar( + "OutputType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + + +class FPLSRegression( + BaseEstimator, + RegressorMixin[InputType, OutputType], +): + r""" + Regression using Functional Partial Least Squares. + + Parameters: + n_components: Number of components to keep. + By default all available components are utilized. + regularization_X: Regularization for the calculation of the X weights. + weight_basis_X: Basis to use for the X block. Only + applicable if X is a FDataBasis. Otherwise it must be None. + weight_basis_Y: Basis to use for the Y block. Only + applicable if Y is a FDataBasis. Otherwise it must be None. + + Attributes: + coef\_: Coefficients of the linear model. + fpls\_: FPLS object used to fit the model. + + Examples: + Fit a FPLS regression model with two components. + + >>> from skfda.ml.regression import FPLSRegression + >>> from skfda.datasets import fetch_tecator + >>> from skfda.representation import FDataGrid + >>> from skfda.typing._numpy import NDArrayFloat + + >>> X, y = fetch_tecator(return_X_y=True) + >>> fpls = FPLSRegression[FDataGrid, NDArrayFloat](n_components=2) + >>> fpls = fpls.fit(X, y) + + """ + + def __init__( + self, + n_components: int | None = None, + regularization_X: L2Regularization[Any] | None = None, + weight_basis_X: Basis | None = None, + weight_basis_Y: Basis | None = None, + _integration_weights_X: NDArrayFloat | None = None, + _integration_weights_Y: NDArrayFloat | None = None, + ) -> None: + self.n_components = n_components + self._integration_weights_X = _integration_weights_X + self._integration_weights_Y = _integration_weights_Y + self.regularization_X = regularization_X + self.weight_basis_X = weight_basis_X + self.weight_basis_Y = weight_basis_Y + + def fit( + self, + X: InputType, + y: OutputType, + ) -> FPLSRegression[InputType, OutputType]: + """ + Fit the model using the data for both blocks. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ + self.fpls_ = FPLS[InputType, OutputType]( + n_components=self.n_components, + regularization_X=self.regularization_X, + component_basis_X=self.weight_basis_X, + component_basis_Y=self.weight_basis_Y, + _integration_weights_X=self._integration_weights_X, + _integration_weights_Y=self._integration_weights_Y, + _deflation_mode="reg", + ) + + self.fpls_.fit(X, y) + + self.coef_ = ( + self.fpls_.x_rotations_matrix_ + @ self.fpls_.y_loadings_matrix_.T + ) + return self + + def predict(self, X: InputType) -> OutputType: + """Predict using the model. + + Args: + X: Data to predict. + + Returns: + Predicted values. + """ + check_is_fitted(self) + return self.fpls_.inverse_transform_y(self.fpls_.transform_x(X)) diff --git a/skfda/ml/regression/_historical_linear_model.py b/skfda/ml/regression/_historical_linear_model.py index de5dc38bf..21377c77c 100644 --- a/skfda/ml/regression/_historical_linear_model.py +++ b/skfda/ml/regression/_historical_linear_model.py @@ -42,7 +42,7 @@ def _pairwise_fem_inner_product( eval_fd = fd(grid) prod = eval_fem * eval_fd - integral = scipy.integrate.simps(prod, grid, axis=1) + integral = scipy.integrate.simpson(prod, grid, axis=1) return np.sum(integral, axis=-1) # type: ignore[no-any-return] diff --git a/skfda/ml/regression/_linear_regression.py b/skfda/ml/regression/_linear_regression.py index 8b1ba5fee..f672a2923 100644 --- a/skfda/ml/regression/_linear_regression.py +++ b/skfda/ml/regression/_linear_regression.py @@ -2,17 +2,27 @@ import itertools import warnings -from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + Iterable, + List, + Optional, + Sequence, + Tuple, + Union, +) import numpy as np import pandas as pd from sklearn.utils.validation import check_is_fitted +from ..._utils import function_to_fdatabasis, nquad_vec from ..._utils._sklearn_adapter import BaseEstimator, RegressorMixin from ...misc.lstsq import solve_regularized_weighted_lstsq from ...misc.regularization import L2Regularization, compute_penalty_matrix -from ...representation import FData -from ...representation.basis import Basis +from ...representation import FData, FDataBasis +from ...representation.basis import Basis, ConstantBasis from ...typing._numpy import NDArrayFloat from ._coefficients import CoefficientInfo, coefficient_info_from_covariate @@ -56,13 +66,17 @@ class LinearRegression( NDArrayFloat, ], ): - r"""Linear regression with multivariate response. + r"""Linear regression with multivariate and functional response. This is a regression algorithm equivalent to multivariate linear regression, but accepting also functional data expressed in a basis expansion. - The model assumed by this method is: + Functional linear regression model is subdivided into three broad + categories, depending on whether the responses or the covariates, + or both, are curves. + + Particulary, when the response is scalar, the model assumed is: .. math:: y = w_0 + w_1 x_1 + \ldots + w_p x_p + \int w_{p+1}(t) x_{p+1}(t) dt \ @@ -71,24 +85,47 @@ class LinearRegression( where the covariates can be either multivariate or functional and the response is multivariate. + When the response is functional, the model assumed is: + + .. math:: + y(t) = \boldsymbol{X} \boldsymbol{\beta}(t) + + where the covariates are multivariate and the response is functional; or: + + .. math:: + y(t) = \boldsymbol{X}(t) \boldsymbol{\beta}(t) + + if the model covariates are also functional (concurrent). + + .. deprecated:: 0.8. Usage of arguments of type sequence of FData, ndarray is deprecated in methods fit, predict. Use covariate parameters of type pandas.DataFrame instead. - .. warning:: - For now, only scalar responses are supported. + Warning: + For now, only multivariate and concurrent convariates are supported + when the response is functional. + + Warning: + This functionality is still experimental. Users should be aware that + the API will suffer heavy changes in the future. Args: coef_basis (iterable): Basis of the coefficient functions of the - functional covariates. If multivariate data is supplied, their - corresponding entries should be ``None``. If ``None`` is provided - for a functional covariate, the same basis is assumed. If this - parameter is ``None`` (the default), it is assumed that ``None`` - is provided for all covariates. + functional covariates. + When the response is scalar, if multivariate data is supplied, + their corresponding entries should be ``None``. If ``None`` is + provided for a functional covariate, the same basis is assumed. + If this parameter is ``None`` (the default), it is assumed that + ``None`` is provided for all covariates. + When the response is functional, if ``None`` is provided, the + response basis is used for all covariates. fit_intercept: Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). + When the response is functional, a coef_basis for intercept must be + supplied. regularization (int, iterable or :class:`Regularization`): If it is not a :class:`Regularization` object, linear differential operator regularization is assumed. If it @@ -104,118 +141,160 @@ class LinearRegression( Attributes: coef\_: A list containing the weight coefficient for each - covariate. For multivariate data, the covariate is a Numpy array. - For functional data, the covariate is a FDataBasis object. + covariate. intercept\_: Independent term in the linear model. Set to 0.0 if `fit_intercept = False`. Examples: - >>> from skfda.ml.regression import LinearRegression - >>> from skfda.representation.basis import (FDataBasis, MonomialBasis, - ... ConstantBasis) - >>> import pandas as pd - Multivariate linear regression can be used with functions expressed in + Functional linear regression can be used with functions expressed in a basis. Also, a functional basis for the weights can be specified: + >>> from skfda.ml.regression import LinearRegression + >>> from skfda.representation.basis import ( + ... FDataBasis, + ... MonomialBasis, + ... ConstantBasis, + ... ) + >>> x_basis = MonomialBasis(n_basis=3) - >>> x_fd = FDataBasis(x_basis, [[0, 0, 1], - ... [0, 1, 0], - ... [0, 1, 1], - ... [1, 0, 1]]) - >>> y = [2, 3, 4, 5] - >>> linear = LinearRegression() - >>> _ = linear.fit(x_fd, y) - >>> linear.coef_[0] + >>> X_train = FDataBasis( + ... basis=x_basis, + ... coefficients=[ + ... [0, 0, 1], + ... [0, 1, 0], + ... [0, 1, 1], + ... [1, 0, 1], + ... ], + ... ) + >>> y_train = [2, 3, 4, 5] + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression() + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=3), coefficients=[[-15. 96. -90.]], ...) - >>> linear.intercept_ + >>> linear_reg.intercept_ array([ 1.]) - >>> linear.predict(x_fd) + >>> linear_reg.predict(X_test) array([ 2., 3., 4., 5.]) - Covariates can include also multivariate data: + Covariates can include also multivariate data. + In order to mix functional and multivariate data a dataframe can + be used: + >>> import pandas as pd >>> x_basis = MonomialBasis(n_basis=2) - >>> x_fd = FDataBasis(x_basis, [[0, 2], - ... [0, 4], - ... [1, 0], - ... [2, 0], - ... [1, 2], - ... [2, 2]]) - >>> x = [[1, 7], [2, 3], [4, 2], [1, 1], [3, 1], [2, 5]] - >>> y = [11, 10, 12, 6, 10, 13] - >>> linear = LinearRegression( - ... coef_basis=[None, ConstantBasis()]) - >>> _ = linear.fit([x, x_fd], y) - >>> linear.coef_[0] - array([ 2., 1.]) - >>> linear.coef_[1] + >>> X_train = pd.DataFrame({ + ... "functional_covariate": FDataBasis( + ... basis=x_basis, + ... coefficients=[ + ... [0, 2], + ... [0, 4], + ... [1, 0], + ... [2, 0], + ... [1, 2], + ... [2, 2], + ... ] + ... ), + ... "multivariate_covariate_1": [1, 2, 4, 1, 3, 2], + ... "multivariate_covariate_2": [7, 3, 2, 1, 1, 5], + ... }) + >>> y_train = [11, 10, 12, 6, 10, 13] + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression( + ... coef_basis=[ConstantBasis(), None, None], + ... ) + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( - basis=ConstantBasis(domain_range=((0.0, 1.0),), n_basis=1), - coefficients=[[ 1.]], - ...) - >>> linear.intercept_ + basis=ConstantBasis(domain_range=((0.0, 1.0),), n_basis=1), + coefficients=[[ 1.]], + ...) + >>> linear_reg.coef_[1] + array([ 2.]) + >>> linear_reg.coef_[2] + array([ 1.]) + >>> linear_reg.intercept_ array([ 1.]) - >>> linear.predict([x, x_fd]) + >>> linear_reg.predict(X_test) array([ 11., 10., 12., 6., 10., 13.]) - Funcionality with pandas Dataframe. - - First example: - - >>> x_basis = MonomialBasis(n_basis=3) - >>> x_fd = FDataBasis(x_basis, [[0, 0, 1], - ... [0, 1, 0], - ... [0, 1, 1], - ... [1, 0, 1]]) - >>> cov_dict = { "fd": x_fd } - >>> y = [2, 3, 4, 5] - >>> df = pd.DataFrame(cov_dict) - >>> linear = LinearRegression() - >>> _ = linear.fit(df, y) - >>> linear.coef_[0] + Response can be functional when covariates are multivariate: + + >>> y_basis = MonomialBasis(n_basis=3) + >>> X_train = pd.DataFrame({ + ... "covariate_1": [3, 5, 3], + ... "covariate_2": [4, 1, 2], + ... "covariate_3": [1, 6, 8], + ... }) + >>> y_train = FDataBasis( + ... basis=y_basis, + ... coefficients=[ + ... [47, 22, 24], + ... [43, 47, 39], + ... [40, 53, 51], + ... ], + ... ) + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression( + ... regularization=None, + ... fit_intercept=False, + ... ) + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=3), - coefficients=[[-15. 96. -90.]], + coefficients=[[ 6. 3. 1.]], ...) - >>> linear.intercept_ - array([ 1.]) - >>> linear.predict(df) - array([ 2., 3., 4., 5.]) + >>> linear_reg.predict(X_test) + FDataBasis( + basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=3), + coefficients=[[ 47. 22. 24.] + [ 43. 47. 39.] + [ 40. 53. 51.]], + ...) - Second example: + Concurrent function-on-function regression is also supported: - >>> x_basis = MonomialBasis(n_basis=2) - >>> x_fd = FDataBasis(x_basis, [[0, 2], - ... [0, 4], - ... [1, 0], - ... [2, 0], - ... [1, 2], - ... [2, 2]]) - >>> mult1 = np.asarray([1, 2, 4, 1, 3, 2]) - >>> mult2 = np.asarray([7, 3, 2, 1, 1, 5]) - >>> cov_dict = {"m1": mult1, "m2": mult2, "fd": x_fd} - >>> df = pd.DataFrame(cov_dict) - >>> y = [11, 10, 12, 6, 10, 13] - >>> linear = LinearRegression( - ... coef_basis=[None, ConstantBasis(), ConstantBasis()]) - >>> _ = linear.fit(df, y) - >>> linear.coef_[0] - array([ 2.]) - >>> linear.coef_[1] - array([ 1.]) - >>> linear.coef_[2] + >>> x_basis = MonomialBasis(n_basis=3) + >>> y_basis = MonomialBasis(n_basis=2) + >>> X_train = FDataBasis( + ... basis=x_basis, + ... coefficients=[ + ... [0, 1, 0], + ... [2, 1, 0], + ... [5, 0, 1], + ... ], + ... ) + >>> y_train = FDataBasis( + ... basis=y_basis, + ... coefficients=[ + ... [1, 1], + ... [2, 1], + ... [3, 1], + ... ], + ... ) + >>> X_test = X_train # Just for illustration purposes + >>> linear_reg = LinearRegression( + ... coef_basis=[y_basis], + ... fit_intercept=False, + ... ) + >>> _ = linear_reg.fit(X_train, y_train) + >>> linear_reg.coef_[0] FDataBasis( - basis=ConstantBasis(domain_range=((0.0, 1.0),), n_basis=1), - coefficients=[[ 1.]], + basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=2), + coefficients=[[ 0.68250377 0.09960705]], ...) - >>> linear.intercept_ - array([ 1.]) - >>> linear.predict(df) - array([ 11., 10., 12., 6., 10., 13.]) + >>> linear_reg.predict(X_test) + FDataBasis( + basis=MonomialBasis(domain_range=((0.0, 1.0),), n_basis=2), + coefficients=[[-0.01660117 0.78211082] + [ 1.34840637 0.98132492] + [ 3.27884682 1.27018536]], + ...) """ @@ -233,7 +312,7 @@ def __init__( def fit( # noqa: D102 self, X: Union[AcceptedDataType, Sequence[AcceptedDataType], pd.DataFrame], - y: NDArrayFloat, + y: AcceptedDataType, sample_weight: Optional[NDArrayFloat] = None, ) -> LinearRegression: @@ -246,30 +325,33 @@ def fit( # noqa: D102 regularization: RegularizationIterableType = self.regularization + coef_basis = self._coef_basis + if self.fit_intercept: new_x = np.ones((len(y), 1)) X_new = [new_x] + list(X_new) + + intercept_basis = self._choose_beta_basis( + coef_basis=None, + x_basis=None, + y_basis=y.basis if isinstance(y, FDataBasis) else None, + ) + coef_basis = [intercept_basis] + coef_basis + new_coef_info_list: List[AcceptedDataCoefsType] = [ - coefficient_info_from_covariate(new_x, y), + coefficient_info_from_covariate( + new_x, + y, + basis=intercept_basis, + ), ] coef_info = new_coef_info_list + list(coef_info) - if isinstance(regularization, Iterable): - regularization = itertools.chain([None], regularization) - elif regularization is not None: - regularization = (None, regularization) - - inner_products_list = [ - c.regression_matrix(x, y) # type: ignore[arg-type] - for x, c in zip(X_new, coef_info) - ] - - # This is C @ J - inner_products = np.concatenate(inner_products_list, axis=1) - - if sample_weight is not None: - inner_products = inner_products * np.sqrt(sample_weight) - y = y * np.sqrt(sample_weight) + if not self._functional_response: + if isinstance(regularization, Iterable): + regularization = itertools.chain([None], regularization) + elif regularization is not None: + regularization = (None, regularization) penalty_matrix = compute_penalty_matrix( basis_iterable=(c.basis for c in coef_info), @@ -277,25 +359,80 @@ def fit( # noqa: D102 regularization=regularization, ) - if self.fit_intercept and penalty_matrix is not None: - # Intercept is not penalized - penalty_matrix[0, 0] = 0 + if self._functional_response: + coef_lengths = [] + left_inner_products_list = [] + right_inner_products_list = [] + + Xt = self._make_transpose(X_new) + + for i, basis_i in enumerate(coef_basis): + coef_lengths.append(basis_i.n_basis) + row = [] + right_inner_products_list.append( + self._weighted_inner_product_integrate( + basis_i, + ConstantBasis(), + Xt[i], + y, + ), + ) + for j, basis_j in enumerate(coef_basis): + row.append( + self._weighted_inner_product_integrate( + basis_i, + basis_j, + Xt[i], + Xt[j], + ), + ) + left_inner_products_list.append(row) + + coef_lengths.pop() + left_inner_products = np.block(left_inner_products_list) + right_inner_products = np.concatenate(right_inner_products_list) + + if penalty_matrix is not None: + left_inner_products += penalty_matrix + + basiscoefs = np.linalg.solve( + left_inner_products, + right_inner_products, + ) + else: + if self.fit_intercept and penalty_matrix is not None: + # Intercept is not penalized + penalty_matrix[0, 0] = 0 - basiscoefs = solve_regularized_weighted_lstsq( - coefs=inner_products, - result=y, - penalty_matrix=penalty_matrix, - ) + inner_products_list = [ + c.regression_matrix(x, y) # type: ignore[arg-type] + for x, c in zip(X_new, coef_info) + ] + + # This is C @ J + inner_products = np.concatenate(inner_products_list, axis=1) + + if sample_weight is not None: + inner_products = inner_products * np.sqrt(sample_weight) + y = y * np.sqrt(sample_weight) + + basiscoefs = solve_regularized_weighted_lstsq( + coefs=inner_products, + result=y, + penalty_matrix=penalty_matrix, + ) + + coef_lengths = np.array([k.shape[1] for k in inner_products_list]) - coef_lengths = np.array([i.shape[1] for i in inner_products_list]) coef_start = np.cumsum(coef_lengths) basiscoef_list = np.split(basiscoefs, coef_start) # Express the coefficients in functional form - coefs = [ - c.convert_from_constant_coefs(bcoefs) - for c, bcoefs in zip(coef_info, basiscoef_list) - ] + coefs = self._convert_from_constant_coefs( + basiscoef_list, + coef_info, + coef_basis, + ) if self.fit_intercept: self.intercept_ = coefs[0] @@ -304,6 +441,7 @@ def fit( # noqa: D102 self.intercept_ = np.zeros(1) self.coef_ = coefs + self.basis_coefs = basiscoef_list self._coef_info = coef_info self._target_ndim = y.ndim @@ -311,27 +449,44 @@ def fit( # noqa: D102 def predict( # noqa: D102 self, - X: Union[AcceptedDataType, Sequence[AcceptedDataType], pd.DataFrame], + X: Union[Sequence[AcceptedDataType], pd.DataFrame], ) -> NDArrayFloat: check_is_fitted(self) X = self._argcheck_X(X) + result = [] - result = np.sum( - [ - coef_info.inner_product(coef, x) # type: ignore[arg-type] - for coef, x, coef_info - in zip(self.coef_, X, self._coef_info) - ], - axis=0, - ) + for coef, x, coef_info in zip(self.coef_, X, self._coef_info): + if self._functional_response: + def prediction(arg, x_eval=x, coef_eval=coef): # noqa: WPS430 + if isinstance(x_eval, Callable): + x_eval = x_eval(arg) # noqa: WPS220 + + # TODO: MIRAR ESTO BIEN + x_eval = x_eval.reshape((-1, 1, 1)) - result += self.intercept_ + return coef_eval(arg) * x_eval + + result.append( + function_to_fdatabasis(prediction, self._y_basis), + ) + else: + result.append(coef_info.inner_product(coef, x)) + + result = sum(result[1:], start=result[0]) + + if self.fit_intercept: + if self._functional_response: + result = result + function_to_fdatabasis( + self.intercept_, self._y_basis, + ) + else: + result += self.intercept_ if self._target_ndim == 1: result = result.ravel() - return result # type: ignore[no-any-return] + return result def _argcheck_X( # noqa: N802 self, @@ -352,15 +507,92 @@ def _argcheck_X( # noqa: N802 elif isinstance(X, pd.DataFrame): X = self._dataframe_conversion(X) - X = [ + return ([ x if isinstance(x, FData) else self._check_and_convert(x) for x in X - ] + ]) + + def _weighted_inner_product_integrate( + self, + basis: Union[FDataBasis, Basis], + transposed_basis: Union[FDataBasis, Basis], + transposed_weight: Union[FDataBasis, NDArrayFloat], + weight: Union[FDataBasis, NDArrayFloat], + ) -> NDArrayFloat: + r""" + Return the weighted inner product matrix between its arguments. + + For two basis (:math:\theta_1) and (:math:\theta_2) the weighted + inner product is defined as: + + . math:: + \int \boldsymbol{\theta}_1 (t) \boldsymbol{\theta}^T_2 (t) w(t) dt + + where w(t) is defined by a functional vectors product: + + . math:: + \boldsymbol{w}^T_1 (t) \boldsymbol{w}_2 (t) + + Args: + basis: Vertical basis vector. + transposed_basis: Horizontal basis vector. + transposed_weight: First weight, horizontal functional vector. + weight: Second weight, vertical functional vector. + + Returns: + Inner product matrix between basis and weight. + + """ + if isinstance(basis, Basis): + basis = basis.to_basis() + + if isinstance(transposed_basis, Basis): + transposed_basis = transposed_basis.to_basis() + + if isinstance(transposed_weight, FData) and not np.array_equal( + basis.domain_range, + transposed_weight.domain_range, + ): + raise ValueError("Domain range for weight and basis must be equal") - if all(not isinstance(i, FData) for i in X): - warnings.warn("All the covariates are scalar.") + domain_range = basis.domain_range - return X + def integrand(args): # noqa: WPS430 + eval_basis = basis(args)[:, 0, :] + eval_transposed_basis = transposed_basis(args)[:, 0, :] + + if isinstance(transposed_weight, FData): + eval_transposed_weight = transposed_weight(args)[:, 0, :] + else: + eval_transposed_weight = transposed_weight.T + + if isinstance(weight, FData): + eval_weight = weight(args)[:, 0, :] + else: + eval_weight = weight.T + + return eval_basis * eval_transposed_basis.T * np.dot( + eval_transposed_weight.T, eval_weight, + ) + + return nquad_vec( + integrand, + domain_range, + ) + + def _make_transpose( + self, + X: Sequence[AcceptedDataType], + ) -> Sequence[AcceptedDataType]: + Xt: list[AcceptedDataType] = [] + for x in X: + if isinstance(x, FData): + Xt.append(x) + else: + x_new = self._check_and_convert(x) + Xt.append(x_new.T) + + return Xt def _check_and_convert( self, @@ -379,37 +611,70 @@ def _check_and_convert( new_X = new_X[:, np.newaxis] return new_X + def _convert_from_constant_coefs( + self, + coef_list: Sequence[NDArrayFloat], + coef_info: Sequence[AcceptedDataCoefsType], + coef_basis: Sequence[Basis], + ) -> Sequence[FDataBasis]: + if self._functional_response: + return [ + FDataBasis(basis=basis, coefficients=coef.T) + for basis, coef in zip(coef_basis, coef_list) + ] + return [ + c.convert_from_constant_coefs(bcoefs) + for c, bcoefs in zip(coef_info, coef_list) + ] + def _argcheck_X_y( # noqa: N802 self, X: Union[AcceptedDataType, Sequence[AcceptedDataType], pd.DataFrame], - y: NDArrayFloat, + y: Union[AcceptedDataType, Sequence[AcceptedDataType]], sample_weight: Optional[NDArrayFloat] = None, coef_basis: Optional[BasisCoefsType] = None, ) -> ArgcheckResultType: """Do some checks to types and shapes.""" new_X = self._argcheck_X(X) - if any(isinstance(i, FData) for i in y): - raise ValueError( - "Some of the response variables are not scalar", - ) + len_new_X = len(new_X) - y = np.asarray(y) + self._y_basis = y.basis if isinstance(y, FDataBasis) else None + + if isinstance(y, FDataBasis): + self._functional_response = True + else: + if any(len(y) != len(x) for x in new_X): + raise ValueError( + "The number of samples on independent and " + "dependent variables should be the same", + ) + self._functional_response = False + y = np.asarray(y) if coef_basis is None: - coef_basis = [None] * len(new_X) + coef_basis = [None] * len_new_X + + if len(coef_basis) == 1 and len_new_X > 1: + # we assume basis objects are inmmutable + coef_basis = [coef_basis[0]] * len_new_X - if len(coef_basis) != len(new_X): + if len_new_X != len(coef_basis): raise ValueError( - "Number of regression coefficients does " - "not match number of independent variables.", + "The number of covariates and coefficients " + "basis should be the same", ) - if any(len(y) != len(x) for x in new_X): - raise ValueError( - "The number of samples on independent and " - "dependent variables should be the same", + coef_basis = [ + self._choose_beta_basis( + coef_basis=c_basis, + x_basis=x.basis if isinstance(x, FDataBasis) else None, + y_basis=self._y_basis, ) + for c_basis, x in zip(coef_basis, new_X) + ] + + self._coef_basis = coef_basis coef_info = [ coefficient_info_from_covariate(x, y, basis=b) @@ -421,6 +686,19 @@ def _argcheck_X_y( # noqa: N802 return new_X, y, sample_weight, coef_info + def _choose_beta_basis( + self, + coef_basis: Basis | None = None, + x_basis: Basis | None = None, + y_basis: Basis | None = None, + ) -> Basis | None: + if coef_basis is not None: + return coef_basis + elif y_basis is None: + return x_basis + else: + return y_basis + def _sample_weight_check( self, sample_weight: Optional[NDArrayFloat], diff --git a/skfda/preprocessing/dim_reduction/__init__.py b/skfda/preprocessing/dim_reduction/__init__.py index b0e01058d..5c8ae8405 100644 --- a/skfda/preprocessing/dim_reduction/__init__.py +++ b/skfda/preprocessing/dim_reduction/__init__.py @@ -13,13 +13,17 @@ ], submod_attrs={ "_fpca": ["FPCA"], - "_neighbor_transforms": ["KNeighborsTransformer"] + "_fpls": ["FPLS"], + "_neighbor_transforms": ["KNeighborsTransformer"], }, ) if TYPE_CHECKING: from ._fpca import FPCA as FPCA - from ._neighbor_transforms import KNeighborsTransformer as KNeighborsTransformer + from ._fpls import FPLS as FPLS + from ._neighbor_transforms import ( + KNeighborsTransformer as KNeighborsTransformer, + ) def __getattr__(name: str) -> Any: diff --git a/skfda/preprocessing/dim_reduction/_fpca.py b/skfda/preprocessing/dim_reduction/_fpca.py index f5c88a54a..9110f31c0 100644 --- a/skfda/preprocessing/dim_reduction/_fpca.py +++ b/skfda/preprocessing/dim_reduction/_fpca.py @@ -348,7 +348,7 @@ def _fit_grid( if self._weights is None: # grid_points is a list with one array in the 1D case identity = np.eye(len(X.grid_points[0])) - self._weights = scipy.integrate.simps(identity, X.grid_points[0]) + self._weights = scipy.integrate.simpson(identity, X.grid_points[0]) elif callable(self._weights): self._weights = self._weights(X.grid_points[0]) # if its a FDataGrid then we need to reduce the dimension to 1-D diff --git a/skfda/preprocessing/dim_reduction/_fpls.py b/skfda/preprocessing/dim_reduction/_fpls.py new file mode 100644 index 000000000..8641f037a --- /dev/null +++ b/skfda/preprocessing/dim_reduction/_fpls.py @@ -0,0 +1,886 @@ +from __future__ import annotations + +import warnings +from abc import abstractmethod +from typing import Any, Generic, Literal, Optional, Tuple, TypeVar, Union, cast + +import numpy as np +import scipy +from sklearn.utils.validation import check_is_fitted + +from ..._utils._sklearn_adapter import BaseEstimator +from ...misc.regularization import L2Regularization, compute_penalty_matrix +from ...representation import FDataGrid +from ...representation.basis import Basis, FDataBasis, _GridBasis +from ...typing._numpy import NDArrayFloat + + +def _power_solver( + X: NDArrayFloat, + tol: float, + max_iter: int, +) -> NDArrayFloat: + """Return the dominant eigenvector of a matrix using the power method.""" + t = X[:, 0] + t_prev = np.ones(t.shape) * max(np.max(t), 1) * 2 + iter_count = 0 + while np.linalg.norm(t - t_prev) > tol: + t_prev = t + t = X @ t + t /= np.linalg.norm(t) + iter_count += 1 + if iter_count > max_iter: + break + return t + + +def _calculate_weights( + X: NDArrayFloat, + Y: NDArrayFloat, + G_ww: NDArrayFloat, + G_xw: NDArrayFloat, + G_cc: NDArrayFloat, + G_yc: NDArrayFloat, + L_X_inv: NDArrayFloat, + L_Y_inv: NDArrayFloat, + tol: float, + max_iter: int, +) -> Tuple[NDArrayFloat, NDArrayFloat]: + """ + Calculate the weights for the PLS algorithm. + + Parameters: + - X: (n_samples, n_features) + The X block data matrix. + - Y: (n_samples, n_targets) + The Y block data matrix. + + - G_ww: (n_features, n_features) + The inner product matrix for the X block weights + (The discretization weights matrix in the case of FDataGrid). + - G_xw: (n_features, n_features) + The inner product matrix for the X block data and weights + (The discretization weights matrix in the case of FDataGrid). + + - G_cc: (n_targets, n_targets) + The inner product matrix for the Y block weights + (The discretization weights matrix in the case of FDataGrid). + - G_yc: (n_targets, n_targets) + The inner product matrix for the Y block data and weights + (The discretization weights matrix in the case of FDataGrid). + + - L_X_inv: (n_features, n_features) + The inverse of the Cholesky decomposition: + L_X @ L_X.T = G_ww + P_x, + where P_x is a the penalty matrix for the X block. + - L_Y_inv: (n_targets, n_targets) + The inverse of the Cholesky decomposition: + L_Y @ L_Y.T = G_cc + P_y, + where P_y is a the penalty matrix for the Y block. + + - tol: The tolerance for the power method. + - max_iter: The maximum number of iterations for the power method. + Returns: + - w: (n_features, 1) + The X block weights. + - c: (n_targets, 1) + The Y block weights. + """ + X = X @ G_xw @ L_X_inv.T + Y = Y @ G_yc @ L_Y_inv.T + S = X.T @ Y + w = _power_solver( + S @ S.T, + tol=tol, + max_iter=max_iter, + ) + + # Calculate the other weight + c = np.dot(Y.T, np.dot(X, w)) + + # Undo the transformation + w = L_X_inv.T @ w + + # Normalize + w /= np.sqrt(np.dot(w.T, G_ww @ w)) + + # Undo the transformation + c = L_Y_inv.T @ c + + # Normalize the other weight + c /= np.sqrt(np.dot(c.T, G_cc @ c)) + + return w, c + + +BlockType = TypeVar( + "BlockType", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + + +# Ignore too many public instance attributes +class _FPLSBlock(Generic[BlockType]): # noqa: WPS230 + """ + Class to store the data of a block of a FPLS model. + + This is an internal class, intended to be used only by the FPLS class. + It provides a common interface for the different types of blocks, + simplifying the implementation of the FPLS algorithm. + + There are three types of blocks (depending on BlockType): + mutltivariate (NDArrayFloat), basis (FDataBasis) and grid (FDataGrid). + + In the following, n_samples is the number of samples of the block. + n_features is: + - The number of features of the block in the case of multivariate + block. + - The number of basis functions in the case of a FDataBasis block. + - The number of points in the case of a FDataGrid block. + + Parameters: + - data: The data of the block. + - label: Label of the block (X or Y). + - integration_weights: Array with shape (n_features,). + The integration weights of the block. It must be None for + multivariate or FDataBasis blocks. + - regularization: The regularization to apply to the block. + It must be None for multivariate blocks. + - weights_basis: The basis of the weights. It must be None for + multivariate or grid blocks. + + Attributes: + - label: Label of the block (X or Y). + - data: The data of the block. + - data_matrix: (n_samples, n_features) matrix. The data + matrix of the block. + - mean: The mean of the data. + - weights_basis: The basis of the weights. + - regularization_matrix: (n_features, n_features) matrix. + Inner product matrix of the regularization operator applied + to the basis or the grid. + - integration_matrix: (n_features, n_features) matrix. + Diagonal matrix of the integration weights. + - G_data_weights: (n_samples, n_samples) matrix. The inner + product matrix for the data and weights + (The discretization matrix in grid blocks). + - G_weights: (n_samples, n_samples) matrix. The inner product + matrix for the weights (The discretization matrix in grid blocks). + + - rotations: (n_features, n_components) matrix. The + rotations of the block. + - loadings_matrix: (n_features, n_components) matrix. The + loadings of the block. + - loadings: The loadings of the block (same type as the data). + + """ + + # Attributes that must be defined in the subclasses + mean: BlockType + data_matrix: NDArrayFloat + G_weights: NDArrayFloat + G_data_weights: NDArrayFloat + regularization_matrix: NDArrayFloat + + def set_nipals_results( + self, + rotations: NDArrayFloat, + loadings: NDArrayFloat, + ) -> None: + """Set the results of NIPALS.""" + self.rotations_matrix = rotations + self.loadings_matrix = loadings + self.rotations = self._to_block_type(self.rotations_matrix, "rotation") + self.loadings = self._to_block_type(self.loadings_matrix, "loading") + + @abstractmethod + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> BlockType: + pass + + @abstractmethod + def transform( + self, + data: BlockType, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + pass + + @abstractmethod + def inverse_transform( + self, + components: NDArrayFloat, + ) -> BlockType: + """Transform from the component space to the data space.""" + pass + + def get_penalty_matrix(self) -> NDArrayFloat: + """Return the penalty matrix.""" + return self.G_weights + self.regularization_matrix + + def get_cholesky_inv_penalty_matrix(self) -> NDArrayFloat: + """Return the Cholesky decomposition of the penalty matrix.""" + return np.linalg.inv(np.linalg.cholesky(self.get_penalty_matrix())) + + +class _FPLSBlockMultivariate(_FPLSBlock[NDArrayFloat]): + def __init__( + self, + data: NDArrayFloat, + label: str, + ) -> None: + self.label = label + if len(data.shape) == 1: + data = data[:, np.newaxis] + + self.G_data_weights = np.identity(data.shape[1]) + self.G_weights = np.identity(data.shape[1]) + + self.mean = np.mean(data, axis=0) + self.data_matrix = data - self.mean + self.data = data - self.mean + + self.regularization_matrix = np.zeros( + (data.shape[1], data.shape[1]), + ) + + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> NDArrayFloat: + return nipals_matrix.T + + def transform( + self, + data: NDArrayFloat, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_array = data + if len(data_array.shape) == 1: + data_array = data_array[:, np.newaxis] + data_array = data_array - self.mean + return data_array @ self.rotations_matrix + + def inverse_transform( + self, + components: NDArrayFloat, + ) -> NDArrayFloat: + """Transform from the component space to the data space.""" + reconstructed = components @ self.loadings_matrix.T + reconstructed += self.mean + return reconstructed + + +class _FPLSBlockBasis(_FPLSBlock[FDataBasis]): + def __init__( + self, + data: FDataBasis, + label: str, + regularization: Optional[L2Regularization[Any]], + weights_basis: Optional[Basis], + ) -> None: + """Initialize the data of a basis block.""" + self.label = label + self.mean = data.mean() + self.data = data - self.mean + self.data_matrix = self.data.coefficients + + # By default, use the basis of the input data + # for the weights + if weights_basis is None: + self.weights_basis = data.basis + else: + self.weights_basis = weights_basis + + # Compute the regularization matrix + # By default, all zeros (no regularization) + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=(self.weights_basis,), + regularization_parameter=1, + regularization=regularization, + ) + if regularization_matrix is None: + regularization_matrix = np.zeros( + (self.data_matrix.shape[1], self.data_matrix.shape[1]), + ) + + self.regularization_matrix = regularization_matrix + self.G_weights = self.weights_basis.gram_matrix() + self.G_data_weights = self.data.basis.inner_product_matrix( + self.weights_basis, + ) + + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> FDataBasis: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, they must be transposed + # so that each row corresponds ot an observation or direction + basis = self.weights_basis if title == "rotation" else self.data.basis + return FDataBasis( + coefficients=nipals_matrix.T, + basis=basis, + sample_names=[ + f"{title.capitalize()} {i}" + for i in range(nipals_matrix.shape[1]) + ], + coordinate_names=(f"FPLS {self.label} {title} value",), + dataset_name=f"FPLS {self.label} {title}s", + ) + + def transform( + self, + data: FDataBasis, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_basis = data - self.mean + return ( + data_basis.coefficients + @ self.G_data_weights + @ self.rotations_matrix + ) + + def inverse_transform( + self, + components: NDArrayFloat, + ) -> FDataBasis: + """Transform from the component space to the data space.""" + reconstructed_basis = self.data.copy( + coefficients=components @ self.loadings_matrix.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} inverse transformed", + ) + return reconstructed_basis + self.mean + + +class _FPLSBlockGrid(_FPLSBlock[FDataGrid]): + def __init__( + self, + data: FDataGrid, + label: str, + integration_weights: Optional[np.ndarray], # type: ignore[type-arg] + regularization: Optional[L2Regularization[Any]], + ) -> None: + """Initialize the data of a grid block.""" + self.label = label + + self.mean = data.mean() + self.data = data - self.mean + self.data_matrix = data.data_matrix[..., 0] + + # Arrange the integration weights in a diagonal matrix + # By default, use Simpson's rule + if integration_weights is None: + identity = np.identity(self.data_matrix.shape[1]) + integration_weights = scipy.integrate.simps( + identity, + self.data.grid_points[0], + ) + self.integration_weights = np.diag(integration_weights) + + # Compute the regularization matrix + # By default, all zeros (no regularization) + regularization_matrix = None + if regularization is not None: + regularization_matrix = compute_penalty_matrix( + basis_iterable=( + _GridBasis(grid_points=self.data.grid_points), + ), + regularization_parameter=1, + regularization=regularization, + ) + if regularization_matrix is None: + regularization_matrix = np.zeros( + (self.data_matrix.shape[1], self.data_matrix.shape[1]), + ) + + self.regularization_matrix = regularization_matrix + self.G_data_weights = self.integration_weights + self.G_weights = self.integration_weights + + def _to_block_type( + self, + nipals_matrix: NDArrayFloat, + title: str, + ) -> FDataGrid: + # Each column of the matrix generated by NIPALS corresponds to + # an obsevation or direction. Therefore, they must be transposed + # so that each row corresponds ot an observation or direction + return FDataGrid( + data_matrix=nipals_matrix.T, + sample_names=[ + f"{title.capitalize()} {i}" + for i in range(nipals_matrix.shape[1]) + ], + coordinate_names=(f"FPLS {self.label} {title} value",), + dataset_name=f"FPLS {self.label} {title}s", + grid_points=self.data.grid_points[0], + ) + + def transform( + self, + data: FDataGrid, + ) -> NDArrayFloat: + """Transform from the data space to the component space.""" + data_grid = data - self.mean + return ( + data_grid.data_matrix[..., 0] + @ self.G_data_weights + @ self.rotations_matrix + ) + + def inverse_transform( + self, + components: NDArrayFloat, + ) -> FDataGrid: + """Transform from the component space to the data space.""" + reconstructed_grid = self.data.copy( + data_matrix=components @ self.loadings_matrix.T, + sample_names=(None,) * components.shape[0], + dataset_name=f"FPLS {self.label} inverse transformed", + ) + return reconstructed_grid + self.mean + + +def _fpls_block_factory( + data: BlockType, + label: str, + integration_weights: NDArrayFloat | None, + regularization: L2Regularization[Any] | None, + weights_basis: Basis | None, +) -> _FPLSBlock[BlockType]: + if isinstance(data, np.ndarray): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockMultivariate( + data=data, + label=label, + ), + ) + elif isinstance(data, FDataBasis): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockBasis( + data=data, + label=label, + regularization=regularization, + weights_basis=weights_basis, + ), + ) + elif isinstance(data, FDataGrid): + return cast( + _FPLSBlock[BlockType], + _FPLSBlockGrid( + data=data, + label=label, + integration_weights=integration_weights, + regularization=regularization, + ), + ) + + raise TypeError("Invalid type for data") + + +InputTypeX = TypeVar( + "InputTypeX", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) +InputTypeY = TypeVar( + "InputTypeY", + bound=Union[FDataGrid, FDataBasis, NDArrayFloat], +) + +DeflationMode = Literal["reg", "can"] + + +# Ignore too many public instance attributes +class FPLS( # noqa: WPS230 + BaseEstimator, + Generic[InputTypeX, InputTypeY], +): + r""" + Functional Partial Least Squares Regression. + + This is a generic class. When instantiated, the type of the + data in each block can be specified. The possiblities are: + NDArrayFloat, FDataGrid and FDataBasis. + + Parameters: + n_components: Number of components to extract. By default, the + maximum number of components is extracted. + regularization_X: Regularization to apply to the X block. + regularization_Y: Regularization to apply to the Y block. + component_basis_X: Basis to use for the X block. Only + applicable if X is a FDataBasis. Otherwise it must be None. + component_basis_Y: Basis to use for the Y block. Only + applicable if Y is a FDataBasis. Otherwise it must be None. + _deflation_mode: Mode to use for deflation. Can be "can" + (dimensionality reduction) or "reg" (regression). + + Attributes: + x_weights\_: (n_features_X, n_components) array with the X weights + extracted by NIPALS. + y_weights\_: (n_features_Y, n_components) array with the Y weights + extracted by NIPALS. + x_scores\_: (n_samples, n_components) array with the X scores + extracted by NIPALS. + y_scores\_: (n_samples, n_components) array with the Y scores + extracted by NIPALS. + x_rotations_matrix\_: (n_features_X, n_components) array with the + X rotations. + y_rotations_matrix\_: (n_features_Y, n_components) array with the + Y rotations. + x_loadings_matrix\_: (n_features_X, n_components) array with the + X loadings. + y_loadings_matrix\_: (n_features_Y, n_components) array with the + Y loadings. + x_rotations\_: Projection directions for the X block (same type as X). + y_rotations\_: Projection directions for the Y block (same type as Y). + x_loadings\_: Loadings for the X block (same type as X). + y_loadings\_: Loadings for the Y block (same type as Y). + + """ + + # Ignore too many arguments + def __init__( # noqa: WPS211 + self, + n_components: int | None = None, + *, + regularization_X: L2Regularization[InputTypeX] | None = None, + regularization_Y: L2Regularization[InputTypeY] | None = None, + component_basis_X: Basis | None = None, + component_basis_Y: Basis | None = None, + tol: float = 1e-6, + max_iter: int = 500, + _deflation_mode: DeflationMode = "can", + _integration_weights_X: NDArrayFloat | None = None, + _integration_weights_Y: NDArrayFloat | None = None, + ) -> None: + self.n_components = n_components + self._integration_weights_X = _integration_weights_X + self._integration_weights_Y = _integration_weights_Y + self.regularization_X = regularization_X + self.regularization_Y = regularization_Y + self.component_basis_X = component_basis_X + self.component_basis_Y = component_basis_Y + self._deflation_mode = _deflation_mode + self.tol = tol + self.max_iter = max_iter + + def _initialize_blocks(self, X: InputTypeX, Y: InputTypeY) -> None: + self._x_block = _fpls_block_factory( + data=X, + label="X", + integration_weights=self._integration_weights_X, + regularization=self.regularization_X, + weights_basis=self.component_basis_X, + ) + self._y_block = _fpls_block_factory( + data=Y, + label="Y", + integration_weights=self._integration_weights_Y, + regularization=self.regularization_Y, + weights_basis=self.component_basis_Y, + ) + + # Ignore too many local variables + def _perform_nipals(self) -> None: # noqa: WPS210 + X = self._x_block.data_matrix + Y = self._y_block.data_matrix + X = X - np.mean(X, axis=0) + Y = Y - np.mean(Y, axis=0) + + if len(Y.shape) == 1: + Y = Y[:, np.newaxis] + + # Store the matrices as list of columns + W, C = [], [] + T, U = [], [] + P, Q = [], [] + + # Calculate the penalty matrices in advance + L_X_inv = self._x_block.get_cholesky_inv_penalty_matrix() + L_Y_inv = self._y_block.get_cholesky_inv_penalty_matrix() + + # Determine some tolerances to stop the algorithm + x_epsilon = ( + 10 + * np.finfo(X.dtype).eps + * np.abs(self._x_block.data_matrix).mean() + ) + y_epsilon = ( + 10 + * np.finfo(Y.dtype).eps + * np.abs(self._y_block.data_matrix).mean() + ) + + n_comp = 0 + while self.n_components is None or n_comp < self.n_components: + # Stop if either matrix is all zeros + if np.all(X == 0) or np.all(Y == 0): + # If the number of components was specified, warn the user + if self.n_components is not None: + warnings.warn( + f"After extracting {n_comp} components, " + "one of the matrices is completely deflated. " + f"The algorithm will return {n_comp} components," + f"instead of {self.n_components}.", + stacklevel=3, + ) + break + + # Increase number of components + n_comp += 1 + + w, c = _calculate_weights( + X, + Y, + G_ww=self._x_block.G_weights, + G_xw=self._x_block.G_data_weights, + G_cc=self._y_block.G_weights, + G_yc=self._y_block.G_data_weights, + L_X_inv=L_X_inv, + L_Y_inv=L_Y_inv, + tol=self.tol, + max_iter=self.max_iter, + ) + + t = np.dot(X @ self._x_block.G_data_weights, w) + u = np.dot(Y @ self._y_block.G_data_weights, c) + + p = np.dot(X.T, t) / (np.dot(t.T, t)) + + y_proyection = t if self._deflation_mode == "reg" else u + + q = np.dot(Y.T, y_proyection) / ( + np.dot(y_proyection, y_proyection) + ) + + X = X - np.outer(t, p) + Y = Y - np.outer(y_proyection, q) + + W.append(w) + C.append(c) + T.append(t) + U.append(u) + P.append(p) + Q.append(q) + + # Set to zero the values that are close to zero + X[abs(X) < x_epsilon] = 0 + Y[abs(Y) < y_epsilon] = 0 + + # Convert each list of columns to a matrix + self.x_weights_ = np.array(W).T + self.y_weights_ = np.array(C).T + self.x_scores_ = np.array(T).T + self.y_scores_ = np.array(U).T + self.x_loadings_matrix_ = np.array(P).T + self.y_loadings_matrix_ = np.array(Q).T + + def fit( + self, + X: InputTypeX, + y: InputTypeY, + ) -> FPLS[InputTypeX, InputTypeY]: + """ + Fit the model using the data for both blocks. + + Args: + X: Data of the X block + y: Data of the Y block + + Returns: + self + """ + self._initialize_blocks(X, y) + + if self.n_components is not None: + # In regression mode, the number of components is limited + # only by the rank of the X data matrix since components are + # only extracted from the X block. + if self._deflation_mode == "reg": + range_upper_bound = min(*self._x_block.data_matrix.shape) + else: + range_upper_bound = min( + *self._x_block.data_matrix.shape, + *self._y_block.data_matrix.shape, + ) + + if self.n_components > range_upper_bound: + raise ValueError( + f"n_components must be less or equal " + f"than {range_upper_bound}", + ) + + self._perform_nipals() + + self.x_rotations_matrix_ = self.x_weights_ @ np.linalg.pinv( + self.x_loadings_matrix_.T + @ self._x_block.G_data_weights + @ self.x_weights_, + ) + + self.y_rotation_matrix_ = self.y_weights_ @ np.linalg.pinv( + self.y_loadings_matrix_.T + @ self._y_block.G_data_weights + @ self.y_weights_, + ) + + self._x_block.set_nipals_results( + rotations=self.x_rotations_matrix_, + loadings=self.x_loadings_matrix_, + ) + self._y_block.set_nipals_results( + rotations=self.y_rotation_matrix_, + loadings=self.y_loadings_matrix_, + ) + + self.x_rotations_ = self._x_block.rotations + self.y_rotations_ = self._y_block.rotations + + self.x_loadings_ = self._x_block.loadings + self.y_loadings_ = self._y_block.loadings + + return self + + def transform( + self, + X: InputTypeX, + y: InputTypeY | None = None, + ) -> NDArrayFloat | Tuple[NDArrayFloat, NDArrayFloat]: + """ + Apply the dimension reduction learned on the train data. + + Args: + X: Data to transform. Must have the same number of features and + type as the data used to train the model. + y : Data to transform. Must have the same number of features and + type as the data used to train the model. + If None, only X is transformed. + + Returns: + - x_scores: Data transformed. + - y_scores: Data transformed (if y is not None) + """ + check_is_fitted(self) + + x_scores = self._x_block.transform(X) + + if y is not None: + y_scores = self._y_block.transform(y) + return x_scores, y_scores + + return x_scores + + def transform_x( + self, + X: InputTypeX, + ) -> NDArrayFloat: + """ + Apply the dimension reduction learned on the train data. + + Args: + X: Data to transform. Must have the same number of features and + type as the data used to train the model. + + + Returns: + - Data transformed. + """ + check_is_fitted(self) + + return self._x_block.transform(X) + + def transform_y( + self, + Y: InputTypeY, + ) -> NDArrayFloat: + """ + Apply the dimension reduction learned on the train data. + + Args: + Y: Data to transform. Must have the same number of features and + type as the data used to train the model. + + + Returns: + - Data transformed. + """ + check_is_fitted(self) + + return self._y_block.transform(Y) + + def inverse_transform( + self, + X: NDArrayFloat, + Y: NDArrayFloat | None = None, + ) -> InputTypeX | Tuple[InputTypeX, InputTypeY]: + """ + Transform data back to its original space. + + Args: + X: Data to transform back. Must have the same number of columns + as the number of components of the model. + Y: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - X: Data reconstructed from the transformed data. + - Y: Data reconstructed from the transformed data + (if Y is not None) + + """ + check_is_fitted(self) + + x_reconstructed = self._x_block.inverse_transform(X) + + if Y is not None: + y_reconstructed = self._y_block.inverse_transform(Y) + return x_reconstructed, y_reconstructed + + return x_reconstructed + + def inverse_transform_x( + self, + X: NDArrayFloat, + ) -> InputTypeX: + """ + Transform X data back to its original space. + + Args: + X: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - Data reconstructed from the transformed data. + """ + check_is_fitted(self) + + return self._x_block.inverse_transform(X) + + def inverse_transform_y( + self, + Y: NDArrayFloat, + ) -> InputTypeY: + """ + Transform Y data back to its original space. + + Args: + Y: Data to transform back. Must have the same number of columns + as the number of components of the model. + + Returns: + - Data reconstructed from the transformed data. + """ + check_is_fitted(self) + + return self._y_block.inverse_transform(Y) diff --git a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py index 70fdefb97..d55d51315 100644 --- a/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py +++ b/skfda/preprocessing/dim_reduction/feature_extraction/__init__.py @@ -8,4 +8,5 @@ 'Please use "dim_reduction" for FPCA' 'or "feature_construction" for feature construction techniques', category=DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/preprocessing/dim_reduction/projection/__init__.py b/skfda/preprocessing/dim_reduction/projection/__init__.py index f028b2627..a56b0ff83 100644 --- a/skfda/preprocessing/dim_reduction/projection/__init__.py +++ b/skfda/preprocessing/dim_reduction/projection/__init__.py @@ -5,4 +5,5 @@ warnings.warn( 'The module "projection" is deprecated. Please use "dim_reduction"', category=DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py index 4da5810b7..298a68fea 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/_rkvs.py @@ -83,7 +83,7 @@ def _rkhs_vs( [indexes[j]], ]) - new_means = np.atleast_2d(means[new_selection]) + new_means = means[new_selection] lstsq_solution = linalg.lstsq( variances[new_selection[:, np.newaxis], new_selection], diff --git a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py index 638d93bf3..036663ccf 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/mrmr.py @@ -160,7 +160,7 @@ def _mrmr( redundancies[last_selected, j] = redundancy_dependence_measure( X[:, last_selected, np.newaxis], X[:, j, np.newaxis], - ) + ).item() redundancies[j, last_selected] = redundancies[last_selected, j] W = np.mean( diff --git a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py index ca7b377b8..432b65bb8 100644 --- a/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py +++ b/skfda/preprocessing/dim_reduction/variable_selection/recursive_maxima_hunting.py @@ -17,6 +17,7 @@ overload, ) +import dcor import numpy as np import numpy.linalg as linalg import numpy.ma as ma @@ -25,7 +26,6 @@ from sklearn.base import clone from typing_extensions import Literal -import dcor from skfda.exploratory.stats.covariance import ( CovarianceEstimator, EmpiricalCovariance, @@ -41,7 +41,6 @@ if TYPE_CHECKING: from ....misc.covariances import CovarianceLike - import GPy def _transform_to_2d(t: ArrayLike) -> NDArrayFloat: @@ -56,48 +55,6 @@ def _transform_to_2d(t: ArrayLike) -> NDArrayFloat: return t -class _PicklableKernel(): - """Class used to pickle GPy kernels.""" - - def __init__(self, kernel: GPy.kern.Kern) -> None: - super().__setattr__('_PicklableKernel__kernel', kernel) - - def __getattr__(self, name: str) -> Any: - if name != '__deepcopy__': - return getattr(self.__kernel, name) - - def __setattr__(self, name: str, value: Any) -> None: - setattr(self.__kernel, name, value) - - def __getstate__(self) -> Mapping[str, Any]: - return { - 'class': self.__kernel.__class__, - 'input_dim': self.__kernel.input_dim, - 'values': self.__kernel.param_array, - } - - def __setstate__(self, state: Mapping[str, Any]) -> None: - super().__setattr__('_PicklableKernel__kernel', state['class']( - input_dim=state['input_dim']), - ) - self.__kernel.param_array[...] = state['values'] - - def __call__(self, *args: Any, **kwargs: Any) -> NDArrayFloat: - return self.__kernel.K(*args, **kwargs) # type: ignore[no-any-return] - - -def make_kernel(k: CovarianceLike) -> CovarianceLike: - try: - import GPy - except ImportError: - return k - - if isinstance(k, GPy.kern.Kern): - return _PicklableKernel(k) - - return k - - def _absolute_argmax( function: FDataGrid, *, @@ -255,7 +212,7 @@ def __init__( super().__init__() self.mean = mean - self.cov = make_kernel(cov) + self.cov = cov def _evaluate_mean(self, t: NDArrayFloat) -> NDArrayFloat: @@ -625,7 +582,7 @@ def __call__( **kwargs: Any, ) -> bool: - score = float(dependences.data_matrix[0, selected_index, 0]) + score = float(dependences.data_matrix[(0,) + selected_index + (0,)]) return score < self.threshold diff --git a/skfda/preprocessing/feature_construction/_functions.py b/skfda/preprocessing/feature_construction/_functions.py index 3359a77af..e426afaa4 100644 --- a/skfda/preprocessing/feature_construction/_functions.py +++ b/skfda/preprocessing/feature_construction/_functions.py @@ -3,13 +3,16 @@ from __future__ import annotations import itertools -from typing import Optional, Sequence, Tuple, TypeVar, Union, cast, overload +from typing import Optional, Sequence, Tuple, TypeVar, Union, cast import numpy as np -from typing_extensions import Literal, Protocol, TypeGuard +from typing_extensions import Literal, TypeGuard -from ..._utils import nquad_vec -from ...misc.validation import check_fdata_dimensions, validate_domain_range +from ..._utils.ndfunction._functions import ( + _average_function_ufunc, + _UnaryUfunc, +) +from ...misc.validation import check_fdata_dimensions from ...representation import FData, FDataBasis, FDataGrid from ...typing._base import DomainRangeLike from ...typing._numpy import ArrayLike, NDArrayBool, NDArrayFloat, NDArrayInt @@ -17,20 +20,6 @@ T = TypeVar("T", bound=Union[NDArrayFloat, FDataGrid]) -class _BasicUfuncProtocol(Protocol): - - @overload - def __call__(self, __arg: FDataGrid) -> FDataGrid: # noqa: WPS112 - pass - - @overload - def __call__(self, __arg: NDArrayFloat) -> NDArrayFloat: # noqa: WPS112 - pass - - def __call__(self, __arg: T) -> T: # noqa: WPS112 - pass - - def _sequence_of_ints(data: Sequence[object]) -> TypeGuard[Sequence[int]]: """Check that every element is an integer.""" return all(isinstance(d, int) for d in data) @@ -332,7 +321,7 @@ def number_crossings( >>> from skfda.preprocessing.feature_construction import ( ... number_crossings, - ... ) + ... ) >>> from scipy.special import jv >>> import numpy as np >>> x_grid = np.linspace(0, 14, 14) @@ -521,7 +510,7 @@ def unconditional_moment( def unconditional_expected_value( data: FData, - function: _BasicUfuncProtocol, + function: _UnaryUfunc, *, domain: DomainRangeLike | None = None, ) -> NDArrayFloat: @@ -537,19 +526,19 @@ def unconditional_expected_value( f_p(x(t))=\frac{1}{\left( b-a\right)}\int_a^b g \left(x_p(t)\right) dt - Args: - data: FDataGrid or FDataBasis where we want to calculate - the expected value. - function: function that specifies how the expected value to is - calculated. It has to be a function of X(t). - domain: Integration domain. By default, the whole domain is used. + Args: + data: FDataGrid or FDataBasis where we want to calculate + the expected value. + function: function that specifies how the expected value to is + calculated. It has to be a function of X(t). + domain: Integration domain. By default, the whole domain is used. - Returns: - ndarray of shape (n_dimensions, n_samples) with the values of the - expectations. + Returns: + ndarray of shape (n_dimensions, n_samples) with the values of the + expectations. Examples: - We will use this funtion to calculate the logarithmic first moment + We will use this function to calculate the logarithmic first moment of the first 5 samples of the Berkeley Growth dataset. We will start by importing it. @@ -574,26 +563,4 @@ def unconditional_expected_value( [ 4.84]]) """ - if domain is None: - domain = data.domain_range - else: - domain = validate_domain_range(domain) - - lebesgue_measure = np.prod( - [ - (iterval[1] - iterval[0]) - for iterval in domain - ], - ) - - if isinstance(data, FDataGrid): - return function(data).integrate(domain=domain) / lebesgue_measure - - def integrand(*args: NDArrayFloat) -> NDArrayFloat: # noqa: WPS430 - f1 = data(args)[:, 0, :] - return function(f1) - - return nquad_vec( - integrand, - domain, - ) / lebesgue_measure + return _average_function_ufunc(data, ufunc=function, domain=domain) diff --git a/skfda/preprocessing/missing/_interpolate.py b/skfda/preprocessing/missing/_interpolate.py index d09ea07b2..d97dff0d8 100644 --- a/skfda/preprocessing/missing/_interpolate.py +++ b/skfda/preprocessing/missing/_interpolate.py @@ -134,7 +134,7 @@ class MissingValuesInterpolation( For multivariate functions, such as surfaces all dimensions are considered. This is currently done using - :external:class:`~scipy.interpolation.LinearNDInterpolator`, which + :external:class:`~scipy.interpolate.LinearNDInterpolator`, which triangulates the space and performs linear barycentric interpolation: >>> X = FDataGrid( diff --git a/skfda/preprocessing/smoothing/_kernel_smoothers.py b/skfda/preprocessing/smoothing/_kernel_smoothers.py index c07489235..e9b56f44d 100644 --- a/skfda/preprocessing/smoothing/_kernel_smoothers.py +++ b/skfda/preprocessing/smoothing/_kernel_smoothers.py @@ -9,9 +9,11 @@ import numpy as np -from ..._utils._utils import _to_grid_points +from ..._utils._utils import _cartesian_product, _to_grid_points from ...misc.hat_matrix import HatMatrix, NadarayaWatsonHatMatrix +from ...misc.metrics import PairwiseMetric, l2_distance from ...typing._base import GridPointsLike +from ...typing._metric import Metric from ...typing._numpy import NDArrayFloat from ._linear import _LinearSmoother @@ -114,10 +116,12 @@ def __init__( *, weights: Optional[NDArrayFloat] = None, output_points: Optional[GridPointsLike] = None, + metric: Metric[NDArrayFloat] = l2_distance, ): self.kernel_estimator = kernel_estimator self.weights = weights self.output_points = output_points + self.metric = metric self._cv = False # For testing purposes only def _hat_matrix( @@ -126,18 +130,18 @@ def _hat_matrix( output_points: GridPointsLike, ) -> NDArrayFloat: - input_points = _to_grid_points(input_points) - output_points = _to_grid_points(output_points) + input_points = _cartesian_product(_to_grid_points(input_points)) + output_points = _cartesian_product(_to_grid_points(output_points)) if self.kernel_estimator is None: self.kernel_estimator = NadarayaWatsonHatMatrix() - delta_x = np.subtract.outer(output_points[0], input_points[0]) + delta_x = PairwiseMetric(self.metric)(output_points, input_points) return self.kernel_estimator( delta_x=delta_x, weights=self.weights, - X_train=input_points, - X=output_points, + X_train=np.array(input_points), + X=np.array(output_points), _cv=self._cv, ) diff --git a/skfda/preprocessing/smoothing/_linear.py b/skfda/preprocessing/smoothing/_linear.py index 8759704c9..09c6fadcc 100644 --- a/skfda/preprocessing/smoothing/_linear.py +++ b/skfda/preprocessing/smoothing/_linear.py @@ -28,6 +28,7 @@ class _LinearSmoother( ``hat_matrix`` to define the smoothing or 'hat' matrix. """ + input_points_: GridPoints output_points_: GridPoints @@ -116,9 +117,25 @@ def transform( ) ) + dims = [len(e) for e in self.output_points_] + dims += [len(e) for e in self.input_points_] + + hat_matrix = np.reshape( + self.hat_matrix_, + dims, + ) + + data_matrix = np.einsum( + hat_matrix, + [Ellipsis, *range(1, len(self.output_points_) + 1)], + X.data_matrix, + [0, *range(1, len(self.output_points_) + 2)], + [0, Ellipsis, len(self.output_points_) + 1], + ) + # The matrix is cached return X.copy( - data_matrix=self.hat_matrix_ @ X.data_matrix, + data_matrix=data_matrix, grid_points=self.output_points_, ) diff --git a/skfda/preprocessing/smoothing/kernel_smoothers.py b/skfda/preprocessing/smoothing/kernel_smoothers.py index ae3b7ac85..fbe9d68ad 100644 --- a/skfda/preprocessing/smoothing/kernel_smoothers.py +++ b/skfda/preprocessing/smoothing/kernel_smoothers.py @@ -18,6 +18,7 @@ "The \"kernel_smoothers\" module is deprecated. " "Use the \"KernelSmoother\" class instead", DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/representation/_functional_data.py b/skfda/representation/_functional_data.py index 2696e7b25..ee8813bbc 100644 --- a/skfda/representation/_functional_data.py +++ b/skfda/representation/_functional_data.py @@ -826,7 +826,7 @@ def cov( # noqa: WPS451 s_points: NDArrayFloat, t_points: NDArrayFloat, /, - ddof: int = 1, + correction: int = 0, ) -> NDArrayFloat: pass @@ -834,7 +834,7 @@ def cov( # noqa: WPS451 def cov( # noqa: WPS451 self: T, /, - ddof: int = 1, + correction: int = 0, ) -> Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat]: pass @@ -844,7 +844,7 @@ def cov( # noqa: WPS320, WPS451 s_points: Optional[NDArrayFloat] = None, t_points: Optional[NDArrayFloat] = None, /, - ddof: int = 1, + correction: int = 0, ) -> Union[ Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat], NDArrayFloat, @@ -864,9 +864,9 @@ def cov( # noqa: WPS320, WPS451 Args: s_points: Points where the covariance function is evaluated. t_points: Points where the covariance function is evaluated. - ddof: "Delta Degrees of Freedom": the divisor used in the - calculation is `N - ddof`, where `N` represents the number - of elements. By default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the + number of elements. Default: `0`. Returns: Covariance function. diff --git a/skfda/representation/basis/_bspline_basis.py b/skfda/representation/basis/_bspline_basis.py index 7b4898f96..3c3981db4 100644 --- a/skfda/representation/basis/_bspline_basis.py +++ b/skfda/representation/basis/_bspline_basis.py @@ -129,11 +129,10 @@ def __init__( # noqa: WPS238 ) n_basis = len(knots) + order - 2 - if (n_basis - order + 2) < 2: + if n_basis < order: raise ValueError( - f"The number of basis ({n_basis}) minus the " - f"order of the bspline ({order}) should be " - f"greater than 3.", + f"The number of basis ({n_basis}) should not be smaller " + f"than the order of the bspline ({order}).", ) self._order = order @@ -461,6 +460,7 @@ def __init__( # noqa: WPS238 knots=knots, ) warnings.warn( - "The BSplines class is deprecated. Use BSplineBasis instead.", + "The BSpline class is deprecated. Use BSplineBasis instead.", DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/representation/basis/_constant_basis.py b/skfda/representation/basis/_constant_basis.py index 6d7d6e24b..3305ba21a 100644 --- a/skfda/representation/basis/_constant_basis.py +++ b/skfda/representation/basis/_constant_basis.py @@ -81,5 +81,6 @@ def __init__(self, domain_range: Optional[DomainRangeLike] = None) -> None: warnings.warn( "The Constant class is deprecated. Use ConstantBasis instead.", DeprecationWarning, + stacklevel=2, ) super().__init__(domain_range=domain_range) diff --git a/skfda/representation/basis/_fdatabasis.py b/skfda/representation/basis/_fdatabasis.py index 8289471d4..945dda05d 100644 --- a/skfda/representation/basis/_fdatabasis.py +++ b/skfda/representation/basis/_fdatabasis.py @@ -445,7 +445,7 @@ def sum( # noqa: WPS125 def var( self: T, eval_points: Optional[NDArrayFloat] = None, - ddof: int = 1, + correction: int = 0, ) -> T: """Compute the variance of the functional data object. @@ -460,15 +460,17 @@ def var( numpy.linspace with bounds equal to the ones defined in self.domain_range and the number of points the maximum between 501 and 10 times the number of basis. - ddof: "Delta Degrees of Freedom": the divisor used in the - calculation is `N - ddof`, where `N` represents the number of - elements. By default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the + number of elements. Default: `0`. Returns: Variance of the original object. """ - return self.to_grid(eval_points).var(ddof=ddof).to_basis(self.basis) + return self.to_grid( + eval_points, + ).var(correction=correction).to_basis(self.basis) @overload def cov( # noqa: WPS451 @@ -476,7 +478,7 @@ def cov( # noqa: WPS451 s_points: NDArrayFloat, t_points: NDArrayFloat, /, - ddof: int = 1, + correction: int = 0, ) -> NDArrayFloat: pass @@ -484,7 +486,7 @@ def cov( # noqa: WPS451 def cov( # noqa: WPS451 self: T, /, - ddof: int = 1, + correction: int = 0, ) -> Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat]: pass @@ -493,7 +495,7 @@ def cov( # noqa: WPS320, WPS451 s_points: Optional[NDArrayFloat] = None, t_points: Optional[NDArrayFloat] = None, /, - ddof: int = 1, + correction: int = 0, ) -> Union[ Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat], NDArrayFloat, @@ -515,9 +517,9 @@ def cov( # noqa: WPS320, WPS451 Args: s_points: Points where the covariance function is evaluated. t_points: Points where the covariance function is evaluated. - ddof: "Delta Degrees of Freedom": the divisor used in the - calculation is `N - ddof`, where `N` represents the number - of elements. By default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the + number of elements. Default: `0`. Returns: Covariance function. @@ -525,7 +527,7 @@ def cov( # noqa: WPS320, WPS451 """ # To avoid circular imports from ...misc.covariances import EmpiricalBasis - cov_function = EmpiricalBasis(self, ddof=ddof) + cov_function = EmpiricalBasis(self, correction=correction) if s_points is None or t_points is None: return cov_function return cov_function(s_points, t_points) diff --git a/skfda/representation/basis/_finite_element_basis.py b/skfda/representation/basis/_finite_element_basis.py index a2a64b0d1..6f8142b68 100644 --- a/skfda/representation/basis/_finite_element_basis.py +++ b/skfda/representation/basis/_finite_element_basis.py @@ -229,4 +229,5 @@ def __init__( "The FiniteElement class is deprecated. Use " "FiniteElementBasis instead.", DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/representation/basis/_fourier_basis.py b/skfda/representation/basis/_fourier_basis.py index c8d324330..cb7b737bd 100644 --- a/skfda/representation/basis/_fourier_basis.py +++ b/skfda/representation/basis/_fourier_basis.py @@ -319,6 +319,7 @@ def __init__( warnings.warn( "The Fourier class is deprecated. Use FourierBasis instead.", DeprecationWarning, + stacklevel=2, ) super().__init__( domain_range=domain_range, diff --git a/skfda/representation/basis/_monomial_basis.py b/skfda/representation/basis/_monomial_basis.py index be6370398..35129ac3a 100644 --- a/skfda/representation/basis/_monomial_basis.py +++ b/skfda/representation/basis/_monomial_basis.py @@ -211,4 +211,5 @@ def __init__( warnings.warn( "The Monomial class is deprecated. Use MonomialBasis instead.", DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/representation/basis/_tensor_basis.py b/skfda/representation/basis/_tensor_basis.py index f4f83dd2d..833395c2f 100644 --- a/skfda/representation/basis/_tensor_basis.py +++ b/skfda/representation/basis/_tensor_basis.py @@ -180,4 +180,5 @@ def __init__(self, basis_list: Iterable[Basis]): warnings.warn( "The Tensor class is deprecated. Use TensorBasis instead.", DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/representation/basis/_vector_basis.py b/skfda/representation/basis/_vector_basis.py index 8c02af61b..ea5be4bb5 100644 --- a/skfda/representation/basis/_vector_basis.py +++ b/skfda/representation/basis/_vector_basis.py @@ -245,4 +245,5 @@ def __init__(self, basis_list: Iterable[Basis]) -> None: "The VectorValued class is deprecated. " "Use VectorValuedBasis instead.", DeprecationWarning, + stacklevel=2, ) diff --git a/skfda/representation/grid.py b/skfda/representation/grid.py index 5801d3a01..bdd01c3e4 100644 --- a/skfda/representation/grid.py +++ b/skfda/representation/grid.py @@ -515,7 +515,7 @@ def integrate( integrand = data.data_matrix for g in data.grid_points[::-1]: - integrand = scipy.integrate.simps( + integrand = scipy.integrate.simpson( integrand, x=g, axis=-2, @@ -582,13 +582,13 @@ def sum( # noqa: WPS125 sample_names=(None,), ) - def var(self: T, ddof: int = 1) -> T: + def var(self: T, correction: int = 0) -> T: """Compute the variance of a set of samples in a FDataGrid object. Args: - ddof: "Delta Degrees of Freedom": the divisor used in the - calculation is `N - ddof`, where `N` represents the number of - elements. By default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the + number of elements. Default: `0`. Returns: A FDataGrid object with just one sample representing the @@ -599,7 +599,7 @@ def var(self: T, ddof: int = 1) -> T: data_matrix=np.array([np.var( self.data_matrix, axis=0, - ddof=ddof, + ddof=correction, )]), sample_names=("variance",), ) @@ -610,7 +610,7 @@ def cov( # noqa: WPS451 s_points: NDArrayFloat, t_points: NDArrayFloat, /, - ddof: int = 1, + correction: int = 0, ) -> NDArrayFloat: pass @@ -618,7 +618,7 @@ def cov( # noqa: WPS451 def cov( # noqa: WPS451 self: T, /, - ddof: int = 1, + correction: int = 0, ) -> Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat]: pass @@ -627,7 +627,7 @@ def cov( # noqa: WPS320, WPS451 s_points: Optional[NDArrayFloat] = None, t_points: Optional[NDArrayFloat] = None, /, - ddof: int = 1, + correction: int = 0, ) -> Union[ Callable[[NDArrayFloat, NDArrayFloat], NDArrayFloat], NDArrayFloat, @@ -643,9 +643,9 @@ def cov( # noqa: WPS320, WPS451 Args: s_points: Grid points where the covariance function is evaluated. t_points: Grid points where the covariance function is evaluated. - ddof: "Delta Degrees of Freedom": the divisor used in the - calculation is `N - ddof`, where `N` represents the number - of elements. By default `ddof` is 1. + correction: degrees of freedom adjustment. The divisor used in the + calculation is `N - correction`, where `N` represents the + number of elements. Default: `0`. Returns: Covariance function. @@ -653,7 +653,7 @@ def cov( # noqa: WPS320, WPS451 """ # To avoid circular imports from ..misc.covariances import EmpiricalGrid - cov_function = EmpiricalGrid(self, ddof=ddof) + cov_function = EmpiricalGrid(self, correction=correction) if s_points is None or t_points is None: return cov_function return cov_function(s_points, t_points) @@ -734,8 +734,9 @@ def _get_op_matrix( return other[other_index] raise ValueError( - f"Invalid dimensions in operator between FDataGrid and Numpy " - f"array: {other.shape}" + f"Invalid dimensions in operator between " + f"FDataGrid (data_matrix.shape={self.data_matrix.shape}) " + f"and Numpy array (shape={other.shape})", ) elif isinstance(other, FDataGrid): diff --git a/skfda/tests/test_fpls.py b/skfda/tests/test_fpls.py new file mode 100644 index 000000000..af322959f --- /dev/null +++ b/skfda/tests/test_fpls.py @@ -0,0 +1,311 @@ +"""Test the FPLS class.""" + +from typing import Tuple + +import numpy as np +import pytest +from sklearn.cross_decomposition import PLSCanonical + +from skfda.datasets import fetch_tecator +from skfda.preprocessing.dim_reduction import FPLS +from skfda.representation import FData, FDataGrid +from skfda.representation.basis import BSplineBasis, FDataBasis +from skfda.typing._numpy import NDArrayFloat + + +class LatentVariablesModel: + """Simulate model driven by latent variables.""" + + def create_latent_variables(self, n_latent: int, n_samples: int) -> None: + """Create latent variables for testing.""" + self.rng = np.random.RandomState(0) + self.n_latent = n_latent + self.n_samples = n_samples + self.grid_points = np.linspace(0, 1, 100) + + # Get the means of the latent variables + latent_means = self.rng.uniform(low=1, high=10, size=(n_latent)) + + # Sample the variables + self.latent_samples = np.array( + [ + self.rng.normal(loc=mean, scale=1, size=n_samples) + for mean in latent_means + ], + ).T + + def create_observed_multivariate_variable( + self, + n_features: int, + noise: float = 0, + ) -> Tuple[NDArrayFloat, NDArrayFloat]: + """Create observed multivariate variable for testing.""" + rotations = self.rng.uniform( + low=0, + high=10, + size=(n_features, self.n_latent), + ) + + observed_values = self.latent_samples @ rotations.T + observed_noise = noise * self.rng.normal( + size=(self.n_samples, n_features), + ) + observed_values += observed_noise + + return observed_values, rotations + + def create_observed_functional_variable( + self, + noise: float = 0, + discretized: bool = False, + ) -> Tuple[FData, FData]: + """Create observed functional variable for testing.""" + n_basis = 20 + basis = BSplineBasis(n_basis=n_basis) + + rotation_coef = self.rng.uniform( + low=0, + high=10, + size=(self.n_latent, n_basis), + ) + rotation = FDataBasis(basis, rotation_coef) + + observed_coef = self.latent_samples @ rotation_coef + observed_func = FDataBasis(basis, observed_coef) + + if discretized: + observed_func = observed_func.to_grid( + grid_points=self.grid_points, + ) + + func_noise = noise * self.rng.normal(size=(self.n_samples, 100)) + observed_func.data_matrix[..., 0] += func_noise + + else: + observed_func.coefficients += noise * self.rng.normal( + size=(self.n_samples, n_basis), + ) + + return observed_func, rotation + + +class TestFPLS(LatentVariablesModel): + """Test the FPLS class.""" + + def test_sklearn(self) -> None: + """Compare results with sklearn.""" + # Load the data + X, y = fetch_tecator( + return_X_y=True, + ) + + integration_weights = np.ones(len(X.grid_points[0])) + + # Fit the model + fpls = FPLS[FDataGrid, NDArrayFloat]( + n_components=3, + _integration_weights_X=integration_weights, + ) + fpls.fit(X, y) + + sklearnpls = PLSCanonical(n_components=3, scale=False) + sklearnpls.fit(X.data_matrix[..., 0], y) + + rtol = 2e-4 + atol = 1e-6 + + # Check that the rotations are correct + np.testing.assert_allclose( + np.abs(fpls.x_rotations_matrix_).flatten(), + np.abs(sklearnpls.x_rotations_).flatten(), + rtol=rtol, + atol=atol, + ) + + # Check the transformation of X + np.testing.assert_allclose( + fpls.transform(X, y), + sklearnpls.transform(X.data_matrix[..., 0], y), + rtol=rtol, + atol=1e-5, + ) + + comp_x, comp_y = fpls.transform(X, y) + + fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) + sklearnpls_inv_x, sklearnpls_inv_y = sklearnpls.inverse_transform( + comp_x, + comp_y, + ) + # Check the inverse transformation of X + np.testing.assert_allclose( + fpls_inv_x.data_matrix.flatten(), + sklearnpls_inv_x.flatten(), + rtol=rtol, + atol=atol, + ) + + # Check the inverse transformation of y + np.testing.assert_allclose( + fpls_inv_y.flatten(), + sklearnpls_inv_y.flatten(), + rtol=rtol, + atol=atol, + ) + + # Ignoring WPS210: Found too many local variables + def test_basis_vs_grid( # noqa: WPS210 + self, + ) -> None: + """Test that the results are the same in basis and grid.""" + n_components = 5 + self.create_latent_variables(n_latent=n_components, n_samples=100) + + # Create the observed variable as basis variables + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=False, + noise=0, + ) + y_observed, y_rotations = self.create_observed_functional_variable( + discretized=False, + noise=0, + ) + + # Fit the model + fpls = FPLS[FData, FData](n_components=n_components) + fpls.fit(X_observed, y_observed) + + # Convert the observed variables to grid + grid_points = np.linspace(0, 1, 2000) + X_observed_grid = X_observed.to_grid( + grid_points=grid_points, + ) + + y_observed_grid = y_observed.to_grid( + grid_points=grid_points, + ) + + # Fit the model with the grid variables + fpls_grid = FPLS[FData, FData](n_components=n_components) + fpls_grid.fit(X_observed_grid, y_observed_grid) + + # Check that the results are the same + np.testing.assert_allclose( + np.abs(fpls.x_rotations_(self.grid_points)), + np.abs(fpls_grid.x_rotations_(self.grid_points)), + rtol=5e-3, + ) + + np.testing.assert_allclose( + np.abs(fpls.y_rotations_(self.grid_points)), + np.abs(fpls_grid.y_rotations_(self.grid_points)), + rtol=5e-3, + ) + + # Check that the transform is the same + np.testing.assert_allclose( + np.abs(fpls.transform(X_observed, y_observed)), + np.abs(fpls_grid.transform(X_observed_grid, y_observed_grid)), + rtol=5e-3, + ) + # Check the inverse transform + comp_x, comp_y = fpls.transform(X_observed, y_observed) + + fpls_inv_x, fpls_inv_y = fpls.inverse_transform(comp_x, comp_y) + fpls_grid_x, fpsl_grid_y = fpls_grid.inverse_transform( + comp_x, + comp_y, + ) + # Check the inverse transformation of X + np.testing.assert_allclose( + fpls_inv_x(grid_points), + fpls_grid_x(grid_points), + rtol=7e-2, + ) + + # Check the inverse transformation of y + np.testing.assert_allclose( + fpls_inv_y(grid_points), + fpsl_grid_y(grid_points), + rtol=0.13, + ) + + def _generate_random_matrix_by_rank(self, n_samples, n_features, rank): + + random_data = np.random.random(n_samples * rank).reshape( + n_samples, + rank, + ) + if rank == n_features: + return random_data + repeated_data = np.array([random_data[:, 0]] * (n_features - rank)).T + + return np.concatenate( + [random_data, repeated_data], + axis=1, + ) + + @pytest.mark.parametrize("rank", [1, 2, 5]) + def test_collinear_matrix( + self, + rank: int, + ) -> None: + """Check that the behaviour is correct with collinear matrices.""" + n_samples = 100 + n_features = 10 + np.random.seed(0) + + X = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=rank, + ) + y = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=rank, + ) + + fpls = FPLS(n_components=rank) + fpls.fit(X, y) + + fpls = FPLS(n_components=5) + + # Check that a warning is raised when the rank is lower than the + # number of components + if rank < 5: + with pytest.warns(UserWarning): + fpls.fit(X, y) + else: + fpls.fit(X, y) + + # Check that only as many components as rank are returned + assert fpls.x_weights_.shape == (n_features, rank) + + # Check that the waring is not raised when the number of components + # is not set + fpls = FPLS().fit(X, y) + # Check that only as many components as rank are returned + assert fpls.x_weights_.shape == (n_features, rank) + + def test_number_of_components( + self, + ) -> None: + """Check error when number of components is too large.""" + n_samples = 100 + n_features = 10 + np.random.seed(0) + + X = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=n_features, + ) + y = self._generate_random_matrix_by_rank( + n_samples=n_samples, + n_features=n_features, + rank=n_features, + ) + + with pytest.raises(ValueError): + FPLS(n_components=n_features + 1).fit(X, y) diff --git a/skfda/tests/test_fpls_regression.py b/skfda/tests/test_fpls_regression.py new file mode 100644 index 000000000..b658e8863 --- /dev/null +++ b/skfda/tests/test_fpls_regression.py @@ -0,0 +1,292 @@ +"""Test the FPLSRegression class.""" +import os + +import numpy as np +import pytest +import scipy +from sklearn.cross_decomposition import PLSRegression + +from skfda.datasets import fetch_tecator +from skfda.misc.operators import LinearDifferentialOperator +from skfda.misc.regularization import L2Regularization +from skfda.ml.regression import FPLSRegression +from skfda.representation import FData +from skfda.representation.basis import BSplineBasis +from skfda.tests.test_fpls import LatentVariablesModel +from skfda.typing._numpy import NDArrayFloat + + +class TestFPLSRegression(LatentVariablesModel): + """Test the FPLSRegression class.""" + + def test_sklearn(self) -> None: + """Compare results with sklearn.""" + # Load the data + X, y = fetch_tecator( + return_X_y=True, + ) + + # Fit the model + fplsr = FPLSRegression[NDArrayFloat, NDArrayFloat]( + n_components=5, + _integration_weights_X=np.ones(len(X.grid_points[0])), + ) + fplsr.fit(X, y) + + sklearnpls = PLSRegression(n_components=5, scale=False) + sklearnpls.fit(X.data_matrix[..., 0], y) + + rtol = 3e-5 + atol = 1e-6 + # Compare the results + np.testing.assert_allclose( + fplsr.coef_.flatten(), + sklearnpls.coef_.T.flatten(), + rtol=rtol, + atol=atol, + ) + + # Compare predictions + np.testing.assert_allclose( + fplsr.predict(X).flatten(), + sklearnpls.predict(X.data_matrix[..., 0]).flatten(), + rtol=rtol, + atol=atol, + ) + + # Check that the rotations are correc + np.testing.assert_allclose( + np.abs(fplsr.fpls_.x_rotations_matrix_).flatten(), + np.abs(sklearnpls.x_rotations_).flatten(), + rtol=rtol, + atol=atol, + ) + + def test_fda_usc_no_reg(self) -> None: + """ + Test a multivariate regression with no regularization. + + Replication Code: + data(tecator) + x <- tecator$absorp.fdata + y <- tecator$y + res2 <- fdata2pls(x, y, ncomp=5) + for (i in res2$l) { + c <- res2$loading.weigths$data[i, ] + c <- c / c(sqrt(crossprod(c))) + print( + paste( + round(c, 8), + collapse = ", " + ) # components + ) + } + """ + # Results from fda.usc: + path = os.path.join( + f"{__file__[:-3]}_data", # Trim .py ending + "test_fda_usc_no_reg_data.npy", + ) + with open(path, "rb") as f: + r_results = np.load(f, allow_pickle=False) + + signs = np.array([1, -1, 1, -1, 1]) + + r_results *= signs + + X, y = fetch_tecator( + return_X_y=True, + ) + + plsReg = FPLSRegression[FData, NDArrayFloat]( + n_components=5, + _integration_weights_X=np.ones(len(X.grid_points[0])), + ) + plsReg.fit(X, y) + + W = plsReg.fpls_.x_weights_ + np.testing.assert_allclose(W, r_results, atol=1e-8) + + def test_fda_usc_reg(self) -> None: + """ + Test the FPLSRegression with regularization against fda.usc. + + Replication Code: + data(tecator) + x=tecator$absorp.fdata + y=tecator$y + res2=fdata2pls(x,y,ncomp=5,P=c(0,0,1),lambda = 1000) + for(i in res2$l){ + c = res2$loading.weigths$data[i,] + c = c/ c(sqrt(crossprod(c))) + print( + paste( + round(c, 8), + collapse=", " + ) # components + ) + } + """ + X, y = fetch_tecator( + return_X_y=True, + ) + + pen_order = 2 + reg_param = 1000 + # This factor compensates for the fact that the difference + # matrices in fda.usc are scaled by the mean of the deltas + # between grid points. This diference is introduced in + # the function D.penalty (fdata2pc.R:479) in fda.usc. + + grid_points = X[0].grid_points[0] + grid_step = np.mean(np.diff(grid_points)) + factor1 = grid_step ** (2 * pen_order - 1) + + reg_param *= factor1 + + regularization = L2Regularization( + LinearDifferentialOperator(pen_order), + regularization_parameter=reg_param, + ) + + # Fit the model + fplsr = FPLSRegression[FData, NDArrayFloat]( + n_components=5, + regularization_X=regularization, + _integration_weights_X=np.ones(len(X.grid_points[0])), + ) + fplsr.fit(X, y) + + path = os.path.join( + f"{__file__[:-3]}_data", # Trim .py ending + "test_fda_usc_reg_data.npy", + ) + with open(path, "rb") as f: + r_results = np.load(f, allow_pickle=False) + + signs = np.array([1, -1, 1, -1, 1]) + + w_mat = fplsr.fpls_.x_weights_ @ np.diag(signs) + np.testing.assert_allclose(w_mat, r_results, atol=6e-6, rtol=6e-4) + + def test_basis_vs_grid(self) -> None: + """Test that the results are the same in basis and grid.""" + X, y = fetch_tecator( + return_X_y=True, + ) + + original_grid_points = X.grid_points[0] + # Express the data in a FDataBasis + X = X.to_basis(BSplineBasis(n_basis=20)) + + # Fit the model + fplsr = FPLSRegression[FData, NDArrayFloat](n_components=5) + fplsr.fit(X, y) + basis_components = fplsr.fpls_.x_rotations_(original_grid_points) + + # Go back to grid + new_grid_points = np.linspace( + original_grid_points[0], + original_grid_points[-1], + 1000, + ) + X = X.to_grid(grid_points=new_grid_points) + + # Get the weights for the Simpson's rule + identity = np.eye(len(X.grid_points[0])) + ss_weights = scipy.integrate.simps(identity, X.grid_points[0]) + + # Fit the model with weights + fplsr = FPLSRegression( + n_components=5, + _integration_weights_X=ss_weights, + ) + fplsr.fit(X, y) + + grid_components = fplsr.fpls_.x_rotations_(original_grid_points) + + np.testing.assert_allclose( + basis_components, + grid_components, + rtol=3e-3, + ) + + @pytest.mark.parametrize("y_features", [1, 5]) + def test_multivariate_regression(self, y_features: int) -> None: + """Test the multivariate regression. + + Consider both scalar and multivariate responses. + """ + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables + # if it has enough components + y_observed, y_rotations = self.create_observed_multivariate_variable( + n_features=y_features, + ) + X_observed, X_rotations = self.create_observed_multivariate_variable( + n_features=10, + ) + + plsreg = FPLSRegression[NDArrayFloat, NDArrayFloat](n_components=5) + plsreg.fit(X_observed, y_observed) + + minimum_score = 0.99 + assert plsreg.score(X_observed, y_observed) > minimum_score + + @pytest.mark.parametrize("discretized", [True, False]) + @pytest.mark.parametrize("n_features", [1, 5]) + @pytest.mark.parametrize("noise_std", [0, 1]) + def test_simple_regresion( + self, + discretized: bool, + n_features: int, + noise_std: float, + ) -> None: + """Test multivariate regressor and functional response.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables if + # it has enough components + y_observed, y_rotations = self.create_observed_multivariate_variable( + n_features=n_features, + noise=noise_std, + ) + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=discretized, + noise=noise_std, + ) + + plsreg = FPLSRegression[FData, NDArrayFloat](n_components=5) + plsreg.fit(X_observed, y_observed) + + minimum_score = 0.99 if noise_std == 0 else 0.98 + assert plsreg.score(X_observed, y_observed) > minimum_score + + @pytest.mark.parametrize("discretized_observed", [True, False]) + @pytest.mark.parametrize("discretized_response", [True, False]) + @pytest.mark.parametrize("noise_std", [0, 1]) + def test_simple_regresion_dataset_functional( + self, + discretized_observed: bool, + discretized_response: bool, + noise_std: float, + ) -> None: + """Test multivariate regressor and functional response.""" + self.create_latent_variables(n_latent=5, n_samples=100) + + # Check that the model is able to recover the latent variables if + # it has enough components + X_observed, X_rotations = self.create_observed_functional_variable( + discretized=discretized_observed, + noise=noise_std, + ) + y_observed, y_rotations = self.create_observed_functional_variable( + discretized=discretized_response, + noise=noise_std, + ) + + plsreg = FPLSRegression[FData, FData](n_components=5) + plsreg.fit(X_observed, y_observed) + minimum_score = 0.99 if noise_std == 0 else 0.98 + assert plsreg.score(X_observed, y_observed) > minimum_score diff --git a/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy b/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy new file mode 100644 index 000000000..36b54860f Binary files /dev/null and b/skfda/tests/test_fpls_regression_data/test_fda_usc_no_reg_data.npy differ diff --git a/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy b/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy new file mode 100644 index 000000000..d1606e5e7 Binary files /dev/null and b/skfda/tests/test_fpls_regression_data/test_fda_usc_reg_data.npy differ diff --git a/skfda/tests/test_metrics.py b/skfda/tests/test_metrics.py index 4e4746901..440806379 100644 --- a/skfda/tests/test_metrics.py +++ b/skfda/tests/test_metrics.py @@ -89,10 +89,11 @@ def test_lp_norm_surface_inf(self) -> None: ) def test_lp_norm_surface(self) -> None: - """Test that integration of surfaces has not been implemented.""" - # Integration of surfaces not implemented, add test case after - # implementation - self.assertEqual(lp_norm(self.fd_surface, p=1), NotImplemented) + """Test the integration of surfaces.""" + np.testing.assert_allclose( + lp_norm(self.fd_surface, p=1), + [0.12566256, 0.12563755, 0.12566133], + ) def test_lp_error_dimensions(self) -> None: """Test error on metric between different kind of objects.""" diff --git a/skfda/tests/test_pandas_fdatabasis.py b/skfda/tests/test_pandas_fdatabasis.py index 6192ee7b7..96be90119 100644 --- a/skfda/tests/test_pandas_fdatabasis.py +++ b/skfda/tests/test_pandas_fdatabasis.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Callable, Iterable, NoReturn +from typing import Any, Callable, Iterable, NoReturn, Sequence import numpy as np import pandas @@ -271,6 +271,20 @@ def all_numeric_reductions(request: Any) -> Any: """ return request.param + +_all_boolean_reductions: Sequence[str] = [ + # "all", + # "any", +] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request: Any) -> Any: + """ + Fixture for boolean reduction names. + """ + return request.param + ############################################################################## # Tests ############################################################################## diff --git a/skfda/tests/test_pandas_fdatagrid.py b/skfda/tests/test_pandas_fdatagrid.py index 47db6c383..80818a1b3 100644 --- a/skfda/tests/test_pandas_fdatagrid.py +++ b/skfda/tests/test_pandas_fdatagrid.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Callable, Generator, NoReturn, Union +from typing import Any, Callable, Generator, NoReturn, Sequence, Union import numpy as np import pandas @@ -290,6 +290,20 @@ def all_numeric_reductions(request: Any) -> Any: """ return request.param + +_all_boolean_reductions: Sequence[str] = [ + # "all", + # "any", +] + + +@pytest.fixture(params=_all_boolean_reductions) +def all_boolean_reductions(request: Any) -> Any: + """ + Fixture for boolean reduction names. + """ + return request.param + ############################################################################## # Tests ############################################################################## diff --git a/skfda/tests/test_recursive_maxima_hunting.py b/skfda/tests/test_recursive_maxima_hunting.py index 9a435b8f9..d781a69b9 100644 --- a/skfda/tests/test_recursive_maxima_hunting.py +++ b/skfda/tests/test_recursive_maxima_hunting.py @@ -25,8 +25,8 @@ def test_gaussian_homoscedastic(self) -> None: join. """ - n_samples = 10000 - n_features = 100 + n_samples = 1000 + n_features = 101 def mean_1( # noqa: WPS430 t: np.typing.NDArray[np.float_], @@ -66,7 +66,7 @@ def mean_1( # noqa: WPS430 rmh.fit(X, y) point_mask = rmh.get_support() points = X.grid_points[0][point_mask] - np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) + np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) @pytest.mark.filterwarnings( 'ignore::sklearn.exceptions.ConvergenceWarning' @@ -82,8 +82,8 @@ def test_fit_exponential(self) -> None: join. """ - n_samples = 10000 - n_features = 100 + n_samples = 1000 + n_features = 101 def mean_1( # noqa: WPS430 t: np.typing.NDArray[np.float_], @@ -129,7 +129,7 @@ def mean_1( # noqa: WPS430 rmh.fit(X, y) point_mask = rmh.get_support() points = X.grid_points[0][point_mask] - np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-1) + np.testing.assert_allclose(points, [0.25, 0.5, 0.75], rtol=1e-2) if __name__ == '__main__': diff --git a/skfda/tests/test_regression.py b/skfda/tests/test_regression.py index 6bab0950f..1bb4fc0c4 100644 --- a/skfda/tests/test_regression.py +++ b/skfda/tests/test_regression.py @@ -1,3 +1,4 @@ +"""Tests for scalar and functional response linear regression.""" from __future__ import annotations import unittest @@ -6,8 +7,9 @@ import numpy as np import pandas as pd from scipy.integrate import cumtrapz +from sklearn.preprocessing import OneHotEncoder -from skfda.datasets import make_gaussian, make_gaussian_process +from skfda.datasets import fetch_weather, make_gaussian, make_gaussian_process from skfda.misc.covariances import Gaussian from skfda.misc.operators import LinearDifferentialOperator from skfda.misc.regularization import L2Regularization @@ -22,16 +24,56 @@ from skfda.representation.grid import FDataGrid +def _test_linear_regression_common( + X_train, + y_train, + expected_coefs, + X_test, + y_test, + coef_basis=None, + regularization=None, +) -> None: + """Execute a test of linear regression, given the parameters.""" + linear_regression = LinearRegression( + fit_intercept=False, + coef_basis=coef_basis, + regularization=regularization, + ) + linear_regression.fit(X_train, y_train) + + for coef, expected in zip(linear_regression.coef_, expected_coefs): + assert isinstance(coef, FDataBasis) + assert coef.basis == expected.basis + np.testing.assert_allclose( + coef.coefficients, + expected.coefficients, + atol=1e-6, + ) + + y_pred = linear_regression.predict(X_test) + assert isinstance(y_pred, FDataBasis) + assert y_pred.basis == y_test.basis + np.testing.assert_allclose( + y_pred.coefficients, + y_test.coefficients, + rtol=1e-5, + ) + + class TestScalarLinearRegression(unittest.TestCase): + """Tests for linear regression with scalar response.""" - def test_regression_single_explanatory(self) -> None: + def test_single_explanatory(self) -> None: + """Test a basic example of functional regression. + Scalar response with functional covariates. + """ x_basis = MonomialBasis(n_basis=7) x_fd = FDataBasis(x_basis, np.identity(7)) beta_basis = FourierBasis(n_basis=5) beta_fd = FDataBasis(beta_basis, [1, 1, 1, 1, 1]) - y = np.array([ + y = [ 0.9999999999999993, 0.162381381441085, 0.08527083481359901, @@ -39,7 +81,7 @@ def test_regression_single_explanatory(self) -> None: 0.09532291032042489, 0.10550022969639987, 0.11382675064746171, - ]) + ] scalar = LinearRegression(coef_basis=[beta_basis]) scalar.fit(x_fd, y) @@ -49,9 +91,7 @@ def test_regression_single_explanatory(self) -> None: beta_fd.coefficients, ) np.testing.assert_allclose( - scalar.intercept_, - 0.0, - atol=1e-6, + scalar.intercept_, 0.0, atol=1e-6, ) y_pred = scalar.predict(x_fd) @@ -68,15 +108,18 @@ def test_regression_single_explanatory(self) -> None: beta_fd.coefficients, ) np.testing.assert_equal( - scalar.intercept_, - 0.0, + scalar.intercept_, 0.0, ) y_pred = scalar.predict(x_fd) np.testing.assert_allclose(y_pred, y) - def test_regression_multiple_explanatory(self) -> None: - y = np.array([1, 2, 3, 4, 5, 6, 7]) + def test_multiple_explanatory(self) -> None: + """Test a example of functional regression. + + Scalar response with functional covariates. + """ + y = [1, 2, 3, 4, 5, 6, 7] X = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) @@ -87,9 +130,7 @@ def test_regression_multiple_explanatory(self) -> None: scalar.fit(X, y) np.testing.assert_allclose( - scalar.intercept_.round(4), - np.array([32.65]), - rtol=1e-3, + scalar.intercept_.round(4), np.array([32.65]), rtol=1e-3, ) assert isinstance(scalar.coef_[0], FDataBasis) @@ -101,39 +142,38 @@ def test_regression_multiple_explanatory(self) -> None: -188.587, 236.5832, -481.3449, - ]]), - rtol=1e-3, + ]]), rtol=1e-3, ) y_pred = scalar.predict(X) np.testing.assert_allclose(y_pred, y, atol=0.01) - def test_regression_mixed(self) -> None: + def test_mixed(self) -> None: + """Test a example of functional regression. - multivariate = np.array([ - [0, 0], [2, 7], [1, 7], [3, 9], - [4, 16], [2, 14], [3, 5], + Scalar response with multivariate and functional covariates. + """ + multivariate = np.array( + [[0, 0], [2, 7], [1, 7], [3, 9], [4, 16], [2, 14], [3, 5]], + ) + + x_fd = FDataBasis(MonomialBasis(n_basis=3), [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], ]) - X: Sequence[ - np.typing.NDArray[np.float_] | FDataBasis, - ] = [ - multivariate, - FDataBasis( - MonomialBasis(n_basis=3), - [ - [1, 0, 0], [0, 1, 0], [0, 0, 1], - [1, 0, 1], [1, 0, 0], [0, 1, 0], - [0, 0, 1], - ], - ), - ] + X = [multivariate, x_fd] + # y = 2 + sum([3, 1] * array) + int(3 * function) # noqa: E800 intercept = 2 coefs_multivariate = np.array([3, 1]) coefs_functions = FDataBasis( - MonomialBasis(n_basis=3), - [[3, 0, 0]], + MonomialBasis(n_basis=3), [[3, 0, 0]], ) y_integral = np.array([3, 3 / 2, 1, 4, 3, 3 / 2, 1]) y_sum = multivariate @ coefs_multivariate @@ -143,15 +183,11 @@ def test_regression_mixed(self) -> None: scalar.fit(X, y) np.testing.assert_allclose( - scalar.intercept_, - intercept, - atol=0.01, + scalar.intercept_, intercept, atol=0.01, ) np.testing.assert_allclose( - scalar.coef_[0], - coefs_multivariate, - atol=0.01, + scalar.coef_[0], coefs_multivariate, atol=0.01, ) assert isinstance(scalar.coef_[1], FDataBasis) @@ -203,16 +239,25 @@ def test_same_result_1d_2d_multivariate_arrays(self) -> None: linear2.coef_[2], ) - def test_regression_df_multivariate(self) -> None: # noqa: D102 + def test_df_multivariate(self) -> None: + """Test a example of functional regression with Dataframe input. + Scalar response with multivariate and functional covariates. + """ multivariate1 = [0, 2, 1, 3, 4, 2, 3] multivariate2 = [0, 7, 7, 9, 16, 14, 5] multivariate = [list(obs) for obs in zip(multivariate1, multivariate2)] x_basis = MonomialBasis(n_basis=3) - x_fd = FDataBasis(x_basis, [[1, 0, 0], [0, 1, 0], [0, 0, 1], - [1, 0, 1], [1, 0, 0], [0, 1, 0], - [0, 0, 1]]) + x_fd = FDataBasis(x_basis, [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ]) cov_dict = {"fd": x_fd, "mult1": multivariate1, "mult2": multivariate2} @@ -252,26 +297,31 @@ def test_regression_df_multivariate(self) -> None: # noqa: D102 y_pred = scalar.predict(df) np.testing.assert_allclose(y_pred, y, atol=0.01) - def test_regression_mixed_regularization(self) -> None: + def test_mixed_regularization(self) -> None: + """Test a example of functional regression. + Scalar response with multivariate and functional covariates + using regularization. + """ multivariate = np.array([ - [0, 0], [2, 7], [1, 7], [3, 9], - [4, 16], [2, 14], [3, 5], + [0, 0], [2, 7], [1, 7], [3, 9], [4, 16], [2, 14], [3, 5], + ]) + + x_fd = FDataBasis(MonomialBasis(n_basis=3), [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [1, 0, 1], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], ]) X: Sequence[ np.typing.NDArray[np.float_] | FDataBasis, - ] = [ - multivariate, - FDataBasis( - MonomialBasis(n_basis=3), - [ - [1, 0, 0], [0, 1, 0], [0, 0, 1], - [1, 0, 1], [1, 0, 0], [0, 1, 0], - [0, 0, 1], - ]), - ] + ] = [multivariate, x_fd] + # y = 2 + sum([3, 1] * array) + int(3 * function) # noqa: E800 intercept = 2 coefs_multivariate = np.array([3, 1]) y_integral = np.array([3, 3 / 2, 1, 4, 3, 3 / 2, 1]) @@ -289,9 +339,7 @@ def test_regression_mixed_regularization(self) -> None: scalar.fit(X, y) np.testing.assert_allclose( - scalar.intercept_, - intercept, - atol=0.01, + scalar.intercept_, intercept, atol=0.01, ) np.testing.assert_allclose( @@ -311,23 +359,28 @@ def test_regression_mixed_regularization(self) -> None: np.testing.assert_allclose( y_pred, [ - 5.349035, 16.456464, 13.361185, 23.930295, - 32.650965, 23.961766, 16.29029, + 5.349035, + 16.456464, + 13.361185, + 23.930295, + 32.650965, + 23.961766, + 16.29029, ], atol=0.01, ) - def test_regression_regularization(self) -> None: + def test_regularization(self) -> None: + """Test a example of functional regression. + Scalar response with functional covariates using regularization. + """ x_basis = MonomialBasis(n_basis=7) x_fd = FDataBasis(x_basis, np.identity(7)) beta_basis = FourierBasis(n_basis=5) - beta_fd = FDataBasis( - beta_basis, - [1.0403, 0, 0, 0, 0], - ) - y = np.array([ + beta_fd = FDataBasis(beta_basis, [1.0403, 0, 0, 0, 0]) + y = [ 1.0000684777229512, 0.1623672257830915, 0.08521053851548224, @@ -335,7 +388,7 @@ def test_regression_regularization(self) -> None: 0.09529138749665378, 0.10549625973303875, 0.11384314859153018, - ]) + ] y_pred_compare = [ 0.890341, @@ -361,23 +414,19 @@ def test_regression_regularization(self) -> None: atol=1e-3, ) np.testing.assert_allclose( - scalar.intercept_, - -0.15, - atol=1e-4, + scalar.intercept_, -0.15, atol=1e-4, ) y_pred = scalar.predict(x_fd) np.testing.assert_allclose(y_pred, y_pred_compare, atol=1e-4) x_basis = MonomialBasis(n_basis=3) - x_fd = FDataBasis( - x_basis, - [ - [1, 0, 0], - [0, 1, 0], - [0, 0, 1], - [2, 0, 1], - ]) + x_fd = FDataBasis(x_basis, [ + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + [2, 0, 1], + ]) beta_fd = FDataBasis(x_basis, [3, 2, 1]) y = np.array([1 + 13 / 3, 1 + 29 / 12, 1 + 17 / 10, 1 + 311 / 30]) @@ -385,10 +434,8 @@ def test_regression_regularization(self) -> None: # Non regularized scalar = LinearRegression() scalar.fit(x_fd, y) - assert isinstance(scalar.coef_[0], FDataBasis) np.testing.assert_allclose( - scalar.coef_[0].coefficients, - beta_fd.coefficients, + scalar.coef_[0].coefficients, beta_fd.coefficients, ) np.testing.assert_allclose(scalar.intercept_, 1) @@ -412,63 +459,19 @@ def test_regression_regularization(self) -> None: atol=0.001, ) np.testing.assert_allclose( - scalar_reg.intercept_, - 0.998, - atol=0.001, + scalar_reg.intercept_, 0.998, atol=0.001, ) y_pred = scalar_reg.predict(x_fd) np.testing.assert_allclose(y_pred, y_reg, atol=0.001) - def test_error_X_not_FData(self) -> None: - """Tests that at least one variable is an FData object.""" - x_fd = np.identity(7) - y = np.zeros(7) - - scalar = LinearRegression(coef_basis=[FourierBasis(n_basis=5)]) - - with np.testing.assert_warns(UserWarning): - scalar.fit([x_fd], y) - - def test_error_y_is_FData(self) -> None: - """Tests that none of the explained variables is an FData object.""" - x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) - y = list(FDataBasis(MonomialBasis(n_basis=7), np.identity(7))) - - scalar = LinearRegression(coef_basis=[FourierBasis(n_basis=5)]) - - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd], y) # type: ignore[arg-type] - - def test_error_X_beta_len_distinct(self) -> None: - """Test that the number of beta bases and explanatory variables - are not different """ - x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) - y = np.array([1 for _ in range(7)]) - beta = FourierBasis(n_basis=5) - - scalar = LinearRegression(coef_basis=[beta]) - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd, x_fd], y) - - scalar = LinearRegression(coef_basis=[beta, beta]) - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd], y) - - def test_error_y_X_samples_different(self) -> None: - """Test that the number of response samples and explanatory samples - are not different """ + def test_error_y_X_samples_different(self) -> None: # noqa: N802 + """Number of response samples and explanatory samples are not different. + Raises ValueError when response is scalar. + """ x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) - y = np.array([1 for _ in range(8)]) - beta = FourierBasis(n_basis=5) - - scalar = LinearRegression(coef_basis=[beta]) - with np.testing.assert_raises(ValueError): - scalar.fit([x_fd], y) - - x_fd = FDataBasis(MonomialBasis(n_basis=8), np.identity(8)) - y = np.array([1 for _ in range(7)]) + y = [1 for _ in range(8)] beta = FourierBasis(n_basis=5) scalar = LinearRegression(coef_basis=[beta]) @@ -476,7 +479,7 @@ def test_error_y_X_samples_different(self) -> None: scalar.fit([x_fd], y) def test_error_beta_not_basis(self) -> None: - """Test that all beta are Basis objects. """ + """Test that all beta are Basis objects.""" x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) y = np.array([1 for _ in range(7)]) beta = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) @@ -486,7 +489,7 @@ def test_error_beta_not_basis(self) -> None: scalar.fit([x_fd], y) def test_error_weights_lenght(self) -> None: - """Test that the number of weights is equal to n_samples.""" + """Number of weights is equal to the number of samples.""" x_fd = FDataBasis(MonomialBasis(n_basis=7), np.identity(7)) y = np.array([1 for _ in range(7)]) weights = np.array([1 for _ in range(8)]) @@ -508,6 +511,446 @@ def test_error_weights_negative(self) -> None: scalar.fit([x_fd], y, weights) +class TestFunctionalLinearRegression(unittest.TestCase): + """Tests for linear regression with functional response.""" + + def test_multivariate_covariates_constant_basic(self) -> None: + """ + Univariate with functional response and constant coefficient one. + """ + y_basis = ConstantBasis() + + X_train = pd.DataFrame({ + "covariate": [1, 3, 5], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [1], + [3], + [5], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[1]], + ) + ] + + X_test = pd.DataFrame({ + "covariate": [2, 4, 6], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[ + [2], + [4], + [6], + ], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + ) + + # Check also without dataframes + _test_linear_regression_common( + X_train=X_train.to_numpy(), + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test.to_numpy(), + y_test=y_test, + ) + + def test_multivariate_covariates_monomial_basic(self) -> None: + """ + Multivariate with functional response and identity coefficients. + """ + y_basis = MonomialBasis(n_basis=2) + + X_train = pd.DataFrame({ + "covariate_1": [1, 3, 5], + "covariate_2": [2, 4, 6], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [1, 2], + [3, 4], + [5, 6], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[1, 0]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[0, 1]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": [2, 4, 6], + "covariate_2": [1, 3, 5], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[ + [2, 1], + [4, 3], + [6, 5], + ], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + ) + + # Check also without dataframes + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[ + [1, 0], + [0, 1], + ], + ), + ] + + # Currently broken. + + # _test_linear_regression_common( + # X_train=X_train.to_numpy(), + # y_train=y_train, + # expected_coefs=expected_coefs, + # X_test=X_test.to_numpy(), + # y_test=y_test, + # ) + + def test_multivariate_3_covariates(self) -> None: + """Test a more complex example involving 3 covariates.""" + y_basis = MonomialBasis(n_basis=3) + + X_train = pd.DataFrame({ + "covariate_1": [3, 5, 3], + "covariate_2": [4, 1, 2], + "covariate_3": [1, 6, 8], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [47, 22, 24], + [43, 47, 39], + [40, 53, 51], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[6, 3, 1]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[7, 2, 4]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[1, 5, 5]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": [3], + "covariate_2": [2], + "covariate_3": [1], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[[33, 18, 16]], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + ) + + def test_multivariate_covariates_regularization(self) -> None: + """Test a example of functional regression. + + Functional response with multivariate covariates and + beta regularization. + """ + y_basis = MonomialBasis(n_basis=3) + + X_train = pd.DataFrame({ + "covariate_1": [3, 5, 3], + "covariate_2": [4, 1, 2], + "covariate_3": [1, 6, 8], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [47, 22, 24], + [43, 47, 39], + [40, 53, 51], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[5.769441, 3.025921, 1.440655]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[6.688267, 1.938523, 3.579894]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[1.198499, 4.952166, 4.811818]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": [3], + "covariate_2": [2], + "covariate_3": [1], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[[31.883356, 17.906975, 16.293571]], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + regularization=[L2Regularization()] * 3, + ) + + def test_multivariate_covariates_R_fda(self) -> None: # noqa: N802 + """Test a example with Canadian Weather comparing with R fda package. + + Code used in R: + daybasis65 <- create.fourier.basis( + rangeval=c(0, 365), nbasis=65, axes=list('axesIntervals')) + Temp.fd <- with(CanadianWeather, smooth.basisPar(day.5, + dailyAv[,,'Temperature.C'], daybasis65)$fd) + TempRgn.f <- fRegress(Temp.fd ~ region, CanadianWeather) + write.table( + t(round( + TempRgn.f$betaestlist$const$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE + ) + write.table( + t(round( + TempRgn.f$betaestlist$region.Atlantic$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE + ) + write.table( + t(round( + TempRgn.f$betaestlist$region.Continental$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE) + write.table( + t(round( + TempRgn.f$betaestlist$region.Pacific$fd$coefs, + digits=4)), + file="", sep = ",", col.names = FALSE, row.names = FALSE) + """ + X_weather, y_weather = fetch_weather( + return_X_y=True, as_frame=True, + ) + fd = X_weather.iloc[:, 0].values + + y_basis = FourierBasis(n_basis=65) + y_fd = fd.coordinates[0].to_basis(y_basis) + + enc = OneHotEncoder(handle_unknown='ignore') + enc.fit([['Atlantic'], ['Continental'], ['Pacific']]) + X = np.array(y_weather).reshape(-1, 1) + X = enc.transform(X).toarray() + + cov_dict = {"mult1": X[:, 0], "mult2": X[:, 1], "mult3": X[:, 2]} + df = pd.DataFrame(cov_dict) + + beta_const_R = [ # noqa: WPS317 + -225.5085, -110.817, -243.4708, 4.6815, 21.4488, 10.3784, 2.6317, + 1.7571, 2.4812, -1.5179, 1.4451, -0.6581, 2.8287, 0.4106, 1.5839, + -1.711, 0.5587, -2.2268, 2.4745, -0.5179, -0.8376, -3.1504, + -0.1357, -0.1186, 1.1512, 0.7343, 1.842, -0.5258, 1.2263, -0.576, + -0.6754, -0.6952, -0.416, -1.0292, 1.6742, 0.4276, 0.5185, -0.2135, + 0.3239, 1.6598, 1.0682, 2.2478, 0.2692, 1.8589, -0.5416, 0.5256, + -1.6838, -1.1174, 0.1842, -0.3521, 0.1809, -1.6302, 0.6676, + -0.3356, 1.036, -0.6297, 0.4227, -0.3096, 1.1373, 0.6317, 0.3608, + -0.9949, -0.709, -0.4588, -0.5694, + ] + + beta_atlantic_R = [ # noqa: WPS317 + 312.966, 35.9273, 67.7156, -12.9111, -27.3945, -18.3422, + -6.6074, -0.0203, -4.5716, 3.3142, -1.8419, 2.2008, -3.1554, + -0.8167, -1.6248, 1.4791, -0.8676, 2.9854, -2.5819, -0.239, 0.6418, + 2.2211, 1.4992, -2.2746, 0.6767, -2.8692, 1.478, 0.5988, -0.3434, + -0.2574, 2.3693, -0.016, 1.4911, 3.2798, -0.6508, 1.3326, -0.6729, + 1.0736, -0.7874, -1.2653, -1.8837, -3.1971, 0.0166, -1.298, 0.1403, + -1.2479, 0.593, 0.715, 0.1659, 0.8047, -1.2938, 0.7217, -1.1323, + -0.9719, -1.256, 0.8089, -0.1986, 0.7974, -0.4129, -0.6855, + -0.6397, 3.2471, 0.4686, 1.3593, 0.9434, + ] + + beta_continental_R = [ # noqa: WPS317 + 214.8319, 41.1702, 6.2763, -11.5837, -40.6003, -10.9865, -6.6548, + 4.2589, -3.5174, 0.9494, 1.5624, -3.1435, -1.3242, -1.6431, + -1.0234, 2.0606, -1.1042, -0.1723, -4.2717, -0.9321, 1.2331, + 2.0911, -1.0444, -1.757, -1.9564, -2.3117, -3.0405, -1.3801, + -1.7431, -2.0031, 0.7171, -0.6877, 0.7969, -1.01, -0.1761, -2.7614, + 0.8308, -0.7232, 1.671, 0.0118, 1.8239, 0.5399, 1.8575, 0.9313, + 1.6813, 0.834, 2.1028, 1.8707, -0.147, -0.6401, -0.165, 1.5439, + -0.4666, 0.2153, -0.8795, 0.4695, 0.0417, 0.7045, -1.1045, 0.0166, + -0.7447, 1.4645, 1.5654, -0.3106, 0.7647, + ] + + beta_pacific_R = [ # noqa: WPS317 + 375.1732, 78.6384, 127.8782, 6.0014, -29.3124, -11.4446, -5.3623, + -1.1054, -5.4936, 0.5137, 0.0086, -0.7174, -5.2713, -1.2635, + -1.6654, -0.5359, -2.4626, 1.8152, -4.0212, 0.8431, -1.7737, + 3.7342, -2.0556, 0.0382, -2.4436, -1.9431, -3.6757, -0.6956, + -2.8307, -0.7396, 1.6465, -0.3534, 0.903, 0.0484, -1.6763, -1.6237, + -0.9657, -1.6763, -0.2481, -1.3371, -0.6295, -2.4142, 0.9318, + -1.1531, 0.8854, -0.966, 1.6884, 1.6327, -0.1843, 0.1531, -0.7279, + 0.8348, -0.4336, -0.1253, -1.0069, 0.2815, -0.3406, 1.4044, + -1.6412, 0.4354, -1.2269, 0.9194, 1.0373, 0.7552, 1.088, + ] + + linear_regression = LinearRegression() + linear_regression.fit(df, y_fd) + + np.testing.assert_allclose( + linear_regression.basis_coefs[0].ravel(), beta_const_R, atol=0.001, + ) + np.testing.assert_allclose( + linear_regression.basis_coefs[1].ravel(), beta_atlantic_R, atol=0.001, + ) + np.testing.assert_allclose( + linear_regression.basis_coefs[2].ravel(), beta_continental_R, atol=0.001, + ) + np.testing.assert_allclose( + linear_regression.basis_coefs[3].ravel(), beta_pacific_R, atol=0.001, + ) + np.testing.assert_equal(linear_regression.coef_[0].basis, y_fd.basis) + + def test_functional_covariates_concurrent(self) -> None: # noqa: N802 + """ + Test a example of concurrent functional regression. + + Functional response with functional and multivariate covariates. + Concurrent model. + """ + y_basis = MonomialBasis(n_basis=2) + x_basis = MonomialBasis(n_basis=3) + + X_train = pd.DataFrame({ + "covariate_1": FDataBasis( + basis=x_basis, + coefficients=[ + [0, 1, 0], + [0, 1, 0], + [0, 0, 1], + ], + ), + "covariate_2": [3, 1, 4], + "covariate_3": [9, 2, 7], + }) + y_train = FDataBasis( + basis=y_basis, + coefficients=[ + [1, 1], + [2, 1], + [3, 1], + ], + ) + + expected_coefs = [ + FDataBasis( + basis=y_basis, + coefficients=[[5.608253, -2.866976]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[1.842478, -0.507984]], + ), + FDataBasis( + basis=y_basis, + coefficients=[[-0.55036, -0.032797]], + ), + ] + + X_test = pd.DataFrame({ + "covariate_1": FDataBasis( + basis=x_basis, + coefficients=[ + [0, 0, 1], + [0, 0, 1], + [1, 1, 0], + ], + ), + "covariate_2": [2, 1, 1], + "covariate_3": [0, 2, 1], + }) + y_test = FDataBasis( + basis=y_basis, + coefficients=[ + [3.323643, 2.012006], + [0.380445, 2.454396], + [7.378201, -0.666481], + ], + ) + + _test_linear_regression_common( + X_train=X_train, + y_train=y_train, + expected_coefs=expected_coefs, + X_test=X_test, + y_test=y_test, + coef_basis=[y_basis, y_basis, y_basis], + ) + + def test_error_y_X_samples_different(self) -> None: # noqa: N802 + """Number of response samples and explanatory samples are not different. + + Raises ValueError when response is functional. + """ + y_basis = MonomialBasis(n_basis=2) + X = [[1, 2], [3, 4], [5, 6], [1, 0]] + + y_fd = FDataBasis(y_basis, [[1, 2], [3, 4], [5, 6]]) + + funct_reg = LinearRegression() + with np.testing.assert_raises(ValueError): + funct_reg.fit(X, y_fd) + + class TestHistoricalLinearRegression(unittest.TestCase): """Tests for historical linear regression.""" @@ -620,7 +1063,7 @@ def test_historical(self) -> None: np.testing.assert_allclose( regression.coef_.data_matrix[0, ..., 0], np.triu(self.coefficients.data_matrix[0, ..., 0]), - atol=0.3, + atol=0.35, rtol=0, ) diff --git a/skfda/tests/test_smoothing.py b/skfda/tests/test_smoothing.py index a76825640..315d2b3a5 100644 --- a/skfda/tests/test_smoothing.py +++ b/skfda/tests/test_smoothing.py @@ -4,6 +4,7 @@ import numpy as np import sklearn +from sklearn.datasets import load_digits from typing_extensions import Literal import skfda @@ -173,6 +174,39 @@ def test_knn(self) -> None: np.testing.assert_allclose(hat_matrix, hat_matrix_r) +class TestSurfaceSmoother(unittest.TestCase): + """Test that smoothing works on surfaces.""" + + def test_knn(self) -> None: + """Check 2D smoothing using digits dataset.""" + X, y = load_digits(return_X_y=True) + X = X.reshape(-1, 8, 8) + + fd = skfda.FDataGrid(X) + + ks = KernelSmoother( + kernel_estimator=KNeighborsHatMatrix(n_neighbors=1)) + fd_trans = ks.fit_transform(fd) + + np.testing.assert_allclose(fd_trans.data_matrix, fd.data_matrix) + + ks = KernelSmoother( + kernel_estimator=KNeighborsHatMatrix(n_neighbors=3)) + fd_trans = ks.fit_transform(fd) + + res = [ + [0, 1.25, 7.75, 10.5, 8.25, 6.25, 1.5, 0], + [0, 3.2, 9.6, 10.6, 9.8, 8.4, 5.6, 1.25], + [0.75, 4.4, 9, 6.4, 4.6, 8.4, 6.4, 2], + [1, 4.8, 7.8, 2.8, 1.6, 7.2, 6.4, 2], + [1.25, 4.2, 7.2, 1.6, 2, 7.4, 6.4, 2], + [1, 4.4, 7.4, 3.4, 4.6, 8.2, 5.4, 1.75], + [0.5, 4, 7.6, 8.4, 7.6, 6.8, 3.8, 0], + [0, 2, 8.25, 8.5, 8.25, 5.5, 0, 0], + ] + np.testing.assert_allclose(fd_trans.data_matrix[0, ..., 0], res) + + class TestBasisSmoother(unittest.TestCase): """Test Basis Smoother.""" diff --git a/skfda/tests/test_stats_std.py b/skfda/tests/test_stats_std.py new file mode 100644 index 000000000..24f6687af --- /dev/null +++ b/skfda/tests/test_stats_std.py @@ -0,0 +1,188 @@ +"""Test stats functions.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import pytest + +from skfda import FDataBasis, FDataGrid +from skfda.exploratory.stats import std +from skfda.representation.basis import ( + Basis, + FourierBasis, + MonomialBasis, + TensorBasis, + VectorValuedBasis, +) + +# Fixtures for test_std_fdatabasis_vector_valued_basis + + +@pytest.fixture(params=[3, 5]) +def vv_n_basis1(request: Any) -> int: + """n_basis for 1st coordinate of vector valued basis.""" + return request.param # type: ignore[no-any-return] + + +@pytest.fixture +def vv_basis1(vv_n_basis1: int) -> Basis: + """1-dimensional basis to test for vector valued basis.""" + # First element of the basis is assumed to be the 1 function + return MonomialBasis( + n_basis=vv_n_basis1, domain_range=(0, 1), + ) + + +@pytest.fixture(params=[FourierBasis, MonomialBasis]) +def vv_basis2(request: Any, vv_n_basis2: int = 3) -> Basis: + """1-dimensional basis to test for vector valued basis.""" + # First element of the basis is assumed to be the 1 function + return request.param( # type: ignore[no-any-return] + domain_range=(0, 1), n_basis=vv_n_basis2, + ) + + +# Fixtures for test_std_fdatabasis_tensor_basis + +@pytest.fixture(params=[FourierBasis]) +def t_basis1(request: Any, t_n_basis1: int = 3) -> Basis: + """1-dimensional basis to test for tensor basis.""" + # First element of the basis is assumed to be the 1 function + return request.param( # type: ignore[no-any-return] + domain_range=(0, 1), n_basis=t_n_basis1, + ) + + +@pytest.fixture(params=[MonomialBasis]) +def t_basis2(request: Any, t_n_basis2: int = 5) -> Basis: + """1-dimensional basis to test for tensor basis.""" + # First element of the basis is assumed to be the 1 function + return request.param( # type: ignore[no-any-return] + domain_range=(0, 1), n_basis=t_n_basis2, + ) + + +# Tests + +def test_std_fdatagrid_1d_to_2d() -> None: + """Test std_fdatagrid with R to R^2 functions.""" + fd = FDataGrid( + data_matrix=[ + [[0, 1, 2, 3, 4, 5], [0, -1, -2, -3, -4, -5]], + [[2, 3, 4, 5, 6, 7], [-2, -3, -4, -5, -6, -7]], + ], + grid_points=[ + [-2, -1], + [0, 1, 2, 3, 4, 5], + ], + ) + expected_std_data_matrix = np.full((1, 2, 6, 1), np.sqrt(2)) + np.testing.assert_allclose( + std(fd).data_matrix, + expected_std_data_matrix, + ) + + +def test_std_fdatagrid_2d_to_2d() -> None: + """Test std_fdatagrid with R to R^2 functions.""" + fd = FDataGrid( + data_matrix=[ + [ + [[10, 11], [10, 12], [11, 14]], + [[15, 16], [12, 15], [20, 13]], + ], + [ + [[11, 12], [11, 13], [12, 13]], + [[14, 15], [11, 16], [21, 12]], + ], + ], + grid_points=[ + [0, 1], + [0, 1, 2], + ], + ) + expected_std_data_matrix = np.full((1, 2, 3, 2), np.sqrt(1 / 2)) + np.testing.assert_allclose( + std(fd).data_matrix, + expected_std_data_matrix, + ) + + +def test_std_fdatabasis_vector_valued_basis( + vv_basis1: Basis, + vv_basis2: Basis, +) -> None: + """Test std_fdatabasis with a vector valued basis.""" + basis = VectorValuedBasis([vv_basis1, vv_basis2]) + + # coefficients of the function===(1, 1) + one_coefficients = np.concatenate(( + np.pad([1], (0, vv_basis1.n_basis - 1)), + np.pad([1], (0, vv_basis2.n_basis - 1)), + )) + + fd = FDataBasis( + basis=basis, + coefficients=[np.zeros(basis.n_basis), one_coefficients], + ) + + np.testing.assert_allclose( + std(fd).coefficients, + np.array([np.sqrt(1 / 2) * one_coefficients]), + rtol=1e-7, + atol=1e-7, + ) + + +def test_std_fdatabasis_tensor_basis( + t_basis1: Basis, + t_basis2: Basis, +) -> None: + """Test std_fdatabasis with a vector valued basis.""" + basis = TensorBasis([t_basis1, t_basis2]) + + # coefficients of the function===1 + one_coefficients = np.pad([1], (0, basis.n_basis - 1)) + + fd = FDataBasis( + basis=basis, + coefficients=[np.zeros(basis.n_basis), one_coefficients], + ) + + np.testing.assert_allclose( + std(fd).coefficients, + np.array([np.sqrt(1 / 2) * one_coefficients]), + rtol=1e-7, + atol=1e-7, + ) + + +def test_std_fdatabasis_2d_to_2d() -> None: + """Test std_fdatabasis with R^2 to R^2 basis.""" + basis = VectorValuedBasis([ + TensorBasis([ + MonomialBasis(domain_range=(0, 1), n_basis=2), + MonomialBasis(domain_range=(0, 1), n_basis=2), + ]), + TensorBasis([ + MonomialBasis(domain_range=(0, 1), n_basis=2), + MonomialBasis(domain_range=(0, 1), n_basis=2), + ]), + ]) + fd = FDataBasis( + basis=basis, + coefficients=[ + [0, 0, 0, 0, 0, 0, 0, 0], + [1, 0, 0, 0, 1, 0, 0, 0], + ], + ) + expected_coefficients = np.array([[np.sqrt(1 / 2), 0, 0, 0] * 2]) + + np.testing.assert_allclose( + std(fd).coefficients, + expected_coefficients, + rtol=1e-7, + atol=1e-7, + )