Source code for asgardpy.stats.stats

"""
Module for performing some statistic functions.
"""

import numpy as np
from gammapy.stats.fit_statistics import cash, wstat
from scipy.stats import chi2, norm

__all__ = [
    "check_model_preference_aic",
    "check_model_preference_lrt",
    "fetch_pivot_energy",
    "get_chi2_sig_pval",
    "get_goodness_of_fit_stats",
    "get_ts_target",
]



[docs]
def get_chi2_sig_pval(test_stat, ndof):
    """
    Using the log-likelihood value for a model fitting to data, along with the
    total degrees of freedom, evaluate the significance value in terms of gaussian
    distribution along with one-tailed p-value for the fitting statistics.

    In Gammapy, for 3D analysis, cash statistics is used, while for 1D analysis,
    wstat statistics is used. Check the documentation for more details
    https://docs.gammapy.org/1.3/user-guide/stats/index.html

    Parameters
    ----------
    test_stat: float
        The test statistic (-2 ln L) value of the fitting.
    ndof: int
        Total number of degrees of freedom.

    Returns
    -------
    chi2_sig: float
        significance (Chi2) of the likelihood of fit model estimated in
        Gaussian distribution.
    pval: float
        p-value for the model fitting

    """
    pval = chi2.sf(test_stat, ndof)
    chi2_sig = norm.isf(pval / 2)

    return chi2_sig, pval




[docs]
def check_model_preference_lrt(test_stat_1, test_stat_2, ndof_1, ndof_2):
    """
    Log-likelihood ratio test. Checking the preference of a "nested" spectral
    model2 (observed), over a primary model1.

    Parameters
    ----------
    test_stat_1: float
        The test statistic (-2 ln L) of the Fit result of the primary spectral model.
    test_stat_2: float
        The test statistic (-2 ln L) of the Fit result of the nested spectral model.
    ndof_1: int
        Number of degrees of freedom for the primary model
    ndof_2: int
        Number of degrees of freedom for the nested model

    Returns
    -------
    p_value: float
        p-value for the ratio of the likelihoods
    gaussian_sigmas: float
        significance (Chi2) of the ratio of the likelihoods estimated in
        Gaussian distribution.
    n_dof: int
        number of degrees of freedom or free parameters between primary and
        nested model.
    """
    n_dof = ndof_1 - ndof_2

    if n_dof < 1:
        print(f"DoF is lower in {ndof_1} compared to {ndof_2}")

        return np.nan, np.nan, n_dof

    gaussian_sigmas, p_value = get_chi2_sig_pval(test_stat_1 - test_stat_2, n_dof)

    return p_value, gaussian_sigmas, n_dof




[docs]
def check_model_preference_aic(list_stat, list_dof):
    """
    Akaike Information Criterion (AIC) preference over a list of stat and DoF
    (degree of freedom) to get relative likelihood of a given list of best-fit
    models.

    Parameters
    ----------
    list_wstat: list
        List of stat or -2 Log likelihood values for a list of models.
    list_dof: list
        List of degrees of freedom or list of free parameters, for a list of models.

    Returns
    -------
    list_rel_p: list
        List of relative likelihood probabilities, for a list of models.
    """
    list_aic_stat = []
    for stat, dof in zip(list_stat, list_dof, strict=True):
        aic_stat = stat + 2 * dof
        list_aic_stat.append(aic_stat)
    list_aic_stat = np.array(list_aic_stat)

    aic_stat_min = np.min(list_aic_stat)

    list_b_stat = []
    for aic in list_aic_stat:
        b_stat = np.exp((aic_stat_min - aic) / 2)
        list_b_stat.append(b_stat)
    list_b_stat = np.array(list_b_stat)

    list_rel_p = []
    for b_stat in list_b_stat:
        rel_p = b_stat / np.sum(list_b_stat)
        list_rel_p.append(rel_p)
    list_rel_p = np.array(list_rel_p)

    return list_rel_p




[docs]
def get_goodness_of_fit_stats(datasets, instrument_spectral_info):
    """
    Evaluating the Goodness of Fit statistics of the fitting of the model to
    the dataset.

    We first use the get_ts_target function to get the total test statistic for
    the (observed) best fit of the model to the data, and the (expected)
    perfect fit of model and data (model = data), for the given target source
    region/pixel.

    We then evaluate the total number of Degrees of Freedom for the Fit as the
    difference between the number of relevant energy bins used in the evaluation
    and the number of free model parameters.

    The fit statistics difference is used as the test statistic value for
    get_chi2_sig_pval function along with the total number of degrees of freedom
    to get the final statistics for the goodness of fit.

    The fit statistics information is updated in the dict object provided and
    a logging message is passed.

    Parameter
    ---------
    datasets: `gammapy.datasets.Datasets`
        List of Datasets object, which can contain 3D and/or 1D datasets
    instrument_spectral_info: dict
        Dict of information for storing relevant fit stats

    Return
    ------
    instrument_spectral_info: dict
        Filled Dict of information with relevant fit statistics
    stat_message: str
        String for logging the fit statistics
    """
    stat_best_fit, stat_max_fit = get_ts_target(datasets)

    instrument_spectral_info["max_fit_stat"] = stat_max_fit
    instrument_spectral_info["best_fit_stat"] = stat_best_fit
    ndof = instrument_spectral_info["DoF"]
    stat_diff_gof = stat_best_fit - stat_max_fit

    fit_chi2_sig, fit_pval = get_chi2_sig_pval(stat_diff_gof, ndof)

    instrument_spectral_info["fit_chi2_sig"] = fit_chi2_sig
    instrument_spectral_info["fit_pval"] = fit_pval

    stat_message = "The Chi2/dof value of the goodness of Fit is "
    stat_message += f"{stat_diff_gof:.2f}/{ndof}\nand the p-value is {fit_pval:.3e} "
    stat_message += f"and in Significance {fit_chi2_sig:.2f} sigmas"
    stat_message += f"\nwith best fit TS (Observed) as {stat_best_fit:.3f} "
    stat_message += f"and max fit TS (Expected) as {stat_max_fit:.3f}"

    return instrument_spectral_info, stat_message




[docs]
def get_ts_target(datasets):
    """
    From a given list of DL4 datasets, with assumed associated models, estimate
    the total test statistic values, in the given target source region/pixel,
    for the (observed) best fit of the model to the data, and the (expected)
    perfect fit of model and data (model = data).

    For consistency in the evaluation of the statistic values, we will use the
    basic Fit Statistic functions in Gammapy for Poisson Data:

    * `cash <https://docs.gammapy.org/1.3/api/gammapy.stats.cash.html>`_

    * `wstat <https://docs.gammapy.org/1.3/api/gammapy.stats.wstat.html>`_

    For the different type of Statistics used in Gammapy for 3D/1D datasets,
    and for our use case of getting the best fit and perfect fit, we will pass
    the appropriate values, by adapting to the following methods,

    * Best Fit (Observed):

        * `Cash stat_array <https://docs.gammapy.org/1.3/api/gammapy.datasets.MapDataset.html#gammapy.datasets.MapDataset.stat_array # noqa>`_

        * `Wstat stat_array <https://docs.gammapy.org/1.3/api/gammapy.datasets.MapDatasetOnOff.html#gammapy.datasets.MapDatasetOnOff.stat_array # noqa>`_

    * Perfect Fit (Expected):

        * `Cash stat_max <https://docs.gammapy.org/1.3/api/gammapy.stats.CashCountsStatistic.html#gammapy.stats.CashCountsStatistic.stat_max # noqa>`_

        * `Wstat stat_max <https://docs.gammapy.org/1.3/api/gammapy.stats.WStatCountsStatistic.html#gammapy.stats.WStatCountsStatistic.stat_max # noqa>`_

    Parameter
    ---------
    datasets: `gammapy.datasets.Datasets`
        List of Datasets object, which can contain 3D and/or 1D datasets

    Return
    ------
    stat_best_fit: float
        Total sum of test statistic of the best fit of model to data, summed
        over all energy bins.
    stat_max_fit: float
        Test statistic difference of the perfect fit of model to data summed
        over all energy bins.
    """  # noqa
    stat_best_fit = 0
    stat_max_fit = 0

    for data in datasets:
        match data.stat_type:
            case "cash":
                # Assuming that the Counts Map is created with the target source as its center
                region = data.counts.geom.center_skydir

                counts_on = (data.counts.copy() * data.mask).get_spectrum(region).data
                mu_on = (data.npred() * data.mask).get_spectrum(region).data

                stat_best_fit += np.nansum(cash(n_on=counts_on, mu_on=mu_on).ravel())
                stat_max_fit += np.nansum(cash(n_on=counts_on, mu_on=counts_on).ravel())

            case "wstat":
                # Assuming that the Counts Map is created with the target source as its center
                region = data.counts.geom.center_skydir

                counts_on = (data.counts.copy() * data.mask).get_spectrum(region).data
                counts_off = np.nan_to_num((data.counts_off * data.mask).get_spectrum(region)).data

                # alpha is evaluated by acceptance ratios, and
                # Background is evaluated with given alpha and counts_off,
                # but for alpha to be of the same shape (in the target region),
                # it will be reevaluated
                bkg = np.nan_to_num((data.background * data.mask).get_spectrum(region))

                with np.errstate(invalid="ignore", divide="ignore"):
                    alpha = bkg / counts_off
                mu_signal = np.nan_to_num((data.npred_signal() * data.mask).get_spectrum(region)).data
                max_pred = counts_on - bkg

                stat_best_fit += np.nansum(wstat(n_on=counts_on, n_off=counts_off, alpha=alpha, mu_sig=mu_signal))
                stat_max_fit += np.nansum(wstat(n_on=counts_on, n_off=counts_off, alpha=alpha, mu_sig=max_pred))

            case "chi2":
                # For FluxxPointsDataset
                stat_best_fit += np.nansum(data.stat_array())
                stat_max_fit += len(data.data.dnde.data)

    return stat_best_fit, stat_max_fit




[docs]
def fetch_pivot_energy(analysis):
    """
    Using an 'AsgardpyAnalysis' object to get the pivot energy for a given dataset
    and fit model, using the pivot_energy function.

    Returns
    -------
    pivot energy : `~astropy.units.Quantity`
        The energy at which the statistical error in the computed flux is smallest.
        If no minimum is found, NaN will be returned.
    """
    # Check if DL4 datasets are created, and if not, only run steps till Fit
    if len(analysis.datasets) == 0:
        steps = [step for step in analysis.config.general.steps if step != "flux-points"]

        analysis.run(steps)
    else:
        analysis.run(["fit"])

    analysis.get_correct_intrinsic_model()
    pivot = analysis.model_deabs.spectral_model.pivot_energy

    return pivot