Source code for ppi_py.ppi_power_analysis

import numpy as np
import warnings
from .utils import reshape_to_2d, construct_weight_vector
from .ppi import _ols_get_stats, _logistic_get_stats, _poisson_get_stats, _wls
from sklearn.linear_model import LogisticRegression, PoissonRegressor


"""
    PPI POWER ANALYSIS

"""



[docs]
def ppi_power(
    ppi_corr,
    cost_X,
    cost_Y,
    cost_Yhat,
    budget=None,
    effective_n=None,
    n_max=None,
):
    """
    Computes the optimal pair of sample sizes for PPI when the PPI correlation is known.

    Args:
        ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        budget (float, optional): Total budget. Used to compute the most powerful pair given the budget.
        effective_n (int, optional): Effective sample size. Used to compute the cheapest pair.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy n + N <= n_max.

    Returns:
        dict: Dictionary containing the following items:
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective number of samples as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.
            - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.

    Notes:
        At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used and the most powerful pair will be returned.

        `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as  (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt

    """
    if budget is None and effective_n is None:
        raise ValueError(
            "At least one of `budget` and `effective_n` must be provided."
        )

    if ppi_corr >= 1 or ppi_corr <= -1:
        raise ValueError("`ppi_corr` must be strictly between -1 and 1.")

    gamma, ppi_cost, classical_cost = _get_costs(
        ppi_corr,
        cost_X,
        cost_Y,
        cost_Yhat,
    )

    if budget is not None:
        return _get_powerful_pair(
            ppi_corr,
            gamma,
            ppi_cost,
            classical_cost,
            cost_X,
            cost_Y,
            cost_Yhat,
            budget=budget,
            n_max=n_max,
        )
    else:
        return _get_cheap_pair(
            ppi_corr,
            gamma,
            ppi_cost,
            classical_cost,
            cost_X,
            cost_Y,
            cost_Yhat,
            effective_n=effective_n,
            n_max=n_max,
        )



def _get_costs(
    ppi_corr,
    cost_X,
    cost_Y,
    cost_Yhat,
):
    """
    Computes the cost of the most efficient PPI and classical estimators per classical sample.

    Args:
        ppi_corr (ndarray): PPI correlation.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.

    Returns:
        gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label.
        ppi_cost (float): Cost of the most efficient PPI estimator per classical sample.
        classical_cost (float): Cost of the classical estimator per classical sample.
    """
    gamma = (cost_Yhat + cost_X) / cost_Y
    ppi_corr_sq = ppi_corr**2
    ppi_cost = cost_Y * (
        1
        - ppi_corr_sq
        + gamma * ppi_corr_sq
        + 2 * (gamma * ppi_corr_sq * (1 - ppi_corr_sq)) ** 0.5
    )
    classical_cost = cost_Y + cost_X
    return gamma, ppi_cost, classical_cost


def _get_powerful_pair(
    ppi_corr,
    gamma,
    ppi_cost,
    classical_cost,
    cost_X,
    cost_Y,
    cost_Yhat,
    budget,
    n_max=None,
):
    """
    Computes the most powerful pair of sample sizes given a budget.

    Args:
        ppi_corr (ndarray): PPI correlation.
        gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label.
        ppi_cost (float): Cost of the most efficient PPI estimator per classical sample.
        classical_cost (float): Cost of the classical estimator per classical sample.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        budget (float): Total budget.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy n + N <= n_max.

    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size.
            - ppi_corr (float): PPI correlation.
    """

    n0 = budget / ppi_cost
    result = _optimal_pair(n0, ppi_corr, gamma, cost_X, cost_Y, cost_Yhat)

    if classical_cost < ppi_cost or result["N"] < 0:
        n = round(budget / classical_cost)
        result = {
            "n": n,
            "N": 0,
            "cost": n * classical_cost,
            "effective_n": n,
            "ppi_corr": ppi_corr,
        }

    if n_max is None:
        return result
    if result["n"] + result["N"] <= n_max:
        return result

    if n_max * (cost_Y + cost_X) <= budget:
        return {
            "n": n_max,
            "N": 0,
            "cost": n_max * (cost_Y + cost_X),
            "effective_n": n_max,
            "ppi_corr": ppi_corr,
        }

    n = round(budget / cost_Y - n_max * gamma)
    N = n_max - n
    effective_n = round(n * (n + N) / (n + (1 - ppi_corr**2) * N))
    return {
        "n": n,
        "N": N,
        "cost": n * (cost_Y + cost_Yhat + cost_X) + N * (cost_Yhat + cost_X),
        "effective_n": effective_n,
        "ppi_corr": ppi_corr,
    }


def _get_cheap_pair(
    ppi_corr,
    gamma,
    ppi_cost,
    classical_cost,
    cost_X,
    cost_Y,
    cost_Yhat,
    effective_n,
    n_max=None,
):
    """
    Computes the most powerful pair of sample sizes given a budget.

    Args:
        ppi_corr (ndarray): PPI correlation.
        gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label.
        ppi_cost (float): Cost of the most efficient PPI estimator per classical sample.
        classical_cost (float): Cost of the classical estimator per classical sample.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        effective_n (int): Effective sample size.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy n + N <= n_max.


    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size.
            - ppi_corr (float): PPI correlation.

    Notes:
        If effective_n > n_max, then there is no pair of sample sizes (n, N) with n + N <= n_max that has a standard error of se or smaller. In this case, the function will give a warning and will return n = n_max and N = 0. This is the most powerful pair of sample sizes that can be achieved with n_max unlabeled samples.
    """

    n0 = effective_n
    result = _optimal_pair(n0, ppi_corr, gamma, cost_X, cost_Y, cost_Yhat)

    if classical_cost < ppi_cost or result["N"] < 0:
        n = round(n0)
        result = {
            "n": n,
            "N": 0,
            "cost": n * classical_cost,
            "effective_n": n,
            "ppi_corr": ppi_corr,
        }

    if n_max is None:
        return result
    if result["n"] + result["N"] <= n_max:
        return result

    if effective_n > n_max:
        warnings.warn(
            "The desired effective sample size is too large for the given number of unlabeled samples. \nReturning n = n_max and N = 0. To achieve the desired effective sample size, increase n_max or decrease effective_n.",
            UserWarning,
        )

        n = n_max
        return {
            "n": n,
            "N": 0,
            "cost": n * classical_cost,
            "effective_n": n,
            "ppi_corr": ppi_corr,
        }

    else:
        n = round(n0 * n_max * (1 - ppi_corr**2) / (n_max - ppi_corr**2 * n0))
        N = n_max - n
        effective_n = round(n * (n + N) / (n + (1 - ppi_corr**2) * N))
        return {
            "n": n,
            "N": N,
            "cost": n * (cost_Y + cost_Yhat + cost_X)
            + N * (cost_Yhat + cost_X),
            "effective_n": effective_n,
            "ppi_corr": ppi_corr,
        }


def _optimal_pair(n0, ppi_corr, gamma, cost_X, cost_Y, cost_Yhat):
    """ "
    Compute the optimal pair of PPI samples achieving the same standard error as a classical estimator with n0 samples.

    Args:
        n0 (float): Number of samples for the classical estimator.
        ppi_corr (float): PPI correlation.
        gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.

    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size.
            - ppi_corr (float): PPI correlation.
    """
    ppi_corr_sq = ppi_corr**2
    n = n0 * (
        1 - ppi_corr_sq + np.sqrt(gamma * ppi_corr_sq * (1 - ppi_corr_sq))
    )
    if ppi_corr != 0:
        N = n * (n0 - n) / (n - (1 - ppi_corr_sq) * n0)
    else:
        N = 0

    n = round(n)
    N = round(N)

    cost = n * cost_Y + (n + N) * (cost_Yhat + cost_X)
    effective_n = round(n * (n + N) / (n + (1 - ppi_corr_sq) * N))

    return {
        "n": n,
        "N": N,
        "cost": cost,
        "effective_n": effective_n,
        "ppi_corr": ppi_corr,
    }


"""
    MEAN POWER CALCULATION

"""



[docs]
def ppi_mean_power(
    Y,
    Yhat,
    cost_Y,
    cost_Yhat,
    budget=None,
    effective_n=None,
    n_max=None,
    w=None,
):
    """
    Computes the optimal pair of sample sizes for estimating the mean with ppi.

    Args:
        Y (ndarray): Gold-standard labels.
        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        budget (float, optional): Total budget. Used to compute the most powerful pair given the budget.
        effective_n (int, optional): Effective sample size. Used to compute the cheapest pair.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max.
        w (ndarray, optional): Sample weights for the labeled data set. Defaults to all ones vector.

    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.
            - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.

    Notes:
        At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used and the most powerful pair will be returned.

        `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as  (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt
    """
    if budget is None and effective_n is None:
        raise ValueError(
            "At least one of `budget` and `effective_n` must be provided."
        )
    if len(Y.shape) > 1 and Y.shape[1] > 1:
        raise ValueError("Y must be a 1D array.")
    if len(Yhat.shape) > 1 and Yhat.shape[1] > 1:
        raise ValueError("Yhat must be a 1D array.")

    Y = reshape_to_2d(Y)
    Yhat = reshape_to_2d(Yhat)
    n = Y.shape[0]
    d = 1

    w = construct_weight_vector(n, w, vectorized=True)

    pointest = np.sum(w * Y) / np.sum(w)

    grads = w * (Y - pointest)
    grads_hat = w * (Yhat - pointest)
    inv_hessian = np.eye(d)

    ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian)

    return ppi_power(
        ppi_corr,
        cost_X=0,
        cost_Y=cost_Y,
        cost_Yhat=cost_Yhat,
        budget=budget,
        effective_n=effective_n,
        n_max=n_max,
    )



def _get_ppi_corr(grads, grads_hat, inv_hessian, coord=None):
    """
    Calculates the parameters needed for power analysis.

    Args:
        grads (ndarray): Gradient of the loss function with respect to the parameter evaluated at the labeled data.
        grads_hat (ndarray): Gradient of the loss function with respect to the model parameter evaluated using predictions on the labeled data.
        inv_hessian (ndarray): Inverse of the Hessian of the loss function with respect to the parameter.
        coord (int, optional): Coordinate for regression coefficients. Must be in {1, ..., d} where d is the shape of the estimand.

    Returns:
        float: Variance of the classical point estimate.
        float: PPI correlation
    """
    grads = reshape_to_2d(grads)
    grads_hat = reshape_to_2d(grads_hat)

    n = grads.shape[0]
    d = inv_hessian.shape[0]

    if grads.shape[1] != d:
        raise ValueError(
            "Dimension mismatch between the gradient and the inverse Hessian."
        )
    grads_cent = grads - grads.mean(axis=0)
    grads_hat_cent = grads_hat - grads_hat.mean(axis=0)
    cov_grads = (1 / n) * grads_cent.T @ grads_hat_cent

    var_grads_hat = grads_hat_cent.T @ grads_hat_cent / n
    var_grads = grads_cent.T @ grads_cent / n

    sigma_sq = np.diag(inv_hessian @ var_grads @ inv_hessian)

    num = np.diag(inv_hessian @ cov_grads @ inv_hessian)
    denom = np.sqrt(
        sigma_sq * np.diag(inv_hessian @ var_grads_hat @ inv_hessian)
    )
    ppi_corr = num / denom
    ppi_corr = np.minimum(ppi_corr, 1 - 1 / n)

    if coord is not None:
        return float(ppi_corr[coord])
    else:
        return float(ppi_corr[0])


"""
    ORDINARY LEAST SQUARES POWER CALCULATION

"""



[docs]
def ppi_ols_power(
    X,
    Y,
    Yhat,
    cost_X,
    cost_Y,
    cost_Yhat,
    coord,
    budget=None,
    effective_n=None,
    n_max=None,
    w=None,
):
    """
    Computes the optimal pair of sample sizes for estimating OLS coefficients with PPI.

    Args:
        X (ndarray): Covariates corresponding to the gold-standard labels.
        Y (ndarray): Gold-standard labels.
        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        coord (int): Coordinate to perform power analysis on. Must be in {0, ..., d-1} where d is the shape of the estimand.
        budget (float, optional): Total budget. Used to compute the most powerful pair given the budget.
        effective_n (int, optional): Effective sample size. Used to compute the cheapest pair.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max.
        w (ndarray, optional): Sample weights for the labeled data set.

    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__
            - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__

    Notes:
        At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used.

        `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as  (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt
    """
    if budget is None and effective_n is None:
        raise ValueError(
            "At least one of `budget` and `effective_n` must be provided."
        )

    pointest = _wls(X, Y, w=w)

    grads, grads_hat, _, inv_hessian = _ols_get_stats(
        pointest,
        X.astype(float),
        Y,
        Yhat,
        X.astype(float),
        Yhat,
        w=w,
        use_unlabeled=False,
    )

    ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian, coord=coord)

    return ppi_power(
        ppi_corr, cost_X, cost_Y, cost_Yhat, budget, effective_n, n_max
    )



"""
    LOGISTIC REGRESSION POWER CALCULATION
"""



[docs]
def ppi_logistic_power(
    X,
    Y,
    Yhat,
    cost_X,
    cost_Y,
    cost_Yhat,
    coord,
    budget=None,
    effective_n=None,
    n_max=None,
    w=None,
):
    """
    Computes the optimal pair of sample sizes for estimating logistic regression coefficients with PPI.

    Args:
        X (ndarray): Covariates corresponding to the gold-standard labels.
        Y (ndarray): Gold-standard labels.
        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        coord (int): Coordinate to perform power analysis on. Must be in {0, ..., d-1} where d is the shape of the estimand.
        budget (float, optional): Total budget. Used to compute the most powerful pair given the budget.
        effective_n (int, optional): Effective sample size. Used to compute the cheapest pair.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max.
        w (ndarray, optional): Sample weights for the labeled data set.

    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size as defined in`[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__
            - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__

    Notes:
        At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used.

        `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as  (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt
    """
    if budget is None and effective_n is None:
        raise ValueError(
            "At least one of `budget` and `effective_n` must be provided."
        )

    pointest = (
        LogisticRegression(
            penalty=None,
            solver="lbfgs",
            max_iter=10000,
            tol=1e-15,
            fit_intercept=False,
        )
        .fit(X, Y)
        .coef_.squeeze()
    )

    grads, grads_hat, _, inv_hessian = _logistic_get_stats(
        pointest,
        X.astype(float),
        Y,
        Yhat,
        X.astype(float),
        Yhat,
        w=w,
        use_unlabeled=False,
    )

    ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian, coord=coord)

    return ppi_power(
        ppi_corr, cost_X, cost_Y, cost_Yhat, budget, effective_n, n_max
    )



"""
    POISSON REGRESSION POWER CALCULATION
"""



[docs]
def ppi_poisson_power(
    X,
    Y,
    Yhat,
    cost_X,
    cost_Y,
    cost_Yhat,
    coord,
    budget=None,
    effective_n=None,
    n_max=None,
    w=None,
):
    """
    Computes the optimal pair of sample sizes for estimating Poisson regression coefficients with PPI.

    Args:
        X (ndarray): Covariates corresponding to the gold-standard labels.
        Y (ndarray): Gold-standard labels.
        Yhat (ndarray): Predictions corresponding to the gold-standard labels.
        cost_X (float): Cost per unlabeled data point.
        cost_Y (float): Cost per gold-standard label.
        cost_Yhat (float): Cost per prediction.
        coord (int): Coordinate to perform power analysis on. Must be in {0, ..., d-1} where d is the shape of the estimand.
        budget (float, optional): Total budget. Used to compute the most powerful pair given the budget.
        effective_n (int, optional): Effective sample size. Used to compute the cheapest pair.
        n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max.
        w (ndarray, optional): Sample weights for the labeled data set.

    Returns:
        dict: Dictionary containing the following items
            - n (int): Optimal number of gold-labeled samples.
            - N (int): Optimal number of unlabeled samples.
            - cost (float): Total cost.
            - effective_n (int): Effective sample size as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.
            - ppi_corr (float): PPI correlation `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__.

    Notes:
        At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used.

        `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as  (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt
    """
    if budget is None and effective_n is None:
        raise ValueError(
            "At least one of `budget` and `effective_n` must be provided."
        )

    pointest = (
        PoissonRegressor(
            alpha=0,
            fit_intercept=False,
            max_iter=10000,
            tol=1e-15,
        )
        .fit(X, Y)
        .coef_
    )

    grads, grads_hat, _, inv_hessian = _poisson_get_stats(
        pointest,
        X.astype(float),
        Y,
        Yhat,
        X.astype(float),
        Yhat,
        w=w,
        use_unlabeled=False,
    )

    ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian, coord=coord)

    return ppi_power(
        ppi_corr, cost_X, cost_Y, cost_Yhat, budget, effective_n, n_max
    )
Source code for ppi_py.ppi_power_analysis

Navigation

Related Topics