Source code for ppi_py.ppi_power_analysis

import numpy as np
import warnings
from .utils import reshape_to_2d, construct_weight_vector
from .ppi import _ols_get_stats, _logistic_get_stats, _poisson_get_stats, _wls
from sklearn.linear_model import LogisticRegression, PoissonRegressor


"""
    PPI POWER ANALYSIS

"""


[docs] def ppi_power( ppi_corr, cost_X, cost_Y, cost_Yhat, budget=None, effective_n=None, n_max=None, ): """ Computes the optimal pair of sample sizes for PPI when the PPI correlation is known. Args: ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. budget (float, optional): Total budget. Used to compute the most powerful pair given the budget. effective_n (int, optional): Effective sample size. Used to compute the cheapest pair. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy n + N <= n_max. Returns: dict: Dictionary containing the following items: - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective number of samples as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. Notes: At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used and the most powerful pair will be returned. `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt """ if budget is None and effective_n is None: raise ValueError( "At least one of `budget` and `effective_n` must be provided." ) if ppi_corr >= 1 or ppi_corr <= -1: raise ValueError("`ppi_corr` must be strictly between -1 and 1.") gamma, ppi_cost, classical_cost = _get_costs( ppi_corr, cost_X, cost_Y, cost_Yhat, ) if budget is not None: return _get_powerful_pair( ppi_corr, gamma, ppi_cost, classical_cost, cost_X, cost_Y, cost_Yhat, budget=budget, n_max=n_max, ) else: return _get_cheap_pair( ppi_corr, gamma, ppi_cost, classical_cost, cost_X, cost_Y, cost_Yhat, effective_n=effective_n, n_max=n_max, )
def _get_costs( ppi_corr, cost_X, cost_Y, cost_Yhat, ): """ Computes the cost of the most efficient PPI and classical estimators per classical sample. Args: ppi_corr (ndarray): PPI correlation. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. Returns: gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label. ppi_cost (float): Cost of the most efficient PPI estimator per classical sample. classical_cost (float): Cost of the classical estimator per classical sample. """ gamma = (cost_Yhat + cost_X) / cost_Y ppi_corr_sq = ppi_corr**2 ppi_cost = cost_Y * ( 1 - ppi_corr_sq + gamma * ppi_corr_sq + 2 * (gamma * ppi_corr_sq * (1 - ppi_corr_sq)) ** 0.5 ) classical_cost = cost_Y + cost_X return gamma, ppi_cost, classical_cost def _get_powerful_pair( ppi_corr, gamma, ppi_cost, classical_cost, cost_X, cost_Y, cost_Yhat, budget, n_max=None, ): """ Computes the most powerful pair of sample sizes given a budget. Args: ppi_corr (ndarray): PPI correlation. gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label. ppi_cost (float): Cost of the most efficient PPI estimator per classical sample. classical_cost (float): Cost of the classical estimator per classical sample. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. budget (float): Total budget. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy n + N <= n_max. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size. - ppi_corr (float): PPI correlation. """ n0 = budget / ppi_cost result = _optimal_pair(n0, ppi_corr, gamma, cost_X, cost_Y, cost_Yhat) if classical_cost < ppi_cost or result["N"] < 0: n = round(budget / classical_cost) result = { "n": n, "N": 0, "cost": n * classical_cost, "effective_n": n, "ppi_corr": ppi_corr, } if n_max is None: return result if result["n"] + result["N"] <= n_max: return result if n_max * (cost_Y + cost_X) <= budget: return { "n": n_max, "N": 0, "cost": n_max * (cost_Y + cost_X), "effective_n": n_max, "ppi_corr": ppi_corr, } n = round(budget / cost_Y - n_max * gamma) N = n_max - n effective_n = round(n * (n + N) / (n + (1 - ppi_corr**2) * N)) return { "n": n, "N": N, "cost": n * (cost_Y + cost_Yhat + cost_X) + N * (cost_Yhat + cost_X), "effective_n": effective_n, "ppi_corr": ppi_corr, } def _get_cheap_pair( ppi_corr, gamma, ppi_cost, classical_cost, cost_X, cost_Y, cost_Yhat, effective_n, n_max=None, ): """ Computes the most powerful pair of sample sizes given a budget. Args: ppi_corr (ndarray): PPI correlation. gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label. ppi_cost (float): Cost of the most efficient PPI estimator per classical sample. classical_cost (float): Cost of the classical estimator per classical sample. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. effective_n (int): Effective sample size. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy n + N <= n_max. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size. - ppi_corr (float): PPI correlation. Notes: If effective_n > n_max, then there is no pair of sample sizes (n, N) with n + N <= n_max that has a standard error of se or smaller. In this case, the function will give a warning and will return n = n_max and N = 0. This is the most powerful pair of sample sizes that can be achieved with n_max unlabeled samples. """ n0 = effective_n result = _optimal_pair(n0, ppi_corr, gamma, cost_X, cost_Y, cost_Yhat) if classical_cost < ppi_cost or result["N"] < 0: n = round(n0) result = { "n": n, "N": 0, "cost": n * classical_cost, "effective_n": n, "ppi_corr": ppi_corr, } if n_max is None: return result if result["n"] + result["N"] <= n_max: return result if effective_n > n_max: warnings.warn( "The desired effective sample size is too large for the given number of unlabeled samples. \nReturning n = n_max and N = 0. To achieve the desired effective sample size, increase n_max or decrease effective_n.", UserWarning, ) n = n_max return { "n": n, "N": 0, "cost": n * classical_cost, "effective_n": n, "ppi_corr": ppi_corr, } else: n = round(n0 * n_max * (1 - ppi_corr**2) / (n_max - ppi_corr**2 * n0)) N = n_max - n effective_n = round(n * (n + N) / (n + (1 - ppi_corr**2) * N)) return { "n": n, "N": N, "cost": n * (cost_Y + cost_Yhat + cost_X) + N * (cost_Yhat + cost_X), "effective_n": effective_n, "ppi_corr": ppi_corr, } def _optimal_pair(n0, ppi_corr, gamma, cost_X, cost_Y, cost_Yhat): """ " Compute the optimal pair of PPI samples achieving the same standard error as a classical estimator with n0 samples. Args: n0 (float): Number of samples for the classical estimator. ppi_corr (float): PPI correlation. gamma (float): Ratio of the cost of a prediction plus unlabled data to the cost of a gold-standard label. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size. - ppi_corr (float): PPI correlation. """ ppi_corr_sq = ppi_corr**2 n = n0 * ( 1 - ppi_corr_sq + np.sqrt(gamma * ppi_corr_sq * (1 - ppi_corr_sq)) ) if ppi_corr != 0: N = n * (n0 - n) / (n - (1 - ppi_corr_sq) * n0) else: N = 0 n = round(n) N = round(N) cost = n * cost_Y + (n + N) * (cost_Yhat + cost_X) effective_n = round(n * (n + N) / (n + (1 - ppi_corr_sq) * N)) return { "n": n, "N": N, "cost": cost, "effective_n": effective_n, "ppi_corr": ppi_corr, } """ MEAN POWER CALCULATION """
[docs] def ppi_mean_power( Y, Yhat, cost_Y, cost_Yhat, budget=None, effective_n=None, n_max=None, w=None, ): """ Computes the optimal pair of sample sizes for estimating the mean with ppi. Args: Y (ndarray): Gold-standard labels. Yhat (ndarray): Predictions corresponding to the gold-standard labels. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. budget (float, optional): Total budget. Used to compute the most powerful pair given the budget. effective_n (int, optional): Effective sample size. Used to compute the cheapest pair. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max. w (ndarray, optional): Sample weights for the labeled data set. Defaults to all ones vector. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. Notes: At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used and the most powerful pair will be returned. `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt """ if budget is None and effective_n is None: raise ValueError( "At least one of `budget` and `effective_n` must be provided." ) if len(Y.shape) > 1 and Y.shape[1] > 1: raise ValueError("Y must be a 1D array.") if len(Yhat.shape) > 1 and Yhat.shape[1] > 1: raise ValueError("Yhat must be a 1D array.") Y = reshape_to_2d(Y) Yhat = reshape_to_2d(Yhat) n = Y.shape[0] d = 1 w = construct_weight_vector(n, w, vectorized=True) pointest = np.sum(w * Y) / np.sum(w) grads = w * (Y - pointest) grads_hat = w * (Yhat - pointest) inv_hessian = np.eye(d) ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian) return ppi_power( ppi_corr, cost_X=0, cost_Y=cost_Y, cost_Yhat=cost_Yhat, budget=budget, effective_n=effective_n, n_max=n_max, )
def _get_ppi_corr(grads, grads_hat, inv_hessian, coord=None): """ Calculates the parameters needed for power analysis. Args: grads (ndarray): Gradient of the loss function with respect to the parameter evaluated at the labeled data. grads_hat (ndarray): Gradient of the loss function with respect to the model parameter evaluated using predictions on the labeled data. inv_hessian (ndarray): Inverse of the Hessian of the loss function with respect to the parameter. coord (int, optional): Coordinate for regression coefficients. Must be in {1, ..., d} where d is the shape of the estimand. Returns: float: Variance of the classical point estimate. float: PPI correlation """ grads = reshape_to_2d(grads) grads_hat = reshape_to_2d(grads_hat) n = grads.shape[0] d = inv_hessian.shape[0] if grads.shape[1] != d: raise ValueError( "Dimension mismatch between the gradient and the inverse Hessian." ) grads_cent = grads - grads.mean(axis=0) grads_hat_cent = grads_hat - grads_hat.mean(axis=0) cov_grads = (1 / n) * grads_cent.T @ grads_hat_cent var_grads_hat = grads_hat_cent.T @ grads_hat_cent / n var_grads = grads_cent.T @ grads_cent / n sigma_sq = np.diag(inv_hessian @ var_grads @ inv_hessian) num = np.diag(inv_hessian @ cov_grads @ inv_hessian) denom = np.sqrt( sigma_sq * np.diag(inv_hessian @ var_grads_hat @ inv_hessian) ) ppi_corr = num / denom ppi_corr = np.minimum(ppi_corr, 1 - 1 / n) if coord is not None: return float(ppi_corr[coord]) else: return float(ppi_corr[0]) """ ORDINARY LEAST SQUARES POWER CALCULATION """
[docs] def ppi_ols_power( X, Y, Yhat, cost_X, cost_Y, cost_Yhat, coord, budget=None, effective_n=None, n_max=None, w=None, ): """ Computes the optimal pair of sample sizes for estimating OLS coefficients with PPI. Args: X (ndarray): Covariates corresponding to the gold-standard labels. Y (ndarray): Gold-standard labels. Yhat (ndarray): Predictions corresponding to the gold-standard labels. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. coord (int): Coordinate to perform power analysis on. Must be in {0, ..., d-1} where d is the shape of the estimand. budget (float, optional): Total budget. Used to compute the most powerful pair given the budget. effective_n (int, optional): Effective sample size. Used to compute the cheapest pair. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max. w (ndarray, optional): Sample weights for the labeled data set. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Notes: At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used. `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt """ if budget is None and effective_n is None: raise ValueError( "At least one of `budget` and `effective_n` must be provided." ) pointest = _wls(X, Y, w=w) grads, grads_hat, _, inv_hessian = _ols_get_stats( pointest, X.astype(float), Y, Yhat, X.astype(float), Yhat, w=w, use_unlabeled=False, ) ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian, coord=coord) return ppi_power( ppi_corr, cost_X, cost_Y, cost_Yhat, budget, effective_n, n_max )
""" LOGISTIC REGRESSION POWER CALCULATION """
[docs] def ppi_logistic_power( X, Y, Yhat, cost_X, cost_Y, cost_Yhat, coord, budget=None, effective_n=None, n_max=None, w=None, ): """ Computes the optimal pair of sample sizes for estimating logistic regression coefficients with PPI. Args: X (ndarray): Covariates corresponding to the gold-standard labels. Y (ndarray): Gold-standard labels. Yhat (ndarray): Predictions corresponding to the gold-standard labels. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. coord (int): Coordinate to perform power analysis on. Must be in {0, ..., d-1} where d is the shape of the estimand. budget (float, optional): Total budget. Used to compute the most powerful pair given the budget. effective_n (int, optional): Effective sample size. Used to compute the cheapest pair. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max. w (ndarray, optional): Sample weights for the labeled data set. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size as defined in`[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ - ppi_corr (float): PPI correlation as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Notes: At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used. `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt """ if budget is None and effective_n is None: raise ValueError( "At least one of `budget` and `effective_n` must be provided." ) pointest = ( LogisticRegression( penalty=None, solver="lbfgs", max_iter=10000, tol=1e-15, fit_intercept=False, ) .fit(X, Y) .coef_.squeeze() ) grads, grads_hat, _, inv_hessian = _logistic_get_stats( pointest, X.astype(float), Y, Yhat, X.astype(float), Yhat, w=w, use_unlabeled=False, ) ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian, coord=coord) return ppi_power( ppi_corr, cost_X, cost_Y, cost_Yhat, budget, effective_n, n_max )
""" POISSON REGRESSION POWER CALCULATION """
[docs] def ppi_poisson_power( X, Y, Yhat, cost_X, cost_Y, cost_Yhat, coord, budget=None, effective_n=None, n_max=None, w=None, ): """ Computes the optimal pair of sample sizes for estimating Poisson regression coefficients with PPI. Args: X (ndarray): Covariates corresponding to the gold-standard labels. Y (ndarray): Gold-standard labels. Yhat (ndarray): Predictions corresponding to the gold-standard labels. cost_X (float): Cost per unlabeled data point. cost_Y (float): Cost per gold-standard label. cost_Yhat (float): Cost per prediction. coord (int): Coordinate to perform power analysis on. Must be in {0, ..., d-1} where d is the shape of the estimand. budget (float, optional): Total budget. Used to compute the most powerful pair given the budget. effective_n (int, optional): Effective sample size. Used to compute the cheapest pair. n_max (int, optional): Maximum number of samples allowed. If provided, the optimal pair will satisfy the additional constraint that n + N <= n_max. w (ndarray, optional): Sample weights for the labeled data set. Returns: dict: Dictionary containing the following items - n (int): Optimal number of gold-labeled samples. - N (int): Optimal number of unlabeled samples. - cost (float): Total cost. - effective_n (int): Effective sample size as defined in `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. - ppi_corr (float): PPI correlation `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__. Notes: At least one of `budget` and `effective_n` must be provided. If both are provided, `budget` will be used. `[BHvL24] <https://osf.io/preprints/socarxiv/j3bnt>`__ Broska, D., Howes, M., & van Loon, A. (2024, August 22). The Mixed Subjects Design: Treating Large Language Models as (Potentially) Informative Observations. https://doi.org/10.31235/osf.io/j3bnt """ if budget is None and effective_n is None: raise ValueError( "At least one of `budget` and `effective_n` must be provided." ) pointest = ( PoissonRegressor( alpha=0, fit_intercept=False, max_iter=10000, tol=1e-15, ) .fit(X, Y) .coef_ ) grads, grads_hat, _, inv_hessian = _poisson_get_stats( pointest, X.astype(float), Y, Yhat, X.astype(float), Yhat, w=w, use_unlabeled=False, ) ppi_corr = _get_ppi_corr(grads, grads_hat, inv_hessian, coord=coord) return ppi_power( ppi_corr, cost_X, cost_Y, cost_Yhat, budget, effective_n, n_max )