import numpy as np
from scipy.stats import norm
from scipy.special import expit
from statsmodels.regression.linear_model import OLS
from statsmodels.stats.weightstats import _zconfint_generic, _zstat_generic
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.isotonic import IsotonicRegression
from .utils import dataframe_decorator, bootstrap
from .ppi import _ols, _wls
from sklearn.linear_model import PoissonRegressor
"""
MEAN ESTIMATION
"""
[docs]
def classical_mean_ci(Y, w=None, alpha=0.1, alternative="two-sided"):
"""Classical mean confidence interval using the central limit theorem.
Args:
Y (ndarray): Array of observations.
w (ndarray, optional): Sample weights for the data set. Must be positive and will be normalized to sum to the size of the dataset.
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
alternative (str, optional): One of "two-sided", "larger", or "smaller". Defaults to "two-sided".
Returns:
tuple: (lower, upper) confidence interval bounds.
"""
n = Y.shape[0]
if w is None:
return _zconfint_generic(
Y.mean(), Y.std() / np.sqrt(n), alpha, alternative
)
else:
w = w / w.sum() * n
return _zconfint_generic(
(w * Y).mean(), (w * Y).std() / np.sqrt(n), alpha, alternative
)
[docs]
def semisupervised_mean_ci(
X,
Y,
X_unlabeled,
K,
alpha=0.1,
alternative="two-sided",
add_intercept=True,
):
"""Semisupervised mean confidence interval from `[ZB22] <https://arxiv.org/abs/1902.00772>`__.
Args:
X (ndarray): Labeled covariates.
Y (ndarray): Labeled responses.
X_unlabeled (ndarray): Unlabeled covariates.
K (int): Number of folds for cross-fitting.
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
alternative (str, optional): One of "two-sided", "larger", or "smaller". Defaults to "two-sided".
add_intercept (bool, optional): Whether to add an intercept to the covariates. Defaults to True.
Returns:
tuple: (lower, upper) confidence interval bounds.
Notes:
`[ZB22] <https://arxiv.org/abs/1902.00772>`__ Y. Zhang and J. Bradic, High-dimensional semi-supervised learning: in search of optimal inference of the mean. arxiv:1902.00772, 2022.
"""
if add_intercept:
X = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)
X_unlabeled = np.concatenate(
[np.ones((X_unlabeled.shape[0], 1)), X_unlabeled], axis=1
)
n = Y.shape[0]
N = X_unlabeled.shape[0]
fold_size = int(n / K)
Yhat = np.zeros(n)
Yhat_unlabeled = np.zeros(N)
muhat = X_unlabeled.mean(axis=0)
Vhat = X_unlabeled - muhat[None, :]
Chat = Vhat.T.dot(Vhat) / N
epsilon_hats = np.zeros(n)
bhat_squareds = np.zeros(K)
betahat_transpose_Vhat_fold_avg = np.zeros((K,))
bhat_squared_fold = np.zeros((K,))
for j in range(K):
fold_indices = range(j * fold_size, (j + 1) * fold_size)
train_indices = np.delete(range(n), fold_indices)
X_train = X[train_indices, :]
Y_train = Y[train_indices]
beta_fold = _ols(X_train, Y_train)
X_fold = X[fold_indices, :]
Y_fold = Y[fold_indices]
Vhat_fold = Vhat[fold_indices]
Yhat[fold_indices] = X_fold @ beta_fold
Yhat_unlabeled += (X_unlabeled @ beta_fold) / K
epsilon_hats[fold_indices] = Y_fold - beta_fold.dot(Vhat_fold.T)
bhat_squared_fold[j] = (
beta_fold.dot(Chat).dot(beta_fold)
+ 2
* (beta_fold.dot(Vhat_fold.T) * epsilon_hats[fold_indices]).mean()
)
betahat_transpose_Vhat_fold_avg[j] = (
2 * beta_fold.dot(Vhat_fold.T).mean()
)
semisupervised_pointest = Yhat_unlabeled.mean() + (Y - Yhat).mean()
epsilon_hats -= semisupervised_pointest
bhat_squared_fold -= (
betahat_transpose_Vhat_fold_avg * semisupervised_pointest
)
bhat_squared = bhat_squared_fold.mean()
sigmahat_squared_epsilon = (epsilon_hats**2).mean()
se = np.sqrt(sigmahat_squared_epsilon + n / N * bhat_squared)
return _zconfint_generic(
semisupervised_pointest, se / np.sqrt(n), alpha, alternative
)
# Make a conformal interval for each unlabeled sample and average. Only valid with bonferroni=True
"""
QUANTILE ESTIMATION
"""
[docs]
def classical_quantile_ci(Y, q, alpha=0.1):
"""Confidence interval for a quantile using the classical method.
Args:
Y (ndarray): Labeled responses.
q (float): Quantile to estimate. Must be in (0, 1).
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
Returns:
tuple: (lower, upper) confidence interval bounds.
"""
n = Y.shape[0]
lower, upper = _zconfint_generic(
q * n, np.sqrt(q * (1 - q) * n), alpha, "two-sided"
)
sorted_Y = np.sort(Y)
return sorted_Y[int(lower)], sorted_Y[int(upper)]
"""
ORDINARY LEAST SQUARES
"""
[docs]
def classical_ols_ci(X, Y, w=None, alpha=0.1, alternative="two-sided"):
"""Confidence interval for the OLS coefficients using the classical method.
Args:
X (ndarray): Labeled features.
Y (ndarray): Labeled responses.
w (ndarray, optional): Sample weights for the data set. Must be positive and will be normalized to sum to the size of the dataset.
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
alternative (str, optional): One of "two-sided", "less", or "greater". Defaults to "two-sided".
Returns:
tuple: (lower, upper) confidence interval bounds.
"""
n = Y.shape[0]
if w is None:
pointest, se = _ols(X, Y, return_se=True)
else:
w = w / w.sum() * Y.shape[0]
pointest, se = _wls(X, Y, w, return_se=True)
return _zconfint_generic(pointest, se, alpha, alternative)
[docs]
def postprediction_ols_ci(
Y,
Yhat,
X_unlabeled,
Yhat_unlabeled,
bootstrap_samples=50,
alpha=0.1,
alternative="two-sided",
):
"""Confidence interval for the OLS coefficients using the PostPI method from `[WML20] <https://www.pnas.org/doi/full/10.1073/pnas.2001238117>`__.
This method does not possess any coverage guarantees unless the model is perfect, but predates Prediction-Powered Inference.
It is included for comparison purposes.
Args:
Y (ndarray): Labeled responses.
Yhat (ndarray): Predicted responses for labeled samples.
X_unlabeled (ndarray): Unlabeled features.
Yhat_unlabeled (ndarray): Predicted responses for unlabeled samples.
bootstrap_samples (int, optional): Number of bootstrap samples to use. Defaults to 50.
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
alternative (str, optional): One of "two-sided", "less", or "greater". Defaults to "two-sided".
Returns:
tuple: (lower, upper) confidence interval bounds.
Notes:
`[WML20] <https://www.pnas.org/doi/full/10.1073/pnas.2001238117>`__ S. Wang, T. H. McCormick, and J. T. Leek, Methods for correcting inference based on outcomes predicted by machine learning. Proceedings of the National Academy of Sciences, 117(48): 30266-30275, 2020.
"""
N, d = X_unlabeled.shape
# fit map to debias predictions
regression = IsotonicRegression(out_of_bounds="clip").fit(
Yhat, Y
) # LinearRegression().fit(Yhat[:,None], Y)
# debias predictions on unlabeled data
Yhat_unlabeled_debiased = regression.predict(Yhat_unlabeled[:, None])
# obtain beta and std err via bootstrap
bootstrap_betas = np.zeros((bootstrap_samples, d))
bootstrap_ses = np.zeros((bootstrap_samples, d))
for b in range(bootstrap_samples):
idx = np.random.choice(range(N), N)
X_ols = X_unlabeled[idx, :]
Y_ols = Yhat_unlabeled_debiased[idx]
bootstrap_betas[b, :], bootstrap_ses[b, :] = _ols(
X_ols, Y_ols, return_se=True
)
pointest = np.median(bootstrap_betas, axis=0)
se = np.median(bootstrap_ses, axis=0)
return _zconfint_generic(pointest, se, alpha, alternative)
"""
LOGISTIC REGRESSION
"""
[docs]
def logistic(X, Y):
"""Compute the logistic regression coefficients.
Args:
X (ndarray): Labeled features.
Y (ndarray): Labeled responses.
Returns:
ndarray: Logistic regression coefficients.
"""
regression = LogisticRegression(
penalty=None,
solver="lbfgs",
max_iter=10000,
tol=1e-15,
fit_intercept=False,
).fit(X, Y)
return regression.coef_.squeeze()
[docs]
def classical_logistic_ci(X, Y, alpha=0.1, alternative="two-sided"):
"""Confidence interval for the logistic regression coefficients using the classical method.
Args:
X (ndarray): Labeled
Y (ndarray): Labeled responses.
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
alternative (str, optional): One of "two-sided", "less", or "greater". Defaults to "two-sided".
Returns:
tuple: (lower, upper) confidence interval bounds.
"""
n = Y.shape[0]
d = X.shape[1]
pointest = logistic(X, Y)
mu = expit(X @ pointest)
V = np.zeros((d, d))
grads = np.zeros((n, d))
for i in range(n):
V += 1 / n * mu[i] * (1 - mu[i]) * X[i : i + 1, :].T @ X[i : i + 1, :]
grads[i] += (mu[i] - Y[i]) * X[i]
V_inv = np.linalg.inv(V)
cov_mat = V_inv @ np.cov(grads.T) @ V_inv
return _zconfint_generic(
pointest, np.sqrt(np.diag(cov_mat) / n), alpha, alternative
)
"""
POISSON REGRESSION
"""
def poisson(X, Y):
"""Compute the Poisson regression coefficients.
Args:
X (ndarray): Labeled features.
Y (ndarray): Labeled responses (count data).
Returns:
ndarray: Poisson regression coefficients.
"""
regression = PoissonRegressor(
alpha=0,
fit_intercept=False,
max_iter=10000,
tol=1e-15,
).fit(X, Y)
return regression.coef_
def classical_poisson_ci(X, Y, alpha=0.1, alternative="two-sided"):
"""Confidence interval for the Poisson regression coefficients using the classical method.
Args:
X (ndarray): Labeled features.
Y (ndarray): Labeled responses (count data).
alpha (float, optional): Error level. Confidence interval will target a coverage of 1 - alpha. Defaults to 0.1. Must be in (0, 1).
alternative (str, optional): One of "two-sided", "less", or "greater". Defaults to "two-sided".
Returns:
tuple: (lower, upper) confidence interval bounds.
"""
n = Y.shape[0]
d = X.shape[1]
pointest = poisson(X, Y)
mu = np.exp(X @ pointest) # Expected value for Poisson regression
V = np.zeros((d, d))
grads = np.zeros((n, d))
for i in range(n):
V += 1 / n * mu[i] * X[i : i + 1, :].T @ X[i : i + 1, :]
grads[i] += (mu[i] - Y[i]) * X[i]
V_inv = np.linalg.inv(V)
cov_mat = V_inv @ np.cov(grads.T) @ V_inv
return _zconfint_generic(
pointest, np.sqrt(np.diag(cov_mat) / n), alpha, alternative
)
"""
BOOTSTRAP CI
"""
def classical_bootstrap_ci(
estimator,
Y,
X=None,
n_resamples=1000,
alpha=0.1,
alternative="two-sided",
method="percentile",
):
"""Classical bootstrap confidence interval for the estimator.
Args:
estimator (callable): Estimator function. Takes in (X,Y) or (Y) and returns a point estimate.
Y (ndarray): Gold-standard labels.
X (ndarray, optional): Covariates corresponding to the gold-standard labels. Defaults to `None`. If `None`, the estimator is assumed to only take in `Y`.
n_resamples (int, optional): Number of bootstrap resamples. Defaults to `1000`.
alpha (float, optional): Error level; the confidence interval will target a coverage of 1 - alpha. Must be in (0, 1). Defaults to `0.1`.
alternative (str, optional): Alternative hypothesis, either 'two-sided', 'larger' or 'smaller'. Defaults to `'two-sided'`.
method (str, optional): Method to compute the confidence interval, either 'percentile' or 'basic'. Defaults to `'percentile'`.
Returns:
float or ndarray: Lower and upper bounds of the bootstrap confidence interval for the estimator.
"""
if X is None:
pointest = estimator(Y)
bootstrap_distribution = np.array(
bootstrap([Y], estimator, n_resamples=n_resamples)
)
else:
pointest = estimator(X, Y)
bootstrap_distribution = np.array(
bootstrap(
[X, Y], estimator, n_resamples=n_resamples, paired=[[0, 1]]
)
)
# Deal with the different types of alternative hypotheses
if alternative == "two-sided":
alpha_lower = alpha / 2
alpha_upper = alpha / 2
elif alternative == "larger":
alpha_lower = alpha
alpha_upper = 0
elif alternative == "smaller":
alpha_lower = 0
alpha_upper = alpha
# Compute the lower and upper bounds depending on the method
if method == "percentile":
lower_bound = np.quantile(bootstrap_distribution, alpha_lower, axis=0)
upper_bound = np.quantile(
bootstrap_distribution, 1 - alpha_upper, axis=0
)
elif method == "basic":
lower_bound = 2 * pointest - np.quantile(
bootstrap_distribution, 1 - alpha_lower, axis=0
)
upper_bound = 2 * pointest - np.quantile(
bootstrap_distribution, alpha_upper, axis=0
)
else:
raise ValueError(
"Method must be either 'percentile' or 'basic'. The others are not implemented yet... want to contribute? ;)"
)
if alternative == "two-sided":
return lower_bound, upper_bound
elif alternative == "larger":
return -np.inf, upper_bound
elif alternative == "smaller":
return lower_bound, np.inf
else:
raise ValueError(
"Alternative must be either 'two-sided', 'larger' or 'smaller'."
)