Source code for pystatistics.regression._linear

"""
Linear regression parameter payload and solution.

LinearParams: immutable data computed by backends.
LinearSolution: user-facing results with R-style summary.
"""

from dataclasses import dataclass
from typing import Any, TYPE_CHECKING
import numpy as np
from numpy.typing import NDArray

from pystatistics.core.result import Result
from pystatistics.regression._formatting import significance_stars

if TYPE_CHECKING:
    from pystatistics.regression.design import Design


[docs] @dataclass(frozen=True) class LinearParams: """ Parameter payload for linear regression. This is the immutable data computed by backends. """ coefficients: NDArray[np.floating[Any]] residuals: NDArray[np.floating[Any]] fitted_values: NDArray[np.floating[Any]] rss: float tss: float rank: int df_residual: int
[docs] @dataclass class LinearSolution: """ User-facing regression results. Wraps the backend Result and provides convenient accessors for all regression outputs including standard errors and t-statistics. """ _result: Result[LinearParams] _design: 'Design' # Optional variable names (set by fit() via names= kwarg) _names: tuple[str, ...] | None = None # Cached computations _standard_errors: NDArray[np.floating[Any]] | None = None _t_statistics: NDArray[np.floating[Any]] | None = None _p_values: NDArray[np.floating[Any]] | None = None @property def coefficients(self) -> NDArray[np.floating[Any]]: return self._result.params.coefficients @property def coef(self) -> dict[str, float]: """Named coefficient mapping (like R's coef() or statsmodels .params). Returns dict mapping variable names to coefficient values. Falls back to 'B[0]', 'B[1]'... when names are not available. """ names = self._names or tuple(f"B[{i}]" for i in range(len(self.coefficients))) return dict(zip(names, self.coefficients.tolist())) @property def residuals(self) -> NDArray[np.floating[Any]]: return self._result.params.residuals @property def fitted_values(self) -> NDArray[np.floating[Any]]: return self._result.params.fitted_values @property def rss(self) -> float: return self._result.params.rss @property def tss(self) -> float: return self._result.params.tss @property def r_squared(self) -> float: if self.tss == 0: return 1.0 if self.rss == 0 else 0.0 return 1.0 - (self.rss / self.tss) @property def adjusted_r_squared(self) -> float: n = self._design.n p = self._result.params.rank if n - p <= 0 or self.tss == 0: return self.r_squared return 1.0 - (1.0 - self.r_squared) * (n - 1) / (n - p) @property def residual_std_error(self) -> float: df = self._result.params.df_residual if df <= 0: return 0.0 return float(np.sqrt(self.rss / df)) @property def standard_errors(self) -> NDArray[np.floating[Any]]: """ Standard errors of coefficients. For CPU QR backend: uses R^{-1} from QR decomposition to compute (X'X)^{-1} = R^{-1} R^{-T}, matching R's backsolve(R, diag(p)) exactly. For GPU backend (no R available): uses np.linalg.inv(X'X). # NOT A FALLBACK: mathematically equivalent to QR path, # just a different computation route since GPU doesn't store QR factors. For rank-deficient matrices, aliased coefficients get NaN standard errors. """ if self._standard_errors is not None: return self._standard_errors df = self._result.params.df_residual rank = self._result.params.rank p = len(self.coefficients) if df <= 0: self._standard_errors = np.full(p, np.nan, dtype=np.float64) return self._standard_errors sigma_sq = self.rss / df pivot = self._result.info.get('pivot') R = self._result.info.get('R') if R is not None and pivot is not None: # QR-based SE computation (matches R exactly) from scipy.linalg import solve_triangular pivot = np.array(pivot) if rank < p: R_active = R[:rank, :rank] try: R_inv = solve_triangular( R_active, np.eye(rank), lower=False ) XtX_inv_diag = np.sum(R_inv ** 2, axis=1) se_pivoted = np.sqrt(sigma_sq * XtX_inv_diag) except np.linalg.LinAlgError: se_pivoted = np.full(rank, np.nan, dtype=np.float64) se = np.full(p, np.nan, dtype=np.float64) se[pivot[:rank]] = se_pivoted else: R_sq = R[:p, :p] try: R_inv = solve_triangular( R_sq, np.eye(p), lower=False ) XtX_inv_diag = np.sum(R_inv ** 2, axis=1) se_pivoted = np.sqrt(sigma_sq * XtX_inv_diag) except np.linalg.LinAlgError: se_pivoted = np.full(p, np.nan, dtype=np.float64) se = np.empty(p, dtype=np.float64) se[pivot] = se_pivoted self._standard_errors = se else: # NOT A FALLBACK: mathematically equivalent to QR path. # GPU backends don't store QR factors, so we compute # (X'X)^{-1} directly. Same result, different route. try: XtX_inv = np.linalg.inv(self._design.XtX()) self._standard_errors = np.sqrt(sigma_sq * np.diag(XtX_inv)) except np.linalg.LinAlgError: self._standard_errors = np.full(p, np.nan, dtype=np.float64) return self._standard_errors @property def t_statistics(self) -> NDArray[np.floating[Any]]: """t-statistics for coefficients.""" if self._t_statistics is not None: return self._t_statistics se = self.standard_errors with np.errstate(divide='ignore', invalid='ignore'): t = self.coefficients / se t = np.where(np.isfinite(t), t, np.nan) self._t_statistics = t return self._t_statistics @property def p_values(self) -> NDArray[np.floating[Any]]: """Two-sided p-values for coefficient t-tests.""" if self._p_values is not None: return self._p_values from scipy import stats t = self.t_statistics df = self.df_residual if df <= 0: self._p_values = np.full(len(t), np.nan, dtype=np.float64) return self._p_values pv = 2.0 * stats.t.sf(np.abs(t), df=df) self._p_values = np.where(np.isfinite(t), pv, np.nan) return self._p_values @property def rank(self) -> int: return self._result.params.rank @property def df_residual(self) -> int: return self._result.params.df_residual @property def info(self) -> dict[str, Any]: return self._result.info @property def timing(self) -> dict[str, float] | None: return self._result.timing @property def backend_name(self) -> str: return self._result.backend_name @property def warnings(self) -> tuple[str, ...]: return self._result.warnings
[docs] def summary(self) -> str: """Generate R-style summary output.""" p = len(self.coefficients) names = self._names or tuple(f"B[{i}]" for i in range(p)) max_name_len = max(len(n) for n in names) col_w = max(max_name_len + 2, 15) rq = np.quantile(self.residuals, [0.0, 0.25, 0.5, 0.75, 1.0]) lines = [ "Linear Regression Results", "=" * 72, f"Observations: {self._design.n}", f"Predictors: {self._design.p}", f"Rank: {self.rank}", "", "Residuals:", f" {'Min':>10s} {'1Q':>10s} {'Median':>10s} {'3Q':>10s} {'Max':>10s}", f" {rq[0]:10.4f} {rq[1]:10.4f} {rq[2]:10.4f} {rq[3]:10.4f} {rq[4]:10.4f}", "", "Coefficients:", "-" * 72, f" {'':<{col_w}s} {'Estimate':>14s} {'Std.Error':>12s} {'t value':>10s} {'Pr(>|t|)':>12s}", "-" * 72, ] for name, coef, se, t, pv in zip( names, self.coefficients, self.standard_errors, self.t_statistics, self.p_values ): if np.isnan(coef): lines.append(f" {name:<{col_w}s} (aliased)") else: se_str = f"{se:12.6f}" if not np.isnan(se) else " NA" t_str = f"{t:10.3f}" if not np.isnan(t) else " NA" pv_str = f"{pv:12.4e}" if not np.isnan(pv) else " NA" sig = significance_stars(pv) lines.append( f" {name:<{col_w}s} {coef:14.6f} {se_str} {t_str} {pv_str} {sig}" ) lines.append("-" * 72) lines.append("Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1") lines.append("") lines.append( f"Residual standard error: {self.residual_std_error:.4f} on {self.df_residual} degrees of freedom" ) lines.append( f"Multiple R-squared: {self.r_squared:.6f}, " f"Adjusted R-squared: {self.adjusted_r_squared:.6f}" ) df_model = self.rank - 1 if df_model > 0 and self.df_residual > 0 and self.tss > 0: from scipy import stats as sp_stats f_stat = (self.r_squared / df_model) / ((1.0 - self.r_squared) / self.df_residual) f_pval = sp_stats.f.sf(f_stat, df_model, self.df_residual) lines.append( f"F-statistic: {f_stat:.2f} on {df_model} and {self.df_residual} DF, " f"p-value: {f_pval:.4e}" ) lines.append(f"Backend: {self.backend_name}") if self.timing: lines.append(f"Time: {self.timing.get('total_seconds', 0):.4f}s") return "\n".join(lines)
def __repr__(self) -> str: return ( f"LinearSolution(n={self._design.n}, p={self._design.p}, " f"rank={self.rank}, r_squared={self.r_squared:.4f})" )