Source code for episia.stats.descriptive

"""
descriptive.py - Descriptive statistics for epidemiological data.

This module provides functions for calculating confidence intervals
for proportions, means, and other descriptive statistics commonly
used in epidemiological analysis.
"""

import numpy as np
from typing import Union, Tuple, Optional, Dict
from dataclasses import dataclass
from enum import Enum
import warnings


[docs] class CI_Method(Enum): """Methods for calculating confidence intervals.""" WALD = "wald" WILSON = "wilson" AGRESTI_COULL = "agresti_coull" JEFFREYS = "jeffreys" CLOPPER_PEARSON = "clopper_pearson" DELTA = "delta"
[docs] @dataclass class ProportionResult: """Rich result object for proportion calculations.""" proportion: float ci_lower: float ci_upper: float sample_size: int numerator: int denominator: int method: str
[docs] def __repr__(self) -> str: return f"Proportion: {self.proportion:.4f} ({self.ci_lower:.4f}-{self.ci_upper:.4f})"
[docs] def to_dict(self) -> Dict: """Convert result to dictionary.""" return { "measure": "proportion", "proportion": self.proportion, "ci_lower": self.ci_lower, "ci_upper": self.ci_upper, "sample_size": self.sample_size, "numerator": self.numerator, "denominator": self.denominator, "method": self.method }
[docs] @dataclass class MeanResult: """Rich result object for mean calculations.""" mean: float ci_lower: float ci_upper: float sample_size: int std_dev: float method: str
[docs] def __repr__(self) -> str: return f"Mean: {self.mean:.4f} ({self.ci_lower:.4f}-{self.ci_upper:.4f})"
[docs] def to_dict(self) -> Dict: """Convert result to dictionary.""" return { "measure": "mean", "mean": self.mean, "ci_lower": self.ci_lower, "ci_upper": self.ci_upper, "sample_size": self.sample_size, "std_dev": self.std_dev, "method": self.method }
@dataclass class IncidenceRateResult: """Rich result object for incidence rate calculations.""" rate: float ci_lower: float ci_upper: float cases: int person_time: float multiplier: int confidence: float method: str def __repr__(self) -> str: unit = f"per {self.multiplier:,} person-time" if self.multiplier != 1 else "per person-time" scaled = self.rate * self.multiplier lo = self.ci_lower * self.multiplier hi = self.ci_upper * self.multiplier return f"Rate: {scaled:.4f} ({lo:.4f}-{hi:.4f}) {unit}" def to_dict(self) -> Dict: """Convert result to dictionary.""" return { "measure": "incidence_rate", "rate": self.rate, "ci_lower": self.ci_lower, "ci_upper": self.ci_upper, "cases": self.cases, "person_time": self.person_time, "multiplier": self.multiplier, "confidence": self.confidence, "method": self.method }
[docs] def proportion_ci( numerator: int = None, denominator: int = None, method: CI_Method = CI_Method.WILSON, confidence: float = 0.95, *, k: int = None, n: int = None, **kwargs ) -> ProportionResult: """Wrapper accepting both proportion_ci(45, 200) and proportion_ci(k=45, n=200).""" # Resolve aliases k/n → numerator/denominator if numerator is None and k is not None: numerator = k if denominator is None and n is not None: denominator = n if numerator is None or denominator is None: raise TypeError( "proportion_ci() requires numerator and denominator. " "Use proportion_ci(45, 200) or proportion_ci(k=45, n=200)." ) return _proportion_ci_impl(numerator, denominator, method, confidence, **kwargs)
def _proportion_ci_impl( numerator: int, denominator: int, method = CI_Method.WILSON, confidence: float = 0.95, **kwargs ) -> ProportionResult: """ Calculate proportion with confidence interval. Args: numerator: Number of events/cases denominator: Total sample size method: Method for CI calculation (default: Wilson) confidence: Confidence level (default: 0.95) **kwargs: Additional method-specific parameters Returns: ProportionResult object Raises: ValueError: If denominator <= 0 or numerator < 0 ValueError: If numerator > denominator Example: >>> result = proportion_ci(45, 100) >>> print(result.proportion) 0.45 """ # Accept string method names e.g. method="wilson" if isinstance(method, str): try: method = CI_Method(method.lower()) except ValueError: method = CI_Method.WILSON # Input validation if denominator <= 0: raise ValueError("Denominator must be positive") if numerator < 0: raise ValueError("Numerator must be non-negative") if numerator > denominator: raise ValueError("Numerator cannot exceed denominator") p = numerator / denominator if denominator > 0 else 0.0 # Select CI calculation method if method == CI_Method.WALD: ci_lower, ci_upper = _wald_ci(p, denominator, confidence, **kwargs) elif method == CI_Method.WILSON: ci_lower, ci_upper = _wilson_ci(numerator, denominator, confidence, **kwargs) elif method == CI_Method.AGRESTI_COULL: ci_lower, ci_upper = _agresti_coull_ci(numerator, denominator, confidence, **kwargs) elif method == CI_Method.JEFFREYS: ci_lower, ci_upper = _jeffreys_ci(numerator, denominator, confidence, **kwargs) elif method == CI_Method.CLOPPER_PEARSON: ci_lower, ci_upper = _clopper_pearson_ci(numerator, denominator, confidence, **kwargs) else: # Default to Wilson ci_lower, ci_upper = _wilson_ci(numerator, denominator, confidence, **kwargs) return ProportionResult( proportion=p, ci_lower=ci_lower, ci_upper=ci_upper, sample_size=denominator, numerator=numerator, denominator=denominator, method=method.value )
[docs] def mean_ci( data: np.ndarray, confidence: float = 0.95, method: str = "t_distribution", population_std: Optional[float] = None ) -> MeanResult: """ Calculate mean with confidence interval. Args: data: Array-like numeric data confidence: Confidence level (default: 0.95) method: 't_distribution' (small samples) or 'normal' (large samples) population_std: Known population standard deviation (optional) Returns: MeanResult object Example: >>> data = np.array([1.2, 1.5, 1.8, 2.1, 1.9]) >>> result = mean_ci(data) >>> print(result.mean) """ # Convert to numpy array if needed if not isinstance(data, np.ndarray): data = np.array(data) # Remove NaN values data = data[~np.isnan(data)] n = len(data) if n == 0: raise ValueError("Data array is empty or contains only NaN values") mean = np.mean(data) std = np.std(data, ddof=1) # Sample standard deviation if population_std is not None: # Use known population standard deviation se = population_std / np.sqrt(n) z = _z_score(confidence) margin = z * se elif method == "t_distribution" and n < 30: # Use t-distribution for small samples from scipy import stats t = stats.t.ppf(1 - (1 - confidence) / 2, df=n-1) se = std / np.sqrt(n) margin = t * se else: # Use normal approximation for large samples z = _z_score(confidence) se = std / np.sqrt(n) margin = z * se ci_lower = mean - margin ci_upper = mean + margin return MeanResult( mean=mean, ci_lower=ci_lower, ci_upper=ci_upper, sample_size=n, std_dev=std, method=method )
[docs] def incidence_rate( cases: int, person_time: float, confidence: float = 0.95, multiplier: int = 1, ) -> IncidenceRateResult: """ Calculate person-time incidence rate with confidence interval. Uses Byar's approximation when cases >= 10, exact Poisson (chi-squared) otherwise consistent with OpenEpi and Rothman & Greenland. Args: cases: Number of incident cases. person_time: Total person-time at risk (any unit: years, months, days). confidence: Confidence level (default 0.95). multiplier: Scale factor for display, e.g. 100, 1_000, 100_000. Does not affect stored rate — only __repr__ scaling. Returns: IncidenceRateResult Example: >>> # HIV seroconversion cohort, 20 cases / 500 person-years >>> result = incidence_rate(20, 500, multiplier=100) >>> print(result) Rate: 4.0000 (2.4423-6.1780) per 100 person-time """ if person_time <= 0: raise ValueError("Person-time must be positive") if cases < 0: raise ValueError("cases must be non-negative.") rate = cases / person_time z = _z_score(confidence) # Byar's approximation (good for cases >= 10) if cases >= 10: # Byar's approximation ci_lower = ( cases * (1 - 1 / (9 * cases) - z / (3 * np.sqrt(cases))) ** 3 / person_time ) ci_upper = ( (cases + 1) * (1 - 1 / (9 * (cases + 1)) + z / (3 * np.sqrt(cases + 1))) ** 3 / person_time ) method = "byar" else: # Exact Poisson via chi-squared quantiles from scipy import stats as _stats alpha = 1 - confidence ci_lower = _stats.chi2.ppf(alpha / 2, 2 * cases) / (2 * person_time) ci_upper = _stats.chi2.ppf(1 - alpha / 2, 2 * (cases + 1)) / (2 * person_time) method = "exact_poisson" return IncidenceRateResult( rate=rate, ci_lower=ci_lower, ci_upper=ci_upper, cases=cases, person_time=person_time, multiplier=multiplier, confidence=confidence, method=method, )
def cumulative_incidence( cases: int, population_at_risk: int, confidence: float = 0.95, method: CI_Method = CI_Method.WILSON, ) -> ProportionResult: """ Calculate cumulative incidence (attack rate / risk) with confidence interval. The denominator must be the disease-free population at the start of the observation period, not the total population. Args: cases: Number of new cases over the period. population_at_risk: Disease-free population at period start. confidence: Confidence level (default 0.95). method: CI method (default Wilson). Returns: ProportionResult Example: >>> # Malaria cohort, rainy season, Sahel >>> result = cumulative_incidence(120, 500) >>> print(result) Proportion: 0.2400 (0.2046-0.2793) """ return _proportion_ci_impl(cases, population_at_risk, method, confidence)
[docs] def attack_rate( cases: int, population: int, confidence: float = 0.95 ) -> ProportionResult: """ Calculate attack rate (cumulative incidence) with CI. Args: cases: Number of cases population: Population at risk confidence: Confidence level Returns: ProportionResult object """ return cumulative_incidence(cases, population, confidence=confidence)
[docs] def prevalence( cases: int, population: int, confidence: float = 0.95, method: CI_Method = CI_Method.WILSON, ) -> ProportionResult: """ Calculate point prevalence with confidence interval. Args: cases: Number of prevalent cases (existing cases at time T). population: Total population examined. confidence: Confidence level (default 0.95). method: CI method (default Wilson). Returns: ProportionResult Example: >>> # HTA survey, Burkina Faso STEPS 2013 >>> result = prevalence(1056, 4800) >>> print(result) Proportion: 0.2200 (0.2085-0.2319) """ return _proportion_ci_impl(cases, population, method, confidence)
# CONFIDENCE INTERVAL METHODS def _wald_ci( p: float, n: int, confidence: float, **kwargs ) -> Tuple[float, float]: """ Wald confidence interval for a proportion. Appropriate for large samples (n > 30) with p not near 0 or 1. """ if n == 0: return 0.0, 0.0 z = _z_score(confidence) se = np.sqrt(p * (1 - p) / n) margin = z * se ci_lower = max(0, p - margin) ci_upper = min(1, p + margin) return ci_lower, ci_upper def _wilson_ci( numerator: int, denominator: int, confidence: float, **kwargs ) -> Tuple[float, float]: """ Wilson score confidence interval. Recommended for all sample sizes, especially when p is near 0 or 1. """ if denominator == 0: return 0.0, 0.0 n = denominator x = numerator p = x / n z = _z_score(confidence) z2 = z**2 center = (x + z2/2) / (n + z2) margin = z * np.sqrt((p*(1-p) + z2/(4*n)) / n) / (1 + z2/n) ci_lower = max(0, center - margin) ci_upper = min(1, center + margin) return ci_lower, ci_upper def _agresti_coull_ci( numerator: int, denominator: int, confidence: float, **kwargs ) -> Tuple[float, float]: """ Agresti-Coull confidence interval (adjusted Wald). Good alternative to Wilson, simpler calculation. """ if denominator == 0: return 0.0, 0.0 n = denominator x = numerator z = _z_score(confidence) z2 = z**2 # Add z^2/2 successes and z^2/2 failures n_adj = n + z2 p_adj = (x + z2/2) / n_adj se_adj = np.sqrt(p_adj * (1 - p_adj) / n_adj) margin = z * se_adj ci_lower = max(0, p_adj - margin) ci_upper = min(1, p_adj + margin) return ci_lower, ci_upper def _jeffreys_ci( numerator: int, denominator: int, confidence: float, **kwargs ) -> Tuple[float, float]: """ Jeffreys Bayesian confidence interval. Good properties for all sample sizes. """ if denominator == 0: return 0.0, 0.0 from scipy import stats x = numerator n = denominator # Beta distribution parameters alpha = x + 0.5 beta = n - x + 0.5 ci_lower = stats.beta.ppf((1-confidence)/2, alpha, beta) ci_upper = stats.beta.ppf(1-(1-confidence)/2, alpha, beta) return ci_lower, ci_upper def _clopper_pearson_ci( numerator: int, denominator: int, confidence: float, **kwargs ) -> Tuple[float, float]: """ Clopper-Pearson exact binomial confidence interval. Conservative - guaranteed coverage at least (1-alpha). """ if denominator == 0: return 0.0, 0.0 from scipy import stats x = numerator n = denominator if x == 0: ci_lower = 0.0 ci_upper = 1 - (confidence/2)**(1/n) elif x == n: ci_lower = (confidence/2)**(1/n) ci_upper = 1.0 else: ci_lower = stats.beta.ppf((1-confidence)/2, x, n-x+1) ci_upper = stats.beta.ppf(1-(1-confidence)/2, x+1, n-x) return ci_lower, ci_upper def _z_score(confidence: float) -> float: """Get z-score for given confidence level.""" from scipy import stats alpha = 1 - confidence return stats.norm.ppf(1 - alpha/2) def _check_sample_size( n: int, p: Optional[float] = None, min_size: int = 5 ) -> bool: """ Check if sample size is adequate for normal approximation. Returns: True if n*p and n*(1-p) are both >= min_size """ if p is None: return n >= 30 # Rule of thumb for means return n * p >= min_size and n * (1 - p) >= min_size # ADDITIONAL DESCRIPTIVE FUNCTIONS
[docs] def median_ci( data: np.ndarray, confidence: float = 0.95, method: str = "exact" ) -> Dict[str, float]: """ Calculate median with confidence interval. Args: data: Array-like numeric data confidence: Confidence level method: 'exact' or 'normal_approximation' Returns: Dictionary with median and CI """ data = np.array(data) data = data[~np.isnan(data)] n = len(data) if n == 0: raise ValueError("Data array is empty") median = np.median(data) if method == "exact": # Binomial-based exact CI for median from scipy import stats # Order statistics indices r = int(np.floor((n/2) - stats.norm.ppf(1-(1-confidence)/2) * np.sqrt(n)/2)) s = int(np.ceil(1 + (n/2) + stats.norm.ppf(1-(1-confidence)/2) * np.sqrt(n)/2)) # Adjust for array indices (0-based) r = max(0, r - 1) s = min(n-1, s - 1) sorted_data = np.sort(data) ci_lower = sorted_data[r] ci_upper = sorted_data[s] else: # Normal approximation se = 1.253 * np.std(data, ddof=1) / np.sqrt(n) z = _z_score(confidence) margin = z * se ci_lower = median - margin ci_upper = median + margin return { "measure": "median", "median": median, "ci_lower": ci_lower, "ci_upper": ci_upper, "sample_size": n, "method": method }
[docs] def interquartile_range( data: np.ndarray, return_quartiles: bool = False ) -> Union[float, Dict[str, float]]: """ Calculate interquartile range (IQR). Args: data: Array-like numeric data return_quartiles: If True, returns Q1, Q3, and IQR Returns: IQR value or dictionary with quartiles """ data = np.array(data) data = data[~np.isnan(data)] q1 = np.percentile(data, 25) q3 = np.percentile(data, 75) iqr = q3 - q1 if return_quartiles: return { "q1": q1, "q3": q3, "iqr": iqr, "lower_fence": q1 - 1.5 * iqr, "upper_fence": q3 + 1.5 * iqr } return iqr
# MODULE EXPORTS __all__ = [ 'CI_Method', 'ProportionResult', 'MeanResult', 'IncidenceRateResult', 'proportion_ci', 'mean_ci', 'incidence_rate', 'cumulative_incidence', 'attack_rate', 'prevalence', 'median_ci', 'interquartile_range' ]