Source code for episia.data.types

"""
This module provides functions for optimizing data types to reduce
memory usage while maintaining data integrity for epidemiological analysis.
"""

import pandas as pd
import numpy as np
from typing import Dict
import warnings


[docs] def optimize_dataframe_types( df: pd.DataFrame, categorical_threshold: float = 0.5, downcast_integers: bool = True, downcast_floats: bool = True ) -> pd.DataFrame: """ Optimize DataFrame column types to reduce memory usage. Args: df: Input DataFrame categorical_threshold: Maximum unique ratio for categorical conversion downcast_integers: Downcast integer columns downcast_floats: Downcast float columns Returns: Optimized DataFrame """ df = df.copy() original_memory = df.memory_usage(deep=True).sum() for col in df.columns: df[col] = optimize_column_type( df[col], categorical_threshold=categorical_threshold, downcast_integers=downcast_integers, downcast_floats=downcast_floats ) optimized_memory = df.memory_usage(deep=True).sum() reduction = ((original_memory - optimized_memory) / original_memory * 100 if original_memory > 0 else 0) if reduction > 10: # Only warn for significant reductions warnings.warn( f"Memory reduced by {reduction:.1f}% " f"({original_memory/1e6:.1f}MB -> {optimized_memory/1e6:.1f}MB)" ) return df
[docs] def optimize_column_type( series: pd.Series, categorical_threshold: float = 0.5, downcast_integers: bool = True, downcast_floats: bool = True ) -> pd.Series: """ Optimize a single column's data type. Args: series: Input Series categorical_threshold: Maximum unique ratio for categorical downcast_integers: Downcast integer columns downcast_floats: Downcast float columns Returns: Optimized Series """ # Handle missing values if series.isna().all(): return series.astype('category') # Check if already optimized current_type = str(series.dtype) if current_type.startswith('category'): return series # Determine optimization strategy based on data n_unique = series.nunique() n_total = len(series) unique_ratio = n_unique / n_total if n_total > 0 else 1.0 # Convert to categorical if appropriate if unique_ratio < categorical_threshold and n_unique > 1: # For low cardinality columns if n_unique < 1000: # Reasonable limit for categories try: return series.astype('category') except: pass # Optimize numeric types if pd.api.types.is_numeric_dtype(series): series = optimize_numeric_type( series, downcast_integers=downcast_integers, downcast_floats=downcast_floats ) # Optimize datetime types elif pd.api.types.is_datetime64_any_dtype(series): series = optimize_datetime_type(series) # Optimize string/object types elif series.dtype == 'object': series = optimize_object_type(series) return series
[docs] def optimize_numeric_type( series: pd.Series, downcast_integers: bool = True, downcast_floats: bool = True ) -> pd.Series: """ Optimize numeric column type. Args: series: Numeric Series downcast_integers: Downcast integer columns downcast_floats: Downcast float columns Returns: Optimized numeric Series """ # Check if integer type is_integer = pd.api.types.is_integer_dtype(series) if is_integer and downcast_integers: # Downcast integers min_val = series.min() max_val = series.max() if pd.isna(min_val) or pd.isna(max_val): return series # Choose smallest integer type if min_val >= 0: # Unsigned integers if max_val <= np.iinfo(np.uint8).max: return series.astype(np.uint8) elif max_val <= np.iinfo(np.uint16).max: return series.astype(np.uint16) elif max_val <= np.iinfo(np.uint32).max: return series.astype(np.uint32) else: # Signed integers if (min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max): return series.astype(np.int8) elif (min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max): return series.astype(np.int16) elif (min_val >= np.iinfo(np.int32).min and max_val <= np.iinfo(np.int32).max): return series.astype(np.int32) elif not is_integer and downcast_floats: # Downcast floats try: # Try to convert to float32 if precision is sufficient float32_series = series.astype(np.float32) if np.allclose(float32_series, series, equal_nan=True): return float32_series except: pass return series
[docs] def optimize_datetime_type(series: pd.Series) -> pd.Series: """ Optimize datetime column type. Args: series: Datetime Series Returns: Optimized datetime Series """ # Already datetime, ensure consistent type if series.dtype == 'datetime64[ns]': return series # Try to convert to datetime try: return pd.to_datetime(series) except: return series
[docs] def optimize_object_type(series: pd.Series) -> pd.Series: """ Optimize object (string) column type. Args: series: Object Series Returns: Optimized Series """ # Check if all values are strings if series.apply(lambda x: isinstance(x, str)).all(): # Check if should be categorical n_unique = series.nunique() if n_unique / len(series) < 0.5 and n_unique < 1000: return series.astype('category') # Check if mixed types - try to infer try: # Try to convert to numeric numeric_series = pd.to_numeric(series, errors='coerce') if not numeric_series.isna().all(): return optimize_numeric_type(numeric_series) except: pass try: # Try to convert to datetime datetime_series = pd.to_datetime(series, errors='coerce') if not datetime_series.isna().all(): return datetime_series except: pass # Return as is if no optimization possible return series
[docs] def get_type_recommendations(df: pd.DataFrame) -> pd.DataFrame: """ Get type optimization recommendations for DataFrame. Args: df: Input DataFrame Returns: DataFrame with recommendations """ recommendations = [] for col in df.columns: current_type = str(df[col].dtype) optimized = optimize_column_type(df[col]) recommended_type = str(optimized.dtype) current_memory = df[col].memory_usage(deep=True) optimized_memory = optimized.memory_usage(deep=True) savings = current_memory - optimized_memory recommendations.append({ 'column': col, 'current_type': current_type, 'recommended_type': recommended_type, 'current_memory_kb': current_memory / 1024, 'optimized_memory_kb': optimized_memory / 1024, 'savings_kb': savings / 1024, 'savings_percent': (savings / current_memory * 100 if current_memory > 0 else 0) }) return pd.DataFrame(recommendations)
[docs] def convert_to_epidemiological_types( df: pd.DataFrame, column_types: Dict[str, str] ) -> pd.DataFrame: """ Convert columns to specific epidemiological types. Args: df: Input DataFrame column_types: Dictionary of {column_name: type} Types: 'binary', 'categorical', 'continuous', 'date' Returns: Converted DataFrame """ df = df.copy() for col, col_type in column_types.items(): if col in df.columns: if col_type == 'binary': df[col] = convert_to_binary(df[col]) elif col_type == 'categorical': df[col] = convert_to_categorical(df[col]) elif col_type == 'continuous': df[col] = convert_to_continuous(df[col]) elif col_type == 'date': df[col] = convert_to_date(df[col]) return df
[docs] def convert_to_binary(series: pd.Series) -> pd.Series: """Convert series to binary (0/1).""" # Handle various binary representations if series.dtype == 'bool': return series.astype(np.int8) # Check if already binary unique_vals = set(series.dropna().unique()) if unique_vals.issubset({0, 1, 0.0, 1.0}): return series.astype(np.int8) # Try to convert from string representations str_lower = series.astype(str).str.lower() binary_map = { 'true': 1, 'false': 0, 'yes': 1, 'no': 0, 'y': 1, 'n': 0, '1': 1, '0': 0, 'positive': 1, 'negative': 0, 'case': 1, 'control': 0 } converted = str_lower.map(binary_map) if not converted.isna().all(): return converted.astype(np.int8) # Fallback: convert to categorical and then to binary indicator return pd.get_dummies(series, drop_first=True).iloc[:, 0].astype(np.int8)
[docs] def convert_to_categorical(series: pd.Series, max_categories: int = 50) -> pd.Series: """Convert series to categorical type.""" if series.dtype.name == 'category': return series n_unique = series.nunique() if n_unique > max_categories: warnings.warn( f"Column has {n_unique} unique values, " f"considering as continuous instead of categorical" ) return series return series.astype('category')
[docs] def convert_to_continuous(series: pd.Series) -> pd.Series: """Convert series to continuous numeric type.""" if pd.api.types.is_numeric_dtype(series): return optimize_numeric_type(series) # Try to convert to numeric try: numeric = pd.to_numeric(series, errors='coerce') if not numeric.isna().all(): return optimize_numeric_type(numeric) except: pass return series
[docs] def convert_to_date(series: pd.Series) -> pd.Series: """Convert series to datetime type.""" if pd.api.types.is_datetime64_any_dtype(series): return series try: return pd.to_datetime(series, errors='coerce') except: return series
[docs] def detect_column_types(df: pd.DataFrame) -> Dict[str, str]: """ Detect epidemiological column types automatically. Args: df: Input DataFrame Returns: Dictionary of {column_name: detected_type} """ type_map = {} for col in df.columns: series = df[col] # Check for date patterns if (series.dtype == 'object' and series.dropna().astype(str).str.match( r'\d{4}[-/]\d{1,2}[-/]\d{1,2}').any()): type_map[col] = 'date' # Check for binary elif series.nunique() == 2: type_map[col] = 'binary' # Check for categorical (low cardinality) elif series.nunique() < 20 and series.nunique() < len(series) * 0.3: type_map[col] = 'categorical' # Check for numeric/continuous elif pd.api.types.is_numeric_dtype(series): type_map[col] = 'continuous' else: type_map[col] = 'unknown' return type_map