Source code for causal_falsify.utils.simulate_data

import numpy as np
import pandas as pd


[docs] def create_polynomial_representation(X, degree): """ Generate polynomial features for the input array X up to a specified degree. Parameters ---------- X : np.ndarray Input array of shape (n_samples, n_features). degree : int Degree of the polynomial features. Returns ------- np.ndarray Array of shape (n_samples, n_features * degree) containing the polynomial features. """ # Validate input type if not isinstance(X, np.ndarray): raise TypeError("X must be a NumPy array.") if X.ndim != 2: raise ValueError("X must be a 2D array of shape (n_samples, n_features).") if not isinstance(degree, int): raise TypeError("degree must be an integer.") if degree < 1: raise ValueError("degree must be a positive integer (>= 1).") n_samples, n_features = X.shape if n_samples == 0 or n_features == 0: raise ValueError("X must have at least one sample and one feature.") # Create polynomial features poly_features = [] for feature_idx in range(n_features): feature = X[:, feature_idx] poly_feature = np.column_stack([feature**d for d in range(1, degree + 1)]) poly_features.append(poly_feature) X_poly = np.hstack(poly_features) return X_poly
[docs] def simulate_data( n_samples: int, degree: int = 1, conf_strength: float = 1.0, transportability_violation: float = 0.0, n_envs: int = 50, n_observed_confounders: int = 5, seed: int = None, ) -> pd.DataFrame: """ Simulates synthetic data for causal inference experiments with multiple environments, observed confounders, and configurable treatment and outcome mechanisms. Parameters ---------- n_samples : int Number of samples to generate per environment. degree : int, optional Degree of polynomial transformation applied to observed confounders (default is 1, i.e., linear). conf_strength : float, optional Strength of confounding between treatment and outcome (default is 1.0). transportability_violation : float, optional Degree of violation in transportability across environments (default is 0.0). n_envs : int, optional Number of distinct environments to simulate (default is 50). n_observed_confounders : int, optional Number of observed confounders/features (default is 5). seed : int, optional Random seed for reproducibility (default is None). Returns ------- pd.DataFrame A pandas DataFrame containing the simulated data with columns: - 'A': Treatment variable - 'Y': Outcome variable - 'X_0', ..., 'X_{n_observed_confounders-1}': Observed confounders - 'S': Environment index Notes ----- - The function generates data for multiple environments, each with its own parameters. - Observed confounders can be transformed using polynomial features. - Unobserved confounding and transportability violations can be controlled via parameters. """ rng = np.random.RandomState(seed) covar_list = [f"X_{i}" for i in range(n_observed_confounders)] x_transform = ( (lambda x: x) if degree == 1 else (lambda x: create_polynomial_representation(x, degree)) ) test_vector = rng.multivariate_normal( np.zeros(n_observed_confounders), np.eye(n_observed_confounders), size=(1) ) feature_dim = x_transform(test_vector).shape[1] x_to_a_coef = rng.choice([-1.0, 1.0], size=(feature_dim, n_envs)) x_to_y_coef = np.ones((feature_dim, n_envs), dtype=float) a_to_y_effect = np.ones((1, n_envs), dtype=float) intercept_a = rng.normal(0, 1.0, size=(1, n_envs)) intercept_y = rng.normal(0, 1.0, size=(1, n_envs)) mu_X = rng.normal(0, 1.0, size=(n_envs, n_observed_confounders)) mu_U = rng.normal(0, 1.0, size=(n_envs, 1)) X_cov = np.full((n_observed_confounders, n_observed_confounders), 0.1) np.fill_diagonal(X_cov, 2) sigma_U = 2.0 all_data = [] for i in range(n_envs): X = rng.multivariate_normal( mu_X[i], X_cov / np.sqrt(n_observed_confounders), size=n_samples ) X_repr = x_transform(X) U = rng.normal(mu_U[i], sigma_U, size=(n_samples, 1)) treatment_confounding = 1.0 if conf_strength != 0.0 else 0.0 A = ( intercept_a[:, i] + (X_repr @ x_to_a_coef[:, i]).reshape(-1, 1) + treatment_confounding * np.sum(np.abs(U), axis=1).reshape(-1, 1) + rng.normal(0, 0.5, size=(n_samples, 1)) # Treatment noise ) Y = ( transportability_violation * intercept_y[:, i] + (X_repr @ x_to_y_coef[:, i]).reshape(-1, 1) + conf_strength * np.sum(np.abs(U), axis=1).reshape(-1, 1) + (a_to_y_effect[:, i] * A).reshape(-1, 1) + rng.normal(0, 0.5, size=(n_samples, 1)) # Outcome noise ) df_env = pd.DataFrame( np.concatenate([A, Y, X], axis=1), columns=["A", "Y"] + covar_list, ) df_env["S"] = i all_data.append(df_env) df_all = pd.concat(all_data, ignore_index=True) return df_all