Skip to content

Commit dc5d347

Browse files
authored
Merge pull request #263 from DoubleML/s-add-conf-irm
Update `make_confounded_irm_data`
2 parents 0511f6b + 727df17 commit dc5d347

File tree

2 files changed

+136
-73
lines changed

2 files changed

+136
-73
lines changed

doubleml/datasets.py

Lines changed: 121 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pandas as pd
22
import numpy as np
3+
import warnings
34

45
from scipy.linalg import toeplitz
56
from scipy.optimize import minimize_scalar
@@ -895,11 +896,11 @@ def f_ps(w, xi):
895896
raise ValueError('Invalid return_type.')
896897

897898

898-
def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
899+
def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs):
899900
"""
900901
Generates counfounded data from an interactive regression model.
901902
902-
The data generating process is defined as follows (similar to the Monte Carlo simulation used
903+
The data generating process is defined as follows (inspired by the Monte Carlo simulation used
903904
in Sant'Anna and Zhao (2020)).
904905
905906
Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds
@@ -924,22 +925,30 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
924925
925926
.. math::
926927
927-
m(X, A) = P(D=1|X,A) = 0.5 + \\gamma_A \\cdot A
928+
m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A
929+
930+
where
931+
932+
.. math::
933+
934+
p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))},
935+
936+
f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4).
928937
929938
and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`.
930939
Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as
931940
932941
.. math::
933942
934-
P(D=1|X) = 0.5.
943+
P(D=1|X) = p(Z).
935944
936945
Further, generate the outcome of interest :math:`Y` as
937946
938947
.. math::
939948
940949
Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon
941950
942-
g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4)
951+
g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4)
943952
944953
where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`.
945954
This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of
@@ -952,13 +961,13 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
952961
\\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))})
953962
\\cdot D (Z_5 + 1) + g(Z).
954963
955-
Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`.
956-
Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample).
964+
Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be
965+
set via the parameters ``gamma_a`` and ``beta_a``.
957966
958-
The observed data is given as :math:`W = (Y, D, X)`.
967+
The observed data is given as :math:`W = (Y, D, Z)`.
959968
Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`,
960-
the potential outcomes of :math:`Y`, the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the
961-
long and short forms of the main regression and the propensity score
969+
the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and
970+
in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE)
962971
are returned in a dictionary.
963972
964973
Parameters
@@ -968,13 +977,16 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
968977
Default is ``500``.
969978
theta : float or int
970979
Average treatment effect.
971-
Default is ``5.0``.
972-
cf_y : float
973-
Percentage of the residual variation of the outcome explained by latent/confounding variable.
974-
Default is ``0.04``.
975-
cf_d : float
976-
Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable.
977-
Default is ``0.04``.
980+
Default is ``0.0``.
981+
gamma_a : float
982+
Coefficient of the unobserved confounder in the propensity score.
983+
Default is ``0.127``.
984+
beta_a : float
985+
Coefficient of the unobserved confounder in the outcome regression.
986+
Default is ``0.58``.
987+
linear : bool
988+
If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic.
989+
Default is ``False``.
978990
979991
Returns
980992
-------
@@ -988,82 +1000,122 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
9881000
doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
9891001
"""
9901002
c = 0.0 # the confounding strength is only valid for c=0
991-
dim_x = 5
1003+
xi = 0.75
1004+
dim_x = kwargs.get('dim_x', 5)
1005+
trimming_threshold = kwargs.get('trimming_threshold', 0.01)
1006+
var_eps_y = kwargs.get('var_eps_y', 1.0)
1007+
1008+
# Specification of main regression function
1009+
def f_reg(w):
1010+
res = 2.5 + 0.74*w[:, 0] + 0.25 * w[:, 1] + 0.137*(w[:, 2] + w[:, 3])
1011+
return res
9921012

1013+
# Specification of prop score function
1014+
def f_ps(w, xi):
1015+
res = xi*(-w[:, 0] + 0.1*w[:, 1] - 0.25*w[:, 2] - 0.1*w[:, 3])
1016+
return res
9931017
# observed covariates
9941018
cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
9951019
x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ])
996-
9971020
z_tilde_1 = np.exp(0.5*x[:, 0])
9981021
z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
9991022
z_tilde_3 = (0.6 + x[:, 0] * x[:, 2]/25)**3
10001023
z_tilde_4 = (20 + x[:, 1] + x[:, 3])**2
1001-
1002-
z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:]))
1024+
z_tilde_5 = x[:, 4]
1025+
z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5))
10031026
z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)
1004-
10051027
# error terms and unobserved confounder
1006-
var_eps_y = 5
10071028
eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs)
1008-
10091029
# unobserved confounder
10101030
a_bounds = (-1, 1)
10111031
a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs)
1032+
var_a = np.square(a_bounds[1] - a_bounds[0]) / 12
10121033

1013-
# get the required impact of the confounder on the propensity score
1014-
possible_coefs = np.arange(0.001, 0.4999, 0.001)
1015-
gamma_a = possible_coefs[(np.arctanh(2*possible_coefs) / (2*possible_coefs)) - 1 - cf_d/(1 - cf_d) >= 0][0]
1016-
1017-
# compute short and long form of riesz representer
1018-
m_long = 0.5 + gamma_a*a
1019-
m_short = 0.5 * np.ones_like(m_long)
1034+
# Choose the features used in the models
1035+
if linear:
1036+
features_ps = x
1037+
features_reg = x
1038+
else:
1039+
features_ps = z
1040+
features_reg = z
10201041

1042+
p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi)))
1043+
# compute short and long form of propensity score
1044+
m_long = p + gamma_a*a
1045+
m_short = p
1046+
# check propensity score bounds
1047+
if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold):
1048+
m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold)
1049+
m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold)
1050+
warnings.warn(f'Propensity score is close to 0 or 1. '
1051+
f'Trimming is at {trimming_threshold} and {1.0-trimming_threshold} is applied')
1052+
# generate treatment based on long form
10211053
u = np.random.uniform(low=0, high=1, size=n_obs)
10221054
d = 1.0 * (m_long >= u)
1023-
1024-
# short and long version of g
1025-
g_partial_reg = 210 + 27.4*z[:, 0] + 13.7*(z[:, 1] + z[:, 2] + z[:, 3])
1026-
1027-
dx = d * (x[:, 4] + 1)
1028-
d1x = x[:, 4] + 1
1029-
var_dx = np.var(dx)
1030-
cov_adx = np.cov(a, dx)[0, 1]
1031-
1032-
def f_g(beta_a):
1033-
g_diff = beta_a * (a - cov_adx / var_dx)
1034-
y_diff = eps_y + g_diff
1035-
return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y)
1036-
beta_a = minimize_scalar(f_g).x
1037-
1055+
# add treatment heterogeneity
1056+
d1x = z[:, 4] + 1
1057+
var_dx = np.var(d*(d1x))
1058+
cov_adx = gamma_a * var_a
1059+
# Outcome regression
1060+
g_partial_reg = f_reg(features_reg)
1061+
# short model
10381062
g_short_d0 = g_partial_reg
10391063
g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg
10401064
g_short = d * g_short_d1 + (1.0-d) * g_short_d0
1041-
1065+
# long model
10421066
g_long_d0 = g_partial_reg + beta_a * a
10431067
g_long_d1 = theta * d1x + g_partial_reg + beta_a * a
10441068
g_long = d * g_long_d1 + (1.0-d) * g_long_d0
1045-
1046-
y0 = g_long_d0 + eps_y
1047-
y1 = g_long_d1 + eps_y
1048-
1049-
y = d * y1 + (1.0-d) * y0
1050-
1051-
oracle_values = {'g_long': g_long,
1052-
'g_short': g_short,
1053-
'm_long': m_long,
1054-
'm_short': m_short,
1055-
'gamma_a': gamma_a,
1056-
'beta_a': beta_a,
1057-
'a': a,
1058-
'y0': y0,
1059-
'y1': y1,
1060-
'z': z}
1061-
1062-
res_dict = {'x': x,
1063-
'y': y,
1064-
'd': d,
1065-
'oracle_values': oracle_values}
1066-
1069+
# Potential outcomes
1070+
y_0 = g_long_d0 + eps_y
1071+
y_1 = g_long_d1 + eps_y
1072+
# Realized outcome
1073+
y = d * y_1 + (1.0-d) * y_0
1074+
# In-sample values for confounding strength
1075+
explained_residual_variance = np.square(g_long - g_short)
1076+
residual_variance = np.square(y - g_short)
1077+
cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance)
1078+
# compute the Riesz representation
1079+
treated_weight = d / np.mean(d)
1080+
untreated_weight = (1.0 - d) / np.mean(d)
1081+
# Odds ratios
1082+
propensity_ratio_long = m_long / (1.0 - m_long)
1083+
rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long)
1084+
rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long)
1085+
propensity_ratio_short = m_short / (1.0 - m_short)
1086+
rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short)
1087+
rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short)
1088+
cf_d_ate = (np.mean(1/(m_long * (1 - m_long))) - np.mean(1/(m_short * (1 - m_short)))) / np.mean(1/(m_long * (1 - m_long)))
1089+
cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long)
1090+
if (beta_a == 0) | (gamma_a == 0):
1091+
rho_ate = 0.0
1092+
rho_atte = 0.0
1093+
else:
1094+
rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1]
1095+
rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1]
1096+
oracle_values = {
1097+
'g_long': g_long,
1098+
'g_short': g_short,
1099+
'm_long': m_long,
1100+
'm_short': m_short,
1101+
'gamma_a': gamma_a,
1102+
'beta_a': beta_a,
1103+
'a': a,
1104+
'y_0': y_0,
1105+
'y_1': y_1,
1106+
'z': z,
1107+
'cf_y': cf_y,
1108+
'cf_d_ate': cf_d_ate,
1109+
'cf_d_atte': cf_d_atte,
1110+
'rho_ate': rho_ate,
1111+
'rho_atte': rho_atte,
1112+
}
1113+
res_dict = {
1114+
'x': x,
1115+
'y': y,
1116+
'd': d,
1117+
'oracle_values': oracle_values
1118+
}
10671119
return res_dict
10681120

10691121

doubleml/tests/test_datasets.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,10 +186,16 @@ def test_make_did_SZ2020_return_types(cross_sectional, dgp_type):
186186
_ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type='matrix')
187187

188188

189+
@pytest.fixture(scope='function',
190+
params=[True, False])
191+
def linear(request):
192+
return request.param
193+
194+
189195
@pytest.mark.ci
190-
def test_make_confounded_irm_data_return_types():
196+
def test_make_confounded_irm_data_return_types(linear):
191197
np.random.seed(3141)
192-
res = make_confounded_irm_data()
198+
res = make_confounded_irm_data(linear=linear)
193199
assert isinstance(res, dict)
194200
assert isinstance(res['x'], np.ndarray)
195201
assert isinstance(res['y'], np.ndarray)
@@ -203,9 +209,14 @@ def test_make_confounded_irm_data_return_types():
203209
assert isinstance(res['oracle_values']['gamma_a'], float)
204210
assert isinstance(res['oracle_values']['beta_a'], float)
205211
assert isinstance(res['oracle_values']['a'], np.ndarray)
206-
assert isinstance(res['oracle_values']['y0'], np.ndarray)
207-
assert isinstance(res['oracle_values']['y1'], np.ndarray)
212+
assert isinstance(res['oracle_values']['y_0'], np.ndarray)
213+
assert isinstance(res['oracle_values']['y_1'], np.ndarray)
208214
assert isinstance(res['oracle_values']['z'], np.ndarray)
215+
assert isinstance(res['oracle_values']['cf_y'], float)
216+
assert isinstance(res['oracle_values']['cf_d_ate'], float)
217+
assert isinstance(res['oracle_values']['cf_d_atte'], float)
218+
assert isinstance(res['oracle_values']['rho_ate'], float)
219+
assert isinstance(res['oracle_values']['rho_atte'], float)
209220

210221

211222
@pytest.mark.ci

0 commit comments

Comments
 (0)