1
1
import pandas as pd
2
2
import numpy as np
3
+ import warnings
3
4
4
5
from scipy .linalg import toeplitz
5
6
from scipy .optimize import minimize_scalar
@@ -895,11 +896,11 @@ def f_ps(w, xi):
895
896
raise ValueError ('Invalid return_type.' )
896
897
897
898
898
- def make_confounded_irm_data (n_obs = 500 , theta = 5 .0 , cf_y = 0.04 , cf_d = 0.04 ):
899
+ def make_confounded_irm_data (n_obs = 500 , theta = 0 .0 , gamma_a = 0.127 , beta_a = 0.58 , linear = False , ** kwargs ):
899
900
"""
900
901
Generates counfounded data from an interactive regression model.
901
902
902
- The data generating process is defined as follows (similar to the Monte Carlo simulation used
903
+ The data generating process is defined as follows (inspired by the Monte Carlo simulation used
903
904
in Sant'Anna and Zhao (2020)).
904
905
905
906
Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\ sim \\ mathcal{N}(0, \\ Sigma)`, where :math:`\\ Sigma` corresponds
@@ -924,22 +925,30 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
924
925
925
926
.. math::
926
927
927
- m(X, A) = P(D=1|X,A) = 0.5 + \\ gamma_A \\ cdot A
928
+ m(X, A) = P(D=1|X,A) = p(Z) + \\ gamma_A \\ cdot A
929
+
930
+ where
931
+
932
+ .. math::
933
+
934
+ p(Z) &= \\ frac{\\ exp(f_{ps}(Z))}{1 + \\ exp(f_{ps}(Z))},
935
+
936
+ f_{ps}(Z) &= 0.75 \\ cdot (-Z_1 + 0.1 \\ cdot Z_2 -0.25 \\ cdot Z_3 - 0.1 \\ cdot Z_4).
928
937
929
938
and generate the treatment :math:`D = 1\\ {m(X, A) \\ ge U\\ }` with :math:`U \\ sim \\ mathcal{U}[0, 1]`.
930
939
Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as
931
940
932
941
.. math::
933
942
934
- P(D=1|X) = 0.5 .
943
+ P(D=1|X) = p(Z) .
935
944
936
945
Further, generate the outcome of interest :math:`Y` as
937
946
938
947
.. math::
939
948
940
949
Y &= \\ theta \\ cdot D (Z_5 + 1) + g(Z) + \\ beta_A \\ cdot A + \\ varepsilon
941
950
942
- g(Z) &= 210 + 27.4 \\ cdot Z_1 +13.7 \\ cdot ( Z_2 + Z_3 + Z_4)
951
+ g(Z) &= 2.5 + 0.74 \\ cdot Z_1 + 0.25 \\ cdot Z_2 + 0.137 \\ cdot ( Z_3 + Z_4)
943
952
944
953
where :math:`\\ varepsilon \\ sim \\ mathcal{N}(0,5)`.
945
954
This implies an average treatment effect of :math:`\\ theta`. Additionally, the long and short forms of
@@ -952,13 +961,13 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
952
961
\\ mathbb{E}[Y|D, X] &= (\\ theta + \\ beta_A \\ frac{\\ mathrm{Cov}(A, D(Z_5 + 1))}{\\ mathrm{Var}(D(Z_5 + 1))})
953
962
\\ cdot D (Z_5 + 1) + g(Z).
954
963
955
- Consequently, the strength of confounding is determined via :math:`\\ gamma_A` and :math:`\\ beta_A`.
956
- Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample) .
964
+ Consequently, the strength of confounding is determined via :math:`\\ gamma_A` and :math:`\\ beta_A`, which can be
965
+ set via the parameters ``gamma_a`` and ``beta_a`` .
957
966
958
- The observed data is given as :math:`W = (Y, D, X )`.
967
+ The observed data is given as :math:`W = (Y, D, Z )`.
959
968
Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`,
960
- the potential outcomes of :math:`Y`, the coefficients :math:` \\ gamma_a`, :math:` \\ beta_a`, the
961
- long and short forms of the main regression and the propensity score
969
+ the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and
970
+ in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE)
962
971
are returned in a dictionary.
963
972
964
973
Parameters
@@ -968,13 +977,16 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
968
977
Default is ``500``.
969
978
theta : float or int
970
979
Average treatment effect.
971
- Default is ``5.0``.
972
- cf_y : float
973
- Percentage of the residual variation of the outcome explained by latent/confounding variable.
974
- Default is ``0.04``.
975
- cf_d : float
976
- Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable.
977
- Default is ``0.04``.
980
+ Default is ``0.0``.
981
+ gamma_a : float
982
+ Coefficient of the unobserved confounder in the propensity score.
983
+ Default is ``0.127``.
984
+ beta_a : float
985
+ Coefficient of the unobserved confounder in the outcome regression.
986
+ Default is ``0.58``.
987
+ linear : bool
988
+ If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic.
989
+ Default is ``False``.
978
990
979
991
Returns
980
992
-------
@@ -988,82 +1000,122 @@ def make_confounded_irm_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04):
988
1000
doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
989
1001
"""
990
1002
c = 0.0 # the confounding strength is only valid for c=0
991
- dim_x = 5
1003
+ xi = 0.75
1004
+ dim_x = kwargs .get ('dim_x' , 5 )
1005
+ trimming_threshold = kwargs .get ('trimming_threshold' , 0.01 )
1006
+ var_eps_y = kwargs .get ('var_eps_y' , 1.0 )
1007
+
1008
+ # Specification of main regression function
1009
+ def f_reg (w ):
1010
+ res = 2.5 + 0.74 * w [:, 0 ] + 0.25 * w [:, 1 ] + 0.137 * (w [:, 2 ] + w [:, 3 ])
1011
+ return res
992
1012
1013
+ # Specification of prop score function
1014
+ def f_ps (w , xi ):
1015
+ res = xi * (- w [:, 0 ] + 0.1 * w [:, 1 ] - 0.25 * w [:, 2 ] - 0.1 * w [:, 3 ])
1016
+ return res
993
1017
# observed covariates
994
1018
cov_mat = toeplitz ([np .power (c , k ) for k in range (dim_x )])
995
1019
x = np .random .multivariate_normal (np .zeros (dim_x ), cov_mat , size = [n_obs , ])
996
-
997
1020
z_tilde_1 = np .exp (0.5 * x [:, 0 ])
998
1021
z_tilde_2 = 10 + x [:, 1 ] / (1 + np .exp (x [:, 0 ]))
999
1022
z_tilde_3 = (0.6 + x [:, 0 ] * x [:, 2 ]/ 25 )** 3
1000
1023
z_tilde_4 = (20 + x [:, 1 ] + x [:, 3 ])** 2
1001
-
1002
- z_tilde = np .column_stack ((z_tilde_1 , z_tilde_2 , z_tilde_3 , z_tilde_4 , x [:, 4 :] ))
1024
+ z_tilde_5 = x [:, 4 ]
1025
+ z_tilde = np .column_stack ((z_tilde_1 , z_tilde_2 , z_tilde_3 , z_tilde_4 , z_tilde_5 ))
1003
1026
z = (z_tilde - np .mean (z_tilde , axis = 0 )) / np .std (z_tilde , axis = 0 )
1004
-
1005
1027
# error terms and unobserved confounder
1006
- var_eps_y = 5
1007
1028
eps_y = np .random .normal (loc = 0 , scale = np .sqrt (var_eps_y ), size = n_obs )
1008
-
1009
1029
# unobserved confounder
1010
1030
a_bounds = (- 1 , 1 )
1011
1031
a = np .random .uniform (low = a_bounds [0 ], high = a_bounds [1 ], size = n_obs )
1032
+ var_a = np .square (a_bounds [1 ] - a_bounds [0 ]) / 12
1012
1033
1013
- # get the required impact of the confounder on the propensity score
1014
- possible_coefs = np . arange ( 0.001 , 0.4999 , 0.001 )
1015
- gamma_a = possible_coefs [( np . arctanh ( 2 * possible_coefs ) / ( 2 * possible_coefs )) - 1 - cf_d / ( 1 - cf_d ) >= 0 ][ 0 ]
1016
-
1017
- # compute short and long form of riesz representer
1018
- m_long = 0.5 + gamma_a * a
1019
- m_short = 0.5 * np . ones_like ( m_long )
1034
+ # Choose the features used in the models
1035
+ if linear :
1036
+ features_ps = x
1037
+ features_reg = x
1038
+ else :
1039
+ features_ps = z
1040
+ features_reg = z
1020
1041
1042
+ p = np .exp (f_ps (features_ps , xi )) / (1 + np .exp (f_ps (features_ps , xi )))
1043
+ # compute short and long form of propensity score
1044
+ m_long = p + gamma_a * a
1045
+ m_short = p
1046
+ # check propensity score bounds
1047
+ if np .any (m_long < trimming_threshold ) or np .any (m_long > 1.0 - trimming_threshold ):
1048
+ m_long = np .clip (m_long , trimming_threshold , 1.0 - trimming_threshold )
1049
+ m_short = np .clip (m_short , trimming_threshold , 1.0 - trimming_threshold )
1050
+ warnings .warn (f'Propensity score is close to 0 or 1. '
1051
+ f'Trimming is at { trimming_threshold } and { 1.0 - trimming_threshold } is applied' )
1052
+ # generate treatment based on long form
1021
1053
u = np .random .uniform (low = 0 , high = 1 , size = n_obs )
1022
1054
d = 1.0 * (m_long >= u )
1023
-
1024
- # short and long version of g
1025
- g_partial_reg = 210 + 27.4 * z [:, 0 ] + 13.7 * (z [:, 1 ] + z [:, 2 ] + z [:, 3 ])
1026
-
1027
- dx = d * (x [:, 4 ] + 1 )
1028
- d1x = x [:, 4 ] + 1
1029
- var_dx = np .var (dx )
1030
- cov_adx = np .cov (a , dx )[0 , 1 ]
1031
-
1032
- def f_g (beta_a ):
1033
- g_diff = beta_a * (a - cov_adx / var_dx )
1034
- y_diff = eps_y + g_diff
1035
- return np .square (np .mean (np .square (g_diff )) / np .mean (np .square (y_diff )) - cf_y )
1036
- beta_a = minimize_scalar (f_g ).x
1037
-
1055
+ # add treatment heterogeneity
1056
+ d1x = z [:, 4 ] + 1
1057
+ var_dx = np .var (d * (d1x ))
1058
+ cov_adx = gamma_a * var_a
1059
+ # Outcome regression
1060
+ g_partial_reg = f_reg (features_reg )
1061
+ # short model
1038
1062
g_short_d0 = g_partial_reg
1039
1063
g_short_d1 = (theta + beta_a * cov_adx / var_dx ) * d1x + g_partial_reg
1040
1064
g_short = d * g_short_d1 + (1.0 - d ) * g_short_d0
1041
-
1065
+ # long model
1042
1066
g_long_d0 = g_partial_reg + beta_a * a
1043
1067
g_long_d1 = theta * d1x + g_partial_reg + beta_a * a
1044
1068
g_long = d * g_long_d1 + (1.0 - d ) * g_long_d0
1045
-
1046
- y0 = g_long_d0 + eps_y
1047
- y1 = g_long_d1 + eps_y
1048
-
1049
- y = d * y1 + (1.0 - d ) * y0
1050
-
1051
- oracle_values = {'g_long' : g_long ,
1052
- 'g_short' : g_short ,
1053
- 'm_long' : m_long ,
1054
- 'm_short' : m_short ,
1055
- 'gamma_a' : gamma_a ,
1056
- 'beta_a' : beta_a ,
1057
- 'a' : a ,
1058
- 'y0' : y0 ,
1059
- 'y1' : y1 ,
1060
- 'z' : z }
1061
-
1062
- res_dict = {'x' : x ,
1063
- 'y' : y ,
1064
- 'd' : d ,
1065
- 'oracle_values' : oracle_values }
1066
-
1069
+ # Potential outcomes
1070
+ y_0 = g_long_d0 + eps_y
1071
+ y_1 = g_long_d1 + eps_y
1072
+ # Realized outcome
1073
+ y = d * y_1 + (1.0 - d ) * y_0
1074
+ # In-sample values for confounding strength
1075
+ explained_residual_variance = np .square (g_long - g_short )
1076
+ residual_variance = np .square (y - g_short )
1077
+ cf_y = np .mean (explained_residual_variance ) / np .mean (residual_variance )
1078
+ # compute the Riesz representation
1079
+ treated_weight = d / np .mean (d )
1080
+ untreated_weight = (1.0 - d ) / np .mean (d )
1081
+ # Odds ratios
1082
+ propensity_ratio_long = m_long / (1.0 - m_long )
1083
+ rr_long_ate = d / m_long - (1.0 - d ) / (1.0 - m_long )
1084
+ rr_long_atte = treated_weight - np .multiply (untreated_weight , propensity_ratio_long )
1085
+ propensity_ratio_short = m_short / (1.0 - m_short )
1086
+ rr_short_ate = d / m_short - (1.0 - d ) / (1.0 - m_short )
1087
+ rr_short_atte = treated_weight - np .multiply (untreated_weight , propensity_ratio_short )
1088
+ cf_d_ate = (np .mean (1 / (m_long * (1 - m_long ))) - np .mean (1 / (m_short * (1 - m_short )))) / np .mean (1 / (m_long * (1 - m_long )))
1089
+ cf_d_atte = (np .mean (propensity_ratio_long ) - np .mean (propensity_ratio_short )) / np .mean (propensity_ratio_long )
1090
+ if (beta_a == 0 ) | (gamma_a == 0 ):
1091
+ rho_ate = 0.0
1092
+ rho_atte = 0.0
1093
+ else :
1094
+ rho_ate = np .corrcoef ((g_long - g_short ), (rr_long_ate - rr_short_ate ))[0 , 1 ]
1095
+ rho_atte = np .corrcoef ((g_long - g_short ), (rr_long_atte - rr_short_atte ))[0 , 1 ]
1096
+ oracle_values = {
1097
+ 'g_long' : g_long ,
1098
+ 'g_short' : g_short ,
1099
+ 'm_long' : m_long ,
1100
+ 'm_short' : m_short ,
1101
+ 'gamma_a' : gamma_a ,
1102
+ 'beta_a' : beta_a ,
1103
+ 'a' : a ,
1104
+ 'y_0' : y_0 ,
1105
+ 'y_1' : y_1 ,
1106
+ 'z' : z ,
1107
+ 'cf_y' : cf_y ,
1108
+ 'cf_d_ate' : cf_d_ate ,
1109
+ 'cf_d_atte' : cf_d_atte ,
1110
+ 'rho_ate' : rho_ate ,
1111
+ 'rho_atte' : rho_atte ,
1112
+ }
1113
+ res_dict = {
1114
+ 'x' : x ,
1115
+ 'y' : y ,
1116
+ 'd' : d ,
1117
+ 'oracle_values' : oracle_values
1118
+ }
1067
1119
return res_dict
1068
1120
1069
1121
0 commit comments