Skip to content

Commit 46c6cba

Browse files
committed
Merge branch 'main' into 0.7.X
2 parents baa145e + 8431daf commit 46c6cba

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+3283
-769
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ share/python-wheels/
2929
MANIFEST
3030
*.idea
3131
*.vscode
32+
.flake8

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
[![Conda Version](https://img.shields.io/conda/vn/conda-forge/doubleml.svg)](https://anaconda.org/conda-forge/doubleml)
66
[![codecov](https://codecov.io/gh/DoubleML/doubleml-for-py/branch/main/graph/badge.svg?token=0BjlFPgdGk)](https://codecov.io/gh/DoubleML/doubleml-for-py)
77
[![Codacy Badge](https://app.codacy.com/project/badge/Grade/1c08ec7d782c451784293c996537de14)](https://www.codacy.com/gh/DoubleML/doubleml-for-py/dashboard?utm_source=github.com&utm_medium=referral&utm_content=DoubleML/doubleml-for-py&utm_campaign=Badge_Grade)
8-
[![Python version](https://img.shields.io/badge/python-3.6%20%7C%203.7%20%7C%203.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://www.python.org/)
8+
[![Python version](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue)](https://www.python.org/)
99

1010
The Python package **DoubleML** provides an implementation of the double / debiased machine learning framework of
1111
[Chernozhukov et al. (2018)](https://doi.org/10.1111/ectj.12097).

doc/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
author = 'Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M.'
2323

2424
# The full version, including alpha/beta/rc tags
25-
release = '0.7.0'
25+
release = '0.7.1'
2626

2727

2828
# -- General configuration ---------------------------------------------------

doubleml/_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,9 @@ def _var_est(psi, psi_deriv, apply_cross_fitting, smpls, is_cluster_data,
333333
sigma2_hat = np.multiply(scaling, gamma_hat)
334334

335335
return sigma2_hat, var_scaling_factor
336+
337+
338+
def _cond_targets(target, cond_sample):
339+
cond_target = target.astype(float)
340+
cond_target[np.invert(cond_sample)] = np.nan
341+
return cond_target

doubleml/_utils_checks.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,3 +226,53 @@ def _check_benchmarks(benchmarks):
226226
raise TypeError('benchmarks name must be of string type. '
227227
f'{str(benchmarks["name"][i])} of type {str(type(benchmarks["name"][i]))} was passed.')
228228
return
229+
230+
231+
def _check_weights(weights, score, n_obs, n_rep):
232+
if weights is not None:
233+
234+
# check general type
235+
if (not isinstance(weights, np.ndarray)) and (not isinstance(weights, dict)):
236+
raise TypeError("weights must be a numpy array or dictionary. "
237+
f"weights of type {str(type(weights))} was passed.")
238+
239+
# check shape
240+
if isinstance(weights, np.ndarray):
241+
if (weights.ndim != 1) or weights.shape[0] != n_obs:
242+
raise ValueError(f"weights must have shape ({n_obs},). "
243+
f"weights of shape {weights.shape} was passed.")
244+
if not np.all(0 <= weights):
245+
raise ValueError("All weights values must be greater or equal 0.")
246+
if weights.sum() == 0:
247+
raise ValueError("At least one weight must be non-zero.")
248+
249+
# check special form for ATTE score
250+
if score == "ATTE":
251+
if not isinstance(weights, np.ndarray):
252+
raise TypeError("weights must be a numpy array for ATTE score. "
253+
f"weights of type {str(type(weights))} was passed.")
254+
255+
is_binary = np.all((np.power(weights, 2) - weights) == 0)
256+
if not is_binary:
257+
raise ValueError("weights must be binary for ATTE score.")
258+
259+
# check general form for ATE score
260+
if isinstance(weights, dict):
261+
assert score == "ATE"
262+
expected_keys = ["weights", "weights_bar"]
263+
if not set(weights.keys()) == set(expected_keys):
264+
raise ValueError(f"weights must have keys {expected_keys}. "
265+
f"keys {str(weights.keys())} were passed.")
266+
267+
expected_shapes = [(n_obs,), (n_obs, n_rep)]
268+
if weights["weights"].shape != expected_shapes[0]:
269+
raise ValueError(f"weights must have shape {expected_shapes[0]}. "
270+
f"weights of shape {weights['weights'].shape} was passed.")
271+
if weights["weights_bar"].shape != expected_shapes[1]:
272+
raise ValueError(f"weights_bar must have shape {expected_shapes[1]}. "
273+
f"weights_bar of shape {weights['weights_bar'].shape} was passed.")
274+
if (not np.all(weights["weights"] >= 0)) or (not np.all(weights["weights_bar"] >= 0)):
275+
raise ValueError("All weights values must be greater or equal 0.")
276+
if (weights["weights"].sum() == 0) or (weights["weights_bar"].sum() == 0):
277+
raise ValueError("At least one weight must be non-zero.")
278+
return

doubleml/_utils_resampling.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@
44
from sklearn.model_selection import KFold, RepeatedKFold, RepeatedStratifiedKFold
55

66

7+
# Remove warnings in future versions
8+
def deprication_apply_cross_fitting():
9+
warnings.warn('The apply_cross_fitting argument is deprecated and will be removed in future versions. '
10+
'In the future, crossfitting is applied by default. '
11+
'To rely on sample splitting please use external predictions.',
12+
DeprecationWarning)
13+
return
14+
15+
716
class DoubleMLResampling:
817
def __init__(self,
918
n_folds,
@@ -14,6 +23,8 @@ def __init__(self,
1423
self.n_folds = n_folds
1524
self.n_rep = n_rep
1625
self.n_obs = n_obs
26+
if not apply_cross_fitting:
27+
deprication_apply_cross_fitting()
1728
self.apply_cross_fitting = apply_cross_fitting
1829
self.stratify = stratify
1930
if (self.n_folds == 1) & self.apply_cross_fitting:

doubleml/datasets.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,3 +1228,120 @@ def f_g(beta_a):
12281228
'oracle_values': oracle_values}
12291229

12301230
return res_dict
1231+
1232+
1233+
def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False):
1234+
"""
1235+
Creates a simple synthetic example for heterogeneous treatment effects.
1236+
The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019).
1237+
1238+
The data is generated as
1239+
1240+
.. math::
1241+
1242+
Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i
1243+
1244+
D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i,
1245+
1246+
where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i
1247+
\\sim\\mathcal{U}[-1,1]`.
1248+
If the treatment is set to be binary, the treatment is generated as
1249+
1250+
.. math::
1251+
D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}.
1252+
1253+
The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support
1254+
which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`.
1255+
Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending
1256+
on the dimension of :math:`x`.
1257+
1258+
If the heterogeneity is univariate the conditional treatment effect takes the following form
1259+
1260+
.. math::
1261+
\\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0),
1262+
1263+
whereas for the two-dimensional case the conditional treatment effect is defined as
1264+
1265+
.. math::
1266+
\\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1).
1267+
1268+
Parameters
1269+
----------
1270+
n_obs : int
1271+
Number of observations to simulate.
1272+
Default is ``200``.
1273+
1274+
p : int
1275+
Dimension of covariates.
1276+
Default is ``30``.
1277+
1278+
support_size : int
1279+
Number of relevant (confounding) covariates.
1280+
Default is ``5``.
1281+
1282+
n_x : int
1283+
Dimension of the heterogeneity. Can be either ``1`` or ``2``.
1284+
Default is ``1``.
1285+
1286+
binary_treatment : bool
1287+
Indicates whether the treatment is binary.
1288+
Default is ``False``.
1289+
1290+
Returns
1291+
-------
1292+
res_dict : dictionary
1293+
Dictionary with entries ``data``, ``effects``, ``treatment_effect``.
1294+
1295+
"""
1296+
# simple input checks
1297+
assert n_x in [1, 2], 'n_x must be either 1 or 2.'
1298+
assert support_size <= p, 'support_size must be smaller than p.'
1299+
assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.'
1300+
1301+
# define treatment effects
1302+
if n_x == 1:
1303+
def treatment_effect(x):
1304+
return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0])
1305+
else:
1306+
assert n_x == 2
1307+
1308+
# redefine treatment effect
1309+
def treatment_effect(x):
1310+
return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1])
1311+
1312+
# Outcome support and coefficients
1313+
support_y = np.random.choice(np.arange(p), size=support_size, replace=False)
1314+
coefs_y = np.random.uniform(0, 1, size=support_size)
1315+
# treatment support and coefficients
1316+
support_d = support_y
1317+
coefs_d = np.random.uniform(0, 0.3, size=support_size)
1318+
1319+
# noise
1320+
epsilon = np.random.uniform(-1, 1, size=n_obs)
1321+
eta = np.random.uniform(-1, 1, size=n_obs)
1322+
1323+
# Generate controls, covariates, treatments and outcomes
1324+
x = np.random.uniform(0, 1, size=(n_obs, p))
1325+
# Heterogeneous treatment effects
1326+
te = treatment_effect(x)
1327+
if binary_treatment:
1328+
d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta)
1329+
else:
1330+
d = np.dot(x[:, support_d], coefs_d) + eta
1331+
y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon
1332+
1333+
# Now we build the dataset
1334+
y_df = pd.DataFrame({'y': y})
1335+
d_df = pd.DataFrame({'d': d})
1336+
x_df = pd.DataFrame(
1337+
data=x,
1338+
index=np.arange(x.shape[0]),
1339+
columns=[f'X_{i}' for i in range(x.shape[1])]
1340+
)
1341+
1342+
data = pd.concat([y_df, d_df, x_df], axis=1)
1343+
res_dict = {
1344+
'data': data,
1345+
'effects': te,
1346+
'treatment_effect': treatment_effect}
1347+
return res_dict

0 commit comments

Comments
 (0)