DoubleML
diff --git a/‎README.md
Lines changed: 11 additions & 9 deletions b/‎README.md
Lines changed: 11 additions & 9 deletions
diff --git a/‎doubleml/double_ml.py
Lines changed: 3 additions & 1 deletion b/‎doubleml/double_ml.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎doubleml/double_ml_data.py
Lines changed: 53 additions & 2 deletions b/‎doubleml/double_ml_data.py
Lines changed: 53 additions & 2 deletions
diff --git a/‎doubleml/double_ml_iivm.py
Lines changed: 53 additions & 12 deletions b/‎doubleml/double_ml_iivm.py
Lines changed: 53 additions & 12 deletions
diff --git a/‎doubleml/tests/_utils_iivm_manual.py
Lines changed: 34 additions & 21 deletions b/‎doubleml/tests/_utils_iivm_manual.py
Lines changed: 34 additions & 21 deletions
@@ -90,19 +90,21 @@ Detailed [installation instructions](https://docs.doubleml.org/stable/intro/inst
 
 If you use the DoubleML package a citation is highly appreciated:
 
-Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M. (2020),
-DoubleML - Double Machine Learning in Python.
-URL: [https://github.com/DoubleML/doubleml-for-py](https://github.com/DoubleML/doubleml-for-py),
-Python-Package version 0.2.0.
+Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M. (2021), DoubleML - An
+Object-Oriented Implementation of Double Machine Learning in Python,
+arXiv:[2104.03220](https://arxiv.org/abs/2104.03220).
 
 Bibtex-entry:
 
 ```
-@Manual{DoubleML2020,
-  title = {DoubleML - Double Machine Learning in Python},
-  author = {Bach, P., Chernozhukov, V., Kurz, M. S., and Spindler, M.},
-  year = {2020},
-  note = {URL: \url{https://github.com/DoubleML/doubleml-for-py}, Python-Package version 0.2.0}
+@misc{DoubleML2021,
+      title={{DoubleML} -- {A}n Object-Oriented Implementation of Double Machine Learning in {P}ython}, 
+      author={Philipp Bach and Victor Chernozhukov and Malte S. Kurz and Martin Spindler},
+      year={2021},
+      eprint={2104.03220},
+      archivePrefix={arXiv},
+      primaryClass={stat.ML},
+      note={arXiv:\href{https://arxiv.org/abs/2104.03220}{2104.03220} [stat.ML]}
 }
 ```
 
 
@@ -869,7 +869,9 @@ def set_ml_nuisance_params(self, learner, treat_var, params):
             raise ValueError('Invalid treatment variable ' + treat_var + '. ' +
                              'Valid treatment variable ' + ' or '.join(self._dml_data.d_cols) + '.')
 
-        if isinstance(params, dict):
+        if params is None:
+            all_params = [None] * self.n_rep
+        elif isinstance(params, dict):
             if self.apply_cross_fitting:
                 all_params = [[params] * self.n_folds] * self.n_rep
             else:
 
@@ -56,12 +56,19 @@ def __init__(self,
                  x_cols=None,
                  z_cols=None,
                  use_other_treat_as_covariate=True):
+        if not isinstance(data, pd.DataFrame):
+            raise TypeError('data must be of pd.DataFrame type. '
+                            f'{str(data)} of type {str(type(data))} was passed.')
+        if not data.columns.is_unique:
+            raise ValueError('Invalid pd.DataFrame: '
+                             'Contains duplicate column names.')
         self._data = data
 
         self.y_col = y_col
         self.d_cols = d_cols
         self.z_cols = z_cols
         self.x_cols = x_cols
+        self._check_disjoint_sets()
         self.use_other_treat_as_covariate = use_other_treat_as_covariate
         self._binary_treats = self._check_binary_treats()
         self._set_y_z()
@@ -245,6 +252,9 @@ def x_cols(self, value):
             if not isinstance(value, list):
                 raise TypeError('The covariates x_cols must be of str or list type (or None). '
                                 f'{str(value)} of type {str(type(value))} was passed.')
+            if not len(set(value)) == len(value):
+                raise ValueError('Invalid covariates x_cols: '
+                                 'Contains duplicate values.')
             if not set(value).issubset(set(self.all_variables)):
                 raise ValueError('Invalid covariates x_cols. '
                                  'At least one covariate is no data column.')
@@ -253,13 +263,14 @@ def x_cols(self, value):
         else:
             # x_cols defaults to all columns but y_col, d_cols and z_cols
             if self.z_cols is not None:
-                y_d_z = set.union(set(self.y_col), set(self.d_cols), set(self.z_cols))
+                y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols))
                 x_cols = [col for col in self.data.columns if col not in y_d_z]
             else:
-                y_d = set.union(set(self.y_col), set(self.d_cols))
+                y_d = set.union({self.y_col}, set(self.d_cols))
                 x_cols = [col for col in self.data.columns if col not in y_d]
             self._x_cols = x_cols
         if reset_value:
+            self._check_disjoint_sets()
             # by default, we initialize to the first treatment variable
             self.set_x_d(self.d_cols[0])
 
@@ -278,11 +289,15 @@ def d_cols(self, value):
         if not isinstance(value, list):
             raise TypeError('The treatment variable(s) d_cols must be of str or list type. '
                             f'{str(value)} of type {str(type(value))} was passed.')
+        if not len(set(value)) == len(value):
+            raise ValueError('Invalid treatment variable(s) d_cols: '
+                             'Contains duplicate values.')
         if not set(value).issubset(set(self.all_variables)):
             raise ValueError('Invalid treatment variable(s) d_cols. '
                              'At least one treatment variable is no data column.')
         self._d_cols = value
         if reset_value:
+            self._check_disjoint_sets()
             # by default, we initialize to the first treatment variable
             self.set_x_d(self.d_cols[0])
 
@@ -304,6 +319,7 @@ def y_col(self, value):
                              f'{value} is no data column.')
         self._y_col = value
         if reset_value:
+            self._check_disjoint_sets()
             self._set_y_z()
 
     @property
@@ -322,13 +338,17 @@ def z_cols(self, value):
             if not isinstance(value, list):
                 raise TypeError('The instrumental variable(s) z_cols must be of str or list type (or None). '
                                 f'{str(value)} of type {str(type(value))} was passed.')
+            if not len(set(value)) == len(value):
+                raise ValueError('Invalid instrumental variable(s) z_cols: '
+                                 'Contains duplicate values.')
             if not set(value).issubset(set(self.all_variables)):
                 raise ValueError('Invalid instrumental variable(s) z_cols. '
                                  'At least one instrumental variable is no data column.')
             self._z_cols = value
         else:
             self._z_cols = None
         if reset_value:
+            self._check_disjoint_sets()
             self._set_y_z()
 
     @property
@@ -368,6 +388,8 @@ def set_x_d(self, treatment_var):
             raise ValueError('Invalid treatment_var. '
                              f'{treatment_var} is not in d_cols.')
         if self.use_other_treat_as_covariate:
+            # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed
+            # (see https://github.com/DoubleML/doubleml-for-py/issues/83)
             xd_list = self.x_cols + self.d_cols
             xd_list.remove(treatment_var)
         else:
@@ -383,3 +405,32 @@ def _check_binary_treats(self):
             zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0)
             is_binary[treatment_var] = (binary_treat & zero_one_treat)
         return is_binary
+
+    def _check_disjoint_sets(self):
+        y_col_set = {self.y_col}
+        x_cols_set = set(self.x_cols)
+        d_cols_set = set(self.d_cols)
+
+        if not y_col_set.isdisjoint(x_cols_set):
+            raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and covariate in '
+                             '``x_cols``.')
+        if not y_col_set.isdisjoint(d_cols_set):
+            raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and treatment variable in '
+                             '``d_cols``.')
+        # note that the line xd_list = self.x_cols + self.d_cols in method set_x_d needs adaption if an intersection of
+        # x_cols and d_cols as allowed (see https://github.com/DoubleML/doubleml-for-py/issues/83)
+        if not d_cols_set.isdisjoint(x_cols_set):
+            raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and as covariate'
+                             '(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``.')
+
+        if self.z_cols is not None:
+            z_cols_set = set(self.z_cols)
+            if not y_col_set.isdisjoint(z_cols_set):
+                raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental '
+                                 'variable in ``z_cols``.')
+            if not d_cols_set.isdisjoint(z_cols_set):
+                raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and '
+                                 'instrumental variable in ``z_cols``.')
+            if not x_cols_set.isdisjoint(z_cols_set):
+                raise ValueError('At least one variable/column is set as covariate (``x_cols``) and instrumental '
+                                 'variable in ``z_cols``.')
@@ -40,6 +40,12 @@ class DoubleMLIIVM(DoubleML):
         ``psi_a, psi_b = score(y, z, d, g_hat0, g_hat1, m_hat, r_hat0, r_hat1, smpls)``.
         Default is ``'LATE'``.
 
+    subgroups: dict or None
+        Dictionary with options to adapt to cases with and without the subgroups of always-takers and never-takes. The
+        logical item ``always_takers`` speficies whether there are always takers in the sample. The logical item
+        ``never_takers`` speficies whether there are never takers in the sample.
+        Default is ``{'always_takers': True, 'never_takers': True}``.
+
     dml_procedure : str
         A str (``'dml1'`` or ``'dml2'``) specifying the double machine learning algorithm.
         Default is ``'dml2'``.
@@ -115,6 +121,7 @@ def __init__(self,
                  n_folds=5,
                  n_rep=1,
                  score='LATE',
+                 subgroups=None,
                  dml_procedure='dml2',
                  trimming_rule='truncate',
                  trimming_threshold=1e-12,
@@ -138,6 +145,25 @@ def __init__(self,
         if trimming_rule not in valid_trimming_rule:
             raise ValueError('Invalid trimming_rule ' + trimming_rule + '. ' +
                              'Valid trimming_rule ' + ' or '.join(valid_trimming_rule) + '.')
+
+        if subgroups is None:
+            # this is the default for subgroups; via None to prevent a mutable default argument
+            subgroups = {'always_takers': True, 'never_takers': True}
+        else:
+            if not isinstance(subgroups, dict):
+                raise TypeError('Invalid subgroups ' + str(subgroups) + '. ' +
+                                'subgroups must be of type dictionary.')
+            if (not all(k in subgroups for k in ['always_takers', 'never_takers']))\
+                    | (not all(k in ['always_takers', 'never_takers'] for k in subgroups)):
+                raise ValueError('Invalid subgroups ' + str(subgroups) + '. ' +
+                                 'subgroups must be a dictionary with keys always_takers and never_takers.')
+            if not isinstance(subgroups['always_takers'], bool):
+                raise TypeError("subgroups['always_takers'] must be True or False. "
+                                f'Got {str(subgroups["always_takers"])}.')
+            if not isinstance(subgroups['never_takers'], bool):
+                raise TypeError("subgroups['never_takers'] must be True or False. "
+                                f'Got {str(subgroups["never_takers"])}.')
+        self.subgroups = subgroups
         self.trimming_rule = trimming_rule
         self.trimming_threshold = trimming_threshold
 
@@ -196,10 +222,16 @@ def _ml_nuisance_and_score_elements(self, smpls, n_jobs_cv):
                                 est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'])
 
         # nuisance r
-        r_hat0 = _dml_cv_predict(self._learner['ml_r'], x, d, smpls=smpls_z0, n_jobs=n_jobs_cv,
-                                 est_params=self._get_params('ml_r0'), method=self._predict_method['ml_r'])
-        r_hat1 = _dml_cv_predict(self._learner['ml_r'], x, d, smpls=smpls_z1, n_jobs=n_jobs_cv,
-                                 est_params=self._get_params('ml_r1'), method=self._predict_method['ml_r'])
+        if self.subgroups['always_takers']:
+            r_hat0 = _dml_cv_predict(self._learner['ml_r'], x, d, smpls=smpls_z0, n_jobs=n_jobs_cv,
+                                     est_params=self._get_params('ml_r0'), method=self._predict_method['ml_r'])
+        else:
+            r_hat0 = np.zeros_like(d)
+        if self.subgroups['never_takers']:
+            r_hat1 = _dml_cv_predict(self._learner['ml_r'], x, d, smpls=smpls_z1, n_jobs=n_jobs_cv,
+                                     est_params=self._get_params('ml_r1'), method=self._predict_method['ml_r'])
+        else:
+            r_hat1 = np.ones_like(d)
 
         psi_a, psi_b = self._score_elements(y, z, d, g_hat0, g_hat1, m_hat, r_hat0, r_hat1, smpls)
         preds = {'ml_g0': g_hat0,
@@ -262,18 +294,27 @@ def _ml_nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune,
         m_tune_res = _dml_tune(z, x, train_inds,
                                self._learner['ml_m'], param_grids['ml_m'], scoring_methods['ml_m'],
                                n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
-        r0_tune_res = _dml_tune(d, x, train_inds_z0,
-                                self._learner['ml_r'], param_grids['ml_r'], scoring_methods['ml_r'],
-                                n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
-        r1_tune_res = _dml_tune(d, x, train_inds_z1,
-                                self._learner['ml_r'], param_grids['ml_r'], scoring_methods['ml_r'],
-                                n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
+
+        if self.subgroups['always_takers']:
+            r0_tune_res = _dml_tune(d, x, train_inds_z0,
+                                    self._learner['ml_r'], param_grids['ml_r'], scoring_methods['ml_r'],
+                                    n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
+            r0_best_params = [xx.best_params_ for xx in r0_tune_res]
+        else:
+            r0_tune_res = None
+            r0_best_params = [None] * len(smpls)
+        if self.subgroups['never_takers']:
+            r1_tune_res = _dml_tune(d, x, train_inds_z1,
+                                    self._learner['ml_r'], param_grids['ml_r'], scoring_methods['ml_r'],
+                                    n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search)
+            r1_best_params = [xx.best_params_ for xx in r1_tune_res]
+        else:
+            r1_tune_res = None
+            r1_best_params = [None] * len(smpls)
 
         g0_best_params = [xx.best_params_ for xx in g0_tune_res]
         g1_best_params = [xx.best_params_ for xx in g1_tune_res]
         m_best_params = [xx.best_params_ for xx in m_tune_res]
-        r0_best_params = [xx.best_params_ for xx in r0_tune_res]
-        r1_best_params = [xx.best_params_ for xx in r1_tune_res]
 
         params = {'ml_g0': g0_best_params,
                   'ml_g1': g1_best_params,
 
@@ -7,7 +7,7 @@
 
 def fit_nuisance_iivm(y, x, d, z, learner_m, learner_g, learner_r, smpls,
                       g0_params=None, g1_params=None, m_params=None, r0_params=None, r1_params=None,
-                      trimming_threshold=1e-12):
+                      trimming_threshold=1e-12, always_takers=True, never_takers=True):
     ml_g0 = clone(learner_g)
     g_hat0 = []
     for idx, (train_index, test_index) in enumerate(smpls):
@@ -41,21 +41,28 @@ def fit_nuisance_iivm(y, x, d, z, learner_m, learner_g, learner_r, smpls,
         if r0_params is not None:
             ml_r0.set_params(**r0_params[idx])
         train_index0 = np.intersect1d(np.where(z == 0)[0], train_index)
-        r_hat0.append(ml_r0.fit(x[train_index0], d[train_index0]).predict_proba(x[test_index])[:, 1])
+        if always_takers:
+            r_hat0.append(ml_r0.fit(x[train_index0], d[train_index0]).predict_proba(x[test_index])[:, 1])
+        else:
+            r_hat0.append(np.zeros_like(d[test_index]))
 
     ml_r1 = clone(learner_r)
     r_hat1 = []
     for idx, (train_index, test_index) in enumerate(smpls):
         if r1_params is not None:
             ml_r1.set_params(**r1_params[idx])
         train_index1 = np.intersect1d(np.where(z == 1)[0], train_index)
-        r_hat1.append(ml_r1.fit(x[train_index1], d[train_index1]).predict_proba(x[test_index])[:, 1])
+        if never_takers:
+            r_hat1.append(ml_r1.fit(x[train_index1], d[train_index1]).predict_proba(x[test_index])[:, 1])
+        else:
+            r_hat1.append(np.ones_like(d[test_index]))
 
     return g_hat0, g_hat1, m_hat, r_hat0, r_hat1
 
 
 def tune_nuisance_iivm(y, x, d, z, ml_m, ml_g, ml_r, smpls, n_folds_tune,
-                       param_grid_g, param_grid_m, param_grid_r):
+                       param_grid_g, param_grid_m, param_grid_r,
+                       always_takers=True, never_takers=True):
     g0_tune_res = [None] * len(smpls)
     for idx, (train_index, _) in enumerate(smpls):
         g0_tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True)
@@ -79,27 +86,33 @@ def tune_nuisance_iivm(y, x, d, z, ml_m, ml_g, ml_r, smpls, n_folds_tune,
                                      cv=m_tune_resampling)
         m_tune_res[idx] = m_grid_search.fit(x[train_index, :], z[train_index])
 
-    r0_tune_res = [None] * len(smpls)
-    for idx, (train_index, _) in enumerate(smpls):
-        r0_tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True)
-        r0_grid_search = GridSearchCV(ml_r, param_grid_r,
-                                      cv=r0_tune_resampling)
-        train_index0 = np.intersect1d(np.where(z == 0)[0], train_index)
-        r0_tune_res[idx] = r0_grid_search.fit(x[train_index0, :], d[train_index0])
-
-    r1_tune_res = [None] * len(smpls)
-    for idx, (train_index, _) in enumerate(smpls):
-        r1_tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True)
-        r1_grid_search = GridSearchCV(ml_r, param_grid_r,
-                                      cv=r1_tune_resampling)
-        train_index1 = np.intersect1d(np.where(z == 1)[0], train_index)
-        r1_tune_res[idx] = r1_grid_search.fit(x[train_index1, :], d[train_index1])
+    if always_takers:
+        r0_tune_res = [None] * len(smpls)
+        for idx, (train_index, _) in enumerate(smpls):
+            r0_tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True)
+            r0_grid_search = GridSearchCV(ml_r, param_grid_r,
+                                          cv=r0_tune_resampling)
+            train_index0 = np.intersect1d(np.where(z == 0)[0], train_index)
+            r0_tune_res[idx] = r0_grid_search.fit(x[train_index0, :], d[train_index0])
+        r0_best_params = [xx.best_params_ for xx in r0_tune_res]
+    else:
+        r0_best_params = None
+
+    if never_takers:
+        r1_tune_res = [None] * len(smpls)
+        for idx, (train_index, _) in enumerate(smpls):
+            r1_tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True)
+            r1_grid_search = GridSearchCV(ml_r, param_grid_r,
+                                          cv=r1_tune_resampling)
+            train_index1 = np.intersect1d(np.where(z == 1)[0], train_index)
+            r1_tune_res[idx] = r1_grid_search.fit(x[train_index1, :], d[train_index1])
+        r1_best_params = [xx.best_params_ for xx in r1_tune_res]
+    else:
+        r1_best_params = None
 
     g0_best_params = [xx.best_params_ for xx in g0_tune_res]
     g1_best_params = [xx.best_params_ for xx in g1_tune_res]
     m_best_params = [xx.best_params_ for xx in m_tune_res]
-    r0_best_params = [xx.best_params_ for xx in r0_tune_res]
-    r1_best_params = [xx.best_params_ for xx in r1_tune_res]
 
     return g0_best_params, g1_best_params, m_best_params, r0_best_params, r1_best_params