Merge pull request #212 from DoubleML/o-policy-tree

SvenKlaassen · web-flow · commit 0851e887df2f · 2023-09-08T15:23:34.000+02:00
Add Policy Tree class for learning policies based on IRM
diff --git a/doubleml/__init__.py b/doubleml/__init__.py
@@ -12,6 +12,7 @@
 from .double_ml_pq import DoubleMLPQ
 from .double_ml_lpq import DoubleMLLPQ
 from .double_ml_cvar import DoubleMLCVAR
+from .double_ml_policytree import DoubleMLPolicyTree
 
 __all__ = ['DoubleMLPLR',
            'DoubleMLPLIV',
@@ -25,6 +26,7 @@
            'DoubleMLPQ',
            'DoubleMLQTE',
            'DoubleMLLPQ',
-           'DoubleMLCVAR']
+           'DoubleMLCVAR',
+           'DoubleMLPolicyTree']
 
 __version__ = get_distribution('doubleml').version
diff --git a/doubleml/double_ml_irm.py b/doubleml/double_ml_irm.py
@@ -7,11 +7,12 @@
 from .double_ml import DoubleML
 
 from .double_ml_blp import DoubleMLBLP
+from .double_ml_policytree import DoubleMLPolicyTree
 from .double_ml_data import DoubleMLData
 from .double_ml_score_mixins import LinearScoreMixin
 
 from ._utils import _dml_cv_predict, _get_cond_smpls, _dml_tune, _trimm, _normalize_ipw
-from ._utils_checks import _check_score, _check_trimming, _check_finite_predictions, _check_is_propensity
+from ._utils_checks import _check_score, _check_trimming, _check_finite_predictions, _check_is_propensity, _check_integer
 
 
 class DoubleMLIRM(LinearScoreMixin, DoubleML):
@@ -472,3 +473,49 @@ def gate(self, groups):
         model = DoubleMLBLP(orth_signal, basis=groups, is_gate=True).fit()
 
         return model
+
+    def policy_tree(self, features, depth=2, **tree_params):
+        """
+        Estimate a decision tree for optimal treatment policy by weighted classification.
+
+        Parameters
+        ----------
+        depth : int
+            The depth of the estimated decision tree.
+            Has to be larger than 0. Deeper trees derive a more complex decision policy. Default is ``2``.
+
+        features : :class:`pandas.DataFrame`
+            The covariates on which the policy tree is learned.
+            Has to be of shape ``(n_obs, d)``, where ``n_obs`` is the number of observations
+            and ``d`` is the number of covariates to be included.
+
+        **tree_params : dict
+            Parameters that are forwarded to the :class:`sklearn.tree.DecisionTreeClassifier`.
+            Note that by default we perform minimal pruning by setting the ``ccp_alpha = 0.01`` and
+            ``min_samples_leaf = 8``. This can be adjusted.
+
+        Returns
+        -------
+        model : :class:`doubleML.DoubleMLPolicyTree`
+            Policy tree model.
+        """
+        valid_score = ['ATE']
+        if self.score not in valid_score:
+            raise ValueError('Invalid score ' + self.score + '. ' +
+                             'Valid score ' + ' or '.join(valid_score) + '.')
+
+        if self.n_rep != 1:
+            raise NotImplementedError('Only implemented for one repetition. ' +
+                                      f'Number of repetitions is {str(self.n_rep)}.')
+
+        _check_integer(depth, "Depth", 0)
+
+        if not isinstance(features, pd.DataFrame):
+            raise TypeError('Covariates must be of DataFrame type. '
+                            f'Covariates of type {str(type(features))} was passed.')
+
+        orth_signal = self.psi_elements['psi_b'].reshape(-1)
+
+        model = DoubleMLPolicyTree(orth_signal, depth=depth, features=features, **tree_params).fit()
+
+        return model
diff --git a/doubleml/double_ml_policytree.py b/doubleml/double_ml_policytree.py
@@ -0,0 +1,162 @@
+import numpy as np
+import pandas as pd
+
+from sklearn.tree import DecisionTreeClassifier, plot_tree
+from sklearn.utils.validation import check_is_fitted
+
+
+class DoubleMLPolicyTree:
+    """Policy Tree fitting for DoubleML.
+    Currently avaivable for IRM models.
+
+    Parameters
+    ----------
+    orth_signal : :class:`numpy.array`
+        The orthogonal signal to be predicted. Has to be of shape ``(n_obs,)``,
+        where ``n_obs`` is the number of observations.
+
+    features : :class:`pandas.DataFrame`
+        The covariates for estimating the policy tree. Has to have the shape ``(n_obs, d)``,
+        where ``n_obs`` is the number of observations and ``d`` is the number of predictors.
+
+    depth : int
+        The depth of the policy tree that will be built. Default is ``2``.
+
+    **tree_params : dict
+        Parameters that are forwarded to the :class:`sklearn.tree.DecisionTreeClassifier`.
+        Note that by default we perform minimal pruning by setting the ``ccp_alpha = 0.01`` and
+        ``min_samples_leaf = 8``. This can be adjusted.
+
+    """
+
+    def __init__(self,
+                 orth_signal,
+                 features,
+                 depth=2,
+                 **tree_params):
+
+        if not isinstance(orth_signal, np.ndarray):
+            raise TypeError('The signal must be of np.ndarray type. '
+                            f'Signal of type {str(type(orth_signal))} was passed.')
+
+        if orth_signal.ndim != 1:
+            raise ValueError('The signal must be of one dimensional. '
+                             f'Signal of dimensions {str(orth_signal.ndim)} was passed.')
+
+        if not isinstance(features, pd.DataFrame):
+            raise TypeError('The features must be of DataFrame type. '
+                            f'Features of type {str(type(features))} was passed.')
+
+        if not features.columns.is_unique:
+            raise ValueError('Invalid pd.DataFrame: '
+                             'Contains duplicate column names.')
+
+        self._orth_signal = orth_signal
+        self._features = features
+        self._depth = depth
+        self._tree_params = tree_params
+
+        self._tree_params.setdefault("ccp_alpha", .01)
+        self._tree_params.setdefault("min_samples_leaf", 8)
+
+        # initialize tree
+        self._policy_tree = DecisionTreeClassifier(max_depth=self._depth,
+                                                   **self._tree_params)
+
+    def __str__(self):
+        class_name = self.__class__.__name__
+        header = f'================== {class_name} Object ==================\n'
+        fit_summary = str(self.summary)
+        res = header + \
+            '\n------------------ Summary ------------------\n' + fit_summary
+        return res
+
+    @property
+    def policy_tree(self):
+        """
+        Policy tree model.
+        """
+        return self._policy_tree
+
+    @property
+    def orth_signal(self):
+        """
+        Orthogonal signal.
+        """
+        return self._orth_signal
+
+    @property
+    def features(self):
+        """
+        Covariates.
+        """
+        return self._features
+
+    @property
+    def summary(self):
+        """
+        A summary for the policy tree.
+        """
+        summary = pd.DataFrame({"Decision Variables": self._features.keys(), "Max Depth": self._depth})
+        return summary
+
+    def fit(self):
+        """
+        Estimate DoubleMLPolicyTree models.
+
+        Returns
+        -------
+        self : object
+        """
+        bin_signal = (np.sign(self._orth_signal) + 1) / 2
+        abs_signal = np.abs(self._orth_signal)
+
+        # fit the tree with target binary score, sample weights absolute score and
+        # provided feature variables
+        self._policy_tree.fit(X=self._features, y=bin_signal,
+                              sample_weight=abs_signal)
+
+        return self
+
+    def plot_tree(self):
+        """
+        Plots the DoubleMLPolicyTree.
+
+        Returns
+        -------
+        self : object
+        """
+        check_is_fitted(self._policy_tree, msg='Policy Tree not yet fitted. Call fit before plot_tree.')
+
+        artists = plot_tree(self.policy_tree, feature_names=list(self._features.keys()), filled=True,
+                            class_names=["No Treatment", "Treatment"], impurity=False)
+        return artists
+
+    def predict(self, features):
+        """
+        Predicts policy based on the DoubleMLPolicyTree.
+
+        Parameters
+        ----------
+        features : :class:`pandas.DataFrame`
+            The covariates for predicting based on the policy tree. Has to have the shape ``(n_obs, d)``,
+            where ``n_obs`` is the number of observations and ``d`` is the number of predictors. Has to
+            have the identical keys as the original covariates.
+
+        Returns
+        -------
+        self : object
+        """
+        check_is_fitted(self._policy_tree, msg='Policy Tree not yet fitted. Call fit before predict.')
+
+        if not isinstance(features, pd.DataFrame):
+            raise TypeError('The features must be of DataFrame type. '
+                            f'Features of type {str(type(features))} was passed.')
+
+        if not set(features.keys()) == set(self._features.keys()):
+            raise KeyError(f'The features must have the keys {self._features.keys()}. '
+                           f'Features with keys {features.keys()} were passed.')
+
+        predictions = self.policy_tree.predict(features)
+
+        return features.assign(pred_treatment=predictions.astype(int))
diff --git a/doubleml/tests/_utils_pt_manual.py b/doubleml/tests/_utils_pt_manual.py
@@ -0,0 +1,12 @@
+import numpy as np
+from sklearn.tree import DecisionTreeClassifier
+
+
+def fit_policytree(orth_signal, features, depth):
+    policytree_model = DecisionTreeClassifier(max_depth=depth,
+                                              ccp_alpha=.01,
+                                              min_samples_leaf=8).fit(X=features,
+                                                                      y=(np.sign(orth_signal) + 1) / 2,
+                                                                      sample_weight=np.abs(orth_signal))
+
+    return policytree_model
diff --git a/doubleml/tests/test_doubleml_exceptions.py b/doubleml/tests/test_doubleml_exceptions.py
@@ -1356,3 +1356,49 @@ def eval_fct(y_pred, y_true):
         return np.nan
     with pytest.raises(ValueError, match=msg):
         dml_irm_obj.evaluate_learners(metric=eval_fct)
+
+
+@pytest.mark.ci
+def test_doubleml_exception_policytree():
+    dml_irm_obj = DoubleMLIRM(dml_data_irm,
+                              ml_g=Lasso(),
+                              ml_m=LogisticRegression(),
+                              trimming_threshold=0.05,
+                              n_folds=5)
+    dml_irm_obj.fit()
+
+    msg = "Covariates must be of DataFrame type. Covariates of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_irm_obj.policy_tree(features=2)
+    msg = "Depth must be larger or equal to 0. -1 was passed."
+    with pytest.raises(ValueError, match=msg):
+        dml_irm_obj.policy_tree(features=pd.DataFrame(np.random.normal(0, 1, size=(dml_data_irm.n_obs, 3))),
+                                depth=-1)
+    msg = "Depth must be an integer. 0.1 of type <class 'float'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_irm_obj.policy_tree(features=pd.DataFrame(np.random.normal(0, 1, size=(dml_data_irm.n_obs, 3))),
+                                depth=.1)
+
+    dml_irm_obj = DoubleMLIRM(dml_data_irm,
+                              ml_g=Lasso(),
+                              ml_m=LogisticRegression(),
+                              trimming_threshold=0.05,
+                              n_folds=5,
+                              score='ATTE')
+    dml_irm_obj.fit()
+
+    msg = 'Invalid score ATTE. Valid score ATE.'
+    with pytest.raises(ValueError, match=msg):
+        dml_irm_obj.policy_tree(features=2, depth=1)
+
+    dml_irm_obj = DoubleMLIRM(dml_data_irm,
+                              ml_g=Lasso(),
+                              ml_m=LogisticRegression(),
+                              trimming_threshold=0.05,
+                              n_folds=5,
+                              score='ATE',
+                              n_rep=2)
+    dml_irm_obj.fit()
+    msg = 'Only implemented for one repetition. Number of repetitions is 2.'
+    with pytest.raises(NotImplementedError, match=msg):
+        dml_irm_obj.policy_tree(features=2, depth=1)
diff --git a/doubleml/tests/test_doubleml_model_defaults.py b/doubleml/tests/test_doubleml_model_defaults.py
@@ -54,6 +54,8 @@
 dml_lpq.bootstrap()
 dml_qte.bootstrap()
 
+policy_tree = dml_irm.policy_tree(features=dml_data_irm.data.drop(columns=["y", "d"]))
+
 
 def _assert_resampling_default_settings(dml_obj):
     assert dml_obj.n_folds == 5
@@ -188,3 +190,10 @@ def test_sensitivity_defaults():
 
     dml_plr.sensitivity_analysis()
     assert dml_plr._sensitivity_params['input'] == input_dict
+
+
+@pytest.mark.ci
+def test_policytree_defaults():
+    assert policy_tree.policy_tree.max_depth == 2
+    assert policy_tree.policy_tree.min_samples_leaf == 8
+    assert policy_tree.policy_tree.ccp_alpha == 0.01
diff --git a/doubleml/tests/test_doubleml_return_types.py b/doubleml/tests/test_doubleml_return_types.py
@@ -4,7 +4,7 @@
 import plotly
 
 from doubleml import DoubleMLPLR, DoubleMLIRM, DoubleMLIIVM, DoubleMLPLIV, DoubleMLData, DoubleMLClusterData, \
-    DoubleMLCVAR, DoubleMLPQ, DoubleMLLPQ, DoubleMLDID, DoubleMLDIDCS
+    DoubleMLCVAR, DoubleMLPQ, DoubleMLLPQ, DoubleMLDID, DoubleMLDIDCS, DoubleMLPolicyTree
 from doubleml.datasets import make_plr_CCDDHNR2018, make_irm_data, make_pliv_CHS2015, make_iivm_data,\
     make_pliv_multiway_cluster_CKMS2021, make_did_SZ2020
 
@@ -395,3 +395,13 @@ def test_sensitivity():
     assert isinstance(did_cs_dml1._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0), tuple)
     did_cs_benchmark = did_cs_dml1.sensitivity_benchmark(benchmarking_set=['Z1'])
     assert isinstance(did_cs_benchmark, pd.DataFrame)
+
+
+@pytest.mark.ci
+def test_policytree():
+    features = dml_data_irm.data.drop(columns=["y", "d"])
+    policy_tree = dml_irm.policy_tree(features, depth=1)
+    assert isinstance(policy_tree, DoubleMLPolicyTree)
+    assert isinstance(policy_tree.plot_tree(), list)
+    predict_features = pd.DataFrame(np.random.normal(size=(5, 20)), columns=features.keys())
+    assert isinstance(policy_tree.predict(predict_features), pd.DataFrame)
diff --git a/doubleml/tests/test_policytree.py b/doubleml/tests/test_policytree.py
diff --git a/requirements.txt b/requirements.txt