Merge branch 'master' of github.com:DoubleML/doubleml-for-py into m-prepare-release-0.1.0

MalteKurz · MalteKurz · commit f9d1e3223c23 · 2020-12-04T15:46:47.000+01:00
diff --git a/.github/workflows/deploy_docu.yml b/.github/workflows/deploy_docu.yml
@@ -15,13 +15,13 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
 
     steps:
-    - uses: actions/checkout@v2
-      with:
-        persist-credentials: false
-    - name: Install SSH Client
+    - name: Check out the repo containing the Python package
+      uses: actions/checkout@v2
+
+    - name: Install SSH Client for deploying the docu to github pages
       uses: webfactory/ssh-agent@v0.4.1
       with:
         ssh-private-key: ${{ secrets.DEPLOY_KEY }}
@@ -37,14 +37,40 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install -r requirements-dev.txt
-        pip install .
+        pip install -e .
+
+    - name: Add R repository
+      run: |
+        sudo apt install dirmngr gnupg apt-transport-https ca-certificates software-properties-common
+        sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+        sudo add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/'
     - name: Install R
-      uses: r-lib/actions/setup-r@v1
+      run: |
+        sudo apt-get update
+        sudo apt-get install r-base
+        sudo apt-get install r-base-dev
+
+    - name: Get user library folder
+      run: |
+        mkdir ${GITHUB_WORKSPACE}/tmp_r_libs_user
+        echo R_LIBS_USER=${GITHUB_WORKSPACE}/tmp_r_libs_user >> $GITHUB_ENV
+
+    - name: Query R version
+      run: |
+        writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version")
+      shell: Rscript {0}
+
+    - name: Cache R packages
+      uses: actions/cache@v2
       with:
-        r-version: 'release'
-    - name: Install R kernel for Jupyter
+        path: ${{ env.R_LIBS_USER }}
+        key: doubleml-user-guide-${{ hashFiles('.github/R-version') }}
+
+    - name: Install R kernel for Jupyter and the R package DoubleML
       run: |
         install.packages('remotes')
+        remotes::install_github("DoubleML/doubleml-for-r")
+        install.packages(c("knitr", "rmarkdown", "testthat", "patrick", "mvtnorm", "dplyr", "glmnet", "lgr", "ggplot2", "ranger", "hdm", "sandwich", "AER", "rpart"))
         install.packages('IRkernel')
         IRkernel::installspec()
       shell: Rscript {0}
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ It further can be readily extended with regards to
 - ... alternative resampling schemes,
 - ... 
 
-![OOP structure of the DoubleML package](/doc/oop.svg?raw=true)
+![An overview of the OOP structure of the DoubleML package is given in the graphic available at https://github.com/DoubleML/doubleml-for-py/blob/master/doc/oop.svg](/doc/oop.svg?raw=true)
 
 ## Installation
 
diff --git a/doc/guide/algorithms.rst b/doc/guide/algorithms.rst
@@ -87,7 +87,6 @@ The DML algorithm can be selected via parameter ``dml_procedure='dml1'`` vs. ``d
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -117,7 +116,6 @@ stores the estimate :math:`\tilde{\theta}_0` in its ``coef`` attribute.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj$coef
 
@@ -135,7 +133,6 @@ are stored in the attribute ``psi``.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj$psi[1:5, ,1]
 
@@ -152,7 +149,6 @@ For the DML1 algorithm, the estimates for the different folds
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj$all_dml1_coef
 
diff --git a/doc/guide/basics.rst b/doc/guide/basics.rst
@@ -60,7 +60,6 @@ The nuisance functions are given by
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         set.seed(1234)
@@ -126,7 +125,6 @@ efficient.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(ggplot2)
 
@@ -219,7 +217,6 @@ other half of observations indexed with :math:`i \in I`
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         non_orth_score = function(y, d, g_hat, m_hat, smpls) {
          u_hat = y - g_hat
@@ -231,7 +228,6 @@ other half of observations indexed with :math:`i \in I`
 
 
     .. jupyter-execute::
-        :raises:
 
         library(mlr3)
         library(mlr3learners)
@@ -334,7 +330,6 @@ orthogonalized regressor :math:`V = D - m(X)`. We then use the final estimate
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(data.table)
         lgr::get_logger("mlr3")$set_threshold("warn")
@@ -412,7 +407,6 @@ induced by overfitting. Cross-fitting performs well empirically.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         set.seed(3333)
 
@@ -489,7 +483,6 @@ The third term :math:`c^*` vanishes in probability if sample splitting is applie
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         g_all = ggplot(data.frame(theta_ols, theta_nonorth, theta_orth_nosplit, theta_dml)) +
                     geom_density(aes(x = theta_ols), fill = "dark blue", alpha = 0.3, color = "dark blue") +
diff --git a/doc/guide/data_backend.rst b/doc/guide/data_backend.rst
@@ -25,7 +25,6 @@ demonstrated in the following. We download the Bonus data set from the Pennsylva
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
 
@@ -70,7 +69,6 @@ serving as treatment variable :math:`D` and the columns ``x_cols=`` specifying t
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         # Specify the data and the variables for the causal model
 
@@ -132,7 +130,6 @@ variable ``y`` and a treatment variable ``d``
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         # Generate data
         set.seed(3141)
@@ -157,7 +154,6 @@ To specify the data and the variables for the causal model from arrays we call
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         obj_dml_data_sim = double_ml_data_from_matrix(X = X, y = y, d = d)
         obj_dml_data_sim
diff --git a/doc/guide/learners.rst b/doc/guide/learners.rst
@@ -216,7 +216,6 @@ package for R.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -244,7 +243,6 @@ Setting hyperparameters:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         set.seed(3141)
         ml_g = lrn("regr.ranger", num.trees=10)
@@ -281,7 +279,6 @@ Setting treatment-variable-specific or fold-specific hyperparameters:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         set.seed(3141)
         ml_g = lrn("regr.ranger")
@@ -307,7 +304,6 @@ The following example illustrates how to set parameters for each fold.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         learner = lrn("regr.ranger")
         ml_g = learner$clone()
@@ -340,7 +336,6 @@ To illustrate the parameter tuning, we generate data from a sparse partially lin
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -387,7 +382,6 @@ for tuning, each of the two folds would be split up into 5 subfolds and the erro
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -432,7 +426,6 @@ external parameter tuning of the nuisance parts. The optimally chosen parameters
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -458,7 +451,6 @@ as provided by the ``ranger`` package.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
diff --git a/doc/guide/models.rst b/doc/guide/models.rst
@@ -35,7 +35,6 @@ Estimation is conducted via its ``fit()`` method:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -88,7 +87,6 @@ Estimation is conducted via its ``fit()`` method:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -137,7 +135,6 @@ Estimation is conducted via its ``fit()`` method:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -187,7 +184,6 @@ Estimation is conducted via its ``fit()`` method:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
diff --git a/doc/guide/resampling.rst b/doc/guide/resampling.rst
@@ -36,7 +36,6 @@ implemented in ``DoubleMLPLR``.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         library(DoubleML)
         library(mlr3)
@@ -71,7 +70,6 @@ The default setting is ``n_folds = 5`` and ``n_rep = 1``, i.e.,
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m, n_folds = 5, n_rep = 1)
         print(dml_plr_obj$n_folds)
@@ -92,7 +90,6 @@ The :math:`K`-fold random partition is stored in the ``smpls`` attribute of the
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj$smpls
 
@@ -119,7 +116,6 @@ stored in the attributes ``psi_a`` and ``psi_b``.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj$fit()
         print(dml_plr_obj$psi_a[1:5, ,1])
@@ -142,7 +138,6 @@ It results in :math:`M` random :math:`K`-fold partitions being drawn.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m, n_folds = 5, n_rep = 10)
         print(dml_plr_obj$n_folds)
@@ -170,7 +165,6 @@ The third dimension refers to the treatment variable and becomes non-singleton i
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj$fit()
         print(dml_plr_obj$psi_a[1:5, ,1])
@@ -199,7 +193,6 @@ and the asymptotic standard error :math:`\hat{\sigma}/\sqrt{N}` in ``se``.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         print(dml_plr_obj$coef)
         print(dml_plr_obj$se)
@@ -218,7 +211,6 @@ The parameter estimates :math:`(\tilde{\theta}_{0,m})_{m \in [M]}` and asymptoti
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         print(dml_plr_obj$all_coef)
         print(dml_plr_obj$all_se)
@@ -246,7 +238,6 @@ initialization of the ``DoubleMLPLR`` object.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         set.seed(314)
         dml_plr_obj_internal = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m, n_folds = 4)
@@ -273,7 +264,6 @@ and set the partition via the ``set_sample_splitting()`` method.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj_external = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m, draw_sample_splitting = FALSE)
 
@@ -312,7 +302,6 @@ Note that cross-fitting performs well empirically and is recommended to remove b
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj_external = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m,
                                                n_folds = 2, apply_cross_fitting = FALSE)
@@ -339,7 +328,6 @@ via ``set_sample_splitting()`` needs to be applied, like for example:
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_obj_external = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m,
                                                 n_folds = 2, apply_cross_fitting = FALSE,
@@ -381,7 +369,6 @@ justification, see also :ref:`bias_overfitting`.
 .. tabbed:: R
 
     .. jupyter-execute::
-        :raises:
 
         dml_plr_no_split = DoubleMLPLR$new(obj_dml_data, ml_g, ml_m,
                                            n_folds = 1, apply_cross_fitting = FALSE)
diff --git a/doc/guide/scores.rst b/doc/guide/scores.rst
diff --git a/doc/guide/se_confint.rst b/doc/guide/se_confint.rst
diff --git a/doc/intro/intro.rst b/doc/intro/intro.rst
diff --git a/setup.py b/setup.py