From 055d0ede2d7b0ca35b049baedf19965b13b7e30f Mon Sep 17 00:00:00 2001 From: "Daniel Shats Daniel.Shats1@ibm.com" Date: Tue, 10 Jan 2023 05:44:32 -0500 Subject: [PATCH 1/5] start of the cache benchmark examples --- examples/cache_test.ipynb | 166 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 examples/cache_test.ipynb diff --git a/examples/cache_test.ipynb b/examples/cache_test.ipynb new file mode 100644 index 000000000..bc6232409 --- /dev/null +++ b/examples/cache_test.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "import pandas as pd\n", + "from fuseimg.datasets.knight import KNIGHT\n", + "from torch.utils.data.dataloader import DataLoader\n", + "from fuse.data.utils.collates import CollateDefault" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "experiment_num = 0\n", + "cache_path = \"/dccstor/mm_hcls/shatz/knight_cache/\"\n", + "data_path = os.environ[\"KNIGHT_DATA\"]\n", + "cache_path = os.path.join(cache_path, f\"exp_num_{str(experiment_num)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "splits = pd.read_pickle(\"/u/shatz/repos/fuse-med-ml/examples/fuse_examples/imaging/classification/knight/baseline/splits_final.pkl\")\n", + "split = splits[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pipeline description hash for [train_cache] is: hash_9c4aa89acd27fb78ddce20428f575665\n", + "- Load and cache data:\n", + "entire samples set samples_ids_hash@d9888790bcbdb60fa9a3de33a134a706.pkl.gz already cached. Found /dccstor/mm_hcls/shatz/knight_cache/exp_num_0/train/train_cache/hash_9c4aa89acd27fb78ddce20428f575665/full_sets_info/samples_ids_hash@d9888790bcbdb60fa9a3de33a134a706.pkl.gz\n", + "- Load and cache data: Done\n", + "Train Data: Done {'attrs': 'bold'}\n", + "Validation Data: {'attrs': 'bold'}\n", + "pipeline description hash for [val_cache] is: hash_9c4aa89acd27fb78ddce20428f575665\n", + "- Load and cache data:\n", + "entire samples set samples_ids_hash@b28209a8409367ee38706d9166530826.pkl.gz already cached. Found /dccstor/mm_hcls/shatz/knight_cache/exp_num_0/val/val_cache/hash_9c4aa89acd27fb78ddce20428f575665/full_sets_info/samples_ids_hash@b28209a8409367ee38706d9166530826.pkl.gz\n", + "- Load and cache data: Done\n", + "Validation Data: Done {'attrs': 'bold'}\n" + ] + } + ], + "source": [ + "train_ds, valid_ds = KNIGHT.dataset(\n", + " data_path=data_path,\n", + " cache_dir=None,\n", + " split=split,\n", + " reset_cache=False,\n", + " resize_to=(70, 256, 256),\n", + " max_allowed_used_space=0.99,\n", + ")\n", + "\n", + "train_dl = DataLoader(\n", + " dataset=train_ds,\n", + " shuffle=False,\n", + " drop_last=False,\n", + " # batch_sampler=sampler,\n", + " collate_fn=CollateDefault(),\n", + " num_workers=0,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# train_ds, valid_ds = KNIGHT.dataset(\n", + " # data_path=data_path,\n", + " # cache_dir=cache_path,\n", + " # split=split,\n", + " # reset_cache=False,\n", + " # resize_to=(70, 256, 256),\n", + " # max_allowed_used_space=0.99,\n", + "# )\n", + "\n", + "# train_dl = DataLoader(\n", + " # dataset=train_ds,\n", + " # shuffle=False,\n", + " # drop_last=False,\n", + " # # batch_sampler=sampler,\n", + " # collate_fn=CollateDefault(),\n", + " # num_workers=0,\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test~\n", + "CPU times: user 8min 12s, sys: 7.31 s, total: 8min 19s\n", + "Wall time: 2min 3s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "for _ in train_dl:\n", + " pass\n", + "\n", + "print(\"test~\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fuse_env1", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.15" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "478d75c6ad124e9d588857d831b6f57d098feccaed2912037e9244dbb2ef03a1" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 8b98bf0ffb6bf4341bebc0e0ec8ce594a269f947 Mon Sep 17 00:00:00 2001 From: "Daniel Shats Daniel.Shats1@ibm.com" Date: Tue, 10 Jan 2023 07:05:59 -0500 Subject: [PATCH 2/5] removed hard-coded num_workers --- fuseimg/datasets/knight.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fuseimg/datasets/knight.py b/fuseimg/datasets/knight.py index 6b86c9e63..75ddcfd46 100644 --- a/fuseimg/datasets/knight.py +++ b/fuseimg/datasets/knight.py @@ -258,6 +258,7 @@ def dataset( test: bool = False, reset_cache: bool = False, resize_to: Tuple = (70, 256, 256), + num_workers=8, ) -> DatasetDefault: """ Get cached dataset @@ -277,7 +278,7 @@ def dataset( if sample_ids is not None: static_pipeline = KNIGHT.static_pipeline(data_path, resize_to=resize_to, test=test) cacher = SamplesCacher( - "cache", static_pipeline, cache_dirs=[f"{cache_dir}/data"], restart_cache=reset_cache, workers=8 + "cache", static_pipeline, cache_dirs=[f"{cache_dir}/data"], restart_cache=reset_cache, workers=num_workers ) dataset = DatasetDefault( sample_ids=sample_ids, @@ -293,7 +294,7 @@ def dataset( static_pipeline = KNIGHT.static_pipeline(data_path, resize_to=resize_to, test=("test" in split)) if "train" in split: train_cacher = SamplesCacher( - "train_cache", static_pipeline, cache_dirs=[f"{cache_dir}/train"], restart_cache=reset_cache, workers=8 + "train_cache", static_pipeline, cache_dirs=[f"{cache_dir}/train"], restart_cache=reset_cache, workers=num_workers ) train_dataset = DatasetDefault( @@ -314,7 +315,7 @@ def dataset( print("Validation Data:", {"attrs": "bold"}) val_cacher = SamplesCacher( - "val_cache", static_pipeline, cache_dirs=[f"{cache_dir}/val"], restart_cache=reset_cache, workers=8 + "val_cache", static_pipeline, cache_dirs=[f"{cache_dir}/val"], restart_cache=reset_cache, workers=num_workers ) ## Create dataset validation_dataset = DatasetDefault( From 462bdb4fdb2ff19401c2b3b3171493640b6547e4 Mon Sep 17 00:00:00 2001 From: "Daniel Shats Daniel.Shats1@ibm.com" Date: Tue, 10 Jan 2023 10:03:06 -0500 Subject: [PATCH 3/5] made cacheing optional in KNIGHT and made a notebook to compare caching vs not caching --- examples/cache_test.ipynb | 886 +++++++++++++++++++++++++++++++++++-- fuseimg/datasets/knight.py | 42 +- 2 files changed, 870 insertions(+), 58 deletions(-) diff --git a/examples/cache_test.ipynb b/examples/cache_test.ipynb index bc6232409..07348d04c 100644 --- a/examples/cache_test.ipynb +++ b/examples/cache_test.ipynb @@ -1,11 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🔥 A notebook to explain why caching is great" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ + "import time\n", + "from tqdm.notebook import tqdm\n", "import os\n", "import pathlib\n", "import pandas as pd\n", @@ -21,7 +30,7 @@ "outputs": [], "source": [ "experiment_num = 0\n", - "cache_path = \"/dccstor/mm_hcls/shatz/knight_cache/\"\n", + "cache_path = \"/dccstor/mm_hcls/shatz/knight/cache/\"\n", "data_path = os.environ[\"KNIGHT_DATA\"]\n", "cache_path = os.path.join(cache_path, f\"exp_num_{str(experiment_num)}\")" ] @@ -40,92 +49,882 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, + "outputs": [], + "source": [ + "num_epochs = range(15)\n", + "num_workers = 48\n", + "batch_size = 4" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🚙 Dataset without cache" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "pipeline description hash for [train_cache] is: hash_9c4aa89acd27fb78ddce20428f575665\n", - "- Load and cache data:\n", - "entire samples set samples_ids_hash@d9888790bcbdb60fa9a3de33a134a706.pkl.gz already cached. Found /dccstor/mm_hcls/shatz/knight_cache/exp_num_0/train/train_cache/hash_9c4aa89acd27fb78ddce20428f575665/full_sets_info/samples_ids_hash@d9888790bcbdb60fa9a3de33a134a706.pkl.gz\n", - "- Load and cache data: Done\n", + "✋ Caching is OFF\n", + "- Load data:\n", + "- Load data: Done\n", "Train Data: Done {'attrs': 'bold'}\n", "Validation Data: {'attrs': 'bold'}\n", - "pipeline description hash for [val_cache] is: hash_9c4aa89acd27fb78ddce20428f575665\n", "- Load and cache data:\n", - "entire samples set samples_ids_hash@b28209a8409367ee38706d9166530826.pkl.gz already cached. Found /dccstor/mm_hcls/shatz/knight_cache/exp_num_0/val/val_cache/hash_9c4aa89acd27fb78ddce20428f575665/full_sets_info/samples_ids_hash@b28209a8409367ee38706d9166530826.pkl.gz\n", "- Load and cache data: Done\n", "Validation Data: Done {'attrs': 'bold'}\n" ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "74ec6acc3bce44e18b8afda773d7961a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/15 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "plt.plot(np.cumsum(times_cache), label=\"cache (without counting cache time)\")\n", + "plt.plot([t + total_cache_time for t in np.cumsum(times_cache)], label=\"cache (with cache time)\")\n", + "plt.plot(np.cumsum(times_nocache), label=\"no cache\")\n", + "plt.xlabel(\"Epochs\")\n", + "plt.ylabel(\"Cumulative Time (s)\")\n", + "plt.title(\"Fuse Cache vs No Cache - Knight Data\")\n", + "plt.legend()\n", + "plt.show()" ] }, { @@ -138,7 +937,7 @@ ], "metadata": { "kernelspec": { - "display_name": "fuse_env1", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -154,7 +953,6 @@ "pygments_lexer": "ipython3", "version": "3.8.15" }, - "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "478d75c6ad124e9d588857d831b6f57d098feccaed2912037e9244dbb2ef03a1" diff --git a/fuseimg/datasets/knight.py b/fuseimg/datasets/knight.py index 75ddcfd46..0f25cadf4 100644 --- a/fuseimg/datasets/knight.py +++ b/fuseimg/datasets/knight.py @@ -263,7 +263,7 @@ def dataset( """ Get cached dataset :param data_path: path to store the original data - :param cache_dir: path to store the cache + :param cache_dir: path to store the cache (make None to turn off caching altogether) :param split: dictionary including sample ids for (train and validation) or test. :param sample_ids: dataset including the specified sample_ids. sample_id is case_{id:05d} (for example case_00001 or case_00100). either split or sample_ids is not None. there is no need in both of them. @@ -274,28 +274,41 @@ def dataset( train_dynamic_pipeline = KNIGHT.train_dynamic_pipeline() val_dynamic_pipeline = KNIGHT.val_dynamic_pipeline() + # turn caching off if cache_dir is None + if cache_dir: + print("👍 Caching is ON") + use_caching = True + else: + print("✋ Caching is OFF") + cacher = None + val_cacher = None + train_cacher = None + use_caching = False + # Create dataset if sample_ids is not None: static_pipeline = KNIGHT.static_pipeline(data_path, resize_to=resize_to, test=test) - cacher = SamplesCacher( - "cache", static_pipeline, cache_dirs=[f"{cache_dir}/data"], restart_cache=reset_cache, workers=num_workers - ) + if use_caching: + cacher = SamplesCacher( + "cache", static_pipeline, cache_dirs=[f"{cache_dir}/data"], restart_cache=reset_cache, workers=num_workers + ) dataset = DatasetDefault( sample_ids=sample_ids, static_pipeline=static_pipeline, dynamic_pipeline=val_dynamic_pipeline if test else train_dynamic_pipeline, cacher=cacher, ) - print("- Load and cache data:") + print("- Load data:") dataset.create() - print("- Load and cache data: Done") + print("- Load data: Done") return dataset static_pipeline = KNIGHT.static_pipeline(data_path, resize_to=resize_to, test=("test" in split)) if "train" in split: - train_cacher = SamplesCacher( - "train_cache", static_pipeline, cache_dirs=[f"{cache_dir}/train"], restart_cache=reset_cache, workers=num_workers - ) + if use_caching: + train_cacher = SamplesCacher( + "train_cache", static_pipeline, cache_dirs=[f"{cache_dir}/train"], restart_cache=reset_cache, workers=num_workers + ) train_dataset = DatasetDefault( sample_ids=split["train"], @@ -304,19 +317,20 @@ def dataset( cacher=train_cacher, ) - print("- Load and cache data:") + print("- Load data:") train_dataset.create() - print("- Load and cache data: Done") + print("- Load data: Done") print("Train Data: Done", {"attrs": "bold"}) #### Validation data print("Validation Data:", {"attrs": "bold"}) - val_cacher = SamplesCacher( - "val_cache", static_pipeline, cache_dirs=[f"{cache_dir}/val"], restart_cache=reset_cache, workers=num_workers - ) + if use_caching: + val_cacher = SamplesCacher( + "val_cache", static_pipeline, cache_dirs=[f"{cache_dir}/val"], restart_cache=reset_cache, workers=num_workers + ) ## Create dataset validation_dataset = DatasetDefault( sample_ids=split["val"], From 00716f7601a7a7ca8da10c164fa685ad3c691196 Mon Sep 17 00:00:00 2001 From: "Daniel Shats Daniel.Shats1@ibm.com" Date: Tue, 10 Jan 2023 10:09:59 -0500 Subject: [PATCH 4/5] black --- fuseimg/datasets/knight.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fuseimg/datasets/knight.py b/fuseimg/datasets/knight.py index 0f25cadf4..130c441f1 100644 --- a/fuseimg/datasets/knight.py +++ b/fuseimg/datasets/knight.py @@ -290,7 +290,11 @@ def dataset( static_pipeline = KNIGHT.static_pipeline(data_path, resize_to=resize_to, test=test) if use_caching: cacher = SamplesCacher( - "cache", static_pipeline, cache_dirs=[f"{cache_dir}/data"], restart_cache=reset_cache, workers=num_workers + "cache", + static_pipeline, + cache_dirs=[f"{cache_dir}/data"], + restart_cache=reset_cache, + workers=num_workers, ) dataset = DatasetDefault( sample_ids=sample_ids, @@ -307,7 +311,11 @@ def dataset( if "train" in split: if use_caching: train_cacher = SamplesCacher( - "train_cache", static_pipeline, cache_dirs=[f"{cache_dir}/train"], restart_cache=reset_cache, workers=num_workers + "train_cache", + static_pipeline, + cache_dirs=[f"{cache_dir}/train"], + restart_cache=reset_cache, + workers=num_workers, ) train_dataset = DatasetDefault( @@ -329,7 +337,11 @@ def dataset( if use_caching: val_cacher = SamplesCacher( - "val_cache", static_pipeline, cache_dirs=[f"{cache_dir}/val"], restart_cache=reset_cache, workers=num_workers + "val_cache", + static_pipeline, + cache_dirs=[f"{cache_dir}/val"], + restart_cache=reset_cache, + workers=num_workers, ) ## Create dataset validation_dataset = DatasetDefault( From a84ff983e21e1b58a2761ee9569dec5bd09829b4 Mon Sep 17 00:00:00 2001 From: "Daniel Shats Daniel.Shats1@ibm.com" Date: Tue, 10 Jan 2023 10:35:29 -0500 Subject: [PATCH 5/5] mypy --- fuseimg/datasets/knight.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fuseimg/datasets/knight.py b/fuseimg/datasets/knight.py index 130c441f1..e6994a858 100644 --- a/fuseimg/datasets/knight.py +++ b/fuseimg/datasets/knight.py @@ -258,7 +258,7 @@ def dataset( test: bool = False, reset_cache: bool = False, resize_to: Tuple = (70, 256, 256), - num_workers=8, + num_workers: int = 8, ) -> DatasetDefault: """ Get cached dataset