diff --git a/.gitignore b/.gitignore index 46fe01e..2807f81 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,6 @@ venv.bak/ # mypy .mypy_cache/ + +# vscode stuff +.vscode/ \ No newline at end of file diff --git a/chapters/1 Introduction/Example 1 Email Spam .ipynb b/chapters/1 Introduction/Example 1 Email Spam .ipynb new file mode 100644 index 0000000..691911d --- /dev/null +++ b/chapters/1 Introduction/Example 1 Email Spam .ipynb @@ -0,0 +1,415 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
testword_freq_makeword_freq_addressword_freq_allword_freq_3dword_freq_ourword_freq_overword_freq_removeword_freq_internetword_freq_order...char_freq_;char_freq_(char_freq_[char_freq_!char_freq_$char_freq_#capital_run_length_averagecapital_run_length_longestcapital_run_length_totalspam
010.000.640.640.00.320.000.000.000.00...0.000.0000.00.7780.0000.0003.756612781
100.210.280.500.00.140.280.210.070.00...0.000.1320.00.3720.1800.0485.11410110281
210.060.000.710.01.230.190.190.120.64...0.010.1430.00.2760.1840.0109.82148522591
300.000.000.000.00.630.000.310.630.31...0.000.1370.00.1370.0000.0003.537401911
400.000.000.000.00.630.000.310.630.31...0.000.1350.00.1350.0000.0003.537401911
\n", + "

5 rows × 59 columns

\n", + "
" + ], + "text/plain": [ + " test word_freq_make word_freq_address word_freq_all word_freq_3d \\\n", + "0 1 0.00 0.64 0.64 0.0 \n", + "1 0 0.21 0.28 0.50 0.0 \n", + "2 1 0.06 0.00 0.71 0.0 \n", + "3 0 0.00 0.00 0.00 0.0 \n", + "4 0 0.00 0.00 0.00 0.0 \n", + "\n", + " word_freq_our word_freq_over word_freq_remove word_freq_internet \\\n", + "0 0.32 0.00 0.00 0.00 \n", + "1 0.14 0.28 0.21 0.07 \n", + "2 1.23 0.19 0.19 0.12 \n", + "3 0.63 0.00 0.31 0.63 \n", + "4 0.63 0.00 0.31 0.63 \n", + "\n", + " word_freq_order ... char_freq_; char_freq_( char_freq_[ char_freq_! \\\n", + "0 0.00 ... 0.00 0.000 0.0 0.778 \n", + "1 0.00 ... 0.00 0.132 0.0 0.372 \n", + "2 0.64 ... 0.01 0.143 0.0 0.276 \n", + "3 0.31 ... 0.00 0.137 0.0 0.137 \n", + "4 0.31 ... 0.00 0.135 0.0 0.135 \n", + "\n", + " char_freq_$ char_freq_# capital_run_length_average \\\n", + "0 0.000 0.000 3.756 \n", + "1 0.180 0.048 5.114 \n", + "2 0.184 0.010 9.821 \n", + "3 0.000 0.000 3.537 \n", + "4 0.000 0.000 3.537 \n", + "\n", + " capital_run_length_longest capital_run_length_total spam \n", + "0 61 278 1 \n", + "1 101 1028 1 \n", + "2 485 2259 1 \n", + "3 40 191 1 \n", + "4 40 191 1 \n", + "\n", + "[5 rows x 59 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_spam = pd.read_csv(\"../../data/Spam.txt\")\n", + "df_spam.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['test', 'word_freq_make', 'word_freq_address', 'word_freq_all',\n", + " 'word_freq_3d', 'word_freq_our', 'word_freq_over', 'word_freq_remove',\n", + " 'word_freq_internet', 'word_freq_order', 'word_freq_mail',\n", + " 'word_freq_receive', 'word_freq_will', 'word_freq_people',\n", + " 'word_freq_report', 'word_freq_addresses', 'word_freq_free',\n", + " 'word_freq_business', 'word_freq_email', 'word_freq_you',\n", + " 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',\n", + " 'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george',\n", + " 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',\n", + " 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',\n", + " 'word_freq_technology', 'word_freq_1999', 'word_freq_parts',\n", + " 'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting',\n", + " 'word_freq_original', 'word_freq_project', 'word_freq_re',\n", + " 'word_freq_edu', 'word_freq_table', 'word_freq_conference',\n", + " 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!',\n", + " 'char_freq_$', 'char_freq_#', 'capital_run_length_average',\n", + " 'capital_run_length_longest', 'capital_run_length_total', 'spam'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(df_spam.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'conference']\n" + ] + } + ], + "source": [ + "words_feature = [word for word in df_spam.columns if 'word_freq' in word]\n", + "words = [word[10:] for word in words_feature]\n", + "# print(words_feature)\n", + "print(words)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
georgeyouyourhpfreehplourreeduremove000business
spam0.002.261.380.020.520.010.510.130.010.280.250.29
email1.271.270.440.900.070.430.180.420.290.010.010.05
\n", + "
" + ], + "text/plain": [ + " george you your hp free hpl our re edu remove 000 business\n", + "spam 0.00 2.26 1.38 0.02 0.52 0.01 0.51 0.13 0.01 0.28 0.25 0.29\n", + "email 1.27 1.27 0.44 0.90 0.07 0.43 0.18 0.42 0.29 0.01 0.01 0.05" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "temp = df_spam[words_feature + ['spam']].groupby('spam').mean().sort_index(ascending=False)\n", + "temp.columns = words\n", + "temp.index = ['spam','email']\n", + "with pd.option_context(\"float_format\",'{:,.2f}'.format):\n", + " display(temp[(temp.iloc[0] - temp.iloc[1]).abs().sort_values(ascending=False).index[:12]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "interpreter": { + "hash": "58dd21b98941f49f3ff1e43b315fad01188084135da91d73015e8a9aac331f94" + }, + "kernelspec": { + "display_name": "Python 3.9.5 64-bit ('env_esl': virtualenvwrapper)", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file