Unit-2 Ipynb
Unit-2 Ipynb
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Unit-2\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Loading"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Loading the dataset\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### importing the data the mail data\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X-GM-THRID\n",
"X-Gmail-Labels\n",
"MIME-Version\n",
"Date\n",
"Message-ID\n",
"Subject\n",
"From\n",
"To\n",
"Content-Type\n"
]
},
{
"data": {
"text/plain": [
"<mailbox.mbox at 0x12a9cacd490>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import mailbox\n",
"\n",
"mboxfile = \"./All mail Including Spam and Trash.mbox\"\n",
"mbox = mailbox.mbox(mboxfile)\n",
"\n",
"for key in mbox[0].keys():\n",
" print(key)\n",
"mbox"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Data transformation"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Data cleansing"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"\n",
"with open('testdata.csv','w') as outputfile:\n",
" writing = csv.writer(outputfile)\n",
" writing.writerow(['subject', 'from', 'date', 'to', 'label','Thread'])\n",
"\n",
" for message in mbox:\n",
" writing.writerow([\n",
" message['subject'],\n",
" message['from'],\n",
" message['date'],\n",
" message['to'],\n",
" message['X-Gmail-Labels'],\n",
" message['X-GM-THRID']\n",
" ])\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Loading the csv file\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject</th>\n",
" <th>from</th>\n",
" <th>date</th>\n",
" <th>to</th>\n",
" <th>label</th>\n",
" <th>Thread</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Conceptual Physics book file</td>\n",
" <td>Kadarla Sai Nithin <[email protected]></td>\n",
" <td>Wed, 11 Jan 2023 10:00:04 +0530</td>\n",
" <td>Suresh Kumar Lokhande <[email protected]></td>\n",
" <td>Archived,Sent</td>\n",
" <td>1754524927945493361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Thank you for signing up for ThingSpeak!</td>\n",
" <td>ThingSpeak Support <[email protected]></td>\n",
" <td>Sat, 22 Apr 2023 09:56:31 +0000</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1763869869345248847</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Get the official Gmail app</td>\n",
" <td>Gmail Team <[email protected]></td>\n",
" <td>Wed, 30 Nov 2022 00:05:59 -0800</td>\n",
" <td>Kadarla Sai Nithin <[email protected]></td>\n",
" <td>Inbox,Unread</td>\n",
" <td>1750907548775931784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Reset Your Password</td>\n",
" <td>Xmind Notifications <[email protected]....</td>\n",
" <td>Tue, 16 May 2023 15:37:20 +0000</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1766065641328425275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Security alert</td>\n",
" <td>Google <[email protected]></td>\n",
" <td>Sat, 25 Feb 2023 04:51:44 GMT</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1758777265723926184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>Verify Email Address</td>\n",
" <td>[email protected]</td>\n",
" <td>Sat, 22 Apr 2023 09:55:12 +0000</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1763869787228555223</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>[email protected]</td>\n",
" <td>Sun, 19 Mar 2023 23:55:52 -0700</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760868809297254026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>[email protected]</td>\n",
" <td>Mon, 13 Mar 2023 23:40:25 -0700</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760324258667829753</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Security alert</td>\n",
" <td>Google <[email protected]></td>\n",
" <td>Fri, 19 May 2023 10:03:42 GMT</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Opened,Category Updates</td>\n",
" <td>1766316440594475561</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>[Request received]</td>\n",
" <td>\"Tech (Support)\" <[email protected]></td>\n",
" <td>Wed, 08 Feb 2023 14:48:12 +0000</td>\n",
" <td>\"sainithink.be25\" <[email protected]></td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1757274643381080149</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>66 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" subject \\\n",
"0 Conceptual Physics book file \n",
"1 Thank you for signing up for ThingSpeak! \n",
"2 Get the official Gmail app \n",
"3 Reset Your Password \n",
"4 Security alert \n",
".. ... \n",
"61 Verify Email Address \n",
"62 uceou.edu account email verification code \n",
"63 uceou.edu account email verification code \n",
"64 Security alert \n",
"65 [Request received] \n",
"\n",
" from \\\n",
"0 Kadarla Sai Nithin <[email protected]> \n",
"1 ThingSpeak Support <[email protected]> \n",
"2 Gmail Team <[email protected]> \n",
"3 Xmind Notifications <[email protected].... \n",
"4 Google <[email protected]> \n",
".. ... \n",
"61 [email protected] \n",
"62 [email protected] \n",
"63 [email protected] \n",
"64 Google <[email protected]> \n",
"65 \"Tech (Support)\" <[email protected]> \n",
"\n",
" date \\\n",
"0 Wed, 11 Jan 2023 10:00:04 +0530 \n",
"1 Sat, 22 Apr 2023 09:56:31 +0000 \n",
"2 Wed, 30 Nov 2022 00:05:59 -0800 \n",
"3 Tue, 16 May 2023 15:37:20 +0000 \n",
"4 Sat, 25 Feb 2023 04:51:44 GMT \n",
".. ... \n",
"61 Sat, 22 Apr 2023 09:55:12 +0000 \n",
"62 Sun, 19 Mar 2023 23:55:52 -0700 \n",
"63 Mon, 13 Mar 2023 23:40:25 -0700 \n",
"64 Fri, 19 May 2023 10:03:42 GMT \n",
"65 Wed, 08 Feb 2023 14:48:12 +0000 \n",
"\n",
" to \\\n",
"0 Suresh Kumar Lokhande <[email protected]> \n",
"1 [email protected] \n",
"2 Kadarla Sai Nithin <[email protected]> \n",
"3 [email protected] \n",
"4 [email protected] \n",
".. ... \n",
"61 [email protected] \n",
"62 [email protected] \n",
"63 [email protected] \n",
"64 [email protected] \n",
"65 \"sainithink.be25\" <[email protected]> \n",
"\n",
" label Thread \n",
"0 Archived,Sent 1754524927945493361 \n",
"1 Inbox,Category Updates,Unread 1763869869345248847 \n",
"2 Inbox,Unread 1750907548775931784 \n",
"3 Inbox,Important,Opened,Category Updates 1766065641328425275 \n",
"4 Inbox,Category Updates,Unread 1758777265723926184 \n",
".. ... ... \n",
"61 Inbox,Important,Opened,Category Updates 1763869787228555223 \n",
"62 Inbox,Important,Opened,Category Updates 1760868809297254026 \n",
"63 Inbox,Important,Opened,Category Updates 1760324258667829753 \n",
"64 Inbox,Opened,Category Updates 1766316440594475561 \n",
"65 Inbox,Category Updates,Unread 1757274643381080149 \n",
"\n",
"[66 rows x 6 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dfs = pd.read_csv('testdata.csv')\n",
"dfs"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Converting the data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"subject object\n",
"from object\n",
"date object\n",
"to object\n",
"label object\n",
"Thread int64\n",
"dtype: object\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject</th>\n",
" <th>from</th>\n",
" <th>date</th>\n",
" <th>to</th>\n",
" <th>label</th>\n",
" <th>Thread</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Conceptual Physics book file</td>\n",
" <td>Kadarla Sai Nithin <[email protected]></td>\n",
" <td>2023-01-11 04:30:04+00:00</td>\n",
" <td>Suresh Kumar Lokhande <[email protected]></td>\n",
" <td>Archived,Sent</td>\n",
" <td>1754524927945493361</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Thank you for signing up for ThingSpeak!</td>\n",
" <td>ThingSpeak Support <[email protected]></td>\n",
" <td>2023-04-22 09:56:31+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1763869869345248847</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Get the official Gmail app</td>\n",
" <td>Gmail Team <[email protected]></td>\n",
" <td>2022-11-30 08:05:59+00:00</td>\n",
" <td>Kadarla Sai Nithin <[email protected]></td>\n",
" <td>Inbox,Unread</td>\n",
" <td>1750907548775931784</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Reset Your Password</td>\n",
" <td>Xmind Notifications <[email protected]....</td>\n",
" <td>2023-05-16 15:37:20+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1766065641328425275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Security alert</td>\n",
" <td>Google <[email protected]></td>\n",
" <td>2023-02-25 04:51:44+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1758777265723926184</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>61</th>\n",
" <td>Verify Email Address</td>\n",
" <td>[email protected]</td>\n",
" <td>2023-04-22 09:55:12+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1763869787228555223</td>\n",
" </tr>\n",
" <tr>\n",
" <th>62</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>[email protected]</td>\n",
" <td>2023-03-20 06:55:52+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760868809297254026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>63</th>\n",
" <td>uceou.edu account email verification code</td>\n",
" <td>[email protected]</td>\n",
" <td>2023-03-14 06:40:25+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Important,Opened,Category Updates</td>\n",
" <td>1760324258667829753</td>\n",
" </tr>\n",
" <tr>\n",
" <th>64</th>\n",
" <td>Security alert</td>\n",
" <td>Google <[email protected]></td>\n",
" <td>2023-05-19 10:03:42+00:00</td>\n",
" <td>[email protected]</td>\n",
" <td>Inbox,Opened,Category Updates</td>\n",
" <td>1766316440594475561</td>\n",
" </tr>\n",
" <tr>\n",
" <th>65</th>\n",
" <td>[Request received]</td>\n",
" <td>\"Tech (Support)\" <[email protected]></td>\n",
" <td>2023-02-08 14:48:12+00:00</td>\n",
" <td>\"sainithink.be25\" <[email protected]></td>\n",
" <td>Inbox,Category Updates,Unread</td>\n",
" <td>1757274643381080149</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>66 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" subject \\\n",
"0 Conceptual Physics book file \n",
"1 Thank you for signing up for ThingSpeak! \n",
"2 Get the official Gmail app \n",
"3 Reset Your Password \n",
"4 Security alert \n",
".. ... \n",
"61 Verify Email Address \n",
"62 uceou.edu account email verification code \n",
"63 uceou.edu account email verification code \n",
"64 Security alert \n",
"65 [Request received] \n",
"\n",
" from \\\n",
"0 Kadarla Sai Nithin <[email protected]> \n",
"1 ThingSpeak Support <[email protected]> \n",
"2 Gmail Team <[email protected]> \n",
"3 Xmind Notifications <[email protected].... \n",
"4 Google <[email protected]> \n",
".. ... \n",
"61 [email protected] \n",
"62 [email protected] \n",
"63 [email protected] \n",
"64 Google <[email protected]> \n",
"65 \"Tech (Support)\" <[email protected]> \n",
"\n",
" date
to \\\n",
"0 2023-01-11 04:30:04+00:00 Suresh Kumar Lokhande
<[email protected]> \n",
"1 2023-04-22 09:56:31+00:00
[email protected] \n",
"2 2022-11-30 08:05:59+00:00 Kadarla Sai Nithin
<[email protected]> \n",
"3 2023-05-16 15:37:20+00:00
[email protected] \n",
"4 2023-02-25 04:51:44+00:00
[email protected] \n",