{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!pip install emoji\n",
        "!pip install pyspellchecker\n",
        "!pip install contractions"
      ],
      "metadata": {
        "id": "3KcKMEWKeq39"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "6OZTzuPTXL9N"
      },
      "outputs": [],
      "source": [
        "corpus = [\n",
        "    \"I can't wait for the new season of my favorite show!\",\n",
        "    \"The COVID-19 pandemic has affected millions of people worldwide.\",\n",
        "    \"U.S. stocks fell on Friday after news of rising inflation.\",\n",
        "    \"<html><body>Welcome to the website!</body></html>\",\n",
        "    \"Python is a great programming language!!! ??\"\n",
        "]\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import re\n",
        "import string\n",
        "from bs4 import BeautifulSoup\n",
        "\n",
        "def clean_text(text):\n",
        "    text = text.lower()  # Lowercase\n",
        "    text = re.sub(r'\\d+', '', text)  # Remove numbers\n",
        "    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation\n",
        "    text = re.sub(r'\\W', ' ', text)  # Remove special characters\n",
        "    text = BeautifulSoup(text, \"html.parser\").get_text()  # Remove HTML tags\n",
        "    return text\n",
        "\n",
        "cleaned_corpus = [clean_text(doc) for doc in corpus]\n",
        "print(cleaned_corpus)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ag0yeKqhXW7s",
        "outputId": "1ff43c84-8024-453f-b7df-9168507aac64"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['i cant wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from nltk.tokenize import word_tokenize\n",
        "import nltk\n",
        "nltk.download('punkt_tab')\n",
        "\n",
        "tokenized_corpus = [word_tokenize(doc) for doc in cleaned_corpus]\n",
        "print(tokenized_corpus)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hIeNFI1cXcLH",
        "outputId": "a658d2a7-e340-47cc-92a8-b5c96b24ad86"
      },
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'covid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], ['htmlbodywelcome', 'to', 'the', 'websitebodyhtml'], ['python', 'is', 'a', 'great', 'programming', 'language']]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n",
            "[nltk_data]   Package punkt_tab is already up-to-date!\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from nltk.corpus import stopwords\n",
        "nltk.download('stopwords')\n",
        "\n",
        "stop_words = set(stopwords.words('english'))\n",
        "filtered_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]\n",
        "print(filtered_corpus)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PRhthPMIXhmx",
        "outputId": "c85794b7-9b35-4964-d925-7ae27228cbff"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[['cant', 'wait', 'new', 'season', 'favorite', 'show'], ['covid', 'pandemic', 'affected', 'millions', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'friday', 'news', 'rising', 'inflation'], ['htmlbodywelcome', 'websitebodyhtml'], ['python', 'great', 'programming', 'language']]\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from nltk.stem import PorterStemmer, WordNetLemmatizer\n",
        "nltk.download('wordnet')\n",
        "\n",
        "stemmer = PorterStemmer()\n",
        "lemmatizer = WordNetLemmatizer()\n",
        "\n",
        "stemmed_corpus = [[stemmer.stem(word) for word in doc] for doc in filtered_corpus]\n",
        "lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_corpus]\n",
        "print(stemmed_corpus)\n",
        "print(lemmatized_corpus)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "FTQAqaKnYClt",
        "outputId": "fca8d07c-d5ce-437a-a898-d32296b711a9"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[['cant', 'wait', 'new', 'season', 'favorit', 'show'], ['covid', 'pandem', 'affect', 'million', 'peopl', 'worldwid'], ['us', 'stock', 'fell', 'friday', 'news', 'rise', 'inflat'], ['htmlbodywelcom', 'websitebodyhtml'], ['python', 'great', 'program', 'languag']]\n",
            "[['cant', 'wait', 'new', 'season', 'favorite', 'show'], ['covid', 'pandemic', 'affected', 'million', 'people', 'worldwide'], ['u', 'stock', 'fell', 'friday', 'news', 'rising', 'inflation'], ['htmlbodywelcome', 'websitebodyhtml'], ['python', 'great', 'programming', 'language']]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "d60b53e4",
        "outputId": "8588b4c5-9a02-402a-8e37-27c07b579c20"
      },
      "source": [
        "import contractions\n",
        "\n",
        "expanded_corpus = [contractions.fix(doc) for doc in cleaned_corpus]\n",
        "print(expanded_corpus)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['i cannot wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b3d41925",
        "outputId": "25c5c1b4-725e-405e-86d6-20aba968bc87"
      },
      "source": [
        "import emoji\n",
        "\n",
        "emoji_corpus = [emoji.demojize(doc) for doc in cleaned_corpus]\n",
        "print(emoji_corpus)"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "['i cant wait for the new season of my favorite show', 'the covid pandemic has affected millions of people worldwide', 'us stocks fell on friday after news of rising inflation', 'htmlbodywelcome to the websitebodyhtml', 'python is a great programming language ']\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "53041d29",
        "outputId": "96a81314-6fe0-4d14-87ab-cf569b92917e"
      },
      "source": [
        "from spellchecker import SpellChecker\n",
        "\n",
        "spell = SpellChecker()\n",
        "corrected_corpus = [[spell.correction(word) for word in doc] for doc in tokenized_corpus]\n",
        "print(corrected_corpus)"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[['i', 'cant', 'wait', 'for', 'the', 'new', 'season', 'of', 'my', 'favorite', 'show'], ['the', 'bovid', 'pandemic', 'has', 'affected', 'millions', 'of', 'people', 'worldwide'], ['us', 'stocks', 'fell', 'on', 'friday', 'after', 'news', 'of', 'rising', 'inflation'], [None, 'to', 'the', None], ['python', 'is', 'a', 'great', 'programming', 'language']]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "zxmZXpJyeXfP"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}