0% found this document useful (0 votes)
86 views

DA Practicle Answers Easyw

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
86 views

DA Practicle Answers Easyw

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 30

create sales data set having 5 columns namely; id ,tv, radio,newspaper and sales.

(random 500 entries) build a linear regression model by identifying independent and
target variable. split the variables into training and testing sets. then divide the
training and testing sets into a 7:3 ratio, respectively and print them. build a simple
linear regression model.
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Create a function to generate the sales dataset

def generate_sales_data(num_entries=500):

np.random.seed(0)

data = {

'id': range(1, num_entries + 1),

'tv': np.random.randint(0, 100, num_entries),

'radio': np.random.randint(0, 100, num_entries),

'newspaper': np.random.randint(0, 100, num_entries),

'sales': np.random.randint(50, 500, num_entries) # Assuming sales range from 50 to 500

return pd.DataFrame(data)

# Generate the sales dataset

sales_data = generate_sales_data()

# Identify independent (X) and target (y) variables

X = sales_data[['tv', 'radio', 'newspaper']]

y = sales_data['sales']

# Split the dataset into training and testing sets (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set:")
print("X_train shape:", X_train.shape)

print("y_train shape:", y_train.shape)

print("\nTesting set:")

print("X_test shape:", X_test.shape)

print("y_test shape:", y_test.shape)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("\nMean Squared Error (MSE):", mse)


create 'realestate' data set having 4 columns namely: ID, flat, houses and purchase
(random 500 entries) build a linear regression mode by identifying independent and
target variable. split the variable into training and testing sets and print them. build
a simple linear regression model for predicting purchases
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

# Create a function to generate the realestate dataset

def generate_realestate_data(num_entries=500):

np.random.seed(0)

data = {

'ID': range(1, num_entries + 1),

'flat': np.random.randint(1, 6, num_entries), # Assuming flat sizes range from 1 to 5

'houses': np.random.randint(1, 6, num_entries), # Assuming house sizes range from 1 to 5

'purchase': np.random.randint(50000, 500000, num_entries) # Assuming purchase prices range


from 50000 to 500000

return pd.DataFrame(data)

# Generate the realestate dataset

realestate_data = generate_realestate_data()
# Identify independent (X) and target (y) variables

X = realestate_data[['flat', 'houses']]

y = realestate_data['purchase']

# Split the dataset into training and testing sets (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shapes of the training and testing sets

print("Training set:")

print("X_train shape:", X_train.shape)

print("y_train shape:", y_train.shape)

print("\nTesting set:")

print("X_test shape:", X_test.shape)

print("y_test shape:", y_test.shape)

model = LinearRegression()

model.fit(X_train, y_train)

print("\nCoefficients:", model.coef_)

print("Intercept:", model.intercept_)

create 'user' data set having 5 columns namely: user Id, gender, age, estimated
salary and purchased build a logistic regression model that can predict whether on
the given paramenter a person will buy a car or not.
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a function to generate the user dataset

def generate_user_data(num_entries=500):

np.random.seed(0)

data = {

'UserId': range(1, num_entries + 1),


'Gender': np.random.choice(['Male', 'Female'], num_entries),

'Age': np.random.randint(18, 70, num_entries),

'EstimatedSalary': np.random.randint(20000, 150000, num_entries), # Assuming salary range


from 20000 to 150000

'Purchased': np.random.randint(0, 2, num_entries) # 0: Not purchased, 1: Purchased

return pd.DataFrame(data)

# Generate the user dataset

user_data = generate_user_data()

# Convert categorical variable 'Gender' to numerical

user_data['Gender'] = user_data['Gender'].map({'Male': 0, 'Female': 1})

# Identify independent (X) and target (y) variables

X = user_data[['Gender', 'Age', 'EstimatedSalary']]

y = user_data['Purchased']

# Split the dataset into training and testing sets (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

print("\nClassification Report:")

print(classification_report(y_test, y_pred)
print("\nConfusion Matrix:")

print(confusion_matrix(y_test, y_pred))
Build a simple linear regression mode for fish species weight prediction
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

# 1. Load the dataset

fish_data = pd.read_csv('fish_data.csv') # Replace 'fish_data.csv' with your dataset file

# 2. Preprocess the data if necessary

# 3. Split data

X = fish_data[['Length', 'Width']] # Assuming 'Length' and 'Width' are features

y = fish_data['Weight'] # Assuming 'Weight' is the target variable

# 4. Split into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build and train the model

model = LinearRegression()

model.fit(X_train, y_train)

# 6. Evaluate the model

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)

print("R-squared Score:", r2)


# 7. Make predictions

# Predicting on some new data (optional)

new_data = pd.DataFrame({'Length': [20, 25, 30], 'Width': [10, 15, 20]})

predictions = model.predict(new_data)

print("Predictions on new data:")

print(predictions)

# 8. Visualize results (optional)

plt.scatter(y_test, y_pred)

plt.xlabel('Actual Weight')

plt.ylabel('Predicted Weight')

plt.title('Actual vs Predicted Weight')

plt.show()

use the iris dataset. write a python program to view some basic statistical details
like percentile, mean, std etc. of the species of 'Iris-setosa', 'iris-versicolor' and
'iris-virginica'. apply logistic regression on the dataset on the dataset to identify
different species (setosa, versicolor, verginica) of Iris flowers given just 4
features: sepal and petal lengths and widths. find the accuracy of the model

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Iris dataset

iris = load_iris()

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

iris_df['Species'] = iris.target

# Map target values to species names

iris_df['Species'] = iris_df['Species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})


# View basic statistical details of each species

for species in iris_df['Species'].unique():

species_data = iris_df[iris_df['Species'] == species]

print("Species:", species)

print(species_data.describe())

print("\n")

# Prepare data for logistic regression

X = iris_df.iloc[:, :-1] # Features: sepal length, sepal width, petal length, petal width

y = iris_df['Species'] # Target: Species

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply logistic regression

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

# Make predictions

y_pred = model.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

# Print classification report and confusion matrix

print("\nClassification Report:")

print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
create the following dataset in python (tid=1, items= bread, milk ) (tid=2, items=
bread, diaper,beer,egss ) (tid=3, items= milk, diaper, beer, coke ) (tid=4,
items=bread, mild, diaper, beer ) (tid=5, items= bread, milk,diaper,coke)convert the
categorical values into numeric format. apply the apriori algorithem on the above
dataset to generate the frequent itemsets and association rules. Repeat the process
with different minimum _support values
from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori, association_rules

import pandas as pd

# Create the dataset

dataset = [

{'tid': 1, 'items': ['bread', 'milk']},

{'tid': 2, 'items': ['bread', 'diaper', 'beer', 'eggs']},

{'tid': 3, 'items': ['milk', 'diaper', 'beer', 'coke']},

{'tid': 4, 'items': ['bread', 'milk', 'diaper', 'beer']},

{'tid': 5, 'items': ['bread', 'milk', 'diaper', 'coke']}

# Convert categorical values into numeric format

te = TransactionEncoder()

te_ary = te.fit(dataset).transform(dataset)

df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply the Apriori algorithm

min_support_values = [0.2, 0.3] # Different minimum support values

for min_support in min_support_values:

frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)

print("Frequent Itemsets with minimum support =", min_support)

print(frequent_itemsets)

print("\n")
# Generate association rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print("Association Rules with minimum support =", min_support)

print(rules)

print("\n")

create your own transactions dataset apply the apriori algorithem on the dataset
(same as above)
download the market basket dataset. write a python program to read the dataset
and display its information preprocess the data (drop null values etc.) convert the
categorical values into numeric format. apply the apriori algorithm on the above
dataset to generate the frequent itemsets and association rules
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori, association_rules

# Read the dataset

data = pd.read_csv('path_to_your_dataset.csv') # Replace 'path_to_your_dataset.csv' with the


actual path

# Display information about the dataset

print("Dataset information:")

print(data.info())

# Preprocess the data (drop null values, etc.)

data.dropna(inplace=True)

print("After dropping null values:")

print(data.info())

# Convert categorical values into numeric format

te = TransactionEncoder()

data_encoded = te.fit_transform(data.values)

df = pd.DataFrame(data_encoded, columns=te.columns_)
# Apply the Apriori algorithm

frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

print("\nFrequent Itemsets:")

print(frequent_itemsets)

# Generate association rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print("\nAssociation Rules:")

print(rules)

download the groceries dataset. write a python program to read the dataset and
display its information preprocess the data (drop null values etc.) convert the
categorical values into numeric format. apply the apriori algorithm on the above
dataset to generate the frequent itemsets and association rules

import pandas as pd

from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori, association_rules

# Read the dataset

data = pd.read_csv('groceries_dataset.csv') # Replace 'groceries_dataset.csv' with the actual


filename

# Display information about the dataset

print("Dataset information:")

print(data.info())

# Preprocess the data (drop null values, etc.)

data.dropna(inplace=True)

print("After dropping null values:")

print(data.info())

# Convert categorical values into numeric format

# Assuming the dataset is already in a transaction format (list of lists)


te = TransactionEncoder()

data_encoded = te.fit_transform(data.values)

df = pd.DataFrame(data_encoded, columns=te.columns_)

# Apply the Apriori algorithm

frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

print("\nFrequent Itemsets:")

print(frequent_itemsets)

# Generate association rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print("\nAssociation Rules:")

print(rules)
write a python code to implement the apriori algorithm test the code on any
standard dataset.
from collections import defaultdict

from itertools import combinations

class Apriori:

def __init__(self, min_support=0.5, min_confidence=0.5):

self.min_support = min_support

self.min_confidence = min_confidence

self.itemsets = None

self.transactions = None

def _get_itemsets(self, transactions):

itemsets = defaultdict(int)

for transaction in transactions:

for item in transaction:

itemsets[frozenset([item])] += 1

return itemsets
def _get_frequent_itemsets(self, itemsets, num_transactions):

frequent_itemsets = {}

for item, count in itemsets.items():

support = count / num_transactions

if support >= self.min_support:

frequent_itemsets[item] = support

return frequent_itemsets

def _generate_candidates(self, itemsets):

candidates = set()

for itemset1 in itemsets:

for itemset2 in itemsets:

union_set = itemset1.union(itemset2)

if len(union_set) == len(itemset1) + 1:

candidates.add(union_set)

return candidates

def _get_association_rules(self, frequent_itemsets, num_transactions):

rules = []

for itemset in frequent_itemsets.keys():

if len(itemset) >= 2:

for combination in combinations(itemset, len(itemset) - 1):

antecedent = frozenset(combination)

consequent = itemset - antecedent

if antecedent in frequent_itemsets and consequent in frequent_itemsets:

confidence = frequent_itemsets[itemset] / frequent_itemsets[antecedent]

if confidence >= self.min_confidence:

support = frequent_itemsets[itemset]

lift = support / (frequent_itemsets[antecedent] * frequent_itemsets[consequent])

rules.append((antecedent, consequent, support, confidence, lift))

return rules
def fit(self, transactions):

self.transactions = transactions

num_transactions = len(transactions)

itemsets = self._get_itemsets(transactions)

frequent_itemsets = self._get_frequent_itemsets(itemsets, num_transactions)

self.itemsets = frequent_itemsets

def generate_association_rules(self):

if self.itemsets is None or self.transactions is None:

raise ValueError("Please fit the model before generating association rules.")

association_rules = self._get_association_rules(self.itemsets, len(self.transactions))

return association_rules

# Example usage

if __name__ == "__main__":

transactions = [

{'bread', 'milk', 'eggs'},

{'bread', 'diaper', 'beer', 'eggs'},

{'milk', 'diaper', 'beer', 'coke'},

{'bread', 'milk', 'diaper', 'beer'},

{'bread', 'milk', 'diaper', 'coke'}

apriori = Apriori(min_support=0.2, min_confidence=0.5)

apriori.fit(transactions)

frequent_itemsets = apriori.itemsets

print("Frequent Itemsets:")

for itemset, support in frequent_itemsets.items():

print(itemset, ": Support =", support)


association_rules = apriori.generate_association_rules()

print("\nAssociation Rules:")

for rule in association_rules:

antecedent, consequent, support, confidence, lift = rule

print(f"{antecedent} -> {consequent}: Support = {support}, Confidence = {confidence}, Lift =


{lift}")
consider any text paragraph. preprocess the text to remove any special character
and digits generate the summary using extractive summarization process
import re

import nltk

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity

def preprocess_text(text):

# Remove special characters and digits

processed_text = re.sub(r'[^a-zA-Z\s]', '', text)

return processed_text

def generate_summary(text, num_sentences=3):

# Preprocess the text

processed_text = preprocess_text(text)

# Tokenize the text into sentences

sentences = sent_tokenize(processed_text)

# Tokenize the text into words

word_tokens = [word_tokenize(sentence.lower()) for sentence in sentences]

# Remove stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in tokens if word not in stop_words] for tokens in
word_tokens]

# Flatten the list of word tokens

flattened_words = [word for sublist in filtered_words for word in sublist]

# Calculate TF-IDF vectors

vectorizer = CountVectorizer()

X = vectorizer.fit_transform([' '.join(flattened_words)] + [' '.join(tokens) for tokens in


filtered_words])

# Calculate cosine similarity between TF-IDF vectors

similarity_matrix = cosine_similarity(X[1:], X)

# Calculate importance score for each sentence

importance_scores = similarity_matrix.sum(axis=1)

# Sort sentences by importance score

sorted_indices = importance_scores.argsort()[::-1]

# Select the top-ranking sentences for summary

summary_sentences = [sentences[i] for i in sorted_indices[:num_sentences]]

# Join the summary sentences into a single string

summary = ' '.join(summary_sentences)

return summary

# Example text

text = """

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data. The goal is a computer capable of "understanding" the contents of documents, including the
contextual nuances of the language within them. The technology can then accurately extract
information and insights contained in the documents as well as categorize and organize the
documents themselves. This technology is very useful in a variety of applications such as machine
translation, text summarization, sentiment analysis, and more.

"""

# Generate summary

summary = generate_summary(text)

print("Summary:")

print(summary)

consider any text paragraph remove the stopwords tokenize the paragraph to
extract words and sentences. calculate the word frequnecy distribution and plot the
frequencies. plot the wordcloud of the text
import re

import matplotlib.pyplot as plt

from wordcloud import WordCloud

from nltk.tokenize import word_tokenize, sent_tokenize

from nltk.corpus import stopwords

from collections import Counter

def remove_stopwords(text):

stop_words = set(stopwords.words('english'))

words = word_tokenize(text)

filtered_words = [word for word in words if word.lower() not in stop_words]

return ' '.join(filtered_words)

def tokenize_text(text):

sentences = sent_tokenize(text)

words = [word_tokenize(sentence) for sentence in sentences]

return sentences, words


def calculate_word_frequency(words):

word_freq = Counter(words)

return word_freq

def plot_word_frequency(word_freq):

words, freqs = zip(*word_freq.items())

plt.figure(figsize=(10, 6))

plt.bar(words, freqs)

plt.xlabel('Words')

plt.ylabel('Frequency')

plt.title('Word Frequency Distribution')

plt.xticks(rotation=45)

plt.show()

def plot_wordcloud(text):

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 6))

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis('off')

plt.title('Word Cloud')

plt.show()

# Example text paragraph

text = """

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data. The goal is a computer capable of "understanding" the contents of documents, including the
contextual nuances of the language within them. The technology can then accurately extract
information and insights contained in the documents as well as categorize and organize the
documents themselves. This technology is very useful in a variety of applications such as machine
translation, text summarization, sentiment analysis, and more.

"""
# Step 1: Remove stopwords

text_without_stopwords = remove_stopwords(text)

# Step 2: Tokenize the paragraph

sentences, words = tokenize_text(text_without_stopwords)

# Step 3: Calculate word frequency distribution

word_freq = calculate_word_frequency([word for sublist in words for word in sublist])

# Step 4: Plot word frequency distribution

plot_word_frequency(word_freq)

# Step 5: Generate word cloud

plot_wordcloud(text_without_stopwords)

consider the following review messages. perform sentiment analysis on the messages. 1. i
purchased headphones online. i am very happy with the product. 2. i saw the movie
yesterday. the animation was really good but the script was ok. 3. i enjoy listening to music 4.
i take a walk in the park everyday
from textblob import TextBlob

def analyze_sentiment(text):
blob = TextBlob(text)
sentiment = blob.sentiment.polarity
if sentiment > 0:
return 'Positive'
elif sentiment < 0:
return 'Negative'
else:
return 'Neutral'

# Review messages
messages = [
"i purchased headphones online. i am very happy with the product.",
"i saw the movie yesterday. the animation was really good but the script was ok.",
"i enjoy listening to music",
"i take a walk in the park everyday"
]

# Perform sentiment analysis on each message


for i, message in enumerate(messages, start=1):
sentiment = analyze_sentiment(message)
print(f"Message {i}: Sentiment - {sentiment}")

write a python script for the following 1. first export the whatsapp chat of any group.
read the exported ".txt" file using open() and read() fucntions. 2. tokenize the read
data into sentences and print it. 3. remove the stopwords from data and perform
lemmatization 4. plot the wordcloud for the given data
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Step 1: Read the exported WhatsApp chat file


def read_chat_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()

# Step 2: Tokenize the data into sentences


def tokenize_sentences(data):
return sent_tokenize(data)

# Step 3: Remove stopwords and perform lemmatization


def preprocess_text(text):
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Tokenize the text into words


words = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

return lemmatized_words

# Step 4: Plot wordcloud for the given data


def plot_wordcloud(text):
wordcloud = WordCloud(width=800, height=400,
background_color='white').generate(' '.join(text))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud')
plt.show()

# File path to the exported WhatsApp chat file


file_path = 'your_chat_file.txt'

# Step 1: Read the exported WhatsApp chat file


chat_data = read_chat_file(file_path)

# Step 2: Tokenize the data into sentences


sentences = tokenize_sentences(chat_data)
print("Sentences:")
print(sentences)
print()

# Step 3: Remove stopwords and perform lemmatization


preprocessed_text = preprocess_text(chat_data)
print("Preprocessed Text:")
print(preprocessed_text)
print()

# Step 4: Plot wordcloud for the given data


plot_wordcloud(preprocessed_text)

Write a Python script for the following:


i. Read the dataset and find the top 5 Instagram influencers from India.
ii . Find the Instagram account having least number of followers.
iii. Read the column "Category", remove stopwords and plot the wordcloud to find the
keywords which will imply that in which category maximum accounts are created.
iv. Group the Instagram accounts category wise.
v. Visualize the dataset and plot the relationship between Followers and Authentic
engagement columns
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# i. Read the dataset and find the top 5 Instagram influencers from India.
def top_influencers(df):
top_5_influencers = df[df['Country'] == 'India'].nlargest(5, 'Followers')
return top_5_influencers

# ii. Find the Instagram account having the least number of followers.
def least_followers(df):
least_follower_account = df[df['Followers'] == df['Followers'].min()]
return least_follower_account

# iii. Read the column "Category", remove stopwords, and plot the wordcloud.
def plot_wordcloud(df):
category_words = ' '.join(df['Category'])
stop_words = set(stopwords.words('english'))
words = [word for word in word_tokenize(category_words) if word.lower() not in
stop_words]
wordcloud = WordCloud(width=800, height=400, background_color='white').generate('
'.join(words))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Category')
plt.show()
# iv. Group the Instagram accounts category-wise.
def group_by_category(df):
grouped = df.groupby('Category').size().reset_index(name='Count')
return grouped

# v. Visualize the dataset and plot the relationship between Followers and Authentic
engagement columns.
def visualize_relationship(df):
plt.figure(figsize=(10, 6))
plt.scatter(df['Followers'], df['Authentic engagement'], alpha=0.5)
plt.title('Relationship between Followers and Authentic Engagement')
plt.xlabel('Followers')
plt.ylabel('Authentic Engagement')
plt.show()

# Read the dataset


df = pd.read_csv('instagram_data.csv')

# i. Top 5 Instagram influencers from India


print("Top 5 Instagram influencers from India:")
print(top_influencers(df))
print()

# ii. Instagram account with the least number of followers


print("Instagram account with the least number of followers:")
print(least_followers(df))
print()

# iii. Plot word cloud for Category


plot_wordcloud(df)

# iv. Group Instagram accounts category-wise


print("\nInstagram accounts grouped by category:")
print(group_by_category(df))
print()

# v. Visualize the relationship between Followers and Authentic Engagement


visualize_relationship(df)

Write a Python script for the following:

i. Read the dataset and perform data cleaning operations on it.

ii. Tokenize the comments in words.

111. Perform sentiment analysis and find the percentage of positive, negative and neutral
comments.

import pandas as pd

import re

from nltk.tokenize import word_tokenize

from textblob import TextBlob

# i. Read the dataset and perform data cleaning operations on it.

def clean_data(df):

# Remove rows with missing values

df.dropna(inplace=True)

# Remove special characters and digits

df['Comment'] = df['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Convert text to lowercase

df['Comment'] = df['Comment'].apply(lambda x: x.lower())

return df

# ii. Tokenize the comments into words

def tokenize_comments(df):

df['Tokenized_Comment'] = df['Comment'].apply(lambda x: word_tokenize(x))

return df

# iii. Perform sentiment analysis and find the percentage of positive, negative, and neutral
comments
def analyze_sentiment(comment):

blob = TextBlob(comment)

sentiment = blob.sentiment.polarity

if sentiment > 0:

return 'Positive'

elif sentiment < 0:

return 'Negative'

else:

return 'Neutral'

def sentiment_analysis(df):

df['Sentiment'] = df['Comment'].apply(analyze_sentiment)

sentiment_counts = df['Sentiment'].value_counts(normalize=True) * 100

return sentiment_counts

# Read the dataset

df = pd.read_csv('your_dataset.csv') # Replace 'your_dataset.csv' with the actual file path to your


dataset

# i. Data cleaning operations

df_cleaned = clean_data(df)

# ii. Tokenize the comments into words

df_tokenized = tokenize_comments(df_cleaned)

# iii. Perform sentiment analysis and find the percentage of positive, negative, and neutral
comments

sentiment_percentage = sentiment_analysis(df_tokenized)

# Print the results

print("Percentage of positive, negative, and neutral comments:")

print(sentiment_percentage)
Write a Python script for the following:

i. Read the dataset and perform data cleaning operations on it.

ii. Find the total views, total likes, total dislikes and comment count.

iii.Find the least and topmost liked and commented videos.

iv. Perform year wise statistics for views and plot the analyzed data.

v. Plot the viewers who reacted on videos.

import pandas as pd

import matplotlib.pyplot as plt

# i. Read the dataset and perform data cleaning operations on it.

def clean_data(df):

# Remove rows with missing values

df.dropna(inplace=True)

# Convert data types if necessary

df['Views'] = pd.to_numeric(df['Views'], errors='coerce')

df['Likes'] = pd.to_numeric(df['Likes'], errors='coerce')

df['Dislikes'] = pd.to_numeric(df['Dislikes'], errors='coerce')

df['Comments'] = pd.to_numeric(df['Comments'], errors='coerce')

return df

# ii. Find the total views, total likes, total dislikes, and comment count.

def get_statistics(df):

total_views = df['Views'].sum()

total_likes = df['Likes'].sum()

total_dislikes = df['Dislikes'].sum()

total_comments = df['Comments'].sum()

return total_views, total_likes, total_dislikes, total_comments


# iii. Find the least and topmost liked and commented videos.

def get_top_least_videos(df):

top_liked_video = df[df['Likes'] == df['Likes'].max()]

least_liked_video = df[df['Likes'] == df['Likes'].min()]

top_commented_video = df[df['Comments'] == df['Comments'].max()]

least_commented_video = df[df['Comments'] == df['Comments'].min()]

return top_liked_video, least_liked_video, top_commented_video, least_commented_video

# iv. Perform year-wise statistics for views and plot the analyzed data.

def year_wise_statistics(df):

df['Year'] = pd.to_datetime(df['Published_Date']).dt.year

year_wise_views = df.groupby('Year')['Views'].sum()

return year_wise_views

# v. Plot the viewers who reacted on videos.

def plot_reactions(df):

reactions = df[['Likes', 'Dislikes', 'Comments']].sum()

reactions.plot(kind='bar', figsize=(10, 6))

plt.title('Reactions on Videos')

plt.xlabel('Reaction Type')

plt.ylabel('Count')

plt.xticks(rotation=45)

plt.show()

# Read the dataset

df = pd.read_csv('your_dataset.csv') # Replace 'your_dataset.csv' with the actual file path to your


dataset

# i. Data cleaning operations


df_cleaned = clean_data(df)

# ii. Find the total views, total likes, total dislikes, and comment count.

total_views, total_likes, total_dislikes, total_comments = get_statistics(df_cleaned)

print("Total Views:", total_views)

print("Total Likes:", total_likes)

print("Total Dislikes:", total_dislikes)

print("Total Comments:", total_comments)

print()

# iii. Find the least and topmost liked and commented videos.

top_liked_video, least_liked_video, top_commented_video, least_commented_video =


get_top_least_videos(df_cleaned)

print("Top Liked Video:")

print(top_liked_video)

print()

print("Least Liked Video:")

print(least_liked_video)

print()

print("Top Commented Video:")

print(top_commented_video)

print()

print("Least Commented Video:")

print(least_commented_video)

print()

# iv. Perform year-wise statistics for views and plot the analyzed data.

year_wise_views = year_wise_statistics(df_cleaned)

print("Year-wise Statistics for Views:")

print(year_wise_views)

print()
# Plot year-wise statistics

year_wise_views.plot(kind='bar', figsize=(10, 6))

plt.title('Year-wise Views')

plt.xlabel('Year')

plt.ylabel('Views')

plt.xticks(rotation=45)

plt.show()

# v. Plot the viewers who reacted on videos.

plot_reactions(df_cleaned)

Write a Python script to read the Tweets using Twitter API and tweepy library to perform the
following tasks:

i. Authenticate Twitter API (Using Bearer Token)

ii. Get the tweets using Keywords or Hash Tags.

iii.Find the total number of likes and retweets on each tweet.

iv.find the most like tweets and print it's text

v.Visualize the tweets and plot the time series for likes and retweets along with dates on which
tweets are published.

import tweepy

import pandas as pd

import matplotlib.pyplot as plt

# Twitter API credentials

bearer_token = 'YOUR_BEARER_TOKEN' # Replace with your Twitter API bearer token

# Authenticate Twitter API (Using Bearer Token)

auth = tweepy.AppAuthHandler(bearer_token)

api = tweepy.API(auth, wait_on_rate_limit=True)

# ii. Get the tweets using Keywords or Hash Tags.


def search_tweets(query, max_tweets=100):

tweets = []

for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended').items(max_tweets):

tweets.append(tweet)

return tweets

# iii. Find the total number of likes and retweets on each tweet.

def get_likes_retweets(tweets):

data = {'Tweet': [], 'Likes': [], 'Retweets': []}

for tweet in tweets:

data['Tweet'].append(tweet.full_text)

data['Likes'].append(tweet.favorite_count)

data['Retweets'].append(tweet.retweet_count)

return pd.DataFrame(data)

# iv. Find the most liked tweet and print its text

def most_liked_tweet(tweets_df):

most_liked_index = tweets_df['Likes'].idxmax()

most_liked_text = tweets_df.iloc[most_liked_index]['Tweet']

return most_liked_text

# v. Visualize the tweets and plot the time series for likes and retweets along with dates on which
tweets are published.

def plot_time_series(tweets_df):

tweets_df['Date'] = pd.to_datetime(tweets_df['Date'])

tweets_df.set_index('Date', inplace=True)

tweets_df[['Likes', 'Retweets']].plot(figsize=(10, 6))

plt.title('Likes and Retweets over Time')

plt.xlabel('Date')

plt.ylabel('Count')

plt.xticks(rotation=45)
plt.show()

# Perform tasks

query = 'python' # Example keyword

tweets = search_tweets(query, max_tweets=100)

tweets_df = get_likes_retweets(tweets)

most_liked = most_liked_tweet(tweets_df)

print("Most Liked Tweet:")

print(most_liked)

print()

# Plot time series

plot_time_series(tweets_df)

You might also like