0% found this document useful (0 votes)

86 views

DA Practicle Answers Easyw

Uploaded by

waghmare1234lavin

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

86 views

DA Practicle Answers Easyw

Uploaded by

waghmare1234lavin

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 30

create sales data set having 5 columns namely; id ,tv, radio,newspaper and sales.

(random 500 entries) build a linear regression model by identifying independent and
target variable. split the variables into training and testing sets. then divide the
training and testing sets into a 7:3 ratio, respectively and print them. build a simple
linear regression model.
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

# Create a function to generate the sales dataset

def generate_sales_data(num_entries=500):

np.random.seed(0)

data = {

'id': range(1, num_entries + 1),

'tv': np.random.randint(0, 100, num_entries),

'radio': np.random.randint(0, 100, num_entries),

'newspaper': np.random.randint(0, 100, num_entries),

'sales': np.random.randint(50, 500, num_entries) # Assuming sales range from 50 to 500

return pd.DataFrame(data)

# Generate the sales dataset

sales_data = generate_sales_data()

# Identify independent (X) and target (y) variables

X = sales_data[['tv', 'radio', 'newspaper']]

y = sales_data['sales']

# Split the dataset into training and testing sets (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set:")
print("X_train shape:", X_train.shape)

print("y_train shape:", y_train.shape)

print("\nTesting set:")

print("X_test shape:", X_test.shape)

print("y_test shape:", y_test.shape)

model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print("\nMean Squared Error (MSE):", mse)

create 'realestate' data set having 4 columns namely: ID, flat, houses and purchase
(random 500 entries) build a linear regression mode by identifying independent and
target variable. split the variable into training and testing sets and print them. build
a simple linear regression model for predicting purchases
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

# Create a function to generate the realestate dataset

def generate_realestate_data(num_entries=500):

np.random.seed(0)

data = {

'ID': range(1, num_entries + 1),

'flat': np.random.randint(1, 6, num_entries), # Assuming flat sizes range from 1 to 5

'houses': np.random.randint(1, 6, num_entries), # Assuming house sizes range from 1 to 5

'purchase': np.random.randint(50000, 500000, num_entries) # Assuming purchase prices range

from 50000 to 500000

return pd.DataFrame(data)

# Generate the realestate dataset

realestate_data = generate_realestate_data()
# Identify independent (X) and target (y) variables

X = realestate_data[['flat', 'houses']]

y = realestate_data['purchase']

# Split the dataset into training and testing sets (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Print the shapes of the training and testing sets

print("Training set:")

print("X_train shape:", X_train.shape)

print("y_train shape:", y_train.shape)

print("\nTesting set:")

print("X_test shape:", X_test.shape)

print("y_test shape:", y_test.shape)

model = LinearRegression()

model.fit(X_train, y_train)

print("\nCoefficients:", model.coef_)

print("Intercept:", model.intercept_)

create 'user' data set having 5 columns namely: user Id, gender, age, estimated
salary and purchased build a logistic regression model that can predict whether on
the given paramenter a person will buy a car or not.
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create a function to generate the user dataset

def generate_user_data(num_entries=500):

np.random.seed(0)

data = {

'UserId': range(1, num_entries + 1),

'Gender': np.random.choice(['Male', 'Female'], num_entries),

'Age': np.random.randint(18, 70, num_entries),

'EstimatedSalary': np.random.randint(20000, 150000, num_entries), # Assuming salary range

from 20000 to 150000

'Purchased': np.random.randint(0, 2, num_entries) # 0: Not purchased, 1: Purchased

return pd.DataFrame(data)

# Generate the user dataset

user_data = generate_user_data()

# Convert categorical variable 'Gender' to numerical

user_data['Gender'] = user_data['Gender'].map({'Male': 0, 'Female': 1})

# Identify independent (X) and target (y) variables

X = user_data[['Gender', 'Age', 'EstimatedSalary']]

y = user_data['Purchased']

# Split the dataset into training and testing sets (70% training, 30% testing)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

print("\nClassification Report:")

print(classification_report(y_test, y_pred)
print("\nConfusion Matrix:")

print(confusion_matrix(y_test, y_pred))
Build a simple linear regression mode for fish species weight prediction
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt

# 1. Load the dataset

fish_data = pd.read_csv('fish_data.csv') # Replace 'fish_data.csv' with your dataset file

# 2. Preprocess the data if necessary

# 3. Split data

X = fish_data[['Length', 'Width']] # Assuming 'Length' and 'Width' are features

y = fish_data['Weight'] # Assuming 'Weight' is the target variable

# 4. Split into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build and train the model

model = LinearRegression()

model.fit(X_train, y_train)

# 6. Evaluate the model

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)

print("R-squared Score:", r2)

# 7. Make predictions

# Predicting on some new data (optional)

new_data = pd.DataFrame({'Length': [20, 25, 30], 'Width': [10, 15, 20]})

predictions = model.predict(new_data)

print("Predictions on new data:")

print(predictions)

# 8. Visualize results (optional)

plt.scatter(y_test, y_pred)

plt.xlabel('Actual Weight')

plt.ylabel('Predicted Weight')

plt.title('Actual vs Predicted Weight')

plt.show()

use the iris dataset. write a python program to view some basic statistical details
like percentile, mean, std etc. of the species of 'Iris-setosa', 'iris-versicolor' and
'iris-virginica'. apply logistic regression on the dataset on the dataset to identify
different species (setosa, versicolor, verginica) of Iris flowers given just 4
features: sepal and petal lengths and widths. find the accuracy of the model

import pandas as pd

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the Iris dataset

iris = load_iris()

iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

iris_df['Species'] = iris.target

# Map target values to species names

iris_df['Species'] = iris_df['Species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# View basic statistical details of each species

for species in iris_df['Species'].unique():

species_data = iris_df[iris_df['Species'] == species]

print("Species:", species)

print(species_data.describe())

print("\n")

# Prepare data for logistic regression

X = iris_df.iloc[:, :-1] # Features: sepal length, sepal width, petal length, petal width

y = iris_df['Species'] # Target: Species

# Split data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply logistic regression

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

# Make predictions

y_pred = model.predict(X_test)

# Calculate accuracy

accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

# Print classification report and confusion matrix

print("\nClassification Report:")

print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
create the following dataset in python (tid=1, items= bread, milk ) (tid=2, items=
bread, diaper,beer,egss ) (tid=3, items= milk, diaper, beer, coke ) (tid=4,
items=bread, mild, diaper, beer ) (tid=5, items= bread, milk,diaper,coke)convert the
categorical values into numeric format. apply the apriori algorithem on the above
dataset to generate the frequent itemsets and association rules. Repeat the process
with different minimum _support values
from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori, association_rules

import pandas as pd

# Create the dataset

dataset = [

{'tid': 1, 'items': ['bread', 'milk']},

{'tid': 2, 'items': ['bread', 'diaper', 'beer', 'eggs']},

{'tid': 3, 'items': ['milk', 'diaper', 'beer', 'coke']},

{'tid': 4, 'items': ['bread', 'milk', 'diaper', 'beer']},

{'tid': 5, 'items': ['bread', 'milk', 'diaper', 'coke']}

# Convert categorical values into numeric format

te = TransactionEncoder()

te_ary = te.fit(dataset).transform(dataset)

df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply the Apriori algorithm

min_support_values = [0.2, 0.3] # Different minimum support values

for min_support in min_support_values:

frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)

print("Frequent Itemsets with minimum support =", min_support)

print(frequent_itemsets)

print("\n")
# Generate association rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print("Association Rules with minimum support =", min_support)

print(rules)

print("\n")

create your own transactions dataset apply the apriori algorithem on the dataset
(same as above)
download the market basket dataset. write a python program to read the dataset
and display its information preprocess the data (drop null values etc.) convert the
categorical values into numeric format. apply the apriori algorithm on the above
dataset to generate the frequent itemsets and association rules
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori, association_rules

# Read the dataset

data = pd.read_csv('path_to_your_dataset.csv') # Replace 'path_to_your_dataset.csv' with the

actual path

# Display information about the dataset

print("Dataset information:")

print(data.info())

# Preprocess the data (drop null values, etc.)

data.dropna(inplace=True)

print("After dropping null values:")

print(data.info())

# Convert categorical values into numeric format

te = TransactionEncoder()

data_encoded = te.fit_transform(data.values)

df = pd.DataFrame(data_encoded, columns=te.columns_)
# Apply the Apriori algorithm

frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

print("\nFrequent Itemsets:")

print(frequent_itemsets)

# Generate association rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print("\nAssociation Rules:")

print(rules)

download the groceries dataset. write a python program to read the dataset and
display its information preprocess the data (drop null values etc.) convert the
categorical values into numeric format. apply the apriori algorithm on the above
dataset to generate the frequent itemsets and association rules

import pandas as pd

from mlxtend.preprocessing import TransactionEncoder

from mlxtend.frequent_patterns import apriori, association_rules

# Read the dataset

data = pd.read_csv('groceries_dataset.csv') # Replace 'groceries_dataset.csv' with the actual

filename

# Display information about the dataset

print("Dataset information:")

print(data.info())

# Preprocess the data (drop null values, etc.)

data.dropna(inplace=True)

print("After dropping null values:")

print(data.info())

# Convert categorical values into numeric format

# Assuming the dataset is already in a transaction format (list of lists)

te = TransactionEncoder()

data_encoded = te.fit_transform(data.values)

df = pd.DataFrame(data_encoded, columns=te.columns_)

# Apply the Apriori algorithm

frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)

print("\nFrequent Itemsets:")

print(frequent_itemsets)

# Generate association rules

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

print("\nAssociation Rules:")

print(rules)
write a python code to implement the apriori algorithm test the code on any
standard dataset.
from collections import defaultdict

from itertools import combinations

class Apriori:

def init(self, min_support=0.5, min_confidence=0.5):

self.min_support = min_support

self.min_confidence = min_confidence

self.itemsets = None

self.transactions = None

def _get_itemsets(self, transactions):

itemsets = defaultdict(int)

for transaction in transactions:

for item in transaction:

itemsets[frozenset([item])] += 1

return itemsets
def _get_frequent_itemsets(self, itemsets, num_transactions):

frequent_itemsets = {}

for item, count in itemsets.items():

support = count / num_transactions

if support >= self.min_support:

frequent_itemsets[item] = support

return frequent_itemsets

def _generate_candidates(self, itemsets):

candidates = set()

for itemset1 in itemsets:

for itemset2 in itemsets:

union_set = itemset1.union(itemset2)

if len(union_set) == len(itemset1) + 1:

candidates.add(union_set)

return candidates

def _get_association_rules(self, frequent_itemsets, num_transactions):

rules = []

for itemset in frequent_itemsets.keys():

if len(itemset) >= 2:

for combination in combinations(itemset, len(itemset) - 1):

antecedent = frozenset(combination)

consequent = itemset - antecedent

if antecedent in frequent_itemsets and consequent in frequent_itemsets:

confidence = frequent_itemsets[itemset] / frequent_itemsets[antecedent]

if confidence >= self.min_confidence:

support = frequent_itemsets[itemset]

lift = support / (frequent_itemsets[antecedent] * frequent_itemsets[consequent])

rules.append((antecedent, consequent, support, confidence, lift))

return rules
def fit(self, transactions):

self.transactions = transactions

num_transactions = len(transactions)

itemsets = self._get_itemsets(transactions)

frequent_itemsets = self._get_frequent_itemsets(itemsets, num_transactions)

self.itemsets = frequent_itemsets

def generate_association_rules(self):

if self.itemsets is None or self.transactions is None:

raise ValueError("Please fit the model before generating association rules.")

association_rules = self._get_association_rules(self.itemsets, len(self.transactions))

return association_rules

# Example usage

if __name__ == "__main__":

transactions = [

{'bread', 'milk', 'eggs'},

{'bread', 'diaper', 'beer', 'eggs'},

{'milk', 'diaper', 'beer', 'coke'},

{'bread', 'milk', 'diaper', 'beer'},

{'bread', 'milk', 'diaper', 'coke'}

apriori = Apriori(min_support=0.2, min_confidence=0.5)

apriori.fit(transactions)

frequent_itemsets = apriori.itemsets

print("Frequent Itemsets:")

for itemset, support in frequent_itemsets.items():

print(itemset, ": Support =", support)

association_rules = apriori.generate_association_rules()

print("\nAssociation Rules:")

for rule in association_rules:

antecedent, consequent, support, confidence, lift = rule

print(f"{antecedent} -> {consequent}: Support = {support}, Confidence = {confidence}, Lift =

{lift}")
consider any text paragraph. preprocess the text to remove any special character
and digits generate the summary using extractive summarization process
import re

import nltk

from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity

def preprocess_text(text):

# Remove special characters and digits

processed_text = re.sub(r'[^a-zA-Z\s]', '', text)

return processed_text

def generate_summary(text, num_sentences=3):

# Preprocess the text

processed_text = preprocess_text(text)

# Tokenize the text into sentences

sentences = sent_tokenize(processed_text)

# Tokenize the text into words

word_tokens = [word_tokenize(sentence.lower()) for sentence in sentences]

# Remove stopwords

stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in tokens if word not in stop_words] for tokens in
word_tokens]

# Flatten the list of word tokens

flattened_words = [word for sublist in filtered_words for word in sublist]

# Calculate TF-IDF vectors

vectorizer = CountVectorizer()

X = vectorizer.fit_transform([' '.join(flattened_words)] + [' '.join(tokens) for tokens in

filtered_words])

# Calculate cosine similarity between TF-IDF vectors

similarity_matrix = cosine_similarity(X[1:], X)

# Calculate importance score for each sentence

importance_scores = similarity_matrix.sum(axis=1)

# Sort sentences by importance score

sorted_indices = importance_scores.argsort()[::-1]

# Select the top-ranking sentences for summary

summary_sentences = [sentences[i] for i in sorted_indices[:num_sentences]]

# Join the summary sentences into a single string

summary = ' '.join(summary_sentences)

return summary

# Example text

text = """

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data. The goal is a computer capable of "understanding" the contents of documents, including the
contextual nuances of the language within them. The technology can then accurately extract
information and insights contained in the documents as well as categorize and organize the
documents themselves. This technology is very useful in a variety of applications such as machine
translation, text summarization, sentiment analysis, and more.

"""

# Generate summary

summary = generate_summary(text)

print("Summary:")

print(summary)

consider any text paragraph remove the stopwords tokenize the paragraph to
extract words and sentences. calculate the word frequnecy distribution and plot the
frequencies. plot the wordcloud of the text
import re

import matplotlib.pyplot as plt

from wordcloud import WordCloud

from nltk.tokenize import word_tokenize, sent_tokenize

from nltk.corpus import stopwords

from collections import Counter

def remove_stopwords(text):

stop_words = set(stopwords.words('english'))

words = word_tokenize(text)

filtered_words = [word for word in words if word.lower() not in stop_words]

return ' '.join(filtered_words)

def tokenize_text(text):

sentences = sent_tokenize(text)

words = [word_tokenize(sentence) for sentence in sentences]

return sentences, words

def calculate_word_frequency(words):

word_freq = Counter(words)

return word_freq

def plot_word_frequency(word_freq):

words, freqs = zip(*word_freq.items())

plt.figure(figsize=(10, 6))

plt.bar(words, freqs)

plt.xlabel('Words')

plt.ylabel('Frequency')

plt.title('Word Frequency Distribution')

plt.xticks(rotation=45)

plt.show()

def plot_wordcloud(text):

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 6))

plt.imshow(wordcloud, interpolation='bilinear')

plt.axis('off')

plt.title('Word Cloud')

plt.show()

# Example text paragraph

text = """

"""
# Step 1: Remove stopwords

text_without_stopwords = remove_stopwords(text)

# Step 2: Tokenize the paragraph

sentences, words = tokenize_text(text_without_stopwords)

# Step 3: Calculate word frequency distribution

word_freq = calculate_word_frequency([word for sublist in words for word in sublist])

# Step 4: Plot word frequency distribution

plot_word_frequency(word_freq)

# Step 5: Generate word cloud

plot_wordcloud(text_without_stopwords)

consider the following review messages. perform sentiment analysis on the messages. 1. i
purchased headphones online. i am very happy with the product. 2. i saw the movie
yesterday. the animation was really good but the script was ok. 3. i enjoy listening to music 4.
i take a walk in the park everyday
from textblob import TextBlob

def analyze_sentiment(text):
blob = TextBlob(text)
sentiment = blob.sentiment.polarity
if sentiment > 0:
return 'Positive'
elif sentiment < 0:
return 'Negative'
else:
return 'Neutral'

# Review messages
messages = [
"i purchased headphones online. i am very happy with the product.",
"i saw the movie yesterday. the animation was really good but the script was ok.",
"i enjoy listening to music",
"i take a walk in the park everyday"
]

# Perform sentiment analysis on each message

for i, message in enumerate(messages, start=1):
sentiment = analyze_sentiment(message)
print(f"Message {i}: Sentiment - {sentiment}")

write a python script for the following 1. first export the whatsapp chat of any group.
read the exported ".txt" file using open() and read() fucntions. 2. tokenize the read
data into sentences and print it. 3. remove the stopwords from data and perform
lemmatization 4. plot the wordcloud for the given data
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Step 1: Read the exported WhatsApp chat file

def read_chat_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()

# Step 2: Tokenize the data into sentences

def tokenize_sentences(data):
return sent_tokenize(data)

# Step 3: Remove stopwords and perform lemmatization

def preprocess_text(text):
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Tokenize the text into words

words = word_tokenize(text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

return lemmatized_words

# Step 4: Plot wordcloud for the given data

def plot_wordcloud(text):
wordcloud = WordCloud(width=800, height=400,
background_color='white').generate(' '.join(text))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud')
plt.show()

# File path to the exported WhatsApp chat file

file_path = 'your_chat_file.txt'

# Step 1: Read the exported WhatsApp chat file

chat_data = read_chat_file(file_path)

# Step 2: Tokenize the data into sentences

sentences = tokenize_sentences(chat_data)
print("Sentences:")
print(sentences)
print()

# Step 3: Remove stopwords and perform lemmatization

preprocessed_text = preprocess_text(chat_data)
print("Preprocessed Text:")
print(preprocessed_text)
print()

# Step 4: Plot wordcloud for the given data

plot_wordcloud(preprocessed_text)

Write a Python script for the following:

i. Read the dataset and find the top 5 Instagram influencers from India.
ii . Find the Instagram account having least number of followers.
iii. Read the column "Category", remove stopwords and plot the wordcloud to find the
keywords which will imply that in which category maximum accounts are created.
iv. Group the Instagram accounts category wise.
v. Visualize the dataset and plot the relationship between Followers and Authentic
engagement columns
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# i. Read the dataset and find the top 5 Instagram influencers from India.
def top_influencers(df):
top_5_influencers = df[df['Country'] == 'India'].nlargest(5, 'Followers')
return top_5_influencers

# ii. Find the Instagram account having the least number of followers.
def least_followers(df):
least_follower_account = df[df['Followers'] == df['Followers'].min()]
return least_follower_account

# iii. Read the column "Category", remove stopwords, and plot the wordcloud.
def plot_wordcloud(df):
category_words = ' '.join(df['Category'])
stop_words = set(stopwords.words('english'))
words = [word for word in word_tokenize(category_words) if word.lower() not in
stop_words]
wordcloud = WordCloud(width=800, height=400, background_color='white').generate('
'.join(words))
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud for Category')
plt.show()
# iv. Group the Instagram accounts category-wise.
def group_by_category(df):
grouped = df.groupby('Category').size().reset_index(name='Count')
return grouped

# v. Visualize the dataset and plot the relationship between Followers and Authentic
engagement columns.
def visualize_relationship(df):
plt.figure(figsize=(10, 6))
plt.scatter(df['Followers'], df['Authentic engagement'], alpha=0.5)
plt.title('Relationship between Followers and Authentic Engagement')
plt.xlabel('Followers')
plt.ylabel('Authentic Engagement')
plt.show()

# Read the dataset

df = pd.read_csv('instagram_data.csv')

# i. Top 5 Instagram influencers from India

print("Top 5 Instagram influencers from India:")
print(top_influencers(df))
print()

# ii. Instagram account with the least number of followers

print("Instagram account with the least number of followers:")
print(least_followers(df))
print()

# iii. Plot word cloud for Category

plot_wordcloud(df)

# iv. Group Instagram accounts category-wise

print("\nInstagram accounts grouped by category:")
print(group_by_category(df))
print()

# v. Visualize the relationship between Followers and Authentic Engagement

visualize_relationship(df)

Write a Python script for the following:

i. Read the dataset and perform data cleaning operations on it.

ii. Tokenize the comments in words.

111. Perform sentiment analysis and find the percentage of positive, negative and neutral
comments.

import pandas as pd

import re

from nltk.tokenize import word_tokenize

from textblob import TextBlob

# i. Read the dataset and perform data cleaning operations on it.

def clean_data(df):

# Remove rows with missing values

df.dropna(inplace=True)

# Remove special characters and digits

df['Comment'] = df['Comment'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Convert text to lowercase

df['Comment'] = df['Comment'].apply(lambda x: x.lower())

return df

# ii. Tokenize the comments into words

def tokenize_comments(df):

df['Tokenized_Comment'] = df['Comment'].apply(lambda x: word_tokenize(x))

return df

# iii. Perform sentiment analysis and find the percentage of positive, negative, and neutral
comments
def analyze_sentiment(comment):

blob = TextBlob(comment)

sentiment = blob.sentiment.polarity

if sentiment > 0:

return 'Positive'

elif sentiment < 0:

return 'Negative'

else:

return 'Neutral'

def sentiment_analysis(df):

df['Sentiment'] = df['Comment'].apply(analyze_sentiment)

sentiment_counts = df['Sentiment'].value_counts(normalize=True) * 100

return sentiment_counts

# Read the dataset

df = pd.read_csv('your_dataset.csv') # Replace 'your_dataset.csv' with the actual file path to your

dataset

# i. Data cleaning operations

df_cleaned = clean_data(df)

# ii. Tokenize the comments into words

df_tokenized = tokenize_comments(df_cleaned)

# iii. Perform sentiment analysis and find the percentage of positive, negative, and neutral
comments

sentiment_percentage = sentiment_analysis(df_tokenized)

# Print the results

print("Percentage of positive, negative, and neutral comments:")

print(sentiment_percentage)
Write a Python script for the following:

i. Read the dataset and perform data cleaning operations on it.

ii. Find the total views, total likes, total dislikes and comment count.

iii.Find the least and topmost liked and commented videos.

iv. Perform year wise statistics for views and plot the analyzed data.

v. Plot the viewers who reacted on videos.

import pandas as pd

import matplotlib.pyplot as plt

# i. Read the dataset and perform data cleaning operations on it.

def clean_data(df):

# Remove rows with missing values

df.dropna(inplace=True)

# Convert data types if necessary

df['Views'] = pd.to_numeric(df['Views'], errors='coerce')

df['Likes'] = pd.to_numeric(df['Likes'], errors='coerce')

df['Dislikes'] = pd.to_numeric(df['Dislikes'], errors='coerce')

df['Comments'] = pd.to_numeric(df['Comments'], errors='coerce')

return df

# ii. Find the total views, total likes, total dislikes, and comment count.

def get_statistics(df):

total_views = df['Views'].sum()

total_likes = df['Likes'].sum()

total_dislikes = df['Dislikes'].sum()

total_comments = df['Comments'].sum()

return total_views, total_likes, total_dislikes, total_comments

# iii. Find the least and topmost liked and commented videos.

def get_top_least_videos(df):

top_liked_video = df[df['Likes'] == df['Likes'].max()]

least_liked_video = df[df['Likes'] == df['Likes'].min()]

top_commented_video = df[df['Comments'] == df['Comments'].max()]

least_commented_video = df[df['Comments'] == df['Comments'].min()]

return top_liked_video, least_liked_video, top_commented_video, least_commented_video

# iv. Perform year-wise statistics for views and plot the analyzed data.

def year_wise_statistics(df):

df['Year'] = pd.to_datetime(df['Published_Date']).dt.year

year_wise_views = df.groupby('Year')['Views'].sum()

return year_wise_views

# v. Plot the viewers who reacted on videos.

def plot_reactions(df):

reactions = df[['Likes', 'Dislikes', 'Comments']].sum()

reactions.plot(kind='bar', figsize=(10, 6))

plt.title('Reactions on Videos')

plt.xlabel('Reaction Type')

plt.ylabel('Count')

plt.xticks(rotation=45)

plt.show()

# Read the dataset

df = pd.read_csv('your_dataset.csv') # Replace 'your_dataset.csv' with the actual file path to your

dataset

# i. Data cleaning operations

df_cleaned = clean_data(df)

# ii. Find the total views, total likes, total dislikes, and comment count.

total_views, total_likes, total_dislikes, total_comments = get_statistics(df_cleaned)

print("Total Views:", total_views)

print("Total Likes:", total_likes)

print("Total Dislikes:", total_dislikes)

print("Total Comments:", total_comments)

print()

# iii. Find the least and topmost liked and commented videos.

top_liked_video, least_liked_video, top_commented_video, least_commented_video =

get_top_least_videos(df_cleaned)

print("Top Liked Video:")

print(top_liked_video)

print()

print("Least Liked Video:")

print(least_liked_video)

print()

print("Top Commented Video:")

print(top_commented_video)

print()

print("Least Commented Video:")

print(least_commented_video)

print()

# iv. Perform year-wise statistics for views and plot the analyzed data.

year_wise_views = year_wise_statistics(df_cleaned)

print("Year-wise Statistics for Views:")

print(year_wise_views)

print()
# Plot year-wise statistics

year_wise_views.plot(kind='bar', figsize=(10, 6))

plt.title('Year-wise Views')

plt.xlabel('Year')

plt.ylabel('Views')

plt.xticks(rotation=45)

plt.show()

# v. Plot the viewers who reacted on videos.

plot_reactions(df_cleaned)

Write a Python script to read the Tweets using Twitter API and tweepy library to perform the
following tasks:

i. Authenticate Twitter API (Using Bearer Token)

ii. Get the tweets using Keywords or Hash Tags.

iii.Find the total number of likes and retweets on each tweet.

iv.find the most like tweets and print it's text

v.Visualize the tweets and plot the time series for likes and retweets along with dates on which
tweets are published.

import tweepy

import pandas as pd

import matplotlib.pyplot as plt

# Twitter API credentials

bearer_token = 'YOUR_BEARER_TOKEN' # Replace with your Twitter API bearer token

# Authenticate Twitter API (Using Bearer Token)

auth = tweepy.AppAuthHandler(bearer_token)

api = tweepy.API(auth, wait_on_rate_limit=True)

# ii. Get the tweets using Keywords or Hash Tags.

def search_tweets(query, max_tweets=100):

tweets = []

for tweet in tweepy.Cursor(api.search, q=query, tweet_mode='extended').items(max_tweets):

tweets.append(tweet)

return tweets

# iii. Find the total number of likes and retweets on each tweet.

def get_likes_retweets(tweets):

data = {'Tweet': [], 'Likes': [], 'Retweets': []}

for tweet in tweets:

data['Tweet'].append(tweet.full_text)

data['Likes'].append(tweet.favorite_count)

data['Retweets'].append(tweet.retweet_count)

return pd.DataFrame(data)

# iv. Find the most liked tweet and print its text

def most_liked_tweet(tweets_df):

most_liked_index = tweets_df['Likes'].idxmax()

most_liked_text = tweets_df.iloc[most_liked_index]['Tweet']

return most_liked_text

# v. Visualize the tweets and plot the time series for likes and retweets along with dates on which
tweets are published.

def plot_time_series(tweets_df):

tweets_df['Date'] = pd.to_datetime(tweets_df['Date'])

tweets_df.set_index('Date', inplace=True)

tweets_df[['Likes', 'Retweets']].plot(figsize=(10, 6))

plt.title('Likes and Retweets over Time')

plt.xlabel('Date')

plt.ylabel('Count')

plt.xticks(rotation=45)
plt.show()

# Perform tasks

query = 'python' # Example keyword

tweets = search_tweets(query, max_tweets=100)

tweets_df = get_likes_retweets(tweets)

most_liked = most_liked_tweet(tweets_df)

print("Most Liked Tweet:")

print(most_liked)

print()

# Plot time series

plot_time_series(tweets_df)

The Thermodynamics of Phase and Reaction Equilibria 2nd Edition Ismail Tosun - Ebook PDF All Chapter Instant Download
100% (2)
The Thermodynamics of Phase and Reaction Equilibria 2nd Edition Ismail Tosun - Ebook PDF All Chapter Instant Download
41 pages
Experiment 7 Liquid Level Measurement
No ratings yet
Experiment 7 Liquid Level Measurement
13 pages
Stefan/Arnold Diffusion Experiment
100% (2)
Stefan/Arnold Diffusion Experiment
6 pages
J. Chem. Thermodynamics: J. Soujanya, B. Satyavathi, T.E. Vittal Prasad
No ratings yet
J. Chem. Thermodynamics: J. Soujanya, B. Satyavathi, T.E. Vittal Prasad
4 pages
Venturimeter, Orificemeter & Rotameter Calibration Set-Up: Experiment No. 4
100% (1)
Venturimeter, Orificemeter & Rotameter Calibration Set-Up: Experiment No. 4
9 pages
Midterm 3 Sol
No ratings yet
Midterm 3 Sol
3 pages
UNIT - I - Basics of Modelling - SCH1401: School of Bio and Chemical Department of Chemical Engineering
No ratings yet
UNIT - I - Basics of Modelling - SCH1401: School of Bio and Chemical Department of Chemical Engineering
111 pages
Darcy'S Law Apparatus: Chemical Engineering Group
No ratings yet
Darcy'S Law Apparatus: Chemical Engineering Group
6 pages
Exp4 - Dynamic Response in 2nd Order System For Step Change
No ratings yet
Exp4 - Dynamic Response in 2nd Order System For Step Change
13 pages
4796-09a52301 - Transport Phenomena in Bioprocesses
No ratings yet
4796-09a52301 - Transport Phenomena in Bioprocesses
4 pages
Types of Fluid Flow
No ratings yet
Types of Fluid Flow
5 pages
Paints
No ratings yet
Paints
5 pages
3rd - Sem-Ct-23-Chemical Engineering
No ratings yet
3rd - Sem-Ct-23-Chemical Engineering
3 pages
Sums of Flanges
100% (1)
Sums of Flanges
19 pages
Aspen Record
100% (1)
Aspen Record
40 pages
Problem 6: Heat Exchange in A Series of Tanks
0% (1)
Problem 6: Heat Exchange in A Series of Tanks
5 pages
Chemistry Unit 4 Fuels and Combustion Notes Rit Format Final
No ratings yet
Chemistry Unit 4 Fuels and Combustion Notes Rit Format Final
29 pages
Bernoulli S Theorem Experiment
No ratings yet
Bernoulli S Theorem Experiment
5 pages
Non-Isothermal CSTR Experiment - 2 Aim: Modeling and Simulation of Non-Isothermal CSTR. Assumptions
No ratings yet
Non-Isothermal CSTR Experiment - 2 Aim: Modeling and Simulation of Non-Isothermal CSTR. Assumptions
6 pages
Drag Coefficient: Experiment 3
0% (1)
Drag Coefficient: Experiment 3
5 pages
MMC PPT 3.2
No ratings yet
MMC PPT 3.2
21 pages
Uniheat - Transfer Thermal Unitst II Convective Heat Transfer Thermal Units
100% (1)
Uniheat - Transfer Thermal Unitst II Convective Heat Transfer Thermal Units
33 pages
4.4 Heat Ex Changers
No ratings yet
4.4 Heat Ex Changers
7 pages
CH6501 Instrumental Methods of Analysis Chemical Engineering 2015-16
No ratings yet
CH6501 Instrumental Methods of Analysis Chemical Engineering 2015-16
14 pages
Agitated Vessel Report - AS
No ratings yet
Agitated Vessel Report - AS
9 pages
Introduction To: Transport Phenomena
No ratings yet
Introduction To: Transport Phenomena
11 pages
GATE PYQs of Conduction
No ratings yet
GATE PYQs of Conduction
76 pages
Separation Process Engineering Includes Mass Transfer Analysis 3rd Edition Wankat Solutions Manual
No ratings yet
Separation Process Engineering Includes Mass Transfer Analysis 3rd Edition Wankat Solutions Manual
15 pages
Experiment No.: 10 Name of Experiment: To Determine Coefficient of Discharge of Rotameter Roll No: Batch: Date
No ratings yet
Experiment No.: 10 Name of Experiment: To Determine Coefficient of Discharge of Rotameter Roll No: Batch: Date
3 pages
To Determine The Coefficient of Discharge For Rectangular Notch
No ratings yet
To Determine The Coefficient of Discharge For Rectangular Notch
3 pages
Open Ended CRE Lab
No ratings yet
Open Ended CRE Lab
15 pages
04 Heat Transfer
No ratings yet
04 Heat Transfer
23 pages
Similarity and Modelling - 2
No ratings yet
Similarity and Modelling - 2
7 pages
Mass Transfer Without Chemical Reaction
0% (1)
Mass Transfer Without Chemical Reaction
3 pages
01 - PDC Study of Step Response of First Order System
100% (1)
01 - PDC Study of Step Response of First Order System
8 pages
Camphor Balls
No ratings yet
Camphor Balls
8 pages
Ps2 in PDC
0% (1)
Ps2 in PDC
3 pages
Cheat Sheet Modelling 1718
No ratings yet
Cheat Sheet Modelling 1718
2 pages
Reactor Selection Criteria
No ratings yet
Reactor Selection Criteria
2 pages
Cary 100 UV-Vis Operating Instructions
No ratings yet
Cary 100 UV-Vis Operating Instructions
9 pages
Reynolds Report
100% (1)
Reynolds Report
6 pages
Fluid Mechanics Question Bank
No ratings yet
Fluid Mechanics Question Bank
6 pages
MODULE - 1.2 - PPT1.2 Flow Through Bed of Solids
No ratings yet
MODULE - 1.2 - PPT1.2 Flow Through Bed of Solids
16 pages
MTO - PROBLEM - SHEETS - 2and 3
No ratings yet
MTO - PROBLEM - SHEETS - 2and 3
5 pages
Experiment No.-1: With Continuous Supply of Fresh Gas and Removal of The Products of Diffusion
No ratings yet
Experiment No.-1: With Continuous Supply of Fresh Gas and Removal of The Products of Diffusion
7 pages
EGR250HW No.3
No ratings yet
EGR250HW No.3
12 pages
Assignment 2 CFM
100% (1)
Assignment 2 CFM
1 page
Raoult
No ratings yet
Raoult
11 pages
Marking Scheme AST-261
100% (1)
Marking Scheme AST-261
5 pages
Numerical For Practice Midterm
No ratings yet
Numerical For Practice Midterm
3 pages
Module 1: Equation Forms in Process Modeling Section 3: Distributed Parameter Models and Abstract Equation Forms
No ratings yet
Module 1: Equation Forms in Process Modeling Section 3: Distributed Parameter Models and Abstract Equation Forms
5 pages
2 - Mathematical Modelling PDF
No ratings yet
2 - Mathematical Modelling PDF
20 pages
Exp 2 Reynolds Apparatus
No ratings yet
Exp 2 Reynolds Apparatus
4 pages
Theory: Figure 1: The Exit Age Distribution Curve E For Fluid Flowing Through A Vessel Also Called The Residence Time
No ratings yet
Theory: Figure 1: The Exit Age Distribution Curve E For Fluid Flowing Through A Vessel Also Called The Residence Time
5 pages
Vertical and Inclined Single Column Manometer
No ratings yet
Vertical and Inclined Single Column Manometer
5 pages
Chemical Reaction Engineering First Serious Exam Question
No ratings yet
Chemical Reaction Engineering First Serious Exam Question
2 pages
MSO assignment 6,7,8
No ratings yet
MSO assignment 6,7,8
31 pages
Rotameter Equations and Derivations
No ratings yet
Rotameter Equations and Derivations
2 pages
hemraj_python_ass1
No ratings yet
hemraj_python_ass1
7 pages
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
Chapter 2 Static and Dynamic Characteristics of Signals
No ratings yet
Chapter 2 Static and Dynamic Characteristics of Signals
18 pages
Alarm Management 2nd Ed Hollifield Habibi Ch1 Final 3 15
No ratings yet
Alarm Management 2nd Ed Hollifield Habibi Ch1 Final 3 15
22 pages
Human Activities Classifier Using SVM
No ratings yet
Human Activities Classifier Using SVM
19 pages
Computer Hardware Servicing
100% (1)
Computer Hardware Servicing
44 pages
Java-All Practicals 1-11
No ratings yet
Java-All Practicals 1-11
47 pages
Gen AI Course Content
No ratings yet
Gen AI Course Content
6 pages
Linkedin Marketing Li Lms Unversioned
No ratings yet
Linkedin Marketing Li Lms Unversioned
1,230 pages
(Ebook) Beginning C# 6.0 Programming with Visual Studio 2015 by Benjamin Perkins, Jacob Vibe Hammer, Jon D. Reid ISBN 9781119096689, 1119096685 pdf download
No ratings yet
(Ebook) Beginning C# 6.0 Programming with Visual Studio 2015 by Benjamin Perkins, Jacob Vibe Hammer, Jon D. Reid ISBN 9781119096689, 1119096685 pdf download
55 pages
T370HW02 VG Auo PDF
No ratings yet
T370HW02 VG Auo PDF
31 pages
AP Calculus BC 2013 Practice Exam
No ratings yet
AP Calculus BC 2013 Practice Exam
52 pages
CCBoot Enterprise Solution
No ratings yet
CCBoot Enterprise Solution
24 pages
Logical Data Model
No ratings yet
Logical Data Model
12 pages
Unit 2 Notes
No ratings yet
Unit 2 Notes
18 pages
(MOS Excel) Mock Test 1
No ratings yet
(MOS Excel) Mock Test 1
5 pages
Blockchain in International Trade
No ratings yet
Blockchain in International Trade
16 pages
Yash Patel Res
No ratings yet
Yash Patel Res
1 page
DDIC ALL Questions
No ratings yet
DDIC ALL Questions
6 pages
3DX 3D Sculptor Datasheet
No ratings yet
3DX 3D Sculptor Datasheet
2 pages
Research Internet Surfing
No ratings yet
Research Internet Surfing
16 pages
HyperElk Levelling Tips
No ratings yet
HyperElk Levelling Tips
4 pages
Exchange Certification Path
No ratings yet
Exchange Certification Path
1 page
Rabbit Tape by Safari Pedals
No ratings yet
Rabbit Tape by Safari Pedals
9 pages
Online Examination
No ratings yet
Online Examination
21 pages
1.6.8 Functions - SolverLoad Function
No ratings yet
1.6.8 Functions - SolverLoad Function
1 page
50+ Data Structure and Algorithms Interview Questions
No ratings yet
50+ Data Structure and Algorithms Interview Questions
12 pages
Django: Python Web Framework Rayland Jeans CSCI 5448
No ratings yet
Django: Python Web Framework Rayland Jeans CSCI 5448
40 pages
Need Help With Essay
100% (1)
Need Help With Essay
7 pages
Cover Letter Job Application Site Supervisor
100% (2)
Cover Letter Job Application Site Supervisor
8 pages
Nursing Informatics
No ratings yet
Nursing Informatics
102 pages
Cojali Oem Solutions Catalogue 2024
No ratings yet
Cojali Oem Solutions Catalogue 2024
20 pages