0% found this document useful (0 votes)
11 views32 pages

Ip HHW

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views32 pages

Ip HHW

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 32

I Python Pandas: Data Handling

1) Create a pandas series from a dictionary of values and an nd


array.

import pandas as pd
dict1 = {'a':10,'b':20,'c':30,'d':40}
s1 = pd.Series(dict1)
print(s1)

import pandas as pd
import numpy as np
array1 = np.array(['a','b','c','d'])
s1 = pd.Series(array1)
print(s1)

2) Write a Pandas program to perform arithmetic operations on


two Pandas Series.
import pandas as pd

def series(series):
n = int(input(f"Enter the number of elements in {series}: "))
list1 = []
for i in range(n):
y = float(input(f"Enter element {i+1} of {series}: "))
list1.append(y)
return pd.Series(list1)

print("Enter elements for the first Series:")


s1 = series("Series 1")
print(s1)

print("Enter elements for the second Series:")


s2 = series("Series 2")
print(s2)

addition = s1 + s2
subtraction = s1 - s2
multiplication = s1 * s2
division = s1 / s2

print("\nAddition:\n", addition)
print("Subtraction:\n", subtraction)
print("Multiplication:\n", multiplication)
print("Division:\n", division)

3) Write a Pandas program to add some data to an existing


Series.
import pandas as pd

l1 = [10,20,30,40]
s1 = pd.Series(l1) # Inital Series
print(s1)

l2 = [50,60,70]
s2 = pd.Series(l2) # Additional Data
print(s2)

s3 = s1._append(s2, ignore_index=True) # Updated Series


print(s3)

4) Write a Pandas program to select the rows where the


percentage greater than 70.
import pandas as pd

data = {'Name':['Vishal','Ram','Shyam','Ravi'],
'Percentage':[90,85,45,75]}

df = pd.DataFrame(data)
print("Original Dataframe\n",df)

print("Required Dataframe\n",df[df['Percentage']>70])
5) Write a Pandas program to select the rows the percentage is
between 70 and 90 (inclusive)
import pandas as pd

data = {'Name':['Vishal','Ram','Shyam','Ravi'],
'Percentage':[92,85,45,75]}

df = pd.DataFrame(data)
print("Original Dataframe\n",df)

print("Required Dataframe\n",df[(df['Percentage'] >= 70) & (df['Percentage'] <= 90)])


6) Write a Pandas program to change the percentage in given
row by user.
import pandas as pd

data = {'Name':['Vishal','Ram','Shyam','Ravi'],
'Percentage':[92,85,45,75]}

df = pd.DataFrame(data)
print("Original Dataframe\n",df)

def change_percentage(df,name,new_percentage):
df.loc[df['Name']== name,'Percentage'] = new_percentage
return df

name = input("Enter the Name whose percentage is to be changed:- ")


new_percentage = float(input(f"Enter the New Percentage for {name}:- "))

df = change_percentage(df,name,new_percentage)

print("Updated Dataframe\n",df)

7) Write a Pandas program to join the two given dataframes


along rows and assign all data.
import pandas as pd

data1 = {'Name':['Vishal','Ram','Shyam'],
'Percentage':[90,85,75]}

df1 = pd.DataFrame(data1) #1st DataFrame

print("DataFrame 1:\n",df1)

data2 = {'Name':['Ravi','Atul','Amir'],
'Percentage':[90,85,75]}

df2 = pd.DataFrame(data2) #2nd DataFrame

print("DataFrame 2:\n",df2)

df3 = pd.concat([df1,df2],ignore_index=True) #Merged DataFrame

print("Joined DataFrame:\n",df3)

8) Write a Pandas program to join the two given dataframes


along columns and assign all data.
import pandas as pd

data1 = {'Name':['Vishal','Ram','Shyam'],
'Percentage':[90,85,75]}
df1 = pd.DataFrame(data1) #1st DataFrame

print("DataFrame 1:\n",df1)

data2 = {'Age':[18,17,19],
'Grade':['A','B','C']}

df2 = pd.DataFrame(data2) #2nd DataFrame

print("DataFrame 2:\n",df2)

df3 = pd.concat([df1,df2],axis=1,ignore_index=False) #Merged DataFrame

print("Joined DataFrame:\n",df3)

9) Write a Pandas program to append a list of dictioneries or


series to a existing DataFrame and display the combined data.
import pandas as pd

data = {"Name":["Vishal","Ram",'Shyam'],
'Percentage':[90,85,75]}

df = pd.DataFrame(data)
print("Original DataFrame\n",df) #Original DataFrame
new_dictionaries = [{'Name':'Atul','Percentage':78}, #List of Dictionaries
{'Name':'Ravi','Percentage':74}]

new_series = [pd.Series(['Amar',89],index = ['Name','Percentage']), #List of Series


pd.Series(['Vicky',65],index = ['Name','Percentage'])]

df = df._append(new_dictionaries,ignore_index=True)
print("DataFrame with appended list of dictionaries\n",df)

df = df._append(new_series,ignore_index = True)
print("DataFrame with appended list of series\n",df)

10) Program to select or filter rows from a DataFrame based on


values in columns in pandas ( Use of Relational and Logical
Operators)
import pandas as pd

data = {'Name':['Vishal','Ram','Shyam','Anil','Raju'],
'Age':[18,17,18,15,16],
'Percentage':[90,75,84,74,85]}
df = pd.DataFrame(data)

print("And Operator:-\n",df[(df['Age']>=17) & (df['Percentage']>=80)])

print("OR Operator:-\n",df[(df['Age']>=17) | (df['Percentage']>=80)])

11) Filter out rows based on different criteria such as duplicate


rows.
import pandas as pd

def duplicate(df):
row = []

for index, rows in df.iterrows():


row.append((index, list(rows)))

duplicate_value = []

for i in range(len(row)):
for j in range(i + 1, len(row)):
if row[i][1] == row[j][1]:
duplicate_value.append(row[j])

print("\nDuplicate Values:")
for idx, row_values in duplicate_value:
row_dict = {col: val for col, val in zip(df.columns, row_values)}
print(f"Index: {idx}, Values: {row_dict}")

data = {
'Name': ['Vishal', 'Ram', 'Shyam', 'Anil', 'Raju', 'Vishal', 'Anil'],
'Age': [18, 17, 14, 15, 16, 18, 15],
'Percentage': [90, 75, 84, 74, 85, 90, 74]
}

df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

duplicate(df)

df = df.drop_duplicates()
print("\nAfter dropping duplicate values:\n", df)

12) Importing and exporting data between pandas and CSV file:
# To create and open a data frame using ‘Student_result.csv’ file using
Pandas.
import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result.csv")

print(df)
# To display row labels, column labels data types of each column and
the dimensions.
# To display the shape (number of rows and columns) of the CSV file.
import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result.csv")

print("Row labels:")
print(df.index)

print("\nColumn labels:")
print(df.columns)

print("\nData types:")
print(df.dtypes)

print("\nDimension:")
print(df.shape)
13) Read the ‘Student_result.csv’ to create data frame and do the
following operation:
# To display Adm_No, Gender and Percentage from ‘Student_result.csv’
file.
import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1.csv")

print("Filtered DataFrame:-\n", df[['Adm_No','Gender','Percentage']])

# To display first 5 and last 5 records from ‘student_result.csv’ file.


import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1.csv")

print("First 5 Records\n",df.head())
print("\nLast 5 Records\n",df.tail())
14) Read the ‘Student_result.csv’ to create data frame and do the
following operation:
# To display Student_result file with new column names.

import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1.csv",
names=['Admission no.','Student Name','Gender','Age','Percentage'],
header = 0)
print(df)

# To modify the Percentage of student below 40 with NaN value in


dataframe.
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1.csv")

df[df['Percentage']<40]= np.NaN

print("Required Datafram:\n",df)

15) Read the ‘Student_result.csv’ to create data frame and do the


following operation:
# To create a duplicate file for ‘student_result.csv’ containing Adm_No,
Name and Percentage.

import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1.csv",
usecols = ['Adm_No','Name','Percentage'])

df.to_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1_Copy.csv")
# Write the statement in Pandas to find the highest percentage and
also print the student’s name and percentage.
import pandas as pd

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\Student_Result1.csv")

print("Maximum Percentage\n",df[df['Percentage']==df['Percentage'].max()])

16) Importing and exporting data between pandas and MySQL


database.
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('mysql+mysqldb://root:vishal@localhost/pandas_sql')

data = {
'Adm_No': [101, 102, 103],
'Name': ['Vishal', 'Ram', 'Shyam'],
'Percentage': [90, 75, 84]
}
df = pd.DataFrame(data)

df.to_sql('student_results', con=engine, if_exists='replace', index=False)


import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('mysql+mysqldb://root:vishal@localhost/pandas_sql')

df = pd.read_sql("Select Name, Percentage from student_results",con=engine)

print(df)

17) Find the sum of each column, or find the column with the
lowest mean.
import pandas as pd

data = {'A':[24,10,15],
'B':[1,25,18],
'C':[7,48,12]}

df = pd.DataFrame(data)

print("Sum of columns")
print(df.sum())

print("\nMean of columns")
print(df.mean())

print("\nLowest Mean")
print((df.mean()).idxmin())

18) Locate the 3 largest values in a data frame.


import pandas as pd

data = {
'A': [10, 5, 8, 12],
'B': [20, 15, 25, 18],
'C': [30, 25, 35, 22]
}
df = pd.DataFrame(data)

rows = []
for index,row in df.iterrows():
for i in range(len(list(row))):
rows.append((list(row)[i]))
rows.sort()

print("Three largest values in the datafram:\n")


print(rows[-1:-4:-1])
19) Subtract the mean of a row from each element of the row in a
Data Frame.
import pandas as pd

data = {
'A': [10, 5, 8, 12],
'B': [20, 15, 25, 18],
'C': [30, 25, 35, 22]
}
df = pd.DataFrame(data)
mean = list(df.mean())
columns = list(df.columns)

for i in range(len(columns)):
df[columns[i]] = df[columns[i]] - mean[i]
print(df)

20) Replace all negative values in a data frame with a 0.


import pandas as pd

data = {
'A': [1.25, -3.75, -0.75, 3.25],
'B': [0.5, -4.5, 5.5, -1.5],
'C': [2.0, -3.0, 7.0, -6.0]
}
df = pd.DataFrame(data)

for i in range(df.shape[0]):
for j in range(df.shape[1]):
if df.iat[i,j] < 0:
df.iat[i,j] = 0

print("Required Dataframe")
print(df)
21) Replace all missing values in a data frame with a 999.
import pandas as pd
import numpy as np

data = {
'A': [10, np.NaN, 8, 12],
'B': [np.NaN, 15, 25, np.NaN],
'C': [30, 25, 35, np.NaN]
}
df = pd.DataFrame(data)

for i in range(df.shape[0]):
for j in range(df.shape[1]):
if pd.isna(df.iat[i,j]):
df.iat[i,j] = 999

print("Required DataFrame")
print(df)

22) Given a Series, print all the elements that are above the 75th
percentile.
import pandas as pd

data = [74, 21, 78, 89, 45, 81, 92, 63, 49, 84]
s1 = pd.Series(data)

print("Above 75 Percentile:")
print(s1[s1>75])
23) Create a Data Frame quarterly sale where each row contains
the item category, item name, and expenditure. Group the
rows by the category, and print the total expenditure per
category.
import pandas as pd

data = {
'Category': ['Electronics', 'Electronics', 'Electronics', 'Furniture', 'Furniture',
'Grocery', 'Grocery', 'Grocery'],
'Item': ['Laptop', 'Smartphone', 'Tablet', 'Chair', 'Table', 'Milk', 'Bread', 'Eggs'],
'Expenditure': [1200, 800, 300, 150, 200, 50, 20, 30]
}
df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

print("\nTotal Expenditure per Category:\n",


df.groupby('Category')['Expenditure'].sum().reset_index())
24) Create a data frame based on ecommerce data and generate
descriptive statistics (mean, median, mode, quartile, and
variance)
import pandas as pd
import numpy as np
from scipy import stats

data = {
'OrderID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'CustomerID': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105],
'Category': ['Electronics', 'Electronics', 'Clothing', 'Clothing', 'Books',
'Electronics', 'Books', 'Clothing', 'Books', 'Electronics'],
'Sales': [250, 150, 50, 200, 300, 400, 100, 150, 200, 350],
'Quantity': [1, 2, 1, 4, 3, 1, 2, 3, 1, 2]
}
df = pd.DataFrame(data)

print("Original DataFrame:\n", df)

mean_sales = df['Sales'].mean()
median_sales = df['Sales'].median()
mode_sales = stats.mode(df['Sales'], keepdims=True)[0][0]
quartiles_sales = df['Sales'].quantile([0.25, 0.5, 0.75])
variance_sales = df['Sales'].var()

print("\nDescriptive Statistics for Sales:")


print(f"Mean: {mean_sales}")
print(f"Median: {median_sales}")
print(f"Mode: {mode_sales}")
print(f"Quartiles: \n{quartiles_sales}")
print(f"Variance: {variance_sales}")
II. Visualization
25) Given the school result data, analyses the performance of the
students on different parameters, e.g subject wise or class
wise.
import pandas as pd
import matplotlib.pyplot as plt

data = {
'StudentID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Name': ['Vishal', 'Anil', 'Shyam', 'Ram', 'Raju', 'Deepa', 'Sita', 'Gita', 'Krishna',
'Meena'],
'Class': ['12th', '11th', '11th', '11th', '11th', '12th', '12th', '12th', '12th',
'11th'],
'Math': [95, 78, 92, 88, 76, 89, 90, 85, 87, 85],
'Computer': [92, 82, 94, 89, 78, 85, 91, 88, 86, 90],
'English': [93, 80, 85, 90, 82, 91, 87, 89, 88, 83]
}
df = pd.DataFrame(data)
Mean_Subject = df[['Math','Computer','English']].mean()
Mean_Class = df.groupby('Class')[['Math','Computer','English']].mean()

plt.figure(figsize=(10,5))

Mean_Subject.plot(kind='bar',color=['blue','green','orange'])
plt.title('Average Score Subject Wise')
plt.xlabel('Subjects')
plt.ylabel('Average Scores')
plt.show()

Mean_Class.plot(kind='bar', figsize=(10, 5))


plt.title('Average Scores by Class')
plt.xlabel('Class')
plt.ylabel('Average Scores')
plt.show()

plt.figure(figsize=(10, 5))
df.plot(x='Name', y=['Math', 'Computer', 'English'], kind='bar', figsize=(10, 5))
plt.title('Individual Performance in Each Subject')
plt.xlabel('Name')
plt.ylabel('Scores')
plt.show()
26) Write a program to plot a bar chart in python to display the
result of a school for five consecutive years.
import matplotlib.pyplot as plt
import pandas as pd

data = {'Years':[2019,2020,2021,2022,2023],
'Maths':[85, 88, 90, 87, 92],
'Computer':[82, 84, 86, 83, 89],
'English':[78, 80, 82, 79, 85]}

df = pd.DataFrame(data)

plt.figure(figsize=(10, 5))
df.plot(x='Years', y=['Maths', 'Computer', 'English'], kind='bar', figsize=(10, 5))
plt.title('Individual Performance in Each Subject')
plt.xlabel('Years')
plt.ylabel('Scores')
plt.show()

27) For the Data frames created above, analyze, and plot
appropriate charts with title and legend. Number of Students
against Scores in all the 7 subjects Show the Highest score of
each subject
import pandas as pd
import matplotlib.pyplot as plt
data = {'Student ID':range(101,111),
'Name': ['Vishal', 'Anil', 'Shyam', 'Ram', 'Raju', 'Deepa', 'Sita', 'Gita',
'Krishna', 'Meena'],
'Class': ['12th', '11th', '11th', '11th', '11th', '12th', '12th', '12th', '12th',
'11th'],
'Math': [85, 78, 92, 88, 76, 89, 90, 85, 87, 95],
'Science': [90, 82, 94, 89, 78, 85, 91, 88, 86, 92],
'English': [88, 80, 85, 90, 82, 91, 87, 89, 88, 93],
'History': [70, 75, 68, 74, 69, 77, 72, 78, 74, 80],
'Geography': [72, 78, 74, 76, 71, 79, 75, 80, 77, 81],
'Hindi': [82, 85, 84, 83, 80, 86, 88, 87, 85, 89],
'Sanskrit': [78, 76, 80, 79, 75, 81, 77, 82, 80, 84]
}

df = pd.DataFrame(data)

subjects = ['Math', 'Science', 'English', 'History', 'Geography', 'Hindi', 'Sanskrit']


highest_scores = df[subjects].max()

plt.figure(figsize=(15, 10))
for i, subject in enumerate(subjects):
plt.subplot(3, 3, i+1)
df[subject].plot(kind='hist', bins=10, alpha=0.7)
plt.axvline(highest_scores[subject], color='red', linestyle='dashed', linewidth=2)
plt.title(subject)
plt.xlabel('Scores')
plt.ylabel('Number of Students')
plt.legend(['Highest Score'])

plt.suptitle('Number of Students Against Scores in All Subjects', fontsize=16)


plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

plt.figure(figsize=(10, 5))
highest_scores.plot(kind='bar', color='skyblue')
plt.title('Highest Scores in Each Subject')
plt.xlabel('Subjects')
plt.ylabel('Scores')
plt.xticks(rotation=45)
plt.show()
28) For the Data frames created above, analyze, and plot
appropriate charts with title and legend. Show the Average
score of each subject
import pandas as pd
import matplotlib.pyplot as plt

data = {
'Name': ['Vishal', 'Anil', 'Rajesh', 'Sunita', 'Amit'],
'Math': [88, 92, 85, 78, 90],
'Physics': [84, 89, 80, 75, 95],
'English': [90, 85, 87, 92, 88],
'Chemistry': [76, 80, 83, 85, 78],
'Hindi': [82, 87, 88, 90, 85],
'Computer': [95, 98, 94, 93, 97],
}

df = pd.DataFrame(data)

average_marks = df.mean(numeric_only=True)

plt.figure(figsize=(10, 6))
average_marks.plot(kind='bar', color='skyblue', alpha=0.7)
plt.axhline(y=average_marks.mean(), color='red', linestyle='dashed', linewidth=2,
label='Overall Average')

plt.title('Average Scores of Each Subject')


plt.xlabel('Subjects')
plt.ylabel('Average Score')
plt.legend()

plt.show()

29) For the Data frames created above, analyze, and plot
appropriate charts with title and legend. Number of Females
and Males Average Percentage of Females and Males
import pandas as pd
import matplotlib.pyplot as plt

data = {
'Name': ['Vishal', 'Anil', 'Rajesh', 'Sunita', 'Amit', 'Deepa', 'Kiran', 'Nita',
'Rahul', 'Meena'],
'Gender': ['M', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'F'],
'Percentage': [88, 92, 85, 78, 90, 95, 88, 82, 87, 91]
}

df = pd.DataFrame(data)

gender_counts = df['Gender'].value_counts()

average_percentage_by_gender = df.groupby('Gender')['Percentage'].mean()

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
gender_counts.plot(kind='bar', color=['skyblue', 'pink'], alpha=0.7)
plt.title('Number of Females and Males')
plt.xlabel('Gender')
plt.ylabel('Number of Students')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
average_percentage_by_gender.plot(kind='bar', color=['skyblue', 'pink'], alpha=0.7)
plt.title('Average Percentage of Females and Males')
plt.xlabel('Gender')
plt.ylabel('Average Percentage')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()
30) Take data of your interest from an open source (e.g.
data.gov.in), aggregate and summarize it. Then plot it using
different plotting functions of the Matplotlib library.
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv(r"C:\Users\Vishal Shrivastawa\Desktop\datafile.csv")

year_columns = df.columns[2:]

plt.figure(figsize=(15, 10))
for sector in df["SECTORS"].unique():
sector_df = df[(df["SECTORS"] == sector) & (df["Economic Activity"] == "R&D
expenditure Rs. Crores")]
if not sector_df.empty:
plt.plot(year_columns, sector_df.iloc[0, 2:], label=sector)

plt.xlabel('Year')
plt.ylabel('R&D Expenditure Rs. Crores')
plt.title('R&D Expenditure by Sector Over the Years')
plt.legend()
plt.show()

plt.figure(figsize=(15, 10))
for sector in df["SECTORS"].unique():
sector_df = df[(df["SECTORS"] == sector) & (df["Economic Activity"].str.contains("GDP
by industry of origin"))]
if not sector_df.empty:
plt.plot(year_columns, sector_df.iloc[0, 2:], label=sector)

plt.xlabel('Year')
plt.ylabel('GDP by Industry of origin (at factor cost Rs. Crores)')
plt.title('GDP by Sector Over the Years')
plt.legend()
plt.show()

You might also like