1.
import numpy as np
import pandas as pd
import [Link] as plt
import seaborn as sns
data = {
"Maths": [Link](50, 100, 50),
"Science": [Link](50, 100, 50),
"History": [Link](50, 100, 50),
"English": [Link](50, 100, 50),
"Geography": [Link](50, 100, 50),
}
df= [Link](data)
csv_path='student [Link]'
df.to_csv(csv_path, index=False)
data_array = [Link](csv_path,delimiter=',',skiprows=1)
correlation_matrix = [Link](data_array, rowvar=False)
subjects = ["Maths", "Science", "History", "English", "Geography"]
[Link](figsize=(8, 6))
[Link](
correlation_matrix,
annot=True,
xticklabels=subjects,
yticklabels=subjects,
cmap="coolwarm",
cbar=True,
linewidths=0.5,
fmt=".2f",
)
[Link]("Correlation Matrix Heatmap")
[Link]("Subjects")
[Link]("Subjects")
[Link]()
print("Correlation Matrix:" )
print(correlation_matrix)
print("\nlnterpretation:")
print("l. Diagonal values are 1.00, indicating perfect correlation with itself." )
print("2. Most off-diagonal values are close to zero, indicating weak relationships." )
print("3. Slight positive correlations exist between Maths and English or History." )
print("4. Negative correlation exists between English and Geography.")
2.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from sklearn.linear_model import LogisticRegression
from [Link] import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
import seaborn as sns
import [Link] as plt
import seaborn as sns
import [Link] as plt
file_path="Heart_Disease_UCI.csv"
data = pd.read_csv('Heart_Disease_UCI.xls')
print("Dataset Overview:")
print([Link]())
print("\nDataset Info:")
print([Link]())
df=[Link](data)
print("\nMissing Values:\n",[Link]().sum())
X = [Link]("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler= StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = [Link](X_test)
model = LogisticRegression(random_state=42)
[Link](X_train_scaled, y_train)
y_pred = [Link](X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1=f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall:{ recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
[Link](figsize=(6, 5))
[Link](conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["No Disease", "Disease"],
yticklabels=["No Disease", "Disease"])
[Link]("Confusion Matrix")
[Link]("Predicted Value")
[Link]("Actual value")
[Link]()
3.
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from [Link] import mean_squared_error,r2_score
import [Link] as plt
import seaborn as sns
file_path = "[Link]"
data=pd.read_csv('[Link]')
print("Dataset Overview:")
print([Link]())
print("\nDataset Info:")
print([Link]())
df=[Link](data)
print("\nmissing Values:")
print([Link]().sum())
[Link](figsize=(12,6))
[Link](x="Year",y="CancerRelatedDeaths",data=data,label="Cancer-Related Deaths",marker="o")
[Link]("Trend of Cancer-Realted Deaths Over Time")
[Link]("Year")
[Link]("Number of Deaths")
[Link]()
[Link]()
[Link]()
X=data[["Year","Population","HealthExpenditure"]]
y=data["CancerRelatedDeaths"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
model=LinearRegression()
[Link](X_train,y_train)
y_pred=[Link](X_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)
print("\nModel Performance:")
print(f"Mean Squared Error(MSE):{mse:.2f}")
print(f"R-squared(R^2):{r2:.2f}")
[Link](figsize=(12,6))
[Link](data["Year"],data["CancerRelatedDeaths"],color="blue",label="Actual Data",alpha=0.6)
[Link](data["Year"],[Link](data[["Year","Population","HealthExpenditure"]]),color="red", label= "Regression Line")
[Link]("Cancer-Related Deaths and Regression Line")
[Link]("Year")
[Link]("Number of Deaths")
[Link]()
[Link]()
[Link]()
4.
import pandas as pd
import numpy as np
data = {
"EmpID": [Link](1,101),
"EmpName": [f"Employee_{i}" for i in range(1,101)],
"Designation" :[Link](
["Manager","Team Lead","Developer","Analyst","Intern"],size=100
)
}
employee_df = [Link](data)
employee_df.to_csv("employee_details.csv", index=False)
print("Dataset saved to 'employee_details.csv'.")
loaded_df= pd.read_csv("employee_details.csv")
sampled_df = loaded_df.sample(frac=0.25, random_state=42)
print("\n25% Random Sample of the Employee Dataset: ")
print(sampled_df)
import numpy as np
from [Link] import norm
[Link](42)
sample_heights=[Link](loc=150, scale=10, size=40)
sample_mean = [Link](sample_heights)
sample_std_dev = [Link](sample_heights, ddof=1)
sample_size = len(sample_heights)
null_hypothesis_mean = 140
significance_level = 0.05
standard_error = sample_std_dev / [Link](sample_size)
z_score = (sample_mean-null_hypothesis_mean) / standard_error
critical_z = [Link](1 - significance_level)
if z_score>critical_z:
decision = "Reject the Null Hypothesis (Ho)"
else:
decision = "Fail to Reject the Null Hypothesis (Ho)"
print("Z-Test Results:")
print(f"Sample Mean: {sample_mean:.2f}")
print(f"Sample Standard Deviation: {sample_std_dev:.2f}")
print(f"Z-Score: {z_score:.2f}")
print(f"CriticaI Z Value: {critical_z:.2f}")
print(f"Decision: {decision} ")
import pandas as pd
import numpy as np
file_path = "monthly_sales.csv"
data = pd.read_csv('monthly_sales.csv')
print("Dataset Overview:")
print([Link]())
print("\nDataset Info:" )
print([Link]())
print("Original Dataset Shape:", [Link])
mean_sales = [Link](data)
print("Mean of Monthly Sales:", mean_sales)
sum_sales = [Link](data)
print("Sum of Monthly Sales:", sum_sales)
product_sales = [Link](data)
print("Product of Monthly Sales:" , product_sales)
data = [Link]({'sales': [Link](60)})
# Convert DataFrame column to NumPy array and reshape
reshaped_sales = data['sales'].[Link](60, 1)
print("Reshaped Array (60 rows, 1 column):\n", reshaped_sales)
# Transpose the reshaped array
transposed_sales = reshaped_sales.T
print("Transposed Array:\n", transposed_sales)
import pandas as pd
import numpy as np
# Read dataset
file_path = "monthly_sales.csv"
data = pd.read_csv(file_path)
# Display dataset overview
print("Dataset Overview:")
print([Link]())
print("\nDataset Info:")
print([Link]())
# Ensure numerical operations are performed on relevant columns
print("Original Dataset Shape:", [Link])
# Assuming 'sales' is the numeric column in the dataset
if 'sales' in [Link]:
mean_sales = data['sales'].mean()
print("Mean of Monthly Sales:", mean_sales)
sum_sales = data['sales'].sum()
print("Sum of Monthly Sales:", sum_sales)
product_sales = data['sales'].prod()
print("Product of Monthly Sales:", product_sales)
else:
print("Column 'sales' not found in the dataset.")
# Create a new DataFrame with 60 values
data = [Link]({'sales': [Link](60)})
# Convert DataFrame column to NumPy array and reshape
reshaped_sales = data['sales'].[Link](60, 1)
print("Reshaped Array (60 rows, 1 column):\n", reshaped_sales)
# Transpose the reshaped array
transposed_sales = reshaped_sales.T
print("Transposed Array:\n", transposed_sales)
8
import numpy as np
data = [Link]([
[10,20,30],
[40,[Link],60],
[70,80,"invalid"],
[90,100,110],
[120,[Link],150]
], dtype=object)
print("Original Array:")
print(data)
def replace_nan_with_mean(array):
numeric_array =[Link]("float",copy=False)
for col in range(numberic_array.shape[1]):
col_values=numberic_array[:,col]
if [Link](col_values).any():
col_means=[Link](col_values)
col_values[[Link](col_values)]=col_mean
return numeric_array
try:
numeric_data = replace_nan_with_mean([Link]("float"))
print("\nArray After Replacing NaN with Column Averages:")
print(numeric_data)
except ValueError:
print("\nArray contains non-numberic values. NaN replacement skipped.")
def remove_non_numeric_rows(array):
numberic_mask=[Link]([all(isinstance(x,(int,float)) and not [Link](x) for x in row) for row in array])
return array[numberic_mask]
cleaned_data=remove_non_numeric_rows(data)
print("\nArray After Removing Rows with Non-Numeric values:")
print(cleaned_data)
def contains_row(array,row):
return any((array==row).all(axis=1))
row_to_check = [90,100,110]
is_present=contains_row(cleaned_data.astype("int"),row_to_check)
print(f"\n Does the array contain the row {row_to_check}?{'Yes' if is_present else 'No'}")
import numpy as np
import [Link] as plt
import pandas as pd
# Load the CSV file
file_path = "student_marks.csv"
scores = pd.read_csv(file_path)
# Display the dataset overview and info
print("Dataset Overview:")
print([Link]())
print("\nDataset Info:")
print([Link]())
# 1. Average score for each subject (column-wise average)
subject_avg_scores = [Link](axis=0)
print("\nAverage Score for Each Subject Across All Students (by Subject):")
print(subject_avg_scores)
# 2. Average score per student (row-wise average)
student_avg_scores = [Link](axis=1)
highest_avg_student = student_avg_scores.idxmax()
lowest_avg_student = student_avg_scores.idxmin()
print("\nStudent with Highest Average Score (Index):", highest_avg_student)
print("Student with Lowest Average Score (Index):", lowest_avg_student)
# 3. Pass rate per subject (score >= 60 considered pass)
pass_rate = (scores >= 60).mean(axis=0)
print("\nPass Rate for Each Subject:")
print(pass_rate)
# 4. Correlation matrix between subjects
correlation_matrix = [Link]()
print("\nCorrelation Matrix Between Subjects:")
print(correlation_matrix)
# 5. Overall average score for each semester (assuming each subject is a semester)
semester_avg_scores = [Link](axis=0)
print("\nOverall Average Score for Each Semester:")
print(semester_avg_scores)
# 6. Plotting the average scores
x_values = range(1, len(semester_avg_scores) + 1)
[Link](figsize=(8, 5))
[Link](x_values, semester_avg_scores, marker="o", linestyle="-", color="blue")
[Link]("Overall Average Scores Across Semesters")
[Link]("Semester")
[Link]("Average Score")
[Link](True)
[Link]()
# 7. Standard deviation of scores per subject
subject_std_dev = [Link](axis=0)
print("\nStandard Deviation of Scores for Each Subject:")
print(subject_std_dev)
10
import pandas as pd
import numpy as np
import [Link] as plt
import seaborn as sns
# Load the data
file_path = "retail_store.csv"
store = pd.read_csv(file_path)
df = [Link](store)
# Calculate revenue
df["Revenue"] = df["Quantity"] * df["Unit_Price"]
print("Sample Dataset:")
print([Link]())
# Total revenue
total_revenue = df["Revenue"].sum()
print(f"\nTotal revenue Generated: ${total_revenue:.2f}")
# Product with highest sales revenue
product_revenue = [Link]("Product_ID")["Revenue"].sum()
highest_revenue_product = product_revenue.idxmax()
print(f"\nProduct with Highest Sales Revenue: Product_ID {highest_revenue_product}")
# Average quantity sold
average_quantity = df["Quantity"].mean()
print(f"\nAverage Quantity Sold per Transaction: {average_quantity:.2f}")
# Create a Month column for grouping
df["Month"] = pd.to_datetime(df["Date"]).dt.to_period("M")
# Monthly sales trend
monthly_sales = [Link]("Month")["Revenue"].sum()
[Link](figsize=(10, 6))
monthly_sales.plot(kind="line", marker="o", color="blue")
[Link]("Monthly Sales Trend")
[Link]("Month")
[Link]("Total Revenue")
[Link]()
[Link]()
# Correlation matrix
correlation = df[["Quantity", "Unit_Price"]].corr()
print("Correlation Matrix:")
print(correlation)
# Scatter plot
[Link](figsize=(8, 6))
[Link](x="Unit_Price", y="Quantity", data=df, alpha=0.7)
[Link]("Correlation Between Quantity and Unit Price")
[Link]("Unit Price")
[Link]("Quantity Sold")
[Link]()
[Link]()