Week2 lab
Week2 lab
[]
import numpy as np
from scipy import stats
import math as mt
import random as rnd
[]
# Number of pets each person owns
sample = [1, 3, 2, 5, 7, 0, 2, 3]
mean = sum(sample) / len(sample)
print(mean) # prints 2.875
[]
# Three exams of .20 weight each and final exam of .4
0 weight
sample = [90, 80, 63, 87]
weights = [.20, .20, .20, .40]
weighted_mean = sum(s * w for s,w in zip(sample, weigh
ts)) / sum(weights)
print(weighted_mean) # prints 81.4
Solve the following question using the sample code above
Calculating median
[]
# Number of pets each person owns
sample = [0, 1, 5, 7, 9, 10, 14]
def median(values):
ordered = sorted(values)
print(ordered)
n = len(ordered)
mid = int(n / 2) - 1 if n % 2 == 0 else int(n/2)
if n % 2 == 0:
return (ordered[mid] + ordered[mid+1]) / 2.0
else:
return ordered[mid]
print(median(sample)) # prints 7
Question: Find the median for the following data set: 102, 56, 34, 99, 89,
101, 10.
Calculating Mode
[]
# Number of pets each person owns
from collections import defaultdict
sample = [1, 3, 2, 5, 7, 0, 2, 3]
def mode(values):
counts = defaultdict(lambda: 0)
for s in values:
counts[s] += 1
max_count = max(counts.values())
modes = [v for v in set(values) if counts[v] == max_count]
return modes
print(mode(sample)) # [2, 3]
[]
# Number of pets each person owns
data = [0, 1, 5, 7, 9, 10, 14]
def variance(values):
mean = sum(values) / len(values)
_variance = sum((v - mean) ** 2 for v in values) / len(values)
return _variance
def variance(values):
mean = sum(values) / len(values)
_variance = sum((v - mean) ** 2 for v in values) / len(values)
return _variance
def std_dev(values):
return sqrt(variance(values))
[4]
import pandas as pd
student_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Carol', \
'Dan', 'Eli', 'Fran'],\
'gender': ['female', 'male', \
'female', 'male', \
'male', 'female'],\
'class': ['FY', 'SO', 'SR', \
'SO',' JR', 'SR'],\
'gpa': [90, 93, 97, 89, 95, 92],\
'num_classes': [4, 3, 4, 4, 3, 2]})
student_df
[6]
student_df['female_flag'] = student_df['gender'] == 'female'
student_df = student_df.drop('gender', axis=1)
student_df
Using group-by
[]
Showing number of students of each gender
[9]
gender_group['num_classes'].sum()
[]
>>> import matplotlib.pyplot as plt
[10]
x = [1, 2, 3, 1.5, 2]
y = [-1, 5, 2, 3, 0]
import matplotlib.pyplot as plt
plt.scatter(x, y)
plt.show()
Say we have an attribute in our dataset that contains the sample data
stored in x. We can call plt.hist() on x to plot the distribution of the
values in the attribute like so:
[12]
x = np.random.randn(100)
plt.hist(x)
plt.show()
[13]
student_df['gpa'].plot.hist()
plt.show()
In this activity, we will practice some basic data processing and analysis
techniques on a dataset available online called Communities and Crime, with
the hope of consolidating our knowledge and techniques. Specifically, we will
process missing values in the dataset, iterate through the attributes, and
visualize the distribution of their values.
First, we need to download this dataset to our local environment, which can be
accessed on this page: https://packt.live/31C5yrZ