Big Data Analytics Notes
Big Data Analytics Notes
# Matplotlib library
plt.scatter([1,2,3,4,5],[10,20,30,40,50])
<matplotlib.collections.PathCollection at 0x17254053ee0>
import numpy as np
import matplotlib.pyplot as plt
x= np.array([1,2,3,4,5,6,7,8])
y= np.array([15,32,66,45,90,153,170,200])
plt.title("Graph")
plt.xlabel("X axis")
plt.ylabel("Y axis")
plt.scatter(x,y)
<matplotlib.collections.PathCollection at 0x172540b2c40>
plt.title("Scatter Plot")
plt.xlabel("X axis")
plt.ylabel("Y axis")
plt.scatter(x,y,label= 'nothing', s=50, color = 'r', marker ='*')
plt.legend()
plt.show()
# Scatter plot is used to check the relationship between two
variables( Correlation)
plt.title("Graph")
plt.xlabel("X axis")
plt.ylabel("Y axis")
plt.plot(x,y, color='r')
[<matplotlib.lines.Line2D at 0x17254141760>]
# We can also visualize each variable by histogram
plt.hist(y)
(array([2., 1., 1., 0., 1., 0., 0., 1., 1., 1.]),
array([ 15. , 33.5, 52. , 70.5, 89. , 107.5, 126. , 144.5, 163. ,
181.5, 200. ]),
<BarContainer object of 10 artists>)
plt.hist(x)
plt.hist(y)
(array([2., 1., 1., 0., 1., 0., 0., 1., 1., 1.]),
array([ 15. , 33.5, 52. , 70.5, 89. , 107.5, 126. , 144.5, 163. ,
181.5, 200. ]),
<BarContainer object of 10 artists>)
x2= [6,9,11]
y2= [6,15,7]
plt.show()
# Bar Graph : Bar Graphs are used to compare things between different
groups.
# Especially when we are trying to make changes over
time, bar graphs are very well suite.
plt.legend()
plt.title("BAR CHART ")
plt.xlabel("bar number")
plt.ylabel("bar height")
plt.show()
# HISTOGRAM:
age = np.array([22,55,62,45,75,21,22,34,42,4,99,100,104,52])
bins= np.array([0,10,20,30,40,50,60,70,80,90,100,110])
plt.title("Histogram")
plt.xlabel("x axis")
plt.ylabel("y axis")
plt.legend()
#plt.grid(True,color='k')
plt.show()
# Difference between a Bar Chart & Histogram?
# In Histogram we have quantitative variable - E.g. how each age group
is contributing towards GDP in a specific country
# In Bar chart they have categorical variables - E.g. GDP in a country
# Stack Plot(Area plot): It can be used to track changes over time for
one or more related groups
#that make up 1 whole category
days= [1,2,3,4,5]
sleeping= [7,8,6,11,7]
eating = [2,3,4,3,2]
working = [7,8,7,2,2]
playing = [8,5,7,8,13]
slices = [7,2,2,13]
activities = ['sleeping', 'eating', 'working', 'playing']
col = ['c','m','r','b']
plt.pie(
slices,
labels = activities,
colors = col,
startangle = 90,
#shadow = True,
explode = (0,0.1,0,0),
autopct = '%1.1f%%'
)
plt.title("Pie Plot")
#plt.legend()
plt.show()
from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()
print(digits.data)
[[ 0. 0. 5. ... 0. 0. 0.]
[ 0. 0. 0. ... 10. 0. 0.]
[ 0. 0. 0. ... 16. 9. 0.]
...
[ 0. 0. 1. ... 6. 0. 0.]
[ 0. 0. 2. ... 12. 0. 0.]
[ 0. 0. 10. ... 12. 1. 0.]]
# NUMPY Library
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14]
import numpy as np
a = np.array([[1,2],[3,4]])
print(a)
[[1 2]
[3 4]]
import numpy as np
a= np.array([(1,2),(3,4)]) # same output as above
print(a)
[[1 2]
[3 4]]
import numpy as np # same output as above
a = np.matrix('1,2;3,4')
print(a)
[[1 2]
[3 4]]
(2, 2)
# Numpy operations
# Numpy operation 1: Reshaping arrays
import numpy as np
a= np.array([(1,2,3,4),(3,4,5,6)]) # 2*2 array having 2 rows & 4
columns
a
array([[1, 2, 3, 4],
[3, 4, 5, 6]])
array([[1, 2],
[3, 4],
[3, 4],
[5, 6]])
array([[1, 3, 3, 5],
[2, 4, 4, 6]])
import numpy as np
a= np.array([(1,2,3,4),(3,4,5,6)])
a[0,2] # Slicing
a= np.array([(1,2,3,4),(3,4,5,6),(7,8,9,10)])
a[0:,3] # Slicing
array([ 4, 6, 10])
a[0:2,3] # Slicing
array([4, 6])
62
0.6321412059754348
[[ 1 4 9 16]
[ 9 16 25 36]
[ 49 64 81 100]]