data science programs
data science programs
C:\Program Files\Python311\Scripts
Type
import numpy as np
print(arr)
print(type(arr))
import numpy as np
print(arr)
import numpy as np
arr = np.array(42)
print(arr)
1D array
import numpy as np
print(arr)
2D array
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
print(arr)
3D array
import numpy as np
print(arr)
check no of dimensions
import numpy as np
a = np.array(42)
b = np.array([1, 2, 3, 4, 5])
c = np.array([[1, 2, 3], [4, 5, 6]])
d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
print(a.ndim)
print(b.ndim)
print(c.ndim)
print(d.ndim)
PANDAS
import pandas as pd
df = pd.read_csv('vp.csv')
print(df.to_string())
What Can Pandas Do?
Pandas are also able to delete rows that are not relevant, or contains wrong values, like empty or NULL values. This is called cleaning the data.
import pandas as pd
mydataset = {
'cars': ["BMW", "Volvo", "Ford"],
'passings': [3, 7, 2]
}
myvar = pd.DataFrame(mydataset)
print(myvar)
pandas version
import pandas as pd
print(pd.__version__)
What is a Series?
import pandas as pd
a = [1, 7, 2]
myvar = pd.Series(a)
print(myvar)
Create Labels
With the index argument, you can name your own labels.
import pandas as pd
a = [1, 7, 2]
print(myvar)
You can also use a key/value object, like a dictionary, when creating a Series.
import pandas as pd
myvar = pd.Series(calories)
print(myvar)
To select only some of the items in the dictionary, use the index argument and specify only the items you want to include in the Series.
import pandas as pd
print(myvar)
DataFrames
import pandas as pd
data = {
"calories": [420, 380, 390],
"duration": [50, 40, 45]
}
myvar = pd.DataFrame(data)
print(myvar)
Iris dataset
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df.head())
10 records
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df.sample(10))
Column title
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df.columns)
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df.shape)
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df)
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df[0:10])
or
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
print(df[0:10])
sliced_data=df[10:21]
print(sliced_data)
df = pd.read_csv("iris_csv.csv")
specific_data=df[["sepalwidth","class"]]
print(specific_data.head(10))
___________________
method 1
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
df2=df["sepalwidth"]
sum=df2.sum()
column=df.shape
method 2
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
df1=df[["sepalwidth"]]
sum = df1.sum()
total = df1.count()
print(total)
avg = sum/total
print("the average:",avg)
method 3
import pandas as pd
import numpy as np
df = pd.read_csv("iris_csv.csv")
sum=df["sepalwidth"].sum()
print("sum",sum)
mean=df["sepalwidth"].mean()
print("mean",mean)
median=df["sepalwidth"].median()
print("median",median)
pima dataset
import pandas as pd
import numpy as np
df = pd.read_csv("pima-indians-diabetes(2).csv")
data1=df["pregnancies"].sum()
print("sum of pregnancies:",data1)
data2=df["pregnancies"].mean()
print("Mean of pregnancies:",data2)
data3=df["pregnancies"].median()
print("Mean of pregnancies:",data3)
data4=df["pregnancies"].value_counts()
print("Frequencies of pregnancies:\n",data4)
data5=df["pregnancies"]. mode()
print(data5)
pima dataset
import pandas as pd
import numpy as np
df=pd.read_csv("pima-indians-diabetes.csv")
d1=df["blood pressure"].skew()
print("Skewness=",d1)
d2=df["blood pressure"].kurtosis()
print("Kurtosis=",d2)
d3=np.var(df["blood pressure"])
print("Variance=",d3)
import pandas as pd
import numpy as np
import math
df = pd.read_csv("pima-indians-diabetes.csv")
data1 = df["pregnancies"].sum()
print("Sum of pregnancies:", data1)
data2 = df["pregnancies"].mean()
print("Mean of pregnancies:", data2)
data3 = df["pregnancies"].median()
print("Median of pregnancies:", data3)
data4 = df["pregnancies"].mode()
print("Mode of pregnancies:", data4)
data5 = np.var(df["pregnancies"])
print("Variance of pregnancies:", data5)
data6 = df["pregnancies"]
var = sum((x - data2) ** 2 for x in data6) / len(data6)
print("Variance of pregnancies (manual calculation):", var)
data7=np.std(df["pregnancies"])
print("standard dev usin fun ",data7)
data8=(var)**0.5
print("satndard dev manual cal",data8)
data9=df["pregnancies"].skew()
print("skewness using fun",data9)
data10=(3*(data2-data3))/data8
print("skewness manual cal",data10)
#The expected output should now show that data9 and data10 have matching
values for skewness.
#These values will be close but may not match exactly due to rounding dif-
ferences.
#They should be very close, indicating that both methods are working cor-
rectly.
import pandas as pd
import matplotlib.pyplot as plt
df=pd.read_csv("pima.csv")
df1=df["insulin"]
df2=df["bp"]
x=df1
y=df2
plt.scatter(x, y)
plt.show()
import pandas as pd
import statistics
import numpy as np
df = pd.read_csv("pima.csv")
df1=df["bp"]
length = int(len(df1))
mean = sum(df1) / length
ans = sum((i - mean) ** 2 for i in df1) / length
print("The variance of BP is : " + str(ans))
STANDARD DEVIATION
import pandas as pd
import statistics
import numpy as np
df = pd.read_csv("pima.csv")
df1=df["bp"]
i = statistics.stdev(df1)
print("Standard Deviation of the BP:",i)
SKEWNESS
KURTOSIS
import pandas as pd
import statistics
import numpy as np
df = pd.read_csv("pima.csv")
df1=df["bp"]
print(df1)
result = df1.kurtosis()
print("kurtosis",result)