Unit3_3) Pandas.ipynb - Colab
Unit3_3) Pandas.ipynb - Colab
import pandas as pd
import numpy as np
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
a 1
b 2
c 3
dtype: int64
print(s.iloc[1])
s.loc['b']
NaN vs None
keyboard_arrow_down DataFrame
A two-dimensional, size-mutable, potentially heterogeneous tabular data structure.
# Creating a DataFrame from a dictionary
import pandas as pd
data = {
'A': [1, 2, 3, 4],
'B': [5, 6, 7, 8],
'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
print(df)
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
A B C
0 1 5 9
1 2 6 10
2 3 7 11
A B C
3 4 8 12
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 4 non-null int64
1 B 4 non-null int64
2 C 4 non-null int64
dtypes: int64(3)
memory usage: 224.0 bytes
None
A B C
count 4.000000 4.000000 4.000000
mean 2.500000 6.500000 10.500000
std 1.290994 1.290994 1.290994
min 1.000000 5.000000 9.000000
25% 1.750000 5.750000 9.750000
50% 2.500000 6.500000 10.500000
75% 3.250000 7.250000 11.250000
max 4.000000 8.000000 12.000000
[[ 1 5 9]
[ 2 6 10]
[ 3 7 11]
[ 4 8 12]]
0 1
1 2
2 3
3 4
Name: A, dtype: int64
A B
0 1 5
1 2 6
2 3 7
3 4 8
df
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
A 2
B 6
C 10
Name: 1, dtype: int64
A 1
B 5
C 9
Name: 0, dtype: int64
print(df)
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
A C
1 2 10
2 3 11
A B
0 1 5
1 2 6
import pandas as pd
data = {
'A': [1, 2, 3, 4],
'B': [5, 6, 7, 8],
'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)
print(df)
A B C
0 1 5 9
1 2 6 10
2 3 7 11
3 4 8 12
a = df['A']>2
print(df[a])
A B C
2 3 7 11
3 4 8 12
# Boolean indexing
print(df['B'][df['A'] > 2])
2 7
3 8
Name: B, dtype: int64
print(df['A'] > 2)
0 False
1 False
2 True
3 True
Name: A, dtype: bool
A B C D
0 1 5 9 6
1 2 6 10 8
2 3 7 11 10
3 4 8 12 12
A B C D D1
0 1 5 9 6 54
1 2 6 10 8 65
2 3 7 11 10 8
3 4 8 12 12 56
A B C D D1
0 10 5 9 6 54
1 2 6 10 8 65
2 3 7 11 10 8
3 4 8 12 12 56
A B C D D1
0 10 20 9 6 54
1 2 6 10 8 65
2 3 7 11 10 8
3 4 8 12 12 56
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
df_with_nan
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
df_with_nan.iloc[0, 1] = np.nan
df_with_nan
A B C D D1
0 10 NaN 9 6 54
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
A B C D D1
0 10 0.0 9 6 54
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
df_with_nan
A B C D D1
0 10 NaN 9 6 54
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
A B C D D1
1 2 6.0 10 8 65
2 3 99.0 11 10 8
3 4 99.0 12 12 56
print(df_with_nan.isna())
A B C D D1
0 False True False False False
1 False False False False False
2 False False False False False
3 False False False False False
A B C D D1
0 10 NaN 9 6 54
Handling Duplicates
A B
0 foo 1
1 bar 1
2 foo 2
3 bar 2
4 foo 1
5 bar 1
keyboard_arrow_down Operations
df
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
A 4.75
B 75.75
C 10.50
D 9.00
D1 45.75
dtype: float64
0 35.6
1 18.2
2 26.2
3 36.6
dtype: float64
df
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
import numpy as np
import pandas as pd
def value_difference(my_list):
my_array = np.array(my_list)
return np.max(my_array)-np.min(my_array)
A B C D D1
0 10 99 9 6 54
1 2 6 10 8 65
2 3 99 11 10 8
3 4 99 12 12 56
A B C D D1
0 10 99 9 6 54
1 12 105 19 14 119
2 15 204 30 24 127
3 19 303 42 36 183
0 93
1 63
2 96
3 95
dtype: int64
A 8
B 93
C 3
D 6
D1 57
dtype: int64
A B
0 foo one
1 bar two
2 baz three
0 FOO
1 BAR
2 BAZ
Name: A, dtype: object
0 3
1 3
2 5
Name: B, dtype: int64
0 True
1 False
2 False
Name: A, dtype: bool
key value
0 A 1
1 B 2
2 C 3
key value
0 B 4
1 C 5
2 D 6
# Merging DataFrames
print(pd.merge(df1, df2, on='key')) # Inner join
print()
print(pd.merge(df1, df2, on='key', how='left')) # Left join
print()
print(pd.merge(df1, df2, on='key', how='outer')) # Outer join
# Concatenating DataFrames
print(pd.concat([df1, df2], axis=0)) # Concatenate rows
print()
print(pd.concat([df1, df2], axis=1)) # Concatenate columns
key value
0 A 1
1 B 2
2 C 3
0 B 4
1 C 5
2 D 6
key data
0 A 0
1 B 1
2 A 2
3 B 3
data
key
A 1.0
B 2.0
data
key
A 1.0 2 2
B 2.0 4 2
A B C D
C large small
A B
bar one 4.0 3.0
two NaN 3.0
foo one 2.0 1.0
two 2.0 NaN
keyboard_arrow_down Plotting
import matplotlib.pyplot as plt
url = 'https://gist.githubusercontent.com/DiogoRibeiro7/c6590d0cf119e87c39e31c21a9c0f3a8/raw/4a8e3da267a0c1f0d650901d8295a5153bde8b21/PlayTennis.csv'
df = pd.read_csv(url)
df
df = pd.DataFrame(data)
print("DataFrame from dictionary:\n", df)
Single column:
0 Alice
1 Bob
2 Charlie
3 David
Name: Name, dtype: object
Multiple columns:
Name City
0 Alice New York
1 Bob Los Angeles
2 Charlie Chicago
3 David Houston
Single row:
Name Bob
Age 27
City Los Angeles
Name: 1, dtype: object
Multiple rows:
Name Age City
1 Bob 27 Los Angeles
2 Charlie 22 Chicago
Rows with age > 25:
Name Age City
1 Bob 27 Los Angeles
3 David 32 Houston
Updated DataFrame:
Name Age City Salary
0 Alice 25 New York 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
DataFrame after updating values:
Name Age City Salary
0 Alice 25 NYC 50000
1 Bob 27 Los Angeles 60000
2 Charlie 22 Chicago 45000
3 David 32 Houston 80000
# Delete a column
df = df.drop(columns=['Salary'])
print("DataFrame after deleting a column:\n", df)
# Delete a row
df = df.drop(index=1)
print("DataFrame after deleting a row:\n", df)
Sorted by Age:
Name Age City
2 Charlie 22 Chicago
0 Alice 25 NYC
3 David 32 Houston
Sorted by City and Age:
Name Age City
2 Charlie 22 Chicago
3 David 32 Houston
0 Alice 25 NYC
# Calculate sum
sum_age = df['Age'].sum()
print("Sum of Ages:", sum_age)
# Calculate median
median_age = df['Age'].median()
print("Median Age:", median_age)
# Viewing data
print("First few rows:\n", df.head())
print("Summary statistics:\n", df.describe())
# Updating values
df.at[0, 'Age'] = 25
df.loc[df['City'] == 'New York', 'City'] = 'NYC'
print("Updated DataFrame:\n", df)
# Sorting data
df_sorted = df.sort_values(by='Age')
print("Sorted by Age:\n", df_sorted)
From CSV
file_path = r"/content/uber-raw-data-apr14.csv"
df = pd.read_csv(file_path)
print(df.head())
From Excel
# excel_file_path = r"C:\Users\Nileaysh\Downloads\BCADS17.xlsx"
file_path = r"/content/200c2e152dbe4ac7b301e8ae7a6762ce.xlsx"
df = pd.read_excel(file_path)
print(df.head())
table column Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 \
0 movie id NaN NaN NaN NaN
1 movie title NaN NaN NaN NaN
2 movie year NaN NaN NaN NaN
3 movie date_published NaN NaN NaN NaN
4 movie duration NaN NaN NaN NaN
Unnamed: 12
0 ratings
1 * movie_id
2 avg_rating
3 total_votes
4 median_rating