Pandas Cheat Sheet
Pandas Cheat Sheet
import pandas as pd
#Python
/sumitkhanna
Python
Basic Operations
In [ ]: # Viewing the first few rows of the DataFrame
df.head() # Default shows the first 5 rows, but you can specify the number of rows as an argument
Out[ ]: Name Age City Salary
Out[ ]: (5, 4)
Selecting Data
In [ ]: # Selecting a single column
df['Name'] # Returns a Series
Out[ ]: 0 Alice
1 Bob
2 Charlie
3 David
4 Edward
Name: Name, dtype: object
In [ ]: # Selecting multiple columns
df[['Name', 'City']] # Returns a DataFrame with specified columns
2 Charlie Chicago
3 David Houston
4 Edward Phoenix
In [ ]: # Conditional selection
df[df['Age'] > 25] # Returns a DataFrame where the age is greater than 25
In [ ]: # Dropping a column
df.drop('Experience', axis=1, inplace=True) # Drops the 'Experience' column
df
Out[ ]: Name Age City Salary
In [ ]: # Renaming columns
df.rename(columns={'Name': 'Employee Name', 'City': 'Location'}, inplace=True) # Renames columns
df
Group Operations
In [ ]: # Grouping data by a column and calculating the mean
grouped = df.groupby('Location')['Age'].mean() # Groups by 'Location' and calculates the mean age for each group
grouped
Out[ ]: Location
Chicago 22.0
Houston 32.0
Los Angeles 27.0
New York 24.0
Phoenix 29.0
Name: Age, dtype: float64
Searching Data
In [ ]: # Searching for specific values
df[df['Employee Name'].str.contains('A')] # Returns rows where 'Employee Name' contains the letter 'A'
Statistical Functions
In [ ]: # Calculating the mean of a column
df['Salary'].mean() # Returns the mean of the 'Salary' column
Out[ ]: 81000.0
Out[ ]: 80000.0
Out[ ]: Location
New York 1
Los Angeles 1
Chicago 1
Houston 1
Phoenix 1
Name: count, dtype: int64
Applying Functions
In [ ]: # Applying a function to a column
df['Salary'].apply(lambda x: x / 1000) # Converts 'Salary' from dollars to thousands
Out[ ]: 0 75.0
1 85.0
2 70.0
3 95.0
4 80.0
Name: Salary, dtype: float64
Out[ ]: 0 48
1 54
2 44
3 64
4 58
dtype: int64
Merging and Joining Data
In [ ]: # Creating another sample DataFrame for merging
data2 = {'Name': ['Alice', 'Bob', 'Charlie', 'Fiona'],
'Department': ['HR', 'Engineering', 'Marketing', 'Finance']}
df2 = pd.DataFrame(data2)
df2
0 Alice HR
1 Bob Engineering
2 Charlie Marketing
3 Fiona Finance
In [ ]: # Concatenating DataFrames
concat_df = pd.concat([df, df2], axis=1) # Concatenates DataFrames along columns
concat_df
Out[ ]: Employee Name Age Location Salary Name Department
Advanced Querying
In [ ]: # Querying using a boolean mask
mask = df['Age'] > 25
df[mask] # Returns rows where 'Age' is greater than 25
Out[ ]: Employee Name Age Location Salary
Advanced Statistics
In [ ]: # Calculating the cumulative sum of a column
df['Cumulative Salary'] = df['Salary'].cumsum() # Adds a column with the cumulative sum of 'Salary'
df
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
Advanced Grouping
In [ ]: # Applying multiple aggregate functions
grouped_agg = df.groupby('Location').agg({'Age': ['mean', 'max'], 'Salary': ['sum', 'mean']}) # Groups by 'Location' and applies multiple aggr
grouped_agg
Out[ ]: Age Salary
Location
Out[ ]: Location
Chicago 0
Houston 0
Los Angeles 0
New York 0
Phoenix 0
Name: Salary, dtype: int64
Advanced Filtering
In [ ]: # Filtering using a custom function
def filter_function(x):
return x['Age'].mean() > 25
filtered = df.groupby('Location').filter(filter_function) # Filters groups where the mean 'Age' is greater than 25
filtered
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
0 0 0 0 0 0 0.0 0.0
2 0 0 0 0 0 0.0 0.0
DataFrame Styling
In [ ]: # Highlighting maximum values in a DataFrame
df.style.highlight_max(axis=0) # Highlights the maximum values in each column
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary
Out[ ]: Employee Name Age Location Salary Cumulative Salary Rolling Mean Salary Expanding Mean Salary