Pandas 🐼

Pandas is built on Numpy and Matplotlib

In pandas, rectangular data is represented as a DataFrame object

Quickly exploration

dataFrame.head() # Returns first few rows of the dataset
dataFrame.info() # Display the names of columns, data types, and nulls
dataFrame.shape # Returns number of rows and columns
dataFrame.describe() # Returns summary statistics

Data frames components:

dataFrame.to_numpy() # Returns data values as a 2-dimensional NumPy array
dataFrame.columns # Returns column names
dataFrame.index # Returns row numbers or row names

Sorting

dataFrame.sort_values('column_name') # Sorting the dataframe by specific column
dataFrame.sort_values('column_name', ascending = False) # Sorting on descending order
dataFrame.sort_values(['column_name_1', 'column_name_2']) # Sorting by multiple columns
dataFrame.sort_values(['column_name_1', 'column_name_2'], ascending = [True, False]) # Sorting by multiple columns and their order

Subsetting

dataFrame['column_name'] # Displaying only one column
dataFrame[['column_1', 'column_2']] # Displaying multiple columns
dataFrame['column_name'] > filter_condition # Subsetting based on a condition
# Example
dogs['weight_kg'] > 10
# Output
# -- False
# -- True
# -- True
# -- False
dataFrame[dataFrame['column_name'] > filter_condition] # Displaying subsets with logical condition
# Example
dogs[ dogs['weight_kg'] > 10 ]
# Output
# -- name | breed | color | height_cm | weight_cm | dob
# -- Bell | chihu | black | 50 | 11 | April, 2025
# -- Ball | chihu | brown | 50 | 12 | April, 2023
# -- Bill | chihu | white | 50 | 13 | April, 2024
# Subsetting based on multiple conditions
is_mexican = dataFrame['nationality'] == 'mexican'
is_female = dataFrame['gender'] == 'female'
dataFrame[is_mexican & is_female]
# or
dataFrame[ (dataFrame['nationality'] == 'mexican') & (dataFrame['gender'] == 'female') ]
#Filter on multiple values
is_mexican_or_american = dataFrame['nationality'].isin(['mexican', 'american'])
dataFrame[is_mexican_or_american]

Adding new Columns

# Adding a new column to a dataframe
dataFrame['new_column'] = dataFrame['old_column'] - 10

Summary Statistics

dataFrame['column'].mean() # Mean - where the "center" of your data is
dataFrame['column'].mode() # Mode - value that appears more frequently
dataFrame['column'].median() # Median - middle value when data is ordered
dataFrame['column'].min()
dataFrame['column'].max()
dataFrame['column'].var() # Returns the variance of the data, which measures the spread of numbers in a dataset.
dataFrame['column'].std() # Returns the standard deviation, which is the square root of the variance
dataFrame['column'].sum()
dataFrame['column'].quantile() # Returns values at a specific quantile (percentile)

The .agg() method

def pct30(column):
return column.quantile(0.3)
dataFrame['column'].agg(pct30)
# More than 1 column
dataFrame[["column_1", "column_2"]].agg(pct30)
def pct30(column):
return column.quantile(0.3)
def pct40(column):
return column.quantile(0.4)
dataFrame["column_1"].agg(pct30, pct40)
dataFrame[["column_1", "column_2"]].agg([pct30, 'median'])

Cumulative Sum

dataFrame['weight']
# Output
# 0 24
# 1 24
# 2 24
# 3 17
# 4 29
dataFrame['weight'].cumsum()
# Output
# 0 24
# 1 48
# 2 72
# 3 89
# 4 118

Cumulative Statistics

dataFrame['weight'].cummax()
dataFrame['weight'].cummin()
dataFrame['weight'].cumprod()

Dropping Duplicates

dataFrame.drop_duplicates(subset='column_name')
dataFrame.drop_duplicates(subset=['column_name_1', 'column_name_2'])

Counting values

dataFrame['column_name'].value_counts()
dataFrame['column_name'].value_counts(sort=True)
# To get proportions -> %
dataFrame['column_name'].value_counts(normalize=True)

Grouped Summaries

dataFrame.groupby('col_name')['col_name_agg'].mean()
dataFrame.groupby('col_name')['col_name_agg'].agg([min, max, sum])
dataFrame.groupby(['col_name', 'col_name_2'])['col_name_agg'].mean()
dataFrame.groupby(['col_name', 'col_name_2'])[['col_name_agg', 'col_name_agg_2']].mean()

Pivot Tables

dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by') # By default, we get the mean
dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', aggfunc="median")
dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', aggfunc=["median", "mean"])
# To group by multiple columns
dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', columns='new_column_for_group_by')
# To avoid missing values
dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', columns='new_column_for_group_by', fill_value=0)
# To include ALL row and column
dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', columns='new_column_for_group_by', fill_value=0, margins=True)

Slicing and Indexing Data frames

Setting an index

# Output
# 0 24 A
# 1 24 B
# 2 24 C
# 3 17 D
# 4 29 E
dataFrame_new = dataFrame.set_index('column_name')
# Output
# A 24
# B 24
# C 24
# D 17
# E 29
# Multilevel or Hierarchical Indexes
dataFrame_new = dataFrame.set_index(['column_name', 'column_name_2'])

Removing an index

dataFrame.reset_index()
# 0 24 A
# 1 24 B
# 2 24 C
# 3 17 D
# 4 29 E
dataFrame.reset_index(drop=True)
# 0 24
# 1 24
# 2 24
# 3 17
# 4 29

Why to use indexes?

# Without Indexes
dataFrame[dataFrame['column_name'].isin('value_A', 'Value_B')]
# With indexes
dataFrame.loc[['value_A', 'value_B']]
# For multi-level indexes, you pass a list to subset the outer level
dataFrame.loc[['value_A', 'value_B']]
# For inner levels, use a list of tuples
dataFrame.loc[[("value_A", "sub_value_A"), ("value_B", "sub_value_B")]]

Sort by index values

dataFrame.sort_index()
dataFrame.sort_index(level=['value_A', 'value_B'], ascending=[True, False])
import pandas as pd
# make sure date is datetime
df['date'] = pd.to_datetime(df['date'])
# extract year-month
df['year_month'] = df['date'].dt.to_period('M')
# count unique months per department
result = df.groupby('department')['year_month'].nunique()
print(result)