Pandas is built on Numpy and Matplotlib
In pandas, rectangular data is represented as a DataFrame object
Quickly exploration
dataFrame.head() # Returns first few rows of the datasetdataFrame.info() # Display the names of columns, data types, and nullsdataFrame.shape # Returns number of rows and columnsdataFrame.describe() # Returns summary statistics
Data frames components:
dataFrame.to_numpy() # Returns data values as a 2-dimensional NumPy arraydataFrame.columns # Returns column namesdataFrame.index # Returns row numbers or row names
Sorting
dataFrame.sort_values('column_name') # Sorting the dataframe by specific columndataFrame.sort_values('column_name', ascending = False) # Sorting on descending orderdataFrame.sort_values(['column_name_1', 'column_name_2']) # Sorting by multiple columnsdataFrame.sort_values(['column_name_1', 'column_name_2'], ascending = [True, False]) # Sorting by multiple columns and their order
Subsetting
dataFrame['column_name'] # Displaying only one columndataFrame[['column_1', 'column_2']] # Displaying multiple columnsdataFrame['column_name'] > filter_condition # Subsetting based on a condition# Exampledogs['weight_kg'] > 10# Output# -- False# -- True# -- True# -- FalsedataFrame[dataFrame['column_name'] > filter_condition] # Displaying subsets with logical condition# Exampledogs[ dogs['weight_kg'] > 10 ]# Output# -- name | breed | color | height_cm | weight_cm | dob# -- Bell | chihu | black | 50 | 11 | April, 2025# -- Ball | chihu | brown | 50 | 12 | April, 2023# -- Bill | chihu | white | 50 | 13 | April, 2024# Subsetting based on multiple conditionsis_mexican = dataFrame['nationality'] == 'mexican'is_female = dataFrame['gender'] == 'female'dataFrame[is_mexican & is_female]# ordataFrame[ (dataFrame['nationality'] == 'mexican') & (dataFrame['gender'] == 'female') ]#Filter on multiple valuesis_mexican_or_american = dataFrame['nationality'].isin(['mexican', 'american'])dataFrame[is_mexican_or_american]
Adding new Columns
# Adding a new column to a dataframedataFrame['new_column'] = dataFrame['old_column'] - 10
Summary Statistics
dataFrame['column'].mean() # Mean - where the "center" of your data isdataFrame['column'].mode() # Mode - value that appears more frequently dataFrame['column'].median() # Median - middle value when data is ordereddataFrame['column'].min()dataFrame['column'].max()dataFrame['column'].var() # Returns the variance of the data, which measures the spread of numbers in a dataset.dataFrame['column'].std() # Returns the standard deviation, which is the square root of the variancedataFrame['column'].sum()dataFrame['column'].quantile() # Returns values at a specific quantile (percentile)
The .agg() method
def pct30(column): return column.quantile(0.3)dataFrame['column'].agg(pct30)# More than 1 columndataFrame[["column_1", "column_2"]].agg(pct30)
def pct30(column): return column.quantile(0.3)def pct40(column): return column.quantile(0.4)dataFrame["column_1"].agg(pct30, pct40)dataFrame[["column_1", "column_2"]].agg([pct30, 'median'])
Cumulative Sum
dataFrame['weight']# Output# 0 24# 1 24# 2 24# 3 17# 4 29dataFrame['weight'].cumsum()# Output# 0 24# 1 48# 2 72# 3 89# 4 118
Cumulative Statistics
dataFrame['weight'].cummax()dataFrame['weight'].cummin()dataFrame['weight'].cumprod()
Dropping Duplicates
dataFrame.drop_duplicates(subset='column_name')dataFrame.drop_duplicates(subset=['column_name_1', 'column_name_2'])
Counting values
dataFrame['column_name'].value_counts()dataFrame['column_name'].value_counts(sort=True)# To get proportions -> %dataFrame['column_name'].value_counts(normalize=True)
Grouped Summaries
dataFrame.groupby('col_name')['col_name_agg'].mean()dataFrame.groupby('col_name')['col_name_agg'].agg([min, max, sum])dataFrame.groupby(['col_name', 'col_name_2'])['col_name_agg'].mean()dataFrame.groupby(['col_name', 'col_name_2'])[['col_name_agg', 'col_name_agg_2']].mean()
Pivot Tables
dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by') # By default, we get the meandataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', aggfunc="median")dataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', aggfunc=["median", "mean"])# To group by multiple columnsdataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', columns='new_column_for_group_by')# To avoid missing valuesdataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', columns='new_column_for_group_by', fill_value=0)# To include ALL row and columndataFrame.pivot_table(values='column_to_summarize', index='column_to_group_by', columns='new_column_for_group_by', fill_value=0, margins=True)
Slicing and Indexing Data frames
Setting an index
# Output# 0 24 A# 1 24 B# 2 24 C# 3 17 D# 4 29 EdataFrame_new = dataFrame.set_index('column_name')# Output# A 24# B 24# C 24# D 17# E 29# Multilevel or Hierarchical IndexesdataFrame_new = dataFrame.set_index(['column_name', 'column_name_2'])
Removing an index
dataFrame.reset_index()# 0 24 A# 1 24 B# 2 24 C# 3 17 D# 4 29 EdataFrame.reset_index(drop=True)# 0 24# 1 24 # 2 24 # 3 17 # 4 29
Why to use indexes?
# Without IndexesdataFrame[dataFrame['column_name'].isin('value_A', 'Value_B')]# With indexesdataFrame.loc[['value_A', 'value_B']]# For multi-level indexes, you pass a list to subset the outer leveldataFrame.loc[['value_A', 'value_B']]# For inner levels, use a list of tuplesdataFrame.loc[[("value_A", "sub_value_A"), ("value_B", "sub_value_B")]]
Sort by index values
dataFrame.sort_index()dataFrame.sort_index(level=['value_A', 'value_B'], ascending=[True, False])
import pandas as pd# make sure date is datetimedf['date'] = pd.to_datetime(df['date'])# extract year-monthdf['year_month'] = df['date'].dt.to_period('M')# count unique months per departmentresult = df.groupby('department')['year_month'].nunique()print(result)