Python for Data Analysis
conda install -y pandas jupyter matplotlib # /ˈnʌmpli/ /ˈpeɪ.dəns/ /ˈmæt.plɒt.lɪb/
# import conventions
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
np.__version__ # '2.2.4'
pd.__version__ # '2.2.3'
# pd.options.display.max_columns = 20
# pd.options.display.max_colwidth = 80
# pd.options.display.max_rows = 20
# pd.options.display.max_colwidth = 80
# np.set_printoptions(precision=4, suppress=True)
# pd.reset_option('all')
# pd.reset_option('display.max_rows')
- 1. NumPy
- 2. Pandas
- 2.1. Series, DataFrame, and Index
- 2.2. Essential Functionality
- 2.2.1. Reindexing
- 2.2.2. Dropping Entries from an Axis
- 2.2.3. Indexing, Selection, and Filtering
- 2.2.4. Arithmetic and Data Alignment
- 2.2.5. Function Application and Mapping
- 2.2.6. Sorting and Ranking
- 2.2.7. Axis Indexes with Duplicate Labels
- 2.2.8. Unique Values, Value Counts, and Membership
- 2.2.9. NumPy Arrays
- 2.3. Reading and Writing
- 3. Data Cleaning and Preparation
- 3.1. Handling Missing Data
- 3.2. Removing Duplicates
- 3.3. Transforming Data Using a Function or Mapping
- 3.4. Replacing Values
- 3.5. Renaming Axis Indexes
- 3.6. Discretization and Binning
- 3.7. Detecting and Filtering Outliers
- 3.8. Permutation and Random Sampling
- 3.9. Computing Indicator/Dummy Variables: One-Hot Encoding
- 3.10. String Functions in pandas
- 3.11. Categorical Encoding
- 4. Data Wrangling: Join, Combine, and Reshape
- 5. Plotting and Visualization
- 6. Data Aggregation and Group Operations
- 7. Time Series
- References
1. NumPy
NumPy (Numerical Python, /ˈnʌmpli/) is a library for the Python programming language that’s widely used in science and engineering, which contains multidimensional array data structures, such as the homogeneous, N-dimensional array (ndarray
, /ˈændəriː/), and a large library of functions that operate efficiently on these data structures.
import numpy as np
a = np.array([[1, 2, 3], [4, 5, 6]])
a.shape # (2, 3)
# Integer types (signed and unsigned)
int8, uint8 # i1, u1: 8-bit (1 byte) integers
int16, uint16 # i2, u2: 16-bit integers
int32, uint32 # i4, u4: 32-bit integers
int64, uint64 # i8, u8: 64-bit integers
# Floating-point types
float16 # f2: Half-precision (16-bit) floating point
float32 # f4 or f: Single-precision (32-bit) floating point; compatible with C float
float64 # f8 or d: Double-precision (64-bit) floating point; compatible with C double and Python float
float128 # f16 or g: Extended-precision (128-bit) floating point
# Complex number types
complex64 # c8: Complex number with two 32-bit floats (real and imaginary parts)
complex128 # c16: Complex number with two 64-bit floats
complex256 # c32: Complex number with two 128-bit floats
# Boolean type
bool # ?: Boolean values (True or False)
# Object type
object # O: Python object type; can store any Python object
# String and byte types
bytes_ # S: Fixed-length raw byte arrays (e.g., ASCII strings, 1 byte per character).
# Use 'S10' for a byte string of length 10.
# Each string is padded or truncated to fit the specified length.
str_ # U: Fixed-length Unicode strings using UTF-32 encoding (4 bytes per character).
# Use 'U10' for a Unicode string of length 10.
# Each string is padded or truncated to fit the specified length.
1.1. arrays
NumPy’s main object is the homogeneous multidimensional array (dimensions are called axes), known as ndarray
, which is a table of same-type elements (usually numbers) indexed by a tuple of non-negative integers.
Don’t think of NumPy’s N-d arrays solely as spatial points or coordinates; instead, picture them as structured data containers, like boxes within boxes.
|
# create a matrix with 2 rows and 3 columns
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
# the number of axes (dimensions) of the array
a.ndim # 2
# the dimensions of the array, a tuple of integers indicating the size of the array in each dimension
a.shape # (2, 3)
a.ndim == len(a.shape) # True
# the total number of elements of the array
a.size # 6 = 2 x 3
import math
a.size == math.prod(a.shape) # True
# an object describing the type of the elements in the array.
a.dtype # dtype('int64')
a.dtype.name # 'int64'
# the size in bytes of each element of the array.
a.itemsize # 8
# the buffer containing the actual elements of the array.
a.data # <memory at 0x10424e190>
# 1-dimensional array
# axis 0 runs horizontally (or vertically)
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a.ndim # 1
a.shape # (10,)
# 2-dimensional array
# axis 0 runs vertically downwards across the rows.
# axis 1 runs horizontally across the columns.
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
a.ndim # 2
a.shape # (2, 3)
# 3-dimensional array
# axis 0 runs along the depth (the first index).
# axis 1 runs along the rows (the second index).
# axis 2 runs along the columns (the third index).
a = np.arange(8).reshape(2, 2, 2)
# array([[[0, 1],
# [2, 3]],
#
# [[4, 5],
# [6, 7]]])
a.ndim # 3
a.shape # (2, 2, 2)
# creating arrays
a = np.array([1, 2, 3, 4, 5, 6])
# array([1, 2, 3, 4, 5, 6])
b = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
# array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
c = np.array([[1, 2], [3, 4]], dtype=complex)
# array([[1.+0.j, 2.+0.j],
# [3.+0.j, 4.+0.j]])
np.zeros((3, 4))
# array([[0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.]])
np.zeros_like(np.arange(12).reshape(3, 4))
# array([[0, 0, 0, 0],
# [0, 0, 0, 0],
# [0, 0, 0, 0]])
np.ones((3, 4), dtype=np.int16)
# array([[1, 1, 1, 1],
# [1, 1, 1, 1],
# [1, 1, 1, 1]], dtype=int16)
np.empty((3, 5)) # returns uninitialized memory!
# array([[0.0e+000, 4.9e-324, 9.9e-324, 1.5e-323, 2.0e-323],
# [2.5e-323, 3.0e-323, 3.5e-323, 4.0e-323, 4.4e-323],
# [4.9e-323, 5.4e-323, 5.9e-323, 6.4e-323, 6.9e-323]])
np.arange(10, 30, 5)
# array([10, 15, 20, 25])
np.arange(0, 2, 0.3)
# array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8])
from numpy import pi
np.linspace(0, 2, 9) # 9 numbers from 0 to 2
# array([0. , 0.25, 0.5 , 0.75, 1. , 1.25, 1.5 , 1.75, 2. ])
x = np.linspace(0, 2 * pi, 100)
f = np.sin(x)
# array([ 0.00000000e+00, 6.34239197e-02, 1.26592454e-01, 1.89251244e-01,
# 2.51147987e-01, 3.12033446e-01, 3.71662456e-01, 4.29794912e-01,
# . . .
# structured arrays
x = np.array(
[('Rex', 9, 81.0), ('Fido', 3, 27.0)],
dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')],
)
# array([('Rex', 9, 81.), ('Fido', 3, 27.)],
# dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f4')])
1.2. arithmetic operators, and universal functions
# arithmetic operators, and universal functions
a = np.array([20, 30, 40, 50])
np.sin(a)
# array([ 0.91294525, -0.98803162, 0.74511316, -0.26237485])
10 * np.sin(a)
# array([ 9.12945251, -9.88031624, 7.4511316 , -2.62374854])
b = np.arange(4)
# array([0, 1, 2, 3])
c = a - b
# array([20, 29, 38, 47])
# a + b
array([20, 31, 42, 53])
# b**2
array([0, 1, 4, 9])
a < 35
# array([ True, True, False, False])
A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])
A * B # element-wise product
# array([[2, 0],
# [0, 4]])
A @ B # matrix product: A.dot(B)
# array([[5, 4],
# [3, 4]])
# universal functions (ufunc): operate element-wise on an array, producing an array as output
b = np.arange(12).reshape(3, 4)
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
np.exp(b)
# array([1. , 2.71828183, 7.3890561 ])
np.sqrt(b)
# array([0. , 1. , 1.41421356])
c = np.array([2.0, -1.0, 4.0])
np.add(b, c)
# array([2., 0., 6.])
x = np.random.standard_normal(8)
# array([-0.99796507, 1.82697075, 0.60924821, 0.27004016, -1.62010011,
# -0.43957991, -2.27936726, -1.87890946])
y = np.random.standard_normal(8)
# array([-0.16034636, 1.60381624, -0.44003017, -1.50187946, -0.66940637,
# -1.2649151 , -0.12716502, -0.15756179])
np.maximum(x, y)
# array([-0.16034636, 1.82697075, 0.60924821, 0.27004016, -0.66940637,
# -0.43957991, -0.12716502, -0.15756179])
# assign the results into an existing array rather than create a new one
out = np.zeros_like(x)
np.add(x, 1, out=out)
# array([ 2.03493046e-03, 2.82697075e+00, 1.60924821e+00, 1.27004016e+00,
# -6.20100112e-01, 5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
out
# array([ 2.03493046e-03, 2.82697075e+00, 1.60924821e+00, 1.27004016e+00,
# -6.20100112e-01, 5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
# mathematical and statistical methods
a = np.arange(12).reshape(3, 4)
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
a.mean() # np.mean(a)
# np.float64(5.5)
a.mean(axis=1) # compute mean across the columns
# array([1.5, 5.5, 9.5])
a.sum()
# np.int64(66)
a.sum(axis=0) # compute sum down the rows
# array([12, 15, 18, 21])
a.cumsum() # computes the cumulative sum of the elements along a flattened version of the array
# array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66])
a.cumsum(axis=0) # computes the cumulative sum along the rows
# array([[ 0, 1, 2, 3],
# [ 4, 6, 8, 10],
# [12, 15, 18, 21]])
a.cumsum(axis=1) # computes the sums along the columns
# array([[ 0, 1, 3, 6],
# [ 4, 9, 15, 22],
# [ 8, 17, 27, 38]])
b = np.random.permutation(a.ravel()).reshape(a.shape)
# array([[ 6, 8, 11, 3],
# [ 7, 10, 4, 1],
# [ 9, 2, 0, 5]])
b.argmax() # return indices of the maximum values along the given axis
# np.int64(2)
b.argmax(axis=0)
# array([2, 1, 0, 2])
b.argmax(axis=1)
# array([2, 1, 0])
b.argmin() # return indices of the minimum values along the given axis
# np.int64(10)
b.argmin(axis=0)
# array([0, 2, 2, 1])
b.argmin(axis=1)
# array([3, 3, 2])
1.3. indexing, slicing and iterating
# indexing, slicing and iterating
# one-dimensional arrays can be indexed, sliced and iterated over, much
# like lists and other Python sequences.
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a[2]
# np.int64(2)
a[2:5]
# array([2, 3, 4])
# from start to position 6, exclusive, set every 2nd element to 1000
a[:6:2]
# array([0, 2, 4])
a[::-1] # reversed
# array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
# multidimensional arrays can have one index per axis, given in a tuple separated by commas (,)
b = np.fromfunction(lambda x, y: 10 * x+ y, (5, 4), dtype=int)
# array([[ 0, 1, 2, 3],
# [10, 11, 12, 13],
# [20, 21, 22, 23],
# [30, 31, 32, 33],
# [40, 41, 42, 43]])
# a comma-separated list of indices to select individual elements
b[2, 3] # b[2][3]
# np.int64(23)
# every element from column index 1 (`1:`) to the end of the first two rows (`:2`) of the array
b[:2, 1:]
# array([[ 1, 2, 3],
# [11, 12, 13]])
# each row in the second column of `b`
b[0:5, 1]
# array([ 1, 11, 21, 31, 41])
# a colon (`:`) by itself to take the entire axis
b[:, 1]
# array([ 1, 11, 21, 31, 41])
b[:, :1]
# array([[ 0],
# [10],
# [20],
# [30],
# [40]])
# each column in the second and third row of `b`
b[1:3, :]
# array([[10, 11, 12, 13],
# [20, 21, 22, 23]])
# missing indices are considered complete slices `:`
b[-1] # b[-1, :]
# array([40, 41, 42, 43])
# the dots (`...`) represent as many colons as needed to produce a complete indexing tuple
b[-1, ...] # b[-1, :]
# array([40, 41, 42, 43])
b = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
# fancy indexing: indexing with arrays of indices
a = np.zeros((8, 4))
for i in range(8):
a[i] = i
# array([[0., 0., 0., 0.],
# [1., 1., 1., 1.],
# [2., 2., 2., 2.],
# [3., 3., 3., 3.],
# [4., 4., 4., 4.],
# [5., 5., 5., 5.],
# [6., 6., 6., 6.],
# [7., 7., 7., 7.]])
i = [4, 3, 0, 6]
a[i]
# array([[4., 4., 4., 4.],
# [3., 3., 3., 3.],
# [0., 0., 0., 0.],
# [6., 6., 6., 6.]])
a = np.arange(32).reshape((8, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11],
# [12, 13, 14, 15],
# [16, 17, 18, 19],
# [20, 21, 22, 23],
# [24, 25, 26, 27],
# [28, 29, 30, 31]])
i = [1, 5, 7, 2], [0, 3, 1, 2]
a[i]
# array([ 4, 23, 29, 10])
palette = np.array(
[
[0, 0, 0], # black
[255, 0, 0], # red
[0, 255, 0], # green
[0, 0, 255], # blue
[255, 255, 255], # white
]
)
image = np.array(
[
[0, 1, 2, 0], # each value corresponds to a color in the palette
[0, 3, 4, 0],
]
)
palette[image] # the (2, 4, 3) color image
# array([[[ 0, 0, 0],
# [255, 0, 0],
# [ 0, 255, 0],
# [ 0, 0, 0]],
#
# [[ 0, 0, 0],
# [ 0, 0, 255],
# [255, 255, 255],
# [ 0, 0, 0]]])
# boolean indexing
names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2], [-12, -4], [3, 4]])
names == 'Bob'
# array([ True, False, False, True, False, False, False])
data[names == "Bob"]
# array([[4, 7],
# [0, 0]])
data[~(names == "Bob")] # data[names != "Bob"]
# array([[ 0, 2],
# [ -5, 6],
# [ 1, 2],
# [-12, -4],
# [ 3, 4]])
# The Python keywords `and` and `or` do not work with Boolean arrays.
# Use `&` (and) and `|` (or) instead.
mask = (names == "Bob") | (names == "Will")
# array([ True, False, True, True, True, False, False])
data[mask]
# array([[ 4, 7],
# [-5, 6],
# [ 0, 0],
# [ 1, 2]])
# iterating
for r in b:
print(r)
# [0 1 2]
# [3 4 5]
# iterating on each element
for e in b.flat:
print(f'{e}', end='\t')
# 0 1 2 3 4 5
1.4. copies and views
# copies and views: array slices are views on the original array
a = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]])
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
b = a
b is a # True
def f(x: np.array): # pass objects as references
return id(x)
id(a), f(a) # (4397168176, 4397168176)
c = a.view() # a new view of the array
c is a, c.base is a, c.flags.owndata # (False, True, False)
c = c.reshape((2, 6))
c.shape, a.shape # ((2, 6), (3, 4))
c[0, 4] = 1234 # a's data changes
a[1, 0] # np.int64(1234)
s = a[:, 1:3]
s = a[:, 1:3]
# array([[ 1, 2],
# [ 5, 6],
# [ 9, 10]])
s[:] = 10 # a view, not a copy
a
# array([[ 0, 10, 10, 3],
# [1234, 10, 10, 7],
# [ 8, 10, 10, 11]])
d = a.copy() # a copy of the array
d is a, d.base is a # (False, False)
1.5. probability distributions
# np.random: arrays of sample values from many kinds of probability distributions
samples = np.random.standard_normal(size=(4, 4))
# array([[ 0.56695748, 0.00955644, 0.08934151, 1.69336179],
# [-0.41463561, 0.81013138, 0.41954566, 0.11812052],
# [ 0.23305357, 0.26527434, 0.3752615 , 1.82930088],
# [ 0.36156569, 1.66151819, 0.78871978, -0.56163182]])
# from random import normalvariate
# N = 1_000_000
#
# %timeit samples = [normalvariate(0, 1) for _ in range(N)]
# 320 ms ± 885 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#
# %timeit np.random.standard_normal(N)
# 15.8 ms ± 8.28 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# random numbers are pseudo-random with a configurable RNG initialized by the `seed` value, ensuring reproducibility.
rng = np.random.default_rng(seed=12345)
type(rng)
# numpy.random._generator.Generator
r1 = np.random.default_rng(seed=12345)
r2 = np.random.default_rng(seed=12345)
r1.standard_normal((2, 3))
# array([[-1.42382504, 1.26372846, -0.87066174],
# [-0.25917323, -0.07534331, -0.74088465]])
r2.default_rng(seed=12345).standard_normal((2, 3))
# array([[-1.42382504, 1.26372846, -0.87066174],
# [-0.25917323, -0.07534331, -0.74088465]])
1.6. array-oriented programming
# array-oriented programming with arrays
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
# traditional list comprehension
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
np.array(result)
# array([1.1, 2.2, 1.3, 1.4, 2.5])
# numpy.where
result = np.where(cond, xarr, yarr)
# array([1.1, 2.2, 1.3, 1.4, 2.5])
a = np.random.standard_normal((4, 4))
# array([[-0.8529, -1.6429, 1.4967, -0.6435],
# [ 0.5086, 0.5242, 1.5083, 0.0487],
# [-1.2721, -0.8147, -0.3598, -1.2042],
# [ 1.1531, 1.5783, -0.9556, -0.6506]])
np.where(a > 0, 2, -2)
# array([[-2, -2, 2, -2],
# [ 2, 2, 2, 2],
# [-2, -2, -2, -2],
# [ 2, 2, -2, -2]])
# set only positive values to 2
np.where(a > 0, 2, a)
# array([[-0.8529, -1.6429, 2. , -0.6435],
# [ 2. , 2. , 2. , 2. ],
# [-1.2721, -0.8147, -0.3598, -1.2042],
# [ 2. , 2. , -0.9556, -0.6506]])
1.7. sorting
# sorting
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 2, 10, 11, 3],
# [ 0, 6, 9, 5],
# [ 7, 8, 1, 4]])
a.sort()
# array([[ 2, 3, 10, 11],
# [ 0, 5, 6, 9],
# [ 1, 4, 7, 8]])
a.sort(axis=0) # sort across each column
# array([[ 0, 3, 6, 8],
# [ 1, 4, 7, 9],
# [ 2, 5, 10, 11]])
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 5, 3, 11, 8],
# [ 7, 1, 0, 4],
# [10, 9, 2, 6]])
a.sort(axis=1) # sort across each row
# array([[ 3, 5, 8, 11],
# [ 0, 1, 4, 7],
# [ 2, 6, 9, 10]])
np.sort(a, axis=0) # a sorted copy of an array
# array([[ 0, 1, 4, 7],
# [ 2, 5, 8, 10],
# [ 3, 6, 9, 11]])
1.8. saving and loading
# save and load data: file input and output with arrays
np.save("a", np.arange(3))
np.load("a.npy")
# array([0, 1, 2])
a = np.arange(3)
b = np.arange(4, 7)
np.savez("x", a=a, b=b) # np.savez_compressed("x", a=a, b=b)
x = np.load("x.npz")
x["a"]
# array([0, 1, 2])
x["b"]
# array([4, 5, 6])
2. Pandas
Pandas (derived from panel data, /ˈpeɪ.dəns/) is an open-source, BSD-licensed Python library built on NumPy, offering high-performance, user-friendly tabular or heterogeneous data structures like Series
(1D labeled arrays of any data type) and DataFrames
(2D labeled tables) for powerful data analysis.
import numpy as np
import pandas as pd
BooleanDtype # Nullable Boolean data, use "boolean" when passing as string
CategoricalDtype # Categorical data type, use "category" when passing as string
DatetimeTZDtype # Datetime with time zone
Float32Dtype # 32-bit nullable floating point, use "Float32" when passing as string
Float64Dtype # 64-bit nullable floating point, use "Float64" when passing as string
Int8Dtype # 8-bit nullable signed integer, use "Int8" when passing as string
Int16Dtype # 16-bit nullable signed integer, use "Int16" when passing as string
Int32Dtype # 32-bit nullable signed integer, use "Int32" when passing as string
Int64Dtype # 64-bit nullable signed integer, use "Int64" when passing as string
UInt8Dtype # 8-bit nullable unsigned integer, use "UInt8" when passing as string
UInt16Dtype # 16-bit nullable unsigned integer, use "UInt16" when passing as string
UInt32Dtype # 32-bit nullable unsigned integer, use "UInt32" when passing as string
UInt64Dtype # 64-bit nullable unsigned integer, use "UInt64" when passing as string
s = pd.Series([1, 2, 3, None], dtype=pd.Float64Dtype()) # pd.Series([1, 2, 3, None], dtype="Float64")
# 0 1.0
# 1 2.0
# 2 3.0
# 3 <NA>
# dtype: Float64
s[3] is pd.NA
# True
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype()) # pd.Series(['one', 'two', None, 'three'], dtype="string")
# 0 one
# 1 two
# 2 <NA>
# 3 three
# dtype: string
s[2] is pd.NA
# True
2.1. Series, DataFrame, and Index
-
A Series is a one-dimensional array-like object containing a sequence of any values and an associated array of labels, called its index, also like a fixed-length, ordered dictionary as it is a mapping of index values to data values.
s = pd.Series([4, 7, -5, 3]) # default integer index # 0 4 # 1 7 # 2 -5 # 3 3 # dtype: int64 s.array # <NumpyExtensionArray> # [np.int64(4), np.int64(7), np.int64(-5), np.int64(3)] # Length: 4, dtype: int64 s.index # RangeIndex(start=0, stop=4, step=1) s.dtype # dtype('int64') s.to_numpy() # NumPy array # array([ 4, 7, -5, 3]) s.to_dict() # dictionary # {0: 4, 1: 7, 2: -5, 3: 3} s = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"]) # label index # d 4 # b 7 # a -5 # c 3 # dtype: int64 # select a single value or a set of values s["a"] # np.int64(-5) s[["b", "c", "d"]] # b 7 # c 3 # d 4 # dtype: int64 # purely integer-location based indexing for selection by position s.iloc[0] # np.int64(4) s.iloc[:2] # d 4 # b 7 # dtype: int64 s.iloc[[1, 2, 0]] # b 7 # a -5 # d 4 # dtype: int64 # Series is ndarray-like s > 0 # d True # b True # a False # c True # dtype: bool s[s > 0] # d 4 # b 7 # c 3 # dtype: int64 s[s > s.median()] # d 4 # b 7 # dtype: int64 np.exp(s) # import numpy as np # d 54.598150 # b 1096.633158 # a 0.006738 # c 20.085537 # dtype: float64 # Series is dictionary-like s["d"] # np.int64(4) s[["a", "b", "c"]] # a -5 # b 7 # c 3 # dtype: int64 "b" in s # True "e" in s # False s.get("e", np.nan) # nan data = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000} states = ["California", "Ohio", "Oregon", "Texas"] pd.Series(data, index=states) # override indices # California NaN # Ohio 35000.0 # Oregon 16000.0 # Texas 71000.0 # dtype: float64 # name attribute s.name = "fooooobar" s.index.name = "buzzzzzz" s # buzzzzzz # d 4 # b 7 # a -5 # c 3 # Name: fooooobar, dtype: int64 # alter the index in place while preserving the original index name s.index = pd.Index(["Bob", "Steve", "Jeff", "Ryan"], name=s.index.name) # Retains the current index name # Output: # buzzzzzz # Bob 4 # Steve 7 # Jeff -5 # Ryan 3 # Name: fooooobar, dtype: int64 # alter the index in place without preserving the index name s.index = ["Bob", "Steve", "Jeff", "Ryan"] # Removes the index name # Output: # Bob 4 # Steve 7 # Jeff -5 # Ryan 3 # Name: fooooobar, dtype: int64
-
A DataFrame is a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns, or a spreadsheet, or a dict of Series objects.
data = { "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"], "year": [2000, 2001, 2002, 2001, 2002, 2003], "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2], } frame = pd.DataFrame(data) # from a dictionary of equal-length lists or NumPy arrays # state year pop # 0 Ohio 2000 1.5 # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 # 5 Nevada 2003 3.2 frame.dtypes # state object # year int64 # pop float64 # dtype: object frame.head() # select the first five (by default) rows # state year pop # 0 Ohio 2000 1.5 # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 frame.tail() # select the last five (by default) rows # state year pop # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 # 5 Nevada 2003 3.2 # specify a sequence of columns and order pd.DataFrame(data, columns=["year", "state", "pop"]) # year state pop # 0 2000 Ohio 1.5 # 1 2001 Ohio 1.7 # 2 2002 Ohio 3.6 # 3 2001 Nevada 2.4 # 4 2002 Nevada 2.9 # 5 2003 Nevada 3.2 pd.DataFrame(data, columns=["year", "state", "pop", "debt"]) # year state pop debt # 0 2000 Ohio 1.5 NaN # 1 2001 Ohio 1.7 NaN # 2 2002 Ohio 3.6 NaN # 3 2001 Nevada 2.4 NaN # 4 2002 Nevada 2.9 NaN # 5 2003 Nevada 3.2 NaN # retrieve columns as Series frame["state"] # dictionary-like notation # 0 Ohio # 1 Ohio # 2 Ohio # 3 Nevada # 4 Nevada # 5 Nevada # Name: state, dtype: object frame.year # dot attribute notation # 0 2000 # 1 2001 # 2 2002 # 3 2001 # 4 2002 # 5 2003 # Name: year, dtype: int64 # retrieve rows by position or name with the special `iloc` and `loc` attributes frame.loc[1] # state Ohio # year 2001 # pop 1.7 # Name: 1, dtype: object frame.iloc[2] # state Ohio # year 2002 # pop 3.6 # Name: 2, dtype: object # columns can be modified by assignment frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"]) frame2["debt"] = 16.5 # year state pop debt # 0 2000 Ohio 1.5 16.5 # 1 2001 Ohio 1.7 16.5 # 2 2002 Ohio 3.6 16.5 # 3 2001 Nevada 2.4 16.5 # 4 2002 Nevada 2.9 16.5 # 5 2003 Nevada 3.2 16.5 frame2["debt"] = np.arange(6.) # year state pop debt # 0 2000 Ohio 1.5 0.0 # 1 2001 Ohio 1.7 1.0 # 2 2002 Ohio 3.6 2.0 # 3 2001 Nevada 2.4 3.0 # 4 2002 Nevada 2.9 4.0 # 5 2003 Nevada 3.2 5.0 # assigning a column that doesn’t exist will create a new column. frame2["eastern"] = frame2["state"] == "Ohio" # year state pop debt eastern # 0 2000 Ohio 1.5 0.0 True # 1 2001 Ohio 1.7 1.0 True # 2 2002 Ohio 3.6 2.0 True # 3 2001 Nevada 2.4 3.0 False # 4 2002 Nevada 2.9 4.0 False # 5 2003 Nevada 3.2 5.0 False # remove a column del frame2["eastern"] # year state pop debt # 0 2000 Ohio 1.5 0.0 # 1 2001 Ohio 1.7 1.0 # 2 2002 Ohio 3.6 2.0 # 3 2001 Nevada 2.4 3.0 # 4 2002 Nevada 2.9 4.0 # 5 2003 Nevada 3.2 5.0 # return and drop an item from frame frame2.pop("debt") # 0 0.0 # 1 1.0 # 2 2.0 # 3 3.0 # 4 4.0 # 5 5.0 # Name: debt, dtype: float64 frame2 # year state pop # 0 2000 Ohio 1.5 # 1 2001 Ohio 1.7 # 2 2002 Ohio 3.6 # 3 2001 Nevada 2.4 # 4 2002 Nevada 2.9 # 5 2003 Nevada 3.2 # from a nested dictionary of dictionaries # the outer dictionary keys as the columns # the inner keys as the row indices populations = { "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, "Nevada": {2001: 2.4, 2002: 2.9}, } frame3 = pd.DataFrame(populations) # Ohio Nevada # 2000 1.5 NaN # 2001 1.7 2.4 # 2002 3.6 2.9 frame3.T # transpose the DataFrame (swap rows and columns) # 2000 2001 2002 # Ohio 1.5 1.7 3.6 # Nevada NaN 2.4 2.9 frame2.to_numpy() # a two-dimensional ndarray # array([[2000, 'Ohio', 1.5, 0.0], # [2001, 'Ohio', 1.7, 1.0], # [2002, 'Ohio', 3.6, 2.0], # [2001, 'Nevada', 2.4, 3.0], # [2002, 'Nevada', 2.9, 4.0], # [2003, 'Nevada', 3.2, 5.0]], dtype=object)
-
A pandas’s Index object is responsible for holding the axis labels (including a DataFrame’s column names) and other metadata (like the axis name or names).
frame3 # Ohio Nevada # 2000 1.5 NaN # 2001 1.7 2.4 # 2002 3.6 2.9 frame3.index # Index([2000, 2001, 2002], dtype='int64') frame3.columns # Index(['Ohio', 'Nevada'], dtype='object') "Ohio" in frame3.columns # True 2003 in frame3.index # False # duplicate labels select all occurrences of that label s = frame3["Ohio"] # 2000 1.5 # 2001 1.7 # 2002 3.6 # Name: Ohio, dtype: float64 i = s.index.append(pd.Index([2000, 2002])) # Index([2000, 2001, 2002, 2000, 2002], dtype='int64') s[i] # 2000 1.5 # 2001 1.7 # 2002 3.6 # 2000 1.5 # 2002 3.6 # Name: Ohio, dtype: float64
2.2. Essential Functionality
2.2.1. Reindexing
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
# d 4.5
# b 7.2
# a -5.3
# c 3.6
# dtype: float64
obj.reindex(["a", "b", "c", "d", "e"])
# a -5.3
# b 7.2
# c 3.6
# d 4.5
# e NaN
# dtype: float64
frame = pd.DataFrame(
np.arange(9).reshape((3, 3)),
index=["a", "c", "d"],
columns=["Ohio", "Texas", "California"],
)
# Ohio Texas California
# a 0 1 2
# c 3 4 5
# d 6 7 8
frame.reindex(index=["a", "b", "c", "d"])
# Ohio Texas California
# a 0.0 1.0 2.0
# b NaN NaN NaN
# c 3.0 4.0 5.0
# d 6.0 7.0 8.0
# reindex with the `axis` keyword
frame.reindex(["Texas", "Utah", "California"], axis="columns")
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
frame.reindex(["Texas", "Utah", "California"], axis=1)
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
# reindex with the `columns` keyword
frame.reindex(columns=["Texas", "Utah", "California"])
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
# reindex with the `loc` operator
frame.loc[["a", "d", "c"], ["California", "Texas"]]
# California Texas
# a 2 1
# d 8 7
# c 5 4
2.2.2. Dropping Entries from an Axis
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
# a 0.0
# b 1.0
# c 2.0
# d 3.0
# e 4.0
# dtype: float64
obj.drop("c")
# a 0.0
# b 1.0
# d 3.0
# e 4.0
# dtype: float64
obj.drop(["c", "d"])
# a 0.0
# b 1.0
# e 4.0
# dtype: float64
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop(index=["Colorado", "Ohio"])
# one two three four
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop(columns=["two"])
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
data.drop("two", axis=1) # drop values from the columns by passing axis=1
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
data.drop("two", axis="columns") # drop values from the columns by passing axis="columns"
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
2.2.3. Indexing, Selection, and Filtering
s = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
# a 0.0
# b 1.0
# c 2.0
# d 3.0
# dtype: float64
s["b"]
# np.float64(1.0)
s[1] # deprecated: label-based indexing or position-based ?
# np.float64(1.0)
s.iloc[1] # by position
# np.float64(1.0)
s.iloc[[1, 3]]
# b 1.0
# d 3.0
# dtype: float64
s[2:4] # slicing with integers is always integer oriented
# c 2.0
# d 3.0
# dtype: float64
s[["b", "a", "d"]]
# b 1.0
# a 0.0
# d 3.0
# dtype: float64
s.loc[["b", "a", "d"]] # the preferred way to select index values by label
# b 1.0
# a 0.0
# d 3.0
# dtype: float64
s[s < 2]
# a 0.0
# b 1.0
# dtype: float64
s.loc["b":"c"] = 5
# a 0.0
# b 5.0
# c 5.0
# d 3.0
# dtype: float64
# retrieve one or more columns from a DataFrame either with a single value or sequence
# df[column] select single column or sequence of columns from the DataFrame
# df.loc[rows] select single row or subset of rows from the DataFrame by label
# df.loc[:, cols] select single column or subset of columns by label
# df.loc[rows, cols] select both row(s) and column(s) by label
# df.iloc[rows] select single row or subset of rows from the DataFrame by integer position
# df.iloc[:, cols] select single column or subset of columns by integer position
# df.iloc[rows, cols] select both row(s) and column(s) by integer position
# df.at[row, col] select a single scalar value by row and column label
# df.iat[row, col] select a single scalar value by row and column position (integers)
# df.reindex(...) select either rows or columns by labels
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data["two"]
# Ohio 1
# Colorado 5
# Utah 9
# New York 13
# Name: two, dtype: int64
data[["three", "one"]]
# three one
# Ohio 2 0
# Colorado 6 4
# Utah 10 8
# New York 14 12
data[:2] # slice (slice rows)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
data[data["three"] > 5] # boolean array (filter rows)
# one two three four
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data < 5
# one two three four
# Ohio True True True True
# Colorado True False False False
# Utah False False False False
# New York False False False False
data[data < 5] = 0 # boolean DataFrame (set values based on some criterion)
# one two three four
# Ohio 0 0 0 0
# Colorado 0 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
# select a subset of the rows and columns using either axis labels (loc) or integers (iloc)
data.loc["Colorado"] # select a single row by label
# one 0
# two 5
# three 6
# four 7
# Name: Colorado, dtype: int64
data.loc[["Colorado", "New York"]] # select multiple rows with a sequence of labels
# one two three four
# Colorado 0 5 6 7
# New York 12 13 14 15
data.loc["Colorado", ["two", "three"]] # data.loc["Colorado"][["two", "three"]]
# two 5
# three 6
# Name: Colorado, dtype: int64
data.iloc[2] # select a single row by integer
# one 8
# two 9
# three 10
# four 11
# Name: Utah, dtype: int64
data.iloc[[2, 1]]
# one two three four
# Utah 8 9 10 11
# Colorado 0 5 6 7
data.iloc[2, [3, 0, 1]]
# four 11
# one 8
# two 9
# Name: Utah, dtype: int64
data.iloc[[1, 2], [3, 0, 1]]
# four one two
# Colorado 7 0 5
# Utah 11 8 9
data.iloc[:, :3][data.three > 5]
# one two three
# Colorado 0 5 6
# Utah 8 9 10
# New York 12 13 14
# boolean arrays can be used with `loc` but not `iloc`
data.loc[data.three >= 2] # data[data.three >= 2]
# one two three four
# Colorado 0 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
2.2.4. Arithmetic and Data Alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
# a 7.3
# c -2.5
# d 3.4
# e 1.5
# dtype: float64
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
# a -2.1
# c 3.6
# e -1.5
# f 4.0
# g 3.1
# dtype: float64
s1 + s2 # label-based data alignment yields NaN for non-overlapping entries in arithmetic
# a 5.2
# c 1.1
# d NaN
# e 0.0
# f NaN
# g NaN
# dtype: float64
df1 = pd.DataFrame(
np.arange(9.0).reshape((3, 3)),
columns=list("bcd"),
index=["Ohio", "Texas", "Colorado"],
)
# b c d
# Ohio 0.0 1.0 2.0
# Texas 3.0 4.0 5.0
# Colorado 6.0 7.0 8.0
df2 = pd.DataFrame(
np.arange(12.0).reshape((4, 3)),
columns=list("bde"),
index=["Utah", "Ohio", "Texas", "Oregon"],
)
# b d e
# Utah 0.0 1.0 2.0
# Ohio 3.0 4.0 5.0
# Texas 6.0 7.0 8.0
# Oregon 9.0 10.0 11.0
df1 + df2 # alignment is performed on both rows and columns on DataFrame
# b c d e
# Colorado NaN NaN NaN NaN
# Ohio 3.0 NaN 6.0 NaN
# Oregon NaN NaN NaN NaN
# Texas 9.0 NaN 12.0 NaN
# Utah NaN NaN NaN NaN
# arithmetic methods with fill values: substitute the passed value for any missing values
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
# a b c d
# 0 0.0 1.0 2.0 3.0
# 1 4.0 5.0 6.0 7.0
# 2 8.0 9.0 10.0 11.0
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))
# a b c d e
# 0 0.0 1.0 2.0 3.0 4.0
# 1 5.0 6.0 7.0 8.0 9.0
# 2 10.0 11.0 12.0 13.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
df2.loc[1, "b"] = np.nan
# a b c d e
# 0 0.0 1.0 2.0 3.0 4.0
# 1 5.0 NaN 7.0 8.0 9.0
# 2 10.0 11.0 12.0 13.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
df1 + df2
# a b c d e
# 0 0.0 2.0 4.0 6.0 NaN
# 1 9.0 NaN 13.0 15.0 NaN
# 2 18.0 20.0 22.0 24.0 NaN
# 3 NaN NaN NaN NaN NaN
df1.add(df2, fill_value=0.)
# a b c d e
# 0 0.0 2.0 4.0 6.0 4.0
# 1 9.0 5.0 13.0 15.0 9.0
# 2 18.0 20.0 22.0 24.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
# broadcasting: operations between DataFrame and Series
arr = np.arange(12).reshape((3, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
arr[0]
# array([0, 1, 2, 3])
arr - arr[0] # the subtraction is performed once for each row
# array([[0, 0, 0, 0],
# [4, 4, 4, 4],
# [8, 8, 8, 8]])
2.2.5. Function Application and Mapping
data = pd.DataFrame(
np.random.standard_normal((4, 3)),
columns=list("bde"),
index=["Utah", "Ohio", "Texas", "Oregon"],
)
# b d e
# Utah -0.079026 1.151062 -0.393464
# Ohio -0.731363 0.601451 -0.462457
# Texas -1.400861 0.892020 -0.010010
# Oregon 0.899501 -0.418370 -0.791160
np.abs(data) # NumPy ufuncs (element-wise array methods)
# b d e
# Utah 0.079026 1.151062 0.393464
# Ohio 0.731363 0.601451 0.462457
# Texas 1.400861 0.892020 0.010010
# Oregon 0.899501 0.418370 0.791160
# apply a function along an axis of the DataFrame.
data.apply(lambda x: x.max() - x.min())
# b 2.300362
# d 1.569433
# e 0.781149
# dtype: float64
data.apply(lambda x: x.max() - x.min(), axis="columns")
# Utah 1.544527
# Ohio 1.332813
# Texas 2.292882
# Oregon 1.690661
# dtype: float64
data.apply(lambda x: pd.Series([x.max(), x.min()], index=["max", "min"]))
# b d e
# max 0.899501 1.151062 -0.01001
# min -1.400861 -0.418370 -0.79116
# apply a function to a Dataframe element-wise
data.map(lambda x: f"{x:.2f}")
# b d e
# Utah -0.08 1.15 -0.39
# Ohio -0.73 0.60 -0.46
# Texas -1.40 0.89 -0.01
# Oregon 0.90 -0.42 -0.79
2.2.6. Sorting and Ranking
s = pd.Series([4, 7, -3, 2], index=["d", "a", "b", "c"])
# d 4
# a 7
# b -3
# c 2
# dtype: int64
# sort lexicographically by row or column label
s.sort_index()
# a 7
# b -3
# c 2
# d 4
# dtype: int64
s.sort_index(ascending=False) # descending
# d 4
# c 2
# b -3
# a 7
# dtype: int64
i = s.index.sort_values()
# Index(['a', 'b', 'c', 'd'], dtype='object')
s.reindex(i)
# a 7
# b -3
# c 2
# d 4
# dtype: int64
s.sort_values() # sort by value
# b -3
# c 2
# d 4
# a 7
# dtype: int64
# any missing values are sorted to the end of the Series by default
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values()
# 4 -3.0
# 5 2.0
# 0 4.0
# 2 7.0
# 1 NaN
# 3 NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(ascending=False)
# 2 7.0
# 0 4.0
# 5 2.0
# 4 -3.0
# 1 NaN
# 3 NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(na_position="first")
# 1 NaN
# 3 NaN
# 4 -3.0
# 5 2.0
# 0 4.0
# 2 7.0
# dtype: float64
data = pd.DataFrame(
[[4, 7, -3, 2], [0, 1, 0, 1]], index=["three", "one"], columns=["d", "a", "b", "c"]
)
# d a b c
# three 4 7 -3 2
# one 0 1 0 1
data.sort_index()
# d a b c
# one 0 1 0 1
# three 4 7 -3 2
data.sort_index(axis="columns")
# a b c d
# three 7 -3 2 4
# one 1 0 1 0
# compute numerical data ranks (1 through n) along axis.
s = pd.Series([7, -5, 7, 4, 2, 0, 4])
s.rank() # by default, average rank of the group
# 0 6.5
# 1 1.0
# 2 6.5
# 3 4.5
# 4 3.0
# 5 2.0
# 6 4.5
# dtype: float64
s.rank(method="first") # ranks assigned in order they appear in the array
# 0 6.0
# 1 1.0
# 2 7.0
# 3 4.0
# 4 3.0
# 5 2.0
# 6 5.0
# dtype: float64
s.rank(ascending=False) # rank in descending order
# 0 1.5
# 1 7.0
# 2 1.5
# 3 3.5
# 4 5.0
# 5 6.0
# 6 3.5
# dtype: float64
# DataFrame can compute ranks over the rows or the columns
data = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
# b a c
# 0 4.3 0 -2.0
# 1 7.0 1 5.0
# 2 -3.0 0 8.0
# 3 2.0 1 -2.5
data.rank()
# b a c
# 0 3.0 1.5 2.0
# 1 4.0 3.5 3.0
# 2 1.0 1.5 4.0
# 3 2.0 3.5 1.0
data.rank(method="first", axis="columns")
# b a c
# 0 3.0 2.0 1.0
# 1 3.0 1.0 2.0
# 2 1.0 2.0 3.0
# 3 3.0 2.0 1.0
2.2.7. Axis Indexes with Duplicate Labels
s = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
# a 0
# a 1
# b 2
# b 3
# c 4
# dtype: int64
s.index.is_unique
# False
s["a"] # return a Series
# a 0
# a 1
# dtype: int64
s["c"] # return a scalar value
# np.int64(4)
data = pd.DataFrame(np.random.standard_normal((5, 3)), index=["a", "a", "b", "b", "c"])
# 0 1 2
# a -0.366146 0.329982 0.186980
# a 1.275523 0.615156 -3.403692
# b 0.488783 -0.448384 -2.626718
# b 0.610570 2.211333 1.131847
# c -0.157337 0.212340 0.175906
data.loc["a"] # return a DataFrame
# 0 1 2
# a -0.366146 0.329982 0.186980
# a 1.275523 0.615156 -3.403692
data.loc["c"] # return a Series
# 0 -0.157337
# 1 0.212340
# 2 0.175906
# Name: c, dtype: float64
2.2.8. Unique Values, Value Counts, and Membership
s = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
s.unique() # return unique values of Series object
# array(['c', 'a', 'd', 'b'], dtype=object)
s.value_counts() # return a Series containing counts of unique values
# c 3
# a 3
# b 2
# d 1
# Name: count, dtype: int64
mask = s.isin(["b", "c"]) # performs a vectorized set membership check
# 0 True
# 1 False
# 2 False
# 3 False
# 4 False
# 5 True
# 6 True
# 7 True
# 8 True
# dtype: bool
s[mask]
# 0 c
# 5 b
# 6 b
# 7 c
# 8 c
# dtype: object
pd.Index(pd.Series(s.unique()))
# Index(['c', 'a', 'd', 'b'], dtype='object')
pd.Index(pd.Series(s.unique())).get_indexer(s)
# array([0, 1, 2, 1, 1, 3, 3, 0, 0])
2.2.9. NumPy Arrays
-
Pandas is often used for data loading and cleaning in model development, with feature engineering being crucial, and the transition to modeling libraries typically involves converting data to NumPy arrays.
data = pd.DataFrame( { "x0": [1, 2, 3, 4, 5], "x1": [0.01, -0.01, 0.25, -4.1, 0.0], "y": [-1.5, 0.0, 3.6, 1.3, -2.0], } ) # x0 x1 y # 0 1 0.01 -1.5 # 1 2 -0.01 0.0 # 2 3 0.25 3.6 # 3 4 -4.10 1.3 # 4 5 0.00 -2.0 data.columns # Index(['x0', 'x1', 'y'], dtype='object') # to a NumPy array from a DataFrame data.to_numpy() # array([[ 1. , 0.01, -1.5 ], # [ 2. , -0.01, 0. ], # [ 3. , 0.25, 3.6 ], # [ 4. , -4.1 , 1.3 ], # [ 5. , 0. , -2. ]]) # to a DataFrame from a NumPy array pd.DataFrame(data.to_numpy(), columns=data.columns) # x0 x1 y # 0 1.0 0.01 -1.5 # 1 2.0 -0.01 0.0 # 2 3.0 0.25 3.6 # 3 4.0 -4.10 1.3 # 4 5.0 0.00 -2.0
-
The
to_numpy
method is best for homogeneous data, producing a numeric ndarray, whereas heterogeneous data results in an ndarray of Python objects.data1 = data.copy() data1["string"] = ["a", "b", "c", "d", "e"] # x0 x1 y string # 0 1 0.01 -1.5 a # 1 2 -0.01 0.0 b # 2 3 0.25 3.6 c # 3 4 -4.10 1.3 d # 4 5 0.00 -2.0 e data1.to_numpy() # array([[1, 0.01, -1.5, 'a'], # [2, -0.01, 0.0, 'b'], # [3, 0.25, 3.6, 'c'], # [4, -4.1, 1.3, 'd'], # [5, 0.0, -2.0, 'e']], dtype=object)
-
It’s recommended to use
loc
indexing to select a subset of columns before applyingto_numpy
.data.loc[:, ["x0", "x1"]].to_numpy() # array([[ 1. , 0.01], # [ 2. , -0.01], # [ 3. , 0.25], # [ 4. , -4.1 ], # [ 5. , 0. ]])
2.3. Reading and Writing
2.3.1. Reading and Writing CSV Text Files
csv_data = """
Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
1,John,1998-05-06,1234567890,25,New York,USA,894240000
2,Jane,1993-07-15,0987654321,30,London,UK,741484800
3,Doe,2001-12-20,1122334455,,Sydney,Australia,1008806400
4,NULL,1990-01-01,NULL,40,NULL,Canada,631152000
"""
# load the CSV data into a DataFrame
data = pd.read_csv(
io.StringIO(csv_data), # use io.StringIO to simulate a file-like object
parse_dates=["Birth Day"], # parse the 'Birth Day' column as datetime
dtype={"Phone Number": str}, # treat 'Phone Number' as a string
na_values=["", "NULL"], # handle missing values
)
# Id Name Birth Day Phone Number Age City Country Timestamp
# 0 1 John 1998-05-06 1234567890 25.0 New York USA 894240000
# 1 2 Jane 1993-07-15 0987654321 30.0 London UK 741484800
# 2 3 Doe 2001-12-20 1122334455 NaN Sydney Australia 1008806400
# 3 4 NaN 1990-01-01 NaN 40.0 NaN Canada 631152000
# convert the 'Timestamp' column from Unix epoch time to datetime
data["Timestamp"] = pd.to_datetime(data["Timestamp"], unit="s")
# Id Name Birth Day Phone Number Age City Country Timestamp timestamp
# 0 1 John 1998-05-06 1234567890 25.0 New York USA 894240000 1998-05-04
# 1 2 Jane 1993-07-15 0987654321 30.0 London UK 741484800 1993-07-01
# 2 3 Doe 2001-12-20 1122334455 NaN Sydney Australia 1008806400 2001-12-20
# 3 4 NaN 1990-01-01 NaN 40.0 NaN Canada 631152000 1990-01-01
data.dtypes
# Id int64
# Name object
# Birth Day datetime64[ns]
# Phone Number object
# Age float64
# City object
# Country object
# Timestamp datetime64[ns]
# dtype: object
# write the data out to a comma-separated file
data.to_csv(sys.stdout) # import sys
# ,Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 0,1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 1,2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 2,3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 3,4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False)
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL')
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL', header=False)
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
2.3.2. Reading and Writing Microsoft Excel Files
# add-on packages xlrd and openpyxl to read old-style XLS and newer XLSX files
conda install openpyxl xlrd
-
pandas.ExcelFile
: read sheets and parse as DataFramexlsx = pd.ExcelFile("book1.xlsx") xlsx.sheet_names # ['Sheet1'] xlsx.parse(sheet_name="Sheet1") # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo df = xlsx.parse(sheet_name="Sheet1", index_col=0) # a b c d message # 0 # 0 1 2 3 4 hello # 1 5 6 7 8 world # 2 9 10 11 12 foo df.columns # Index(['a', 'b', 'c', 'd', 'message'], dtype='object') df.index # Index([0, 1, 2], dtype='int64', name=0) xlsx.parse(sheet_name="Sheet1", index_col="message") # 0 a b c d # message # hello 0 1 2 3 4 # world 1 5 6 7 8 # foo 2 9 10 11 12
-
pandas.read_excel
: read a sheet as pandas data directlypd.read_excel("book1.xlsx") # pd.read_excel("book1.xlsx", sheet_name=0) # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo pd.read_excel("book1.xlsx", sheet_name="Sheet1") # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo
-
pandas.to_excel
: write pandas data to Excel fileswriter = pd.ExcelWriter("sample1.xlsx") df.to_excel(writer, sheet_name="foo") # write to sheet 'foo' df.to_excel(writer, sheet_name="bar") # write to sheet 'bar' df.to_excel(writer, sheet_name="buz") # write to sheet 'buz' pd.ExcelFile("sample1.xlsx").sheet_names # ['foo', 'bar', 'buz']
2.3.3. Reading SQL-based Databases
import sqlite3
query = "CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );"
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
# <sqlite3.Cursor at 0x7f56df665840>
con.commit()
data = [
("Atlanta", "Georgia", 1.25, 6),
("Tallahassee", "Florida", 2.6, 3),
("Sacramento", "California", 1.7, 5),
]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
# <sqlite3.Cursor at 0x7f56e074fac0>
con.commit()
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
# [('Atlanta', 'Georgia', 1.25, 6),
# ('Tallahassee', 'Florida', 2.6, 3),
# ('Sacramento', 'California', 1.7, 5)]
cursor.description
# (('a', None, None, None, None, None, None),
# ('b', None, None, None, None, None, None),
# ('c', None, None, None, None, None, None),
# ('d', None, None, None, None, None, None))
columns = [x[0] for x in cursor.description]
pd.DataFrame(rows, columns=columns)
# a b c d
# 0 Atlanta Georgia 1.25 6
# 1 Tallahassee Florida 2.60 3
# 2 Sacramento California 1.70 5
-
pandas.read_sql
: read data easily from a general SQLAlchemy connectionimport sqlalchemy as sqla # conda install sqlalchemy -y db = sqla.create_engine("sqlite:///mydata.sqlite") pd.read_sql("SELECT * FROM test", db) # a b c d # 0 Atlanta Georgia 1.25 6 # 1 Tallahassee Florida 2.60 3 # 2 Sacramento California 1.70 5
3. Data Cleaning and Preparation
3.1. Handling Missing Data
s = pd.Series([1, np.nan, 3.5, None, 7])
s.dropna() # remove missing values
# 0 1.0
# 2 3.5
# 4 7.0
# dtype: float64
s.fillna(0) # fill NA/NaN values (NA: not available)
# 0 1.0
# 1 0.0
# 2 3.5
# 3 0.0
# 4 7.0
# dtype: float64
s.isna()
# 0 False
# 1 True
# 2 False
# 3 True
# 4 False
# dtype: bool
s.notna()
# 0 True
# 1 False
# 2 True
# 3 False
# 4 True
# dtype: bool
data = pd.DataFrame(
[
[1.0, 6.5, 3.0],
[1.0, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[np.nan, 6.5, 3.0],
]
)
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
data.dropna()
# 0 1 2
# 0 1.0 6.5 3.0
data.dropna(how="all")
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 3 NaN 6.5 3.0
data[4] = np.nan
# 0 1 2 4
# 0 1.0 6.5 3.0 NaN
# 1 1.0 NaN NaN NaN
# 2 NaN NaN NaN NaN
# 3 NaN 6.5 3.0 NaN
data.dropna(axis="columns", how="all")
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
data = pd.DataFrame(np.random.standard_normal((7, 3)))
data.iloc[:4, 1] = np.nan
data.iloc[:2, 2] = np.nan
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.dropna()
# 0 1 2
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.dropna(thresh=2)
# 0 1 2
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.ffill() # fill NA/NaN values by using the last (row) valid observation to fill the gap
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.ffill(axis="columns")
# 0 1 2
# 0 -0.327178 -0.327178 -0.327178
# 1 1.345916 1.345916 1.345916
# 2 -0.502837 -0.502837 0.894144
# 3 -0.372355 -0.372355 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.bfill() # fill NA/NaN values by using the next (row) valid observation to fill the gap
# 0 1 2
# 0 -0.327178 0.484139 0.894144
# 1 1.345916 0.484139 0.894144
# 2 -0.502837 0.484139 0.894144
# 3 -0.372355 0.484139 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.bfill(axis="columns")
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 0.894144 0.894144
# 3 -0.372355 -1.719540 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
3.2. Removing Duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
# k1 k2
# 0 one 1
# 1 two 1
# 2 one 2
# 3 two 3
# 4 one 3
# 5 two 4
# 6 two 4
data.duplicated()
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# 5 False
# 6 True
# dtype: bool
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data["v1"] = range(7)
# k1 k2 v1
# 0 one 1 0
# 1 two 1 1
# 2 one 2 2
# 3 two 3 3
# 4 one 3 4
# 5 two 4 5
# 6 two 4 6
data.drop_duplicates(subset=["k1"])
# k1 k2 v1
# 0 one 1 0
# 1 two 1 1
data.drop_duplicates(subset=["k1"], keep="last")
# k1 k2 v1
# 4 one 3 4
# 6 two 4 6
3.3. Transforming Data Using a Function or Mapping
data = pd.DataFrame(
{
"food": [
"bacon",
"pulled pork",
"bacon",
"pastrami",
"corned beef",
"bacon",
"pastrami",
"honey ham",
"nova lox",
],
"ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
}
)
# food ounces
# 0 bacon 4.0
# 1 pulled pork 3.0
# 2 bacon 12.0
# 3 pastrami 6.0
# 4 corned beef 7.5
# 5 bacon 8.0
# 6 pastrami 3.0
# 7 honey ham 5.0
# 8 nova lox 6.0
meat_to_animal = {
"bacon": "pig",
"pulled pork": "pig",
"pastrami": "cow",
"corned beef": "cow",
"honey ham": "pig",
"nova lox": "salmon",
}
data["animal"] = data["food"].map(meat_to_animal) # data["food"].map(lambda x: meat_to_animal.get(x, None))
# food ounces animal
# 0 bacon 4.0 pig
# 1 pulled pork 3.0 pig
# 2 bacon 12.0 pig
# 3 pastrami 6.0 cow
# 4 corned beef 7.5 cow
# 5 bacon 8.0 pig
# 6 pastrami 3.0 cow
# 7 honey ham 5.0 pig
# 8 nova lox 6.0 salmon
3.4. Replacing Values
data = pd.Series([1., -999., 2., -999., -1000., 3.])
# 0 1.0
# 1 -999.0
# 2 2.0
# 3 -999.0
# 4 -1000.0
# 5 3.0
# dtype: float64
data.replace(-999, np.nan)
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 -1000.0
# 5 3.0
# dtype: float64
data.replace([-999, -1000], np.nan) # replace multiple values at once
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 NaN
# 5 3.0
# dtype: float64
3.5. Renaming Axis Indexes
data = pd.DataFrame(
np.arange(12).reshape((3, 4)),
index=["Ohio", "Colorado", "New York"],
columns=["one", "two", "three", "four"],
)
data.index.map(lambda x: x[:4].upper())
# Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(lambda x: x[:4].upper())
# one two three four
# OHIO 0 1 2 3
# COLO 4 5 6 7
# NEW 8 9 10 11
data.rename(index=str.title, columns=str.upper)
# ONE TWO THREE FOUR
# Ohio 0 1 2 3
# Colo 4 5 6 7
# New 8 9 10 11
3.6. Discretization and Binning
-
pandas.cut()
: discretizes data points into bins based on values# continuous data points (i.e. observations) is often discretized or otherwise separated into “bins” for analysis ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32] bins = [18, 25, 35, 60, 100] age_categories = pd.cut(ages, bins) # pandas.core.arrays.categorical.Categorical # [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]] # Length: 12 # Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]] age_categories.codes # array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8) age_categories.categories # IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]') age_categories.categories[0] # Interval(18, 25, closed='right') age_categories.value_counts() # (18, 25] 5 # (25, 35] 3 # (35, 60] 3 # (60, 100] 1 # Name: count, dtype: int64 pd.cut(ages, bins, right=False).categories # IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]') group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"] pd.cut(ages, bins, right=False, labels=group_names).categories # Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')
-
pandas.qcut()
: discretizes data points into bins based on sample quantilesdata = np.random.uniform(size=20) # array([0.34500722, 0.51390559, 0.45442101, 0.24779283, 0.11860691, # 0.64541406, 0.95682711, 0.69983468, 0.661519 , 0.09869171, # 0.12846634, 0.10438803, 0.6110211 , 0.28152758, 0.40054146, # 0.36215715, 0.29260695, 0.0516599 , 0.7291701 , 0.84767151]) pd.cut(data, 4, precision=2).categories # equal-length bins based on the minimum and maximum values # IntervalIndex([(0.051, 0.28], (0.28, 0.5], (0.5, 0.73], (0.73, 0.96]], dtype='interval[float64, right]') pd.qcut(data, 4, precision=2).value_counts() # (0.041999999999999996, 0.22] 5 # (0.22, 0.38] 5 # (0.38, 0.65] 5 # (0.65, 0.96] 5 # Name: count, dtype: int64 pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts() # (0.0507, 0.104] 2 # (0.104, 0.381] 8 # (0.381, 0.741] 8 # (0.741, 0.957] 2 # Name: count, dtype: int64
3.7. Detecting and Filtering Outliers
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.002266 -0.036654 -0.017418 -0.029068
# std 1.018625 0.950048 0.994702 1.041082
# min -3.421405 -3.833754 -3.642241 -3.125454
# 25% -0.663947 -0.694460 -0.659706 -0.762197
# 50% -0.024047 -0.005600 -0.025911 -0.024954
# 75% 0.700707 0.603613 0.663005 0.701335
# max 3.134904 2.526298 2.994854 3.704972
data[(data.abs() > 3).any(axis="columns")] # select all rows having a value exceeding 3 or –3
# 0 1 2 3
# 64 0.769583 -3.833754 -0.284691 0.283270
# 176 -3.198594 0.745868 -0.430357 -0.374628
# 207 0.823846 -0.171761 1.137797 3.704972
# 416 -3.024055 0.560612 1.320891 -0.691014
# 790 -3.421405 0.516339 0.049209 1.016584
# 915 -0.543984 0.844945 -3.010801 -0.165290
# 916 3.134904 0.091651 0.908098 1.685942
# 945 3.073425 -0.943425 -1.711061 0.573415
# 955 -1.375006 -0.344609 -3.642241 -0.823830
# 981 0.507220 0.136884 0.880373 -3.125454
data[data.abs() > 3] = np.sign(data) * 3 # cap values outside the interval [–3, 3]
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.001830 -0.035821 -0.016765 -0.029648
# std 1.015991 0.947074 0.992531 1.038420
# min -3.000000 -3.000000 -3.000000 -3.000000
# 25% -0.663947 -0.694460 -0.659706 -0.762197
# 50% -0.024047 -0.005600 -0.025911 -0.024954
# 75% 0.700707 0.603613 0.663005 0.701335
# max 3.000000 2.526298 2.994854 3.000000
np.sign(data).head() # produces 1 and –1 values based on positive or negative
# 0 1 2 3
# 0 -1.0 1.0 -1.0 1.0
# 1 -1.0 -1.0 -1.0 -1.0
# 2 1.0 -1.0 -1.0 1.0
# 3 -1.0 -1.0 1.0 -1.0
# 4 1.0 1.0 1.0 -1.0
3.8. Permutation and Random Sampling
data = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
# 0 1 2 3 4 5 6
# 0 0 1 2 3 4 5 6
# 1 7 8 9 10 11 12 13
# 2 14 15 16 17 18 19 20
# 3 21 22 23 24 25 26 27
# 4 28 29 30 31 32 33 34
sampler = np.random.permutation(5)
# array([2, 4, 3, 0, 1])
data.iloc[sampler] # equivalent: data.take(sampler)
0 1 2 3 4 5 6
2 14 15 16 17 18 19 20
4 28 29 30 31 32 33 34
3 21 22 23 24 25 26 27
0 0 1 2 3 4 5 6
1 7 8 9 10 11 12 13
column_sampler = np.random.permutation(7)
# array([3, 5, 2, 4, 6, 0, 1])
data.take(column_sampler, axis="columns")
# 3 5 2 4 6 0 1
# 0 3 5 2 4 6 0 1
# 1 10 12 9 11 13 7 8
# 2 17 19 16 18 20 14 15
# 3 24 26 23 25 27 21 22
# 4 31 33 30 32 34 28 29
data.sample(n=3) # select a random subset without replacement
# 0 1 2 3 4 5 6
# 0 0 1 2 3 4 5 6
# 2 14 15 16 17 18 19 20
# 4 28 29 30 31 32 33 34
data.sample(n=7, replace=True) # select a random subset with replacement (to allow repeat choices)
# 0 1 2 3 4 5 6
# 3 21 22 23 24 25 26 27
# 0 0 1 2 3 4 5 6
# 1 7 8 9 10 11 12 13
# 0 0 1 2 3 4 5 6
# 4 28 29 30 31 32 33 34
# 4 28 29 30 31 32 33 34
# 3 21 22 23 24 25 26 27
3.9. Computing Indicator/Dummy Variables: One-Hot Encoding
data = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
# key data1
# 0 b 0
# 1 b 1
# 2 a 2
# 3 c 3
# 4 a 4
# 5 b 5
pd.get_dummies(data["key"], dtype=int)
# a b c
# 0 0 1 0
# 1 0 1 0
# 2 1 0 0
# 3 0 0 1
# 4 1 0 0
# 5 0 1 0
data.join(pd.get_dummies(data["key"], dtype=int))
# key data1 a b c
# 0 b 0 0 1 0
# 1 b 1 0 1 0
# 2 a 2 1 0 0
# 3 c 3 0 0 1
# 4 a 4 1 0 0
# 5 b 5 0 1 0
movies = pd.read_table(
"https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/movielens/movies.dat",
sep="::",
header=None,
names=["movie_id", "title", "genres"],
engine="python",
)
movies.head()
# movie_id title genres
# 0 1 Toy Story (1995) Animation|Children's|Comedy
# 1 2 Jumanji (1995) Adventure|Children's|Fantasy
# 2 3 Grumpier Old Men (1995) Comedy|Romance
# 3 4 Waiting to Exhale (1995) Comedy|Drama
# 4 5 Father of the Bride Part II (1995) Comedy
dummies = movies["genres"].str.get_dummies("|") # string functions in pandas
dummies.iloc[:5,:6]
# Action Adventure Animation Children's Comedy Crime
# 0 0 0 1 1 1 0
# 1 0 1 0 1 0 0
# 2 0 0 0 0 1 0
# 3 0 0 0 0 1 0
# 4 0 0 0 0 1 0
movies_windic = movies.join(dummies.add_prefix("Genre_"))
movies_windic.iloc[0][:10]
# movie_id 1
# title Toy Story (1995)
# genres Animation|Children's|Comedy
# Genre_Action 0
# Genre_Adventure 0
# Genre_Animation 1
# Genre_Children's 1
# Genre_Comedy 1
# Genre_Crime 0
# Genre_Documentary 0
# Name: 0, dtype: object
np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
# array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
# 0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins), dtype=int)
# (0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
# 0 0 0 0 0 1
# 1 0 1 0 0 0
# 2 1 0 0 0 0
# 3 0 1 0 0 0
# 4 0 0 1 0 0
# 5 0 0 1 0 0
# 6 0 0 0 0 1
# 7 0 0 0 1 0
# 8 0 0 0 1 0
# 9 0 0 0 1 0
3.10. String Functions in pandas
data = {
"Dave": "dave@google.com",
"Steve": "steve@gmail.com",
"Rob": "rob@gmail.com",
"Wes": np.nan,
}
data = pd.Series(data)
# Dave dave@google.com
# Steve steve@gmail.com
# Rob rob@gmail.com
# Wes NaN
# dtype: object
data.isna()
# Dave False
# Steve False
# Rob False
# Wes True
# dtype: bool
data.astype("string") # extension types
# Dave dave@google.com
# Steve steve@gmail.com
# Rob rob@gmail.com
# Wes <NA>
# dtype: string
data.str.contains("gmail") # .str attribute
# Dave False
# Steve True
# Rob True
# Wes NaN
# dtype: object
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE) # regular expressions
# Dave [(dave, google, com)]
# Steve [(steve, gmail, com)]
# Rob [(rob, gmail, com)]
# Wes NaN
# dtype: object
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0] # vectorized element retrieval
# Dave (dave, google, com)
# Steve (steve, gmail, com)
# Rob (rob, gmail, com)
# Wes NaN
# dtype: object
matches.str.get(1)
# Dave google
# Steve gmail
# Rob gmail
# Wes NaN
# dtype: object
data.str[:5] # slice strings
# Dave dave@
# Steve steve
# Rob rob@g
# Wes NaN
# dtype: object
data.str.extract(pattern, flags=re.IGNORECASE) # extract capture groups
# 0 1 2
# Dave dave google com
# Steve steve gmail com
# Rob rob gmail com
# Wes NaN NaN NaN
3.11. Categorical Encoding
categories = pd.Series(["apple", "orange"]) # categories, dictionary, or levels
# 0 apple
# 1 orange
# dtype: object
codes = pd.Series([0, 1, 0, 0] * 2) # category codes or codes
# 0 0
# 1 1
# 2 0
# 3 0
# 4 0
# 5 1
# 6 0
# 7 0
# dtype: int64
values = categories.take(codes)
# 0 apple
# 1 orange
# 0 apple
# 0 apple
# 0 apple
# 1 orange
# 0 apple
# 0 apple
# dtype: object
# pandas has a Categorical extension type for the integer-based categorical or dictionary-encoded encoding
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)
data = pd.DataFrame(
{
"fruit": fruits,
"basket_id": np.arange(N),
"count": rng.integers(3, 15, size=N),
"weight": rng.uniform(0, 4, size=N),
},
columns=["basket_id", "fruit", "count", "weight"],
)
# basket_id fruit count weight
# 0 0 apple 8 0.366659
# 1 1 orange 6 2.394272
# 2 2 apple 9 3.418968
# 3 3 apple 12 2.406485
# 4 4 apple 8 3.727953
# 5 5 orange 5 2.899125
# 6 6 apple 3 3.442205
# 7 7 apple 4 3.717351
fruit_cat = data["fruit"].astype("category")
# 0 apple
# 1 orange
# 2 apple
# 3 apple
# 4 apple
# 5 orange
# 6 apple
# 7 apple
# Name: fruit, dtype: category
# Categories (2, object): ['apple', 'orange']
c = fruit_cat.array
type(c) # pandas.core.arrays.categorical.Categorical
c.categories # Index(['apple', 'orange'], dtype='object')
c.codes # array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
dict(enumerate(c.categories)) # get a mapping between codes and categories
# {0: 'apple', 1: 'orange'}
pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
# ['foo', 'bar', 'baz', 'foo', 'bar']
# Categories (3, object): ['bar', 'baz', 'foo']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"])
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo', 'bar', 'baz']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"], ordered=True)
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo' < 'bar' < 'baz']
# `.cat` accessor attribute
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2).astype("category")
cat_s.cat.categories
# Index(['a', 'b', 'c', 'd'], dtype='object')
cat_s.cat.codes
# 0 0
# 1 1
# 2 2
# 3 3
# 4 0
# 5 1
# 6 2
# 7 3
# dtype: int8
cat_s.value_counts()
# a 2
# b 2
# c 2
# d 2
# Name: count, dtype: int64
cat_s2 = cat_s.cat.set_categories(["a", "b", "c", "d", "e"])
cat_s2.value_counts()
# a 2
# b 2
# c 2
# d 2
# e 0
# Name: count, dtype: int64
cat_s3 = cat_s[cat_s.isin(["a", "b"])]
# 0 a
# 1 b
# 4 a
# 5 b
# dtype: category
# Categories (4, object): ['a', 'b', 'c', 'd']
cat_s_3.cat.remove_unused_categories()
# 0 a
# 1 b
# 4 a
# 5 b
# dtype: category
# Categories (2, object): ['a', 'b']
draws = np.random.default_rng(seed=12345).standard_normal(1_000)
# array([-1.42382504, 1.26372846, -0.87066174, -0.25917323, -0.07534331,
bins = pd.qcut(draws, 4) # compute a quartile binning
# [(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
# Length: 1000
# Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4']) # quartile names
# ['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
# Length: 1000
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
bins = pd.Series(bins, name='quartile')
# 0 Q1
# 1 Q4
# 2 Q1
# ..
# 998 Q3
# 999 Q2
# Name: quartile, Length: 1000, dtype: category
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
results = pd.Series(draws).groupby(bins, observed=True).agg(["count", "min", "max"]).reset_index() # statistics
# quartile count min max
# 0 Q1 250 -3.119609 -0.678494
# 1 Q2 250 -0.673305 0.008009
# 2 Q3 250 0.018753 0.686183
# 3 Q4 250 0.688282 3.211418
# dummy variables or one-hot encoding
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s, dtype=int)
# a b c d
# 0 1 0 0 0
# 1 0 1 0 0
# 2 0 0 1 0
# 3 0 0 0 1
# 4 1 0 0 0
# 5 0 1 0 0
# 6 0 0 1 0
# 7 0 0 0 1
4. Data Wrangling: Join, Combine, and Reshape
4.1. Hierarchical Indexing
data = pd.Series(
np.random.uniform(size=5),
index=[
["a", "a", "b", "b", "b"],
[ 2, 3, 1, 3, 1]
],
)
data # a prettified view
# a 2 0.911650
# 3 0.316521
# b 1 0.049235
# 3 0.995199
# 1 0.602722
# dtype: float64
data.index
# MultiIndex([('a', 2),
# ('a', 3),
# ('b', 1),
# ('b', 3),
# ('b', 1)],
# )
data["b"] # partial indexing
# 1 0.049235
# 3 0.995199
# 1 0.602722
# dtype: float64
data.loc[:, 3] # level indexing
# a 0.316521
# b 0.995199
# dtype: float64
data = pd.DataFrame(
np.arange(12).reshape((4, 3)),
index=[
["a", "a", "b", "b"],
[ 1, 2, 1, 2]
],
columns=[
[ "Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"]
],
)
# Ohio Colorado
# Green Red Green
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
data.index.names = ["key1", "key2"]
data.columns.names = ["state", "color"]
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
data.index.nlevels
# 2
data["Ohio"]
# color Green Red
# key key2
# a 1 0 1
# 2 3 4
# b 1 6 7
# 2 9 10
pd.MultiIndex.from_tuples(
[
("2023-01-01", "North"),
("2023-01-01", "South"),
("2023-01-02", "North"),
("2023-01-02", "South"),
],
names=["Date", "Region"],
)
# MultiIndex([('2023-01-01', 'North'),
# ('2023-01-01', 'South'),
# ('2023-01-02', 'North'),
# ('2023-01-02', 'South')],
# names=['Date', 'Region'])
pd.MultiIndex.from_arrays(
[
[ "Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"],
],
names=["state", "color"],
)
# MultiIndex([( 'Ohio', 'Green'),
# ( 'Ohio', 'Red'),
# ('Colorado', 'Green')],
# names=['state', 'color'])
4.1.1. Pivoting, Stacking, and Unstacking
data = {
"Date": ["2023-01-01", "2023-01-01", "2023-01-02", "2023-01-02"],
"City": ["New York", "Los Angeles", "New York", "Los Angeles"],
"Temperature": [30, 75, 28, 77],
"Humidity": [65, 30, 70, 35],
}
data = pd.DataFrame(data)
# Date City Temperature Humidity
# 0 2023-01-01 New York 30 65
# 1 2023-01-01 Los Angeles 75 30
# 2 2023-01-02 New York 28 70
# 3 2023-01-02 Los Angeles 77 35
# pandas.pivot(...)
# - reshapes data by turning unique values from one column into columns
# - creates a table where rows and columns represent specific categories, and the values are aggregated
pivoted = data.pivot(index="Date", columns="City", values=["Temperature", "Humidity"])
# Temperature Humidity
# City Los Angeles New York Los Angeles New York
# Date
# 2023-01-01 75 30 30 65
# 2023-01-02 77 28 35 70
pivoted.columns
# MultiIndex([('Temperature', 'Los Angeles'),
# ('Temperature', 'New York'),
# ( 'Humidity', 'Los Angeles'),
# ( 'Humidity', 'New York')],
# names=[None, 'City'])
pivoted_t = data.pivot(index="Date", columns="City", values="Temperature")
# City Los Angeles New York
# Date
# 2023-01-01 75 30
# 2023-01-02 77 28
# pandas.stack(...)
# - converts columns into rows, creating a hierarchical index
# - compacts a DataFrame by stacking columns into a single column
stacked = pivoted_t.stack()
# Date City
# 2023-01-01 Los Angeles 75
# New York 30
# 2023-01-02 Los Angeles 77
# New York 28
# dtype: int64
stacked.index
# MultiIndex([('2023-01-01', 'Los Angeles'),
# ('2023-01-01', 'New York'),
# ('2023-01-02', 'Los Angeles'),
# ('2023-01-02', 'New York')],
# names=['Date', 'City'])
# pandas.unstack(...)
# - converts rows into columns, the reverse of stack
# - expands a DataFrame by unstacking a hierarchical index into columns
stacked.unstack()
# City Los Angeles New York
# Date
# 2023-01-01 75 30
# 2023-01-02 77 28
4.1.2. Reordering and Sorting Levels
df.swaplevel("key1", "key2")
# state Ohio Colorado
# color Green Red Green
# key2 key1
# 1 a 0 1 2
# 2 a 3 4 5
# 1 b 6 7 8
# 2 b 9 10 11
df.sort_index(level=1)
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# b 1 6 7 8
# a 2 3 4 5
# b 2 9 10 11
df.swaplevel(0, 1)
# state Ohio Colorado
# color Green Red Green
# key2 key1
# 1 a 0 1 2
# 2 a 3 4 5
# 1 b 6 7 8
# 2 b 9 10 11
df.swaplevel(0, 1).sort_index(level=0)
# state Ohio Colorado
# color Green Red Green
# key2 key1
# 1 a 0 1 2
# b 6 7 8
# 2 a 3 4 5
# b 9 10 11
4.1.3. Summary Statistics by Level
df.groupby(level="key2").sum()
# state Ohio Colorado
# color Green Red Green
# key2
# 1 6 8 10
# 2 12 14 16
df.T.groupby(level="color").sum().T
# color Green Red
# key1 key2
# a 1 2 1
# 2 8 4
# b 1 14 7
# 2 20 10
4.1.4. Indexing with a DataFrame’s columns
data = pd.DataFrame(
{
"a": range(7),
"b": range(7, 0, -1),
"c": ["one", "one", "one", "two", "two", "two", "two"],
"d": [0, 1, 2, 0, 1, 2, 3],
}
)
# a b c d
# 0 0 7 one 0
# 1 1 6 one 1
# 2 2 5 one 2
# 3 3 4 two 0
# 4 4 3 two 1
# 5 5 2 two 2
# 6 6 1 two 3
data2 = data.set_index(["c", "d"]) # set a DataFrame index using existing columns
# a b
# c d
# one 0 0 7
# 1 1 6
# 2 2 5
# two 0 3 4
# 1 4 3
# 2 5 2
# 3 6 1
data2.reset_index() # reset the hierarchical index levels into the columns
c d a b
0 one 0 0 7
1 one 1 1 6
2 one 2 2 5
3 two 0 3 4
4 two 1 4 3
5 two 2 5 2
6 two 3 6 1
4.2. Combining and Merging Datasets
4.2.1. Database-Style DataFrame Joins
left = pd.DataFrame(
{
"key1": ["foo", "foo", "bar"],
"key2": ["one", "two", "one"],
"lval": pd.Series([1, 2, 3], dtype="Int64"),
}
)
# key1 key2 lval
# 0 foo one 1
# 1 foo two 2
# 2 bar one 3
right = pd.DataFrame(
{
"key1": ["foo", "foo", "bar", "bar"],
"key2": ["one", "one", "one", "two"],
"rval": pd.Series([4, 5, 6, 7], dtype="Int64"),
}
)
# key1 key2 rval
# 0 foo one 4
# 1 foo one 5
# 2 bar one 6
# 3 bar two 7
pd.merge(left, right) # same as pd.merge(left, right, on=["key1", "key2"]) by specifying the overlap columns explicitly
# key1 key2 lval rval
# 0 foo one 1 4
# 1 foo one 1 5
# 2 bar one 3 6
pd.merge(left, right, left_on=["key1", "key2"], right_on=["key1", "key2"]) # specify the left and right keys separately
key1 key2 lval rval
0 foo one 1 4
1 foo one 1 5
2 bar one 3 6
pd.merge(left, right, on=["key1", "key2"], how="inner") # inner join (by default)
# key1 key2 lval rval
# 0 foo one 1 4
# 1 foo one 1 5
# 2 bar one 3 6
pd.merge(left, right, on=["key1", "key2"], how="left") # left join
# key1 key2 lval rval
# 0 foo one 1 4
# 1 foo one 1 5
# 2 foo two 2 <NA>
# 3 bar one 3 6
pd.merge(left, right, on=["key1", "key2"], how="right") # right join
# key1 key2 lval rval
# 0 foo one 1 4
# 1 foo one 1 5
# 2 bar one 3 6
# 3 bar two <NA> 7
pd.merge(left, right, on=["key1", "key2"], how="outer") # outer join
# key1 key2 lval rval
# 0 bar one 3 6
# 1 bar two <NA> 7
# 2 foo one 1 4
# 3 foo one 1 5
# 4 foo two 2 <NA>
pd.merge(left, right, on=["key1", "key2"], how="outer", indicator=True) # source indicator
# key1 key2 lval rval _merge
# 0 bar one 3 6 both
# 1 bar two <NA> 7 right_only
# 2 foo one 1 4 both
# 3 foo one 1 5 both
# 4 foo two 2 <NA> left_only
pd.merge(left, right, on="key1") # specify a single join key
# key1 key2_x lval key2_y rval
# 0 foo one 1 one 4
# 1 foo one 1 one 5
# 2 foo two 2 one 4
# 3 foo two 2 one 5
# 4 bar one 3 one 6
# 5 bar one 3 two 7
pd.merge(left, right, on="key1", suffixes=("_left", "_right")) # sepcify the suffixes for the overlap columns
# key1 key2_left lval key2_right rval
# 0 foo one 1 one 4
# 1 foo one 1 one 5
# 2 foo two 2 one 4
# 3 foo two 2 one 5
# 4 bar one 3 one 6
# 5 bar one 3 two 7
4.2.2. Merging on Index
left1 = pd.DataFrame(
{"key": ["a", "b", "a", "a", "b", "c"], "value": pd.Series(range(6), dtype="Int64")}
)
# key value
# 0 a 0
# 1 b 1
# 2 a 2
# 3 a 3
# 4 b 4
# 5 c 5
right1 = pd.DataFrame({"group_val": [3.5, 7]}, index=["a", "b"])
# group_val
# a 3.5
# b 7.0
pd.merge(left1, right1, left_on="key", right_index=True) # merge on index
# key value group_val
# 0 a 0 3.5
# 1 b 1 7.0
# 2 a 2 3.5
# 3 a 3 3.5
# 4 b 4 7.0
pd.merge(right1, left1, left_index=True, right_on="key", how="outer")
# group_val key value
# 0 3.5 a 0
# 2 3.5 a 2
# 3 3.5 a 3
# 1 7.0 b 1
# 4 7.0 b 4
# 5 NaN c 5
left2 = pd.DataFrame(
[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
index=["a", "c", "e"],
columns=["Ohio", "Nevada"],
).astype("Int64")
right2 = pd.DataFrame(
[[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13, 14]],
index=["b", "c", "d", "e"],
columns=["Missouri", "Alabama"],
).astype("Int64")
pd.merge(left2, right2, left_index=True, right_index=True, how="outer") # use both sides indexes
# Ohio Nevada Missouri Alabama
# a 1 2 <NA> <NA>
# b <NA> <NA> 7 8
# c 3 4 9 10
# d <NA> <NA> 11 12
# e 5 6 13 14
left2.join(right2, how="outer") # join: merge by index shortly
# Ohio Nevada Missouri Alabama
# a 1 2 <NA> <NA>
# b <NA> <NA> 7 8
# c 3 4 9 10
# d <NA> <NA> 11 12
# e 5 6 13 14
another = pd.DataFrame(
[[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [16.0, 17.0]],
index=["a", "c", "e", "f"],
columns=["New York", "Oregon"],
)
# New York Oregon
# a 7.0 8.0
# c 9.0 10.0
# e 11.0 12.0
# f 16.0 17.0
left2.join([right2, another])
# Ohio Nevada Missouri Alabama New York Oregon
# a 1 2 <NA> <NA> 7.0 8.0
# c 3 4 9 10 9.0 10.0
# e 5 6 13 14 11.0 12.0
lefth = pd.DataFrame(
{
"key1": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada"],
"key2": [2000, 2001, 2002, 2001, 2002],
"data": pd.Series(range(5), dtype="Int64"),
}
)
# key1 key2 data
# 0 Ohio 2000 0
# 1 Ohio 2001 1
# 2 Ohio 2002 2
# 3 Nevada 2001 3
# 4 Nevada 2002 4
righth_index = pd.MultiIndex.from_arrays(
[
["Nevada", "Nevada", "Ohio", "Ohio", "Ohio", "Ohio"],
[2001, 2000, 2000, 2000, 2001, 2002],
]
)
righth = pd.DataFrame(
{
"event1": pd.Series([0, 2, 4, 6, 8, 10], dtype="Int64", index=righth_index),
"event2": pd.Series([1, 3, 5, 7, 9, 11], dtype="Int64", index=righth_index),
}
)
# event1 event2
# Nevada 2001 0 1
# 2000 2 3
# Ohio 2000 4 5
# 2000 6 7
# 2001 8 9
# 2002 10 11
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True) # multiple-key merge
key1 key2 data event1 event2
0 Ohio 2000 0 4 5
0 Ohio 2000 0 6 7
1 Ohio 2001 1 8 9
2 Ohio 2002 2 10 11
3 Nevada 2001 3 0 1
4.2.3. Concatenating Along an Axis
# numpy.concatenate(...)
arr = np.arange(12).reshape((3, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
np.concat([arr, arr], axis=1)
# array([[ 0, 1, 2, 3, 0, 1, 2, 3],
# [ 4, 5, 6, 7, 4, 5, 6, 7],
# [ 8, 9, 10, 11, 8, 9, 10, 11]])
# pandas.concat(...)
s1 = pd.Series([0, 1], index=["a", "b"], dtype="Int64")
# a 0
# b 1
# dtype: Int64
s2 = pd.Series([2, 3, 4], index=["c", "d", "e"], dtype="Int64")
# c 2
# d 3
# e 4
# dtype: Int64
s3 = pd.Series([5, 6], index=["f", "g"], dtype="Int64")
# f 5
# g 6
# dtype: Int64
pd.concat([s1, s2, s3]) # stacking (union all) on columns
# a 0
# b 1
# c 2
# d 3
# e 4
# f 5
# g 6
# dtype: Int64
pd.concat([s1, s2, s3], axis="columns") # union (outer join) of the indexes
# 0 1 2
# a 0 <NA> <NA>
# b 1 <NA> <NA>
# c <NA> 2 <NA>
# d <NA> 3 <NA>
# e <NA> 4 <NA>
# f <NA> <NA> 5
# g <NA> <NA> 6
s4 = pd.concat([s1, s3])
# a 0
# b 1
# f 5
# g 6
# dtype: Int64
pd.concat([s1, s4], axis="columns")
# 0 1
# a 0 0
# b 1 1
# f <NA> 5
# g <NA> 6
pd.concat([s1, s4], axis="columns", join="inner")
# 0 1
# a 0 0
# b 1 1
# use the keys to indicate the concatenated pieces
result = pd.concat([s1, s1, s3], keys=["one", "two", "three"])
# one a 0
# b 1
# two a 0
# b 1
# three f 5
# g 6
# dtype: Int64
result.unstack()
# a b f g
# one 0 1 <NA> <NA>
# two 0 1 <NA> <NA>
# three <NA> <NA> 5 6
pd.concat([s1, s2, s3], axis="columns", keys=["one", "two", "three"])
# one two three
# a 0 <NA> <NA>
# b 1 <NA> <NA>
# c <NA> 2 <NA>
# d <NA> 3 <NA>
# e <NA> 4 <NA>
# f <NA> <NA> 5
# g <NA> <NA> 6
df1 = pd.DataFrame(
np.arange(6).reshape(3, 2), index=["a", "b", "c"], columns=["one", "two"]
)
# one two
# a 0 1
# b 2 3
# c 4 5
df2 = pd.DataFrame(
5 + np.arange(4).reshape(2, 2), index=["a", "c"], columns=["three", "four"]
)
# three four
# a 5 6
# c 7 8
pd.concat([df1, df2])
# one two three four
# a 0.0 1.0 NaN NaN
# b 2.0 3.0 NaN NaN
# c 4.0 5.0 NaN NaN
# a NaN NaN 5.0 6.0
# c NaN NaN 7.0 8.0
pd.concat([df1, df2], axis="columns")
# one two three four
# a 0 1 5.0 6.0
# b 2 3 NaN NaN
# c 4 5 7.0 8.0
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"])
# level1 level2
# one two three four
# a 0 1 5.0 6.0
# b 2 3 NaN NaN
# c 4 5 7.0 8.0
pd.concat({"level1": df1, "level2": df2}, axis="columns")
# level1 level2
# one two three four
# a 0 1 5.0 6.0
# b 2 3 NaN NaN
# c 4 5 7.0 8.0
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"], names=["upper", "lower"])
# upper level1 level2
# lower one two three four
# a 0 1 5.0 6.0
# b 2 3 NaN NaN
# c 4 5 7.0 8.0
# not preserve indexes along concatenation axis, instead produce a new range(total_length) index
df1 = pd.DataFrame(np.random.standard_normal((3, 4)), columns=["a", "b", "c", "d"])
# a b c d
# 0 2.410815 -0.872558 1.215403 1.321002
# 1 -0.046726 0.088367 0.255645 -1.442833
# 2 0.001280 0.561726 -0.992764 -0.830453
df2 = pd.DataFrame(np.random.standard_normal((2, 3)), columns=["b", "d", "a"])
# b d a
# 0 -1.281585 -1.052020 0.925319
# 1 -0.711210 -0.424939 -0.619031
pd.concat([df1, df2], ignore_index=True)
# a b c d
# 0 2.410815 -0.872558 1.215403 1.321002
# 1 -0.046726 0.088367 0.255645 -1.442833
# 2 0.001280 0.561726 -0.992764 -0.830453
# 3 0.925319 -1.281585 NaN -1.052020
# 4 -0.619031 -0.711210 NaN -0.424939
4.2.4. Combining Data with Overlap
a = pd.Series(
[np.nan, 2.5, 0.0, 3.5, 4.5, np.nan], index=["f", "e", "d", "c", "b", "a"]
)
# f NaN
# e 2.5
# d 0.0
# c 3.5
# b 4.5
# a NaN
# dtype: float64
b = pd.Series(
[0.0, np.nan, 2.0, np.nan, np.nan, 5.0], index=["a", "b", "c", "d", "e", "f"]
)
# a 0.0
# b NaN
# c 2.0
# d NaN
# e NaN
# f 5.0
# dtype: float64
np.where(pd.isna(a), b, a) # not check the index labels
# array([0. , 2.5, 0. , 3.5, 4.5, 5. ])
a.combine_first(b) # line up values by index
# a 0.0
# b 4.5
# c 3.5
# d 0.0
# e 2.5
# f 5.0
# dtype: float64
df1 = pd.DataFrame(
{
"a": [1.0, np.nan, 5.0, np.nan],
"b": [np.nan, 2.0, np.nan, 6.0],
"c": range(2, 18, 4),
}
)
# a b c
# 0 1.0 NaN 2
# 1 NaN 2.0 6
# 2 5.0 NaN 10
# 3 NaN 6.0 14
df2 = pd.DataFrame(
{
"a": [5.0, 4.0, np.nan, 3.0, 7.0],
"b": [np.nan, 3.0, 4.0, 6.0, 8.0],
}
)
# a b
# 0 5.0 NaN
# 1 4.0 3.0
# 2 NaN 4.0
# 3 3.0 6.0
# 4 7.0 8.0
df1.combine_first(df2) # union of all the column names
# a b c
# 0 1.0 NaN 2.0
# 1 4.0 2.0 6.0
# 2 5.0 4.0 10.0
# 3 3.0 6.0 14.0
# 4 7.0 8.0 NaN
4.3. Reshaping and Pivoting
4.3.1. Reshaping with Hierarchical Indexing
data = pd.DataFrame(
np.arange(6).reshape((2, 3)),
index=pd.Index(["Ohio", "Colorado"], name="state"),
columns=pd.Index(["one", "two", "three"], name="number"),
)
# number one two three
# state
# Ohio 0 1 2
# Colorado 3 4 5
data.stack() # pivot the columns into the rows (a new Series)
# state number
# Ohio one 0
# two 1
# three 2
# Colorado one 3
# two 4
# three 5
# dtype: int64
data.unstack() # pivot the rows into the columns (a new Series)
# number state
# one Ohio 0
# Colorado 3
# two Ohio 1
# Colorado 4
# three Ohio 2
# Colorado 5
# dtype: int64
data.stack().unstack() # rearrange forth and back
# number one two three
# state
# Ohio 0 1 2
# Colorado 3 4 5
4.3.2. Pivoting Long to Wide Format
A common way to store multiple time series in databases and CSV files is what is sometimes called long or stacked format, that each row represents a single observation or value, with columns for identifiers (e.g., time, category) and the corresponding value. |
raw_data = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/macrodata.csv")
raw_data.head()
# year quarter realgdp realcons realinv ... tbilrate unemp pop infl realint
# 0 1959 1 2710.349 1707.4 286.898 ... 2.82 5.8 177.146 0.00 0.00
# 1 1959 2 2778.801 1733.7 310.859 ... 3.08 5.1 177.830 2.34 0.74
# 2 1959 3 2775.488 1751.8 289.226 ... 3.82 5.3 178.657 2.74 1.09
# 3 1959 4 2785.204 1753.7 299.356 ... 4.33 5.6 179.386 0.27 4.06
# 4 1960 1 2847.699 1770.5 331.722 ... 3.50 5.2 180.007 2.31 1.19
#
# [5 rows x 14 columns]
data = raw_data.loc[:, ["year", "quarter", "realgdp", "infl", "unemp"]]
data.head()
# year quarter realgdp infl unemp
# 0 1959 1 2710.349 0.00 5.8
# 1 1959 2 2778.801 2.34 5.1
# 2 1959 3 2775.488 2.74 5.3
# 3 1959 4 2785.204 0.27 5.6
# 4 1960 1 2847.699 2.31 5.2
periods = pd.PeriodIndex.from_fields(
year=data.pop("year"),
quarter=data.pop("quarter")
)
periods.name = "date"
# PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
# '1960Q3', '1960Q4', '1961Q1', '1961Q2',
# ...
# '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
# '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
# dtype='period[Q-DEC]', name='date', length=203)
data.index = periods.to_timestamp("D")
data.head()
# realgdp infl unemp
# date
# 1959-01-01 2710.349 0.00 5.8
# 1959-04-01 2778.801 2.34 5.1
# 1959-07-01 2775.488 2.74 5.3
# 1959-10-01 2785.204 0.27 5.6
# 1960-01-01 2847.699 2.31 5.2
data.columns.name = "item"
data.head()
# item realgdp infl unemp
# date
# 1959-01-01 2710.349 0.00 5.8
# 1959-04-01 2778.801 2.34 5.1
# 1959-07-01 2775.488 2.74 5.3
# 1959-10-01 2785.204 0.27 5.6
# 1960-01-01 2847.699 2.31 5.2
ldata = data.stack().reset_index().rename(columns={0: "value"}) # long format
ldata.head()
# date item value
# 0 1959-01-01 realgdp 2710.349
# 1 1959-01-01 infl 0.000
# 2 1959-01-01 unemp 5.800
# 3 1959-04-01 realgdp 2778.801
# 4 1959-04-01 infl 2.340
pivoted = ldata.pivot(index="date", columns="item", values="value")
pivoted.head()
# item infl realgdp unemp
# date
# 1959-01-01 0.00 2710.349 5.8
# 1959-04-01 2.34 2778.801 5.1
# 1959-07-01 2.74 2775.488 5.3
# 1959-10-01 0.27 2785.204 5.6
# 1960-01-01 2.31 2847.699 5.2
ldata["value2"] = np.random.standard_normal(len(ldata))
pivoted = ldata.pivot(index="date", columns="item") # mutiple values, hierarchical columns
pivoted.head()
# value value2
# item infl realgdp unemp infl realgdp unemp
# date
# 1959-01-01 0.00 2710.349 5.8 -0.971766 -0.561886 0.007484
# 1959-04-01 2.34 2778.801 5.1 0.649848 0.664631 0.635405
# 1959-07-01 2.74 2775.488 5.3 -0.828290 0.528560 -1.424809
# 1959-10-01 0.27 2785.204 5.6 -1.433866 -0.062887 0.395601
# 1960-01-01 2.31 2847.699 5.2 0.074323 -0.682785 -0.587253
pivoted["value"].head()
# item infl realgdp unemp
# date
# 1959-01-01 0.00 2710.349 5.8
# 1959-04-01 2.34 2778.801 5.1
# 1959-07-01 2.74 2775.488 5.3
# 1959-10-01 0.27 2785.204 5.6
# 1960-01-01 2.31 2847.699 5.2
unstacked = ldata.set_index(["date", "item"]).unstack(level="item") # equivalent: set_index and unstack
unstacked.head()
# value value2
# item infl realgdp unemp infl realgdp unemp
# date
# 1959-01-01 0.00 2710.349 5.8 -0.971766 -0.561886 0.007484
# 1959-04-01 2.34 2778.801 5.1 0.649848 0.664631 0.635405
# 1959-07-01 2.74 2775.488 5.3 -0.828290 0.528560 -1.424809
# 1959-10-01 0.27 2785.204 5.6 -1.433866 -0.062887 0.395601
# 1960-01-01 2.31 2847.699 5.2 0.074323 -0.682785 -0.587253
4.3.3. Pivoting Wide to Long Format
df = pd.DataFrame(
{
"key": ["foo", "bar", "baz"],
"A": [1, 2, 3],
"B": [4, 5, 6],
"C": [7, 8, 9],
}
)
# key A B C
# 0 foo 1 4 7
# 1 bar 2 5 8
# 2 baz 3 6 9
melted = pd.melt(df, id_vars="key") # use "key" as the group indicator
# key variable value
# 0 foo A 1
# 1 bar A 2
# 2 baz A 3
# 3 foo B 4
# 4 bar B 5
# 5 baz B 6
# 6 foo C 7
# 7 bar C 8
# 8 baz C 9
melted.pivot(index="key", columns="variable", values="value").reset_index() # reshape back
# variable key A B C
# 0 bar 2 5 8
# 1 baz 3 6 9
# 2 foo 1 4 7
5. Plotting and Visualization
conda install -c conda-forge -y jupyterlab && jupyter lab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = np.arange(10) # array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.plot(data);
plt.show()
===
matplotlib.pyplot maintains the concept of an active figure and axes.
|
-
plt.figure()
creates a new figure, and it becomes the active figure, and a default axes is also created and becomes active. -
plt.gcf()
returns the current active figure or creates one if no figure exists. -
plt.gca()
returns the current active axes or creates one if no axes exists in the current figure. -
plt.plot()
,plt.scatter()
, etc., operate on the current active axes. -
plt.axes(ax)
can be used to set the current axes, whereax
is an axes object. -
plt.sca(ax)
is a more suitable way to set the current axes, asplt.axes
is deprecated for this purpose. -
plt.subplot()
creates a new subplot and makes it the active axes. -
In interactive mode,
plt.show()
is not necessary to display plots.-
plt.ion()
enables interactive mode, andplt.ioff()
disables it. -
plt.isinteractive()
returnsTrue
if interactive mode is enabled, andFalse
otherwise.
plt.get_fignums() # [] fig, ax = plt.subplots() fig.number # 1 plt.get_fignums() # [1] fig == plt.gcf() # True ax is plt.gca() # True fig2 = plt.figure() fig2.number # 2 plt.get_fignums() # [1, 2] plt.gcf().number # 2 fig2.axes # [] ax2 = plt.gca() ax2 == ax # False fig2.axes # [<Axes: >] fig3 = plt.figure(1) fig3.number # 1 fig3 == fig # True fig3 == plt.gcf() # True
===
-
5.1. Figures and Subplots
# plots in matplotlib reside within a figure object
fig = plt.figure() # create a new figure
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
plt.get_fignums() # [1]
plt.figure(1) == fig # True
# get the current figure or create a new figure
cf = plt.gcf()
cf.number, cf == fig # (1, True)
# saving plots to file
fig.savefig("figpath.svg")
fig.savefig("figpath.png", dpi=400)
# show a specific figure
fig.show()
fig.show() # fine
# show and erase the current figure
plt.show()
plt.show() # not work
plt.get_fignums() # []
plt.gcf() # <Figure size 640x480 with 0 Axes>
ax3.plot(np.random.standard_normal(50).cumsum(), color="black", linestyle="dashed");
ax1.hist(np.random.standard_normal(100), bins=20, color="black", alpha=0.3);
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.standard_normal(30));
ax3.plot(np.random.standard_normal(50).cumsum(), color="black", linestyle="dashed");
# plt.subplots(...): create a grid of subplots more convenient
fig, axes = plt.subplots(2, 3, sharex=True, sharey=True)
axes
# array([[<Axes: >, <Axes: >, <Axes: >],
# [<Axes: >, <Axes: >, <Axes: >]], dtype=object)
fig.show()
# adjusting the spacing around subplots
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
for j in range(2):
axes[i, j].hist(np.random.standard_normal(500), bins=50, color="black", alpha=0.5)
fig.subplots_adjust(wspace=0, hspace=0)
5.2. Colors, Markers, and Line Styles
fig = plt.figure()
ax = fig.add_subplot()
ax.plot(np.random.standard_normal(30).cumsum(), color="red", linestyle="--", marker="o") # markers highlights the data points
fig.show()
fig = plt.figure()
ax = fig.add_subplot()
data = np.random.standard_normal(30).cumsum()
ax.plot(data, color="#FF69B4", linestyle="dashed", label="Default") # pink
ax.plot(data, color="#33CCFF", linestyle="dotted", drawstyle="steps-post", label="steps-post") # sky blue
ax.legend()
fig.show()
5.3. Ticks, Labels, and Legends
fig, ax = plt.subplots()
ax.plot(np.random.default_rng(seed=12345).standard_normal(1_000).cumsum());
fig.show()
fig, ax = plt.subplots()
ax.plot(np.random.default_rng(seed=12345).standard_normal(1_000).cumsum());
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(["one", "two", "three", "four", "five"], rotation=30, fontsize=8)
ax.set_xlabel("Stages");
ax.set_title("My first matplotlib plot")
# ax.set(title="My first matplotlib plot", xlabel="Stages")
fig, ax = plt.subplots()
ax.plot(np.random.randn(1000).cumsum(), color="black", label="one")
ax.plot(np.random.randn(1000).cumsum(), color="black", linestyle="dashed", label="two")
ax.plot(np.random.randn(1000).cumsum(), color="black", linestyle="dotted", label="three")
ax.legend()
5.4. Annotations and Drawing on a Subplot
fig, ax = plt.subplots()
data = pd.read_csv(
"https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/spx.csv",
index_col=0,
parse_dates=True,
)
spx = data["SPX"]
spx.plot(ax=ax, color="black")
crisis_data = [
(datetime(2007, 10, 11), "Peak of bull market"), # from datatime import datetime
(datetime(2008, 3, 12), "Bear Stearns Fails"),
(datetime(2008, 9, 15), "Lehman Bankruptcy"),
]
for date, label in crisis_data:
ax.annotate(
label,
xy=(date, spx.asof(date) + 75),
xytext=(date, spx.asof(date) + 225),
arrowprops=dict(facecolor="black", headwidth=4, width=2, headlength=4),
horizontalalignment="left",
verticalalignment="top",
)
# Zoom in on 2007-2010
ax.set_xlim(["1/1/2007", "1/1/2011"])
ax.set_ylim([600, 1800])
ax.set_title("Important dates in the 2008-2009 financial crisis")
fig, ax = plt.subplots()
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color="black", alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color="blue", alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]], color="green", alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)
5.5. Plotting with pandas and seaborn
Series and DataFrame have a plot
attribute for making some basic plot types.
5.5.1. Line and Bar Plots
-
Simple Series plot
s = pd.Series(np.random.standard_normal(10).cumsum(), index=np.arange(0, 100, 10)) s.plot() # `s.plot()` is equivalent to `s.plot.line()` and `s.plot(kind="line")`
Figure 13. Simple Series plot -
Simple DataFrame plot
df = pd.DataFrame( np.random.standard_normal((10, 4)).cumsum(0), columns=["A", "B", "C", "D"], index=np.arange(0, 100, 10), ) plt.style.use("grayscale") df.plot()
Figure 14. Simple DataFrame plot -
Horizonal and vertical bar plot
fig, axes = plt.subplots(2, 1) s = pd.Series(np.random.default_rng(seed=12345).uniform(size=16), index=list("abcdefghijklmnop")) s.plot.bar(ax=axes[0], color="black", alpha=0.7) s.plot.barh(ax=axes[1], color="black", alpha=0.7)
Figure 15. Horizonal and vertical bar plot -
DataFrame bar plot
df = pd.DataFrame( np.random.default_rng(seed=12345).uniform(size=(6, 4)), index=["one", "two", "three", "four", "five", "six"], columns=pd.Index(["A", "B", "C", "D"], name="Genus"), ) df.plot.bar()
Figure 16. DataFrame bar plotdf.plot.barh(stacked=True, alpha=0.5) # create stacked bar plots
Figure 17. DataFrame stacked bar plottips = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/tips.csv") tips.head() # total_bill tip smoker day time size # 0 16.99 1.01 No Sun Dinner 2 # 1 10.34 1.66 No Sun Dinner 3 # 2 21.01 3.50 No Sun Dinner 3 # 3 23.68 3.31 No Sun Dinner 2 # 4 24.59 3.61 No Sun Dinner 4 party_counts = pd.crosstab(tips["day"], tips["size"]) # frequency table # size 1 2 3 4 5 6 # day # Fri 1 16 1 1 0 0 # Sat 2 53 18 13 1 0 # Sun 0 39 15 18 3 1 # Thur 1 48 4 5 1 3 party_counts = party_counts.reindex(index=["Thur", "Fri", "Sat", "Sun"]) # size 1 2 3 4 5 6 # day # Thur 1 48 4 5 1 3 # Fri 1 16 1 1 0 0 # Sat 2 53 18 13 1 0 # Sun 0 39 15 18 3 1 party_counts = party_counts.loc[:, 2:5] # remove one- and six-person parties # size 2 3 4 5 # day # Thur 48 4 5 1 # Fri 16 1 1 0 # Sat 53 18 13 1 # Sun 39 15 18 3 party_pcts = party_counts.div(party_counts.sum(axis="columns"), axis="index") # normalize to sum to 1 # size 2 3 4 5 # day # Thur 0.827586 0.068966 0.086207 0.017241 # Fri 0.888889 0.055556 0.055556 0.000000 # Sat 0.623529 0.211765 0.152941 0.011765 # Sun 0.520000 0.200000 0.240000 0.040000 party_pcts.plot.bar(stacked=True)
Figure 18. Fraction of parties by size within each dayimport seaborn as sns # conda install seaborn sns.set_style("whitegrid") sns.set_palette("Greys_r") # set a greyscale color palette for black-and-white print medium
tips["tip_pct"] = tips["tip"] / (tips["total_bill"] - tips["tip"]) tips.head() # total_bill tip smoker day time size tip_pct # 0 16.99 1.01 No Sun Dinner 2 0.063204 # 1 10.34 1.66 No Sun Dinner 3 0.191244 # 2 21.01 3.50 No Sun Dinner 3 0.199886 # 3 23.68 3.31 No Sun Dinner 2 0.162494 # 4 24.59 3.61 No Sun Dinner 4 0.172069
sns.barplot(x="tip_pct", y="day", data=tips, orient="h")
Figure 19. Tipping (average) percentage by day with error bars
5.5.2. Histograms and Density Plots
-
A histogram is a kind of bar plot that gives a discretized display of value frequency, that is, the data points are split into discrete, evenly spaced bins, and the number of data points in each bin is plotted.
tips["tip_pct"].plot.hist(bins=50)
Figure 20. Histogram of tip percentages -
A density plot is formed by approximating a continuous probability distribution that generated observed data as a mixture of simpler distributions, often referred to as kernel density estimates (KDE).
tips["tip_pct"].plot.density() # SciPy: conda install scipy
Figure 21. Density plot of tip percentages
5.5.3. Scatter or Point Plots
macro = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/macrodata.csv")
data = macro[["cpi", "m1", "tbilrate", "unemp"]]
trans_data = np.log(data).diff().dropna()
trans_data.tail()
# cpi m1 tbilrate unemp
# 198 -0.007904 0.045361 -0.396881 0.105361
# 199 -0.021979 0.066753 -2.277267 0.139762
# 200 0.002340 0.010286 0.606136 0.160343
# 201 0.008419 0.037461 -0.200671 0.127339
# 202 0.008894 0.012202 -0.405465 0.042560
-
Make a scatter plot and fits a linear regression line
ax = sns.regplot(x="m1", y="unemp", data=trans_data) ax.set_title("Changes in log(m1) versus log(unemp)")
Figure 22. A seaborn regression/scatter plot -
Make a pairs plot or scatter plot matrix
sns.pairplot(trans_data, diag_kind="kde", plot_kws={"alpha": 0.2})
Figure 23. Pair plot matrix of statsmodels macro data
5.5.4. Facet Grids and Categorical Data
-
Tipping percentage by day/time/smoker
sns.catplot(x="day", y="tip_pct", hue="time", col="smoker", kind="bar", data=tips[tips.tip_pct < 1])
Figure 24. Tipping percentage by day/time/smoker -
Tipping percentage by day split by time/smoker
sns.catplot(x="day", y="tip_pct", row="time", col="smoker", kind="bar", data=tips[tips.tip_pct < 1])
-
Box plot of tipping percentage by day
sns.catplot(x="tip_pct", y="day", kind="box", data=tips[tips.tip_pct < 0.5])
Figure 25. Box plot of tipping percentage by day
6. Data Aggregation and Group Operations
6.1. Group Operations
df = pd.DataFrame(
{
"key1": pd.Series(["a", "a", None, "b", "b", "a", None], dtype="string"),
"key2": pd.Series([1, 2, 1, 2, 1, None, 1], dtype="Int64"),
"data1": np.random.standard_normal(7),
"data2": np.random.standard_normal(7),
}
)
# key1 key2 data1 data2
# 0 a 1 -0.570747 0.965378
# 1 a 2 -0.442446 0.433062
# 2 <NA> 1 0.156601 0.045637
# 3 b 2 1.307246 -2.110711
# 4 b 1 -0.914420 -0.069982
# 5 a <NA> -1.878161 -0.512519
# 6 <NA> 1 -0.637074 -1.144718
# 1. split into groups based on one or more keys
# 2. apply a function to each group, producing a new value
# 3. combine the results of all those function applications into a result object
grouped = df["data1"].groupby(df["key1"])
# <pandas.core.groupby.generic.SeriesGroupBy object at 0x7f5e0a36ef90>
grouped.apply(lambda x: x.mean())
# key1
# a -0.963785
# b 0.196413
# Name: data1, dtype: float64
grouped = df[["data1"]].groupby(df["key1"])
# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f5dd552a210>
grouped.mean()
# data1
# key1
# a -0.963785
# b 0.196413
grouped = df.groupby("key1")["data1"]
grouped.keys # 'key1'
grouped.groups # {'a': [0, 1, 5], 'b': [3, 4]}
grouped.get_group("a")
# 0 -0.570747
# 1 -0.442446
# 5 -1.878161
# Name: data1, dtype: float64
grouped.get_group("a").mean() # np.float64(-0.9637846666666666)
-
any missing values in a group key are excluded from the result by default
df.groupby("key1").size() # key1 # a 3 # b 2 # dtype: int64 df.groupby("key1", dropna=False).size() # key1 # a 3 # b 2 # <NA> 2 # dtype: int64
-
compute the number of non-null values in each group
df.groupby("key1").count() # key2 data1 data2 # key1 # a 2 3 3 # b 2 2 2
6.1.1. group by column names
df.groupby("key1").mean() # the numeric columns key1, data1, and data2 are all aggregated
# key2 data1 data2
# key1
# a 1.5 -0.963784 0.295307
# b 1.5 0.196413 -1.090347
df.groupby("key2").mean(numeric_only=True) # key1 is a nuisance column
# data1 data2
# key2
# 1 -0.49141 -0.050921
# 2 0.43240 -0.838825
df.groupby(["key1", "key2"]).mean()
# data1 data2
# key1 key2
# a 1 -0.570747 0.965378
# 2 -0.442446 0.433062
# b 1 -0.914420 -0.069982
# 2 1.307246 -2.110711
6.1.2. iterate over groups
for name, chunk in df["data1"].groupby(df["key1"]):
print(name)
print(chunk)
# a
# 0 -0.570747
# 1 -0.442446
# 5 -1.878161
# Name: data1, dtype: float64
# b
# 3 1.307246
# 4 -0.914420
# Name: data1, dtype: float64
for name, chunk in df.groupby(["key1", "key2"]):
print(name)
print(chunk)
# ('a', np.int64(1))
# key1 key2 data1 data2
# 0 a 1 -0.570747 0.965378
# ('a', np.int64(2))
# key1 key2 data1 data2
# 1 a 2 -0.442446 0.433062
# ('b', np.int64(1))
# key1 key2 data1 data2
# 4 b 1 -0.91442 -0.069982
# ('b', np.int64(2))
# key1 key2 data1 data2
# 3 b 2 1.307246 -2.110711
6.1.3. select a column or subset of columns
df.groupby("key1")["data1"] # df["data1"].groupby(df["key1"])
# <pandas.core.groupby.generic.SeriesGroupBy object at 0x7f5dd9540cd0>
df.groupby("key1")[["data1"]] # df[["data1"]].groupby(df["key1"])
# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f5dd979a690>
df.groupby(["key1", "key2"])["data1"].mean()
# key1 key2
# a 1 -0.570747
# 2 -0.442446
# b 1 -0.914420
# 2 1.307246
# Name: data1, dtype: float64
df.groupby("key2")[["data1", "data2"]].mean()
# data1 data2
# key2
# 1 -0.49141 -0.050921
# 2 0.43240 -0.838825
6.1.4. group with arrays, dictionaries, series, and functions
people = pd.DataFrame(
np.random.standard_normal((5, 5)),
columns=["a", "b", "c", "d", "e"],
index=["Joe", "Steve", "Wanda", "Jill", "Trey"],
)
people.iloc[2:3, [1, 2]] = np.nan # add a few NA values
# a b c d e
# Joe -0.722383 0.074867 -2.167530 0.779033 0.401936
# Steve 1.154339 -0.256692 0.382976 0.138536 0.057351
# Wanda -2.275050 NaN NaN -1.089663 -0.355903
# Jill -0.188314 0.327582 1.138911 -0.509901 0.388599
# Trey 0.736051 -1.300729 -1.214873 0.171754 -0.053030
colors = ["red", "red", "blue", "blue", "red"]
states = np.array(["OH", "CA", "CA", "OH", "OH"])
people.T.groupby(colors).max().T
# blue red
# Joe 0.779033 0.401936
# Steve 0.382976 1.154339
# Wanda -1.089663 -0.355903
# Jill 1.138911 0.388599
# Trey 0.171754 0.736051
people.T.groupby([colors, states]).sum().T
# blue red
# CA OH CA OH
# Joe -2.167530 0.779033 0.074867 -0.320447
# Steve 0.382976 0.138536 -0.256692 1.211690
# Wanda 0.000000 -1.089663 0.000000 -2.630953
# Jill 1.138911 -0.509901 0.327582 0.200285
# Trey -1.214873 0.171754 -1.300729 0.683021
dictionary = {"a": "red", "b": "red", "c": "blue", "d": "blue", "e": "red", "f": "orange"} # unused keys (f) are OK
people.T.groupby(dictionary).sum().T
# blue red
# Joe -1.388497 -0.245580
# Steve 0.521512 0.954998
# Wanda -1.089663 -2.630953
# Jill 0.629010 0.527867
# Trey -1.043119 -0.617708
series = pd.Series(dictionary)
people.T.groupby(series).count().T
# blue red
# Joe 2 3
# Steve 2 3
# Wanda 1 2
# Jill 2 3
# Trey 2 3
people.groupby(lambda x: x[0]).sum()
# a b c d e
# J -0.910697 0.402449 -1.028619 0.269132 0.790535
# S 1.154339 -0.256692 0.382976 0.138536 0.057351
# T 0.736051 -1.300729 -1.214873 0.171754 -0.053030
# W -2.275050 0.000000 0.000000 -1.089663 -0.355903
people.groupby(people.index.array.map(lambda x: x[0])).sum()
# a b c d e
# J -0.910697 0.402449 -1.028619 0.269132 0.790535
# S 1.154339 -0.256692 0.382976 0.138536 0.057351
# T 0.736051 -1.300729 -1.214873 0.171754 -0.053030
# W -2.275050 0.000000 0.000000 -1.089663 -0.355903
people.groupby(len).sum()
# a b c d e
# 3 -0.722383 0.074867 -2.167530 0.779033 0.401936
# 4 0.547737 -0.973147 -0.075962 -0.338147 0.335569
# 5 -1.120712 -0.256692 0.382976 -0.951127 -0.298552
people.groupby(people.index.array.map(len)).sum()
# a b c d e
# 3 -0.722383 0.074867 -2.167530 0.779033 0.401936
# 4 0.547737 -0.973147 -0.075962 -0.338147 0.335569
# 5 -1.120712 -0.256692 0.382976 -0.951127 -0.298552
people.groupby([lambda x: len(x), states]).sum()
# a b c d e
# 3 OH -0.722383 0.074867 -2.167530 0.779033 0.401936
# 4 OH 0.547737 -0.973147 -0.075962 -0.338147 0.335569
# 5 CA -1.120712 -0.256692 0.382976 -0.951127 -0.298552
6.1.5. group by index levels
columns = pd.MultiIndex.from_arrays(
[
["US", "US", "US", "JP", "JP"],
[1, 3, 5, 1, 3],
],
names=["cty", "tenor"],
)
df = pd.DataFrame(np.random.standard_normal((4, 5)), columns=columns)
# cty US JP
# tenor 1 3 5 1 3
# 0 -1.604618 0.129037 0.322549 -1.217651 -0.818239
# 1 1.198944 -2.472948 -0.637405 -0.116704 -1.252229
# 2 1.839182 0.339592 -0.580455 1.108270 0.694266
# 3 -2.000956 1.706998 -2.092790 0.139301 0.074481
df.T.groupby(level="cty").count().T
# cty JP US
# 0 2 3
# 1 2 3
# 2 2 3
# 3 2 3
6.2. Data Aggregation
Aggregations refer to any data transformation that produces scalar values from arrays.
# common optimized aggregations on GroupBy objects
any, all # return True if any (one or more values) or all non-NA values are “truthy”
count # number of non-NA values
cummin, cummax # cumulative minimum and maximum of non-NA values
cumsum # cumulative sum of non-NA values
cumprod # cumulative product of non-NA values
first, last # first and last non-NA values
mean # mean of non-NA values
median # arithmetic median of non-NA values
min, max # minimum and maximum of non-NA values
nth # retrieve value that would appear at position n with the data in sorted order
ohlc # compute four “open-high-low-close” statistics for time series-like data
prod # product of non-NA values
quantile # compute sample quantile
rank # ordinal ranks of non-NA values, like calling Series.rank
size # compute group sizes, returning result as a Series
sum # sum of non-NA values
std, var # sample standard deviation and variance
# key1 key2 data1 data2
# 0 a 1 -0.570747 0.965378
# 1 a 2 -0.442446 0.433062
# 2 <NA> 1 0.156601 0.045637
# 3 b 2 1.307246 -2.110711
# 4 b 1 -0.914420 -0.069982
# 5 a <NA> -1.878161 -0.512519
# 6 <NA> 1 -0.637074 -1.144718
grouped = df.groupby("key1")
grouped["data1"].nsmallest(2)
# key1
# a 5 -1.878161
# 0 -0.570747
# b 4 -0.914420
# 3 1.307246
# Name: data1, dtype: float64
grouped.agg(lambda x: x.max() - x.min()) # aggregate or alias agg: custom non-optimized aggregation
# key2 data1 data2
# key1
# a 1 1.435715 1.477897
# b 1 2.221666 2.040729
grouped["data1"].describe() # work, but not aggregations
# count mean std min 25% 50% 75% max
# key1
# a 3.0 -0.963785 0.794467 -1.878161 -1.224454 -0.570747 -0.506597 -0.442446
# b 2.0 0.196413 1.570955 -0.914420 -0.359004 0.196413 0.751829 1.307246
tips = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/tips.csv")
tips.head()
# total_bill tip smoker day time size
# 0 16.99 1.01 No Sun Dinner 2
# 1 10.34 1.66 No Sun Dinner 3
# 2 21.01 3.50 No Sun Dinner 3
# 3 23.68 3.31 No Sun Dinner 2
# 4 24.59 3.61 No Sun Dinner 4
tips["tip_pct"] = tips["tip"] / tips["total_bill"]
tips.head()
# total_bill tip smoker day time size tip_pct
# 0 16.99 1.01 No Sun Dinner 2 0.059447
# 1 10.34 1.66 No Sun Dinner 3 0.160542
# 2 21.01 3.50 No Sun Dinner 3 0.166587
# 3 23.68 3.31 No Sun Dinner 2 0.139780
# 4 24.59 3.61 No Sun Dinner 4 0.146808
6.2.1. apply a list of functions or function names
grouped["tip_pct"].agg("mean")
# day smoker
# Fri No 0.151650
# Yes 0.174783
# Sat No 0.158048
# Yes 0.147906
# Sun No 0.160113
# Yes 0.187250
# Thur No 0.160298
# Yes 0.163863
# Name: tip_pct, dtype: float64
grouped["tip_pct"].agg(["mean", "std", lambda x: x.max() - x.min()])
# mean std <lambda_0>
# day smoker
# Fri No 0.151650 0.028123 0.067349
# Yes 0.174783 0.051293 0.159925
# Sat No 0.158048 0.039767 0.235193
# Yes 0.147906 0.061375 0.290095
# Sun No 0.160113 0.042347 0.193226
# Yes 0.187250 0.154134 0.644685
# Thur No 0.160298 0.038774 0.193350
# Yes 0.163863 0.039389 0.151240
6.2.2. apply a list of (name, function) tuples as the column names
grouped["tip_pct"].agg([("average", "mean"), ("stdev", np.std), ("peak", lambda x: x.max() - x.min())])
# average stdev peak
# day smoker
# Fri No 0.151650 0.028123 0.067349
# Yes 0.174783 0.051293 0.159925
# Sat No 0.158048 0.039767 0.235193
# Yes 0.147906 0.061375 0.290095
# Sun No 0.160113 0.042347 0.193226
# Yes 0.187250 0.154134 0.644685
# Thur No 0.160298 0.038774 0.193350
# Yes 0.163863 0.039389 0.151240
6.2.3. apply a list of functions to all of the columns or different functions per column
grouped[["tip_pct", "total_bill"]].agg(["count", "min", "max"])
# tip_pct total_bill
# count min max count min max
# day smoker
# Fri No 4 0.120385 0.187735 4 12.46 22.75
# Yes 15 0.103555 0.263480 15 5.75 40.17
# Sat No 45 0.056797 0.291990 45 7.25 48.33
# Yes 42 0.035638 0.325733 42 3.07 50.81
# Sun No 57 0.059447 0.252672 57 8.77 48.17
# Yes 19 0.065660 0.710345 19 7.25 45.35
# Thur No 45 0.072961 0.266312 45 7.51 41.19
# Yes 17 0.090014 0.241255 17 10.34 43.11
grouped.agg({"tip": lambda x: x.max(), "size": "sum"})
# tip size
# day smoker
# Fri No 3.50 9
# Yes 4.73 31
# Sat No 9.00 115
# Yes 10.00 104
# Sun No 6.00 167
# Yes 6.50 49
# Thur No 6.70 112
# Yes 5.00 40
6.2.4. apply a sql-style aggregation without row indexes
tips.groupby(["day", "smoker"], as_index=False).mean(numeric_only=True) # SQL style: as_index=False
# day smoker total_bill tip size tip_pct
# 0 Fri No 18.420000 2.812500 2.250000 0.151650
# 1 Fri Yes 16.813333 2.714000 2.066667 0.174783
# 2 Sat No 19.661778 3.102889 2.555556 0.158048
# 3 Sat Yes 21.276667 2.875476 2.476190 0.147906
# 4 Sun No 20.506667 3.167895 2.929825 0.160113
# 5 Sun Yes 24.120000 3.516842 2.578947 0.187250
# 6 Thur No 17.113111 2.673778 2.488889 0.160298
# 7 Thur Yes 19.190588 3.030000 2.352941 0.163863
6.3. Apply: General split-apply-combine
-
apply
splits the object being manipulated into pieces, invokes the passed function on each piece, and then attempts to concatenate the piecestop = lambda df, n=5, column="tip_pct": df.sort_values(column, ascending=False)[:n] top(tips, n=6) # total_bill tip smoker day time size tip_pct # 172 7.25 5.15 Yes Sun Dinner 2 2.452381 # 178 9.60 4.00 Yes Sun Dinner 2 0.714286 # 67 3.07 1.00 Yes Sat Dinner 1 0.483092 # 232 11.61 3.39 No Sat Dinner 2 0.412409 # 183 23.17 6.50 Yes Sun Dinner 4 0.389922 # 109 14.31 4.00 Yes Sat Dinner 2 0.387973
# apply the `top` function on each group # combine the results of each function call together using `pandas.concat` # label the pieces with the group names with a hierarchical index tips.groupby("smoker").apply(top, include_groups=False) # total_bill tip day time size tip_pct # smoker # No 232 11.61 3.39 Sat Dinner 2 0.412409 # 149 7.51 2.00 Thur Lunch 2 0.362976 # 51 10.29 2.60 Sun Dinner 2 0.338101 # 185 20.69 5.00 Sun Dinner 5 0.318674 # 88 24.71 5.85 Thur Lunch 2 0.310180 # Yes 172 7.25 5.15 Sun Dinner 2 2.452381 # 178 9.60 4.00 Sun Dinner 2 0.714286 # 67 3.07 1.00 Sat Dinner 1 0.483092 # 183 23.17 6.50 Sun Dinner 4 0.389922 # 109 14.31 4.00 Sat Dinner 2 0.387973 # apply a function that takes other arguments or keywords tips.groupby("smoker").apply(top, n=3, column="tip", include_groups=False) # total_bill tip day time size tip_pct # smoker # No 212 48.33 9.00 Sat Dinner 4 0.228833 # 23 39.42 7.58 Sat Dinner 4 0.238065 # 59 48.27 6.73 Sat Dinner 4 0.162013 # Yes 170 50.81 10.00 Sat Dinner 3 0.245038 # 183 23.17 6.50 Sun Dinner 4 0.389922 # 214 28.17 6.50 Sat Dinner 3 0.299954 # apply a function like `describe` tips.groupby(["smoker", "day"]).apply(lambda x: x.describe(), include_groups=False) # total_bill tip size tip_pct # smoker day # No Fri count 4.000000 4.000000 4.00 4.000000 # mean 18.420000 2.812500 2.25 0.179740 # std 5.059282 0.898494 0.50 0.039458 # min 12.460000 1.500000 2.00 0.136861 # 25% 15.100000 2.625000 2.00 0.159215 # ... ... ... ... ... # Yes Thur min 10.340000 2.000000 2.00 0.098918 # 25% 13.510000 2.000000 2.00 0.173762 # 50% 16.470000 2.560000 2.00 0.181818 # 75% 19.810000 4.000000 2.00 0.241984 # max 43.110000 5.000000 4.00 0.317965 # # [64 rows x 4 columns]
-
remove the group keys for the results
tips.groupby("smoker").apply(top, include_groups=False) # group_keys=True # total_bill tip day time size tip_pct # smoker # No 232 11.61 3.39 Sat Dinner 2 0.291990 # 149 7.51 2.00 Thur Lunch 2 0.266312 # 51 10.29 2.60 Sun Dinner 2 0.252672 # 185 20.69 5.00 Sun Dinner 5 0.241663 # 88 24.71 5.85 Thur Lunch 2 0.236746 # Yes 172 7.25 5.15 Sun Dinner 2 0.710345 # 178 9.60 4.00 Sun Dinner 2 0.416667 # 67 3.07 1.00 Sat Dinner 1 0.325733 # 183 23.17 6.50 Sun Dinner 4 0.280535 # 109 14.31 4.00 Sat Dinner 2 0.279525 tips.groupby("smoker", group_keys=False).apply(top, include_groups=False) # group_keys=False # total_bill tip day time size tip_pct # 232 11.61 3.39 Sat Dinner 2 0.412409 # 149 7.51 2.00 Thur Lunch 2 0.362976 # 51 10.29 2.60 Sun Dinner 2 0.338101 # 185 20.69 5.00 Sun Dinner 5 0.318674 # 88 24.71 5.85 Thur Lunch 2 0.310180 # 172 7.25 5.15 Sun Dinner 2 2.452381 # 178 9.60 4.00 Sun Dinner 2 0.714286 # 67 3.07 1.00 Sat Dinner 1 0.483092 # 183 23.17 6.50 Sun Dinner 4 0.389922 # 109 14.31 4.00 Sat Dinner 2 0.387973
-
quantile and bucket analysis
df = pd.DataFrame( {"data1": np.random.standard_normal(1000), "data2": np.random.standard_normal(1000)} ) stats = lambda x: pd.DataFrame({"min": x.min(), "max": x.max(), "count": x.count(), "mean": x.mean()}) quartiles = pd.cut(frame["data1"], 4) df.groupby(quartiles, observed=True).apply(stats) # min max count mean # data1 # (-3.178, -1.488] data1 -3.171336 -1.496146 58 -1.875128 # data2 -2.040919 3.687331 58 0.225945 # (-1.488, 0.195] data1 -1.488242 0.193295 516 -0.524040 # data2 -2.887477 2.946888 516 -0.045822 # (0.195, 1.878] data1 0.208000 1.875157 393 0.811104 # data2 -3.066116 2.567213 393 0.119729 # (1.878, 3.561] data1 1.888199 3.560975 33 2.283037 # data2 -2.094342 1.893865 33 0.009781 df.groupby(quartiles, observed=True).agg(["min", "max", "count", "mean"]) # equivalent # data1 data2 # min max count mean min max count mean # data1 # (-3.178, -1.488] -3.171336 -1.496146 58 -1.875128 -2.040919 3.687331 58 0.225945 # (-1.488, 0.195] -1.488242 0.193295 516 -0.524040 -2.887477 2.946888 516 -0.045822 # (0.195, 1.878] 0.208000 1.875157 393 0.811104 -3.066116 2.567213 393 0.119729 # (1.878, 3.561] 1.888199 3.560975 33 2.283037 -2.094342 1.893865 33 0.009781 quartiles_samp = pd.qcut(frame["data1"], 4) df.groupby(quartiles_samp.cat.codes, observed=True).apply(stats) # min max count mean # 0 data1 -3.171336 -0.664742 250 -1.223014 # data2 -2.887477 3.687331 250 0.016059 # 1 data1 -0.655842 -0.005734 250 -0.324734 # data2 -2.667193 2.946888 250 -0.042407 # 2 data1 0.001768 0.683123 250 0.329009 # data2 -2.674011 2.618537 250 0.083003 # 3 data1 0.683490 3.560975 250 1.278506 # data2 -3.066116 2.279426 250 0.090694 quartiles_samp = pd.qcut(frame["data1"], 4, labels=False) df.groupby(quartiles_samp, observed=True).apply(stats) # min max count mean # data1 # 0 data1 -3.171336 -0.664742 250 -1.223014 # data2 -2.887477 3.687331 250 0.016059 # 1 data1 -0.655842 -0.005734 250 -0.324734 # data2 -2.667193 2.946888 250 -0.042407 # 2 data1 0.001768 0.683123 250 0.329009 # data2 -2.674011 2.618537 250 0.083003 # 3 data1 0.683490 3.560975 250 1.278506 # data2 -3.066116 2.279426 250 0.090694
6.4. Group Transforms and Unwrapped GroupBys
-
transform
is another built-in method that produces either a scalar value (broadcast to the group’s shape) or an object matching the input group’s shape.df = pd.DataFrame({"key": ["a", "b", "c"] * 4, "value": np.arange(12.0)}) # key value # 0 a 0.0 # 1 b 1.0 # 2 c 2.0 # 3 a 3.0 # 4 b 4.0 # 5 c 5.0 # 6 a 6.0 # 7 b 7.0 # 8 c 8.0 # 9 a 9.0 # 10 b 10.0 # 11 c 11.0 g = df.groupby("key", group_keys=False) g.mean() # value # key # a 4.5 # b 5.5 # c 6.5 g.transform("mean") # produce a scalar value to broadcast to the group shape # value # 0 4.5 # 1 5.5 # 2 6.5 # 3 4.5 # 4 5.5 # 5 6.5 # 6 4.5 # 7 5.5 # 8 6.5 # 9 4.5 # 10 5.5 # 11 6.5 g.transform(lambda x: x.rank(ascending=False)) # produce an object matching the input group shape # value # 0 4.0 # 1 4.0 # 2 4.0 # 3 3.0 # 4 3.0 # 5 3.0 # 6 2.0 # 7 2.0 # 8 2.0 # 9 1.0 # 10 1.0 # 11 1.0 normalize = lambda x: (x - x.mean()) / x.std() g.transform(normalize) # value # 0 -1.161895 # 1 -1.161895 # 2 -1.161895 # 3 -0.387298 # 4 -0.387298 # 5 -0.387298 # 6 0.387298 # 7 0.387298 # 8 0.387298 # 9 1.161895 # 10 1.161895 # 11 1.161895 g.apply(normalize) # value # 0 -1.161895 # 1 -1.161895 # 2 -1.161895 # 3 -0.387298 # 4 -0.387298 # 5 -0.387298 # 6 0.387298 # 7 0.387298 # 8 0.387298 # 9 1.161895 # 10 1.161895 # 11 1.161895 # unwrapped vectorized group operation: high efficiency, reduced readability normalized = (df["value"] - g.transform("mean")["value"]) / g.transform("std")["value"] # 0 -1.161895 # 1 -1.161895 # 2 -1.161895 # 3 -0.387298 # 4 -0.387298 # 5 -0.387298 # 6 0.387298 # 7 0.387298 # 8 0.387298 # 9 1.161895 # 10 1.161895 # 11 1.161895 # Name: value, dtype: float64
6.5. Pivot Tables and Cross-Tabulation
tips = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/tips.csv")
tips["tip_pct"] = tips["tip"] / (tips["total_bill"] - tips["tip"])
tips.head()
# total_bill tip smoker day time size tip_pct
# 0 16.99 1.01 No Sun Dinner 2 0.063204
# 1 10.34 1.66 No Sun Dinner 3 0.191244
# 2 21.01 3.50 No Sun Dinner 3 0.199886
# 3 23.68 3.31 No Sun Dinner 2 0.162494
# 4 24.59 3.61 No Sun Dinner 4 0.172069
-
A pivot table is a data summarization tool, common in spreadsheet and analysis software, that aggregates data by keys into a rectangular grid with keys as rows and columns.
# compute a table of group means (by default) arranged by `day` and `smoker` on the rows tips.pivot_table(index=["day", "smoker"], values=["size", "tip", "tip_pct", "total_bill"]) # size tip tip_pct total_bill # day smoker # Fri No 2.250000 2.812500 0.179740 18.420000 # Yes 2.066667 2.714000 0.216293 16.813333 # Sat No 2.555556 3.102889 0.190412 19.661778 # Yes 2.476190 2.875476 0.179833 21.276667 # Sun No 2.929825 3.167895 0.193617 20.506667 # Yes 2.578947 3.516842 0.322021 24.120000 # Thur No 2.488889 2.673778 0.193424 17.113111 # Yes 2.352941 3.030000 0.198508 19.190588 tips.groupby(["day", "smoker"])[["size", "tip", "tip_pct", "total_bill"]].mean() # groupby # size tip tip_pct total_bill # day smoker # Fri No 2.250000 2.812500 0.179740 18.420000 # Yes 2.066667 2.714000 0.216293 16.813333 # Sat No 2.555556 3.102889 0.190412 19.661778 # Yes 2.476190 2.875476 0.179833 21.276667 # Sun No 2.929825 3.167895 0.193617 20.506667 # Yes 2.578947 3.516842 0.322021 24.120000 # Thur No 2.488889 2.673778 0.193424 17.113111 # Yes 2.352941 3.030000 0.198508 19.190588 # take the average of only `tip_pct` and `size`, additionally group by `time`, and # put `smoker` in the table columns and `time` and `day` in the rows tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"]) # size tip_pct # smoker No Yes No Yes # time day # Dinner Fri 2.000000 2.222222 0.162612 0.202545 # Sat 2.555556 2.476190 0.190412 0.179833 # Sun 2.929825 2.578947 0.193617 0.322021 # Thur 2.000000 NaN 0.190114 NaN # Lunch Fri 3.000000 1.833333 0.231125 0.236915 # Thur 2.500000 2.352941 0.193499 0.198508 # include partial totals by passing `margins=True` to add `All` row and column labels, with # corresponding values being the group statistics for all the data within a single tier tips.pivot_table(index=["time", "day"], columns="smoker", values=["tip_pct", "size"], margins=True) # size tip_pct # smoker No Yes All No Yes All # time day # Dinner Fri 2.000000 2.222222 2.166667 0.162612 0.202545 0.192562 # Sat 2.555556 2.476190 2.517241 0.190412 0.179833 0.185305 # Sun 2.929825 2.578947 2.842105 0.193617 0.322021 0.225718 # Thur 2.000000 NaN 2.000000 0.190114 NaN 0.190114 # Lunch Fri 3.000000 1.833333 2.000000 0.231125 0.236915 0.236088 # Thur 2.500000 2.352941 2.459016 0.193499 0.198508 0.194895 # All 2.668874 2.408602 2.569672 0.192237 0.218176 0.202123 # specify an aggregation function other than `mean` with the `aggfunc` keyword argument tips.pivot_table( index=["time", "day"], columns="smoker", values=["tip_pct", "size"], margins=True, aggfunc=len, ) # size tip_pct # smoker No Yes All No Yes All # time day # Dinner Fri 3.0 9.0 12 3.0 9.0 12 # Sat 45.0 42.0 87 45.0 42.0 87 # Sun 57.0 19.0 76 57.0 19.0 76 # Thur 1.0 NaN 1 1.0 NaN 1 # Lunch Fri 1.0 6.0 7 1.0 6.0 7 # Thur 44.0 17.0 61 44.0 17.0 61 # All 151.0 93.0 244 151.0 93.0 244 # fill NA/NaN combinations by `fill_value` tips.pivot_table(index=["time", "size", "smoker"], columns="day", values="tip_pct") # day Fri Sat Sun Thur # time size smoker # Dinner 1 No NaN 0.160000 NaN NaN # Yes NaN 0.483092 NaN NaN # 2 No 0.162612 0.198319 0.206535 0.190114 # Yes 0.211180 0.178877 0.400522 NaN # ... ... ... ... ... # Lunch 4 No NaN NaN NaN 0.161573 # Yes NaN NaN NaN 0.186592 # 5 No NaN NaN NaN 0.138160 # 6 No NaN NaN NaN 0.211191 # # [21 rows x 4 columns] tips.pivot_table(index=["time", "size", "smoker"], columns="day", values="tip_pct", fill_value=0) # day Fri Sat Sun Thur # time size smoker # Dinner 1 No 0.000000 0.160000 0.000000 0.000000 # Yes 0.000000 0.483092 0.000000 0.000000 # 2 No 0.162612 0.198319 0.206535 0.190114 # Yes 0.211180 0.178877 0.400522 0.000000 # ... ... ... ... ... # Lunch 4 No 0.000000 0.000000 0.000000 0.161573 # Yes 0.000000 0.000000 0.000000 0.186592 # 5 No 0.000000 0.000000 0.000000 0.138160 # 6 No 0.000000 0.000000 0.000000 0.211191 # # [21 rows x 4 columns]
-
A cross-tabulation, or crosstab, is a specific type of pivot table that calculates the frequency of data points within different groups.
pd.crosstab([tips["time"], tips["day"]], tips["smoker"], margins=True) # smoker No Yes All # time day # Dinner Fri 3 9 12 # Sat 45 42 87 # Sun 57 19 76 # Thur 1 0 1 # Lunch Fri 1 6 7 # Thur 44 17 61 # All 151 93 244
6.6. Window Operations
A window operation in pandas is a data analysis tool that performs statistical calculations over a sliding or adaptive window by looking back from each observation point in an ordered sequence, enabling data smoothing and trend analysis.
-
Pandas supports four types of windowing operations: rolling windows for sliding calculations, weighted windows for non-rectangular calculations, expanding windows for accumulating calculations, and exponentially weighted windows for weighted accumulating calculations.
s = pd.Series(range(5)) # 0 0 # 1 1 # 2 2 # 3 3 # 4 4 # dtype: int64 s.rolling(window=2).sum() # 0 NaN # 1 1.0 # 2 3.0 # 3 5.0 # 4 7.0 # dtype: float64 for window in s.rolling(window=2): print(window) # 0 0 # dtype: int64 # 0 0 # 1 1 # dtype: int64 # 1 1 # 2 2 # dtype: int64 # 2 2 # 3 3 # dtype: int64 # 3 3 # 4 4 # dtype: int64 for window in s.rolling(window=2, step=2): print(window) # 0 0 # dtype: int64 # 1 1 # 2 2 # dtype: int64 # 3 3 # 4 4 # dtype: int64
-
All windowing operations require a minimum number of non-null values (specified by
min_periods
) to produce a result, defaulting to 1 for time-based windows and the window size for fixed windows, otherwise returning null (i.e.na.nan
).s.rolling(window=2, min_periods=1).sum() # 0 0.0 # 1 1.0 # 2 3.0 # 3 5.0 # 4 7.0 # dtype: float64
-
All windowing operations support the
aggregate
(aliasagg
) method to return multiple aggregation results.s.rolling(window=2).agg(["sum", "mean", "std"]) # sum mean std # 0 NaN NaN NaN # 1 1.0 0.5 0.707107 # 2 3.0 1.5 0.707107 # 3 5.0 2.5 0.707107 # 4 7.0 3.5 0.707107
-
All windowing operations support the
apply
method to return custom aggregation results using user-defined functions.s = pd.Series(range(5)) s.rolling(window=4).apply(lambda x: np.fabs(x - x.mean()).mean(), raw=True) # 0 NaN # 1 NaN # 2 NaN # 3 1.0 # 4 1.0 # dtype: float64
6.6.1. Rolling Window
-
A rolling window is a fixed or variable-sized window that can be specified by either a number of observations or a time-based offset, requiring a monotonic time index when using time-based offsets.
times = ['2020-01-01', '2020-01-03', '2020-01-04', '2020-01-05', '2020-01-29'] s = pd.Series(range(5), index=pd.DatetimeIndex(times)) # 2020-01-01 0 # 2020-01-03 1 # 2020-01-04 2 # 2020-01-05 3 # 2020-01-29 4 # dtype: int64 s.rolling(window=2).sum() # window with 2 observations # 2020-01-01 NaN # 2020-01-03 1.0 # 2020-01-04 3.0 # 2020-01-05 5.0 # 2020-01-29 7.0 # dtype: float64 s.rolling(window="2D").sum() # window with 2 days worth of observations # 2020-01-01 0.0 # 2020-01-03 1.0 # 2020-01-04 3.0 # 2020-01-05 5.0 # 2020-01-29 4.0 # dtype: float64 s.rolling(window="2D", min_periods=2).sum() # 2020-01-01 NaN # 2020-01-03 NaN # 2020-01-04 3.0 # 2020-01-05 5.0 # 2020-01-29 NaN # dtype: float64
-
By default the labels (index or column labels) are set to the right edge of the window, but a
center
keyword is available so the labels can be set at the center.s = pd.Series(range(5)) # 0 0 # 1 1 # 2 2 # 3 3 # 4 4 # dtype: int64 # the window ends at the current observation, and for each position, it looks back 3 positions. for window in s.rolling(window=3): print(list(window)) # [0] # [0, 1] # [0, 1, 2] # [1, 2, 3] # [2, 3, 4] # the window is centered on the current observation, and for each position, it looks 1 position back and 1 position forward. for window in s.rolling(window=3, center=True): print(list(window)) # [0, 1] (can't look back, so only forward) # [0, 1, 2] # [1, 2, 3] # [2, 3, 4] # [3, 4] (can't look forward, so only back)
6.6.2. Expanding Window
-
An expanding window is a special case of rolling window where the window size equals the length of available data, yielding the value of an aggregation statistic with all the data up to that point in time.
data = pd.DataFrame(range(5)) # 0 # 0 0 # 1 1 # 2 2 # 3 3 # 4 4 data.rolling(window=len(data), min_periods=1).mean() # 0 # 0 0.0 # 1 0.5 # 2 1.0 # 3 1.5 # 4 2.0 data.expanding(min_periods=1).mean() # 0 # 0 0.0 # 1 0.5 # 2 1.0 # 3 1.5 # 4 2.0
6.6.3. Weighted Window
-
A weighted window is a rolling window that assigns different weights to observations within the window, where the weights are determined by a specific mathematical function (like Gaussian, Hann, or Hamming) to emphasize or de-emphasize certain data points in the calculation.
-
In pandas, weighted windows can be created in multiple ways: by specifying a
win_type
argument in.rolling
that corresponds to ascipy.signal
window function (commonly used in filtering and spectral estimation), or by using custom weights, exponential weights, or other mathematical functions.s = pd.Series(range(5)) # gaussian window s.rolling(window=3, win_type="gaussian").mean(std=2) # conda install scipy # 0 NaN # 1 NaN # 2 1.0 # 3 2.0 # 4 3.0 # dtype: float64 # custom weights weights = np.array([0.1, 0.3, 0.6]) s.rolling(window=3).apply(lambda x: np.sum(x * weights)) # 0 NaN # 1 NaN # 2 1.5 # 3 2.5 # 4 3.5 # dtype: float64 # linear weights (e.g., [0.1, 0.2, 0.3, 0.4]) s.rolling(window=4).apply(lambda x: np.sum(x * np.linspace(0.1, 0.4, len(x)))) # ewm (exponential weighted moving) s.ewm(span=3).mean() s.ewm(alpha=0.3).mean()
6.6.4. Binary Window Functions
A binary window function is a specialized method within the rolling()
, expanding()
, or ewm()
frameworks to compute relationship-based statistics—such as covariance (cov
) and correlation (corr
)—over a defined or adaptive window.
-
Covariance measures the direction of the linear relationship between two variables—positive for co-movement, negative for opposing movement—but its non-standardized value depends on the data’s scale.
-
Correlation quantifies both the strength and direction of a linear relationship between two variables with values from
-1
to1
, providing a standardized and more easily interpretable measure than covariance.
7. Time Series
Time series data consists of sequentially recorded data points over time, which can be fixed or irregular in frequency, and is used to represent timestamps, fixed periods, intervals, or elapsed time across various fields.
7.1. Date and Time Types
-
The Python standard library, such as
datetime
,time
, andcalendar
modules, includes data types for date and time data, as well as calendar-related functionality.from datetime import datetime, timedelta, timezone import pytz now = datetime.now() # datetime.datetime(2022, 8, 12, 14, 9, 11, 337033) now.year, now.month, now.day # (2022, 8, 12)
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15) # datetime.timedelta(days=926, seconds=56700) delta.days, delta.seconds # (926, 56700) datetime(2011, 1, 7) + timedelta(12) # datetime.datetime(2011, 1, 19, 0, 0) datetime(2011, 1, 7) + timedelta(days=12, hours=8, minutes=45) # datetime.datetime(2011, 1, 19, 8, 45)
# tz-naive datetime now = datetime.now() # datetime.datetime(2024, 5, 29, 14, 38, 23, 755723) now.tzname(), now.utcoffset() # (None, None) # local datetime local_now = now.astimezone() # datetime.now().astimezone() # datetime.datetime(2024, 5, 29, 14, 38, 23, 755723, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800), 'CST')) local_now.tzname(), local_now.utcoffset() # ('CST', datetime.timedelta(seconds=28800)) # a UTC datetime utc_now = now.astimezone(timezone.utc) # datetime.now(timezone.utc) # datetime.datetime(2024, 5, 29, 6, 38, 23, 755723, tzinfo=datetime.timezone.utc) utc_now.tzname(), utc_now.utcoffset() # ('UTC', datetime.timedelta(0)) # a fixed offset from UTC tz = timezone(timedelta(hours=8)) now_tz = now.astimezone(tz) # datetime.now(tz) # datetime.datetime(2024, 5, 29, 14, 38, 23, 755723, tzinfo=datetime.timezone(datetime.timedelta(seconds=28800))) now_tz.tzname(), now_tz.utcoffset().total_seconds() / 3600 # ('UTC+08:00', 8.0) # a named time zone tz = pytz.timezone("Asia/Shanghai") # pip install pytz now_tz = now.astimezone(tz) # datetime.now(tz) # datetime.datetime(2024, 5, 29, 14, 38, 23, 755723, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>) now_tz.tzname(), now_tz.utcoffset() # ('CST', datetime.timedelta(seconds=28800))
stamp = datetime(2011, 1, 3) str(stamp) # '2011-01-03 00:00:00' stamp.strftime("%Y-%m-%d") # '2011-01-03' value = "2011-01-03" datetime.strptime(value, "%Y-%m-%d") # datetime.datetime(2011, 1, 3, 0, 0) datestrs = ["7/6/2011", "8/6/2011"] [datetime.strptime(x, "%m/%d/%Y") for x in datestrs] # [datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]
-
The pandas library offers a comprehensive suite of data types and functionalities specifically tailored for time series analysis, organized around four general concepts: date times (a specific date and time with timezone support), time deltas (an absolute time duration), time spans (a span of time defined by a point in time and its associated frequency), and date offsets (a relative time duration that respects calendar arithmetic).
from datetime import datetime import pandas as pd import numpy as np
# date times: a specific date and time with timezone support. pd.Timestamp("2025-05-07 08:51:12 +0800") # Timestamp('2025-05-07 08:51:12+0800', tz='UTC+08:00') pd.to_datetime("2010/11/12") # Timestamp('2010-11-12 00:00:00') now = datetime.now() # datetime.datetime(2024, 5, 20, 15, 32, 2, 656503) np.datetime64(now) # np.datetime64('2024-05-20T15:11:30.645265') pd.Timestamp(now) # Timestamp('2024-05-20 15:11:30.645265') pd.Timestamp(now.astimezone()) # Timestamp('2024-05-20 15:32:02.656503+0800', tz='CST')
ts = pd.Timestamp(datetime.now()) # Timestamp('2024-05-20 15:48:21.663001') ts.to_pydatetime() # datetime.datetime(2024, 5, 20, 15, 48, 21, 663001) ts.to_datetime64() # np.datetime64('2024-05-20T15:48:21.663001') ts.to_period("Y-JUN") # Period('2024', 'Y-JUN') ts_utc = ts.tz_localize("utc") # Timestamp('2024-05-20 15:48:21.663001+0000', tz='UTC') ts_utc.tz_convert("Europe/Moscow") # Timestamp('2024-05-20 18:48:21.663001+0300', tz='Europe/Moscow')
pd.Timestamp.min # Timestamp('1677-09-21 00:12:43.145224193') pd.Timestamp.max # Timestamp('2262-04-11 23:47:16.854775807')
# epoch timestamps: nanoseconds since the Unix epoch (January 1, 1970) ts = pd.Timestamp("2024-05-20 15:32:02.656503+0800") ns = ts.value # 1716190322656503000 pd.Timestamp(ns, tz="UTC+08:00") # Timestamp('2024-05-20 15:32:02.656503+0800', tz='UTC+08:00') pd.Timestamp(ns // 1_000, unit="us", tz="UTC+08:00") # Timestamp('2024-05-20 15:32:02.656503+0800', tz='UTC+08:00') pd.Timestamp(ns // 1_000_000, unit="ms", tz="UTC+08:00") # Timestamp('2024-05-20 15:32:02.656000+0800', tz='UTC+08:00') pd.Timestamp(ns // 1_000_000_000, unit="s", tz="UTC+08:00") # Timestamp('2024-05-20 15:32:02+0800', tz='UTC+08:00')
# time deltas: an absolute time duration. pd.Timedelta(timedelta(days=15)) # Timedelta('15 days 00:00:00') td = pd.Timedelta(1, "d") # Timedelta('1 days 00:00:00') pd.Timestamp("2024-05-20T15:11:30.645265") + td # Timestamp('2024-05-21 15:11:30.645265')
# date offsets: a relative time duration that respects calendar arithmetic. ts = pd.Timestamp("2016-10-30 00:00:00", tz="Europe/Helsinki") ts.dst() # datetime.timedelta(seconds=3600) # a Timedelta day adds 24 hours, while a DateOffset day adjusts for DST with the same time next day ts + pd.Timedelta(days=1) # absolute time # Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') ts + pd.DateOffset(days=1) # calendar time # Timestamp('2016-10-31 00:00:00+0200', tz='Europe/Helsinki') friday = pd.Timestamp("2018-01-05") friday.day_name() # 'Friday' friday_2b = friday + 2 * pd.offsets.BDay() # plus two business days # Timestamp('2018-01-09 00:00:00') friday_2b.day_name() # 'Tuesday' mb = pd.offsets.MonthBegin() mb.rollforward("2025-01-15") # Timestamp('2025-02-01 00:00:00') mb.rollback("2025-01-15") # Timestamp('2025-01-01 00:00:00') # a number of string aliases are given to useful common time series frequencies pd.offsets.BusinessDay().name # 'B' pd.offsets.BDay().name # 'B' pd.offsets.MonthEnd().name # 'ME' pd.offsets.Hour().name # 'h'
# pandas represents null date times, time deltas, and time spans as NaT pd.Timestamp("NaT") # NaT pd.Period("NaT") # NaT pd.Timestamp(datetime.now()) - pd.Timestamp("NaT") # NaT
tdi = pd.to_datetime(["2025-05-07 08:51:12 +0800", None]) # DatetimeIndex(['2025-05-07 08:51:12+08:00', 'NaT'], dtype='datetime64[ns, UTC+08:00]', freq=None) tdi.dtype # pandas stores timestamps using NumPy’s `datetime64` data type at the nanosecond resolution # datetime64[ns, UTC+08:00] dti[0] # scalar values from a DatetimeIndex are pandas Timestamp objects # Timestamp('2024-05-20 15:32:02.656503+0800', tz='UTC+08:00') dates = pd.date_range("2012-03-09 09:30", periods=6) # DatetimeIndex(['2012-03-09 09:30:00', '2012-03-10 09:30:00', # '2012-03-11 09:30:00', '2012-03-12 09:30:00', # '2012-03-13 09:30:00', '2012-03-14 09:30:00'], # dtype='datetime64[ns]', freq='D') # localize a tz-naive date time index to a specific time zone local_dates = dates.tz_localize("Asia/Shanghai") # DatetimeIndex(['2012-03-09 09:30:00+08:00', '2012-03-10 09:30:00+08:00', # '2012-03-11 09:30:00+08:00', '2012-03-12 09:30:00+08:00', # '2012-03-13 09:30:00+08:00', '2012-03-14 09:30:00+08:00'], # dtype='datetime64[ns, Asia/Shanghai]', freq=None) # convert a date time index to another specific time zone local_dates.tz_convert("Europe/Berlin") # DatetimeIndex(['2012-03-09 02:30:00+01:00', '2012-03-10 02:30:00+01:00', # '2012-03-11 02:30:00+01:00', '2012-03-12 02:30:00+01:00', # '2012-03-13 02:30:00+01:00', '2012-03-14 02:30:00+01:00'], # dtype='datetime64[ns, Europe/Berlin]', freq=None)
ts = pd.Series(np.random.standard_normal(6), index=dates) # 2012-03-09 09:30:00 -1.206441 # 2012-03-10 09:30:00 1.604368 # 2012-03-11 09:30:00 -1.453797 # 2012-03-12 09:30:00 0.507613 # 2012-03-13 09:30:00 2.178526 # 2012-03-14 09:30:00 -0.950722 # Freq: D, dtype: float64 print(ts.index.tz) # None # localize a tz-naive time series to a specific time zone ts_utc = ts.tz_localize("UTC") ts_utc.index # DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00', # '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00', # '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'], # dtype='datetime64[ns, UTC]', freq='D') print(ts_utc.index.tz) # UTC # convert a time series to another specific time zone ts_utc.tz_convert("America/Los_Angeles") # 2012-03-09 01:30:00-08:00 -1.206441 # 2012-03-10 01:30:00-08:00 1.604368 # 2012-03-11 01:30:00-08:00 -1.453797 # 2012-03-12 02:30:00-07:00 0.507613 # 2012-03-13 02:30:00-07:00 2.178526 # 2012-03-14 02:30:00-07:00 -0.950722 # Freq: D, dtype: float64 # two time series with different time zones are combined to be UTC ts1 = ts[:2].tz_localize("Europe/London") ts2 = ts[2:].tz_localize("Europe/Paris") ts1.index.tz, ts2.index.tz, (ts1 + ts2).index.tz # (<DstTzInfo 'Europe/London' LMT-1 day, 23:59:00 STD>, <DstTzInfo 'Europe/Paris' LMT+0:09:00 STD>, datetime.timezone.utc)
# time spans: a span of time defined by a point in time and its associated frequency. pd.Period("2011-01") # inferred from datetime string format # Period('2011-01', 'M') pd.Period("2011-01", "M") # specified explicitly # Period('2011-01', 'M') pd.Period("2011-01", freq="D") # specified explicitly # Period('2011-01-01', 'D') # a period represents a span of time (e.g., a day, a month, a quarter, etc) now = datetime.now() # datetime.datetime(2025, 5, 20, 8, 42, 2, 587839) pd.Period(now, "Y") # Period('2025', 'Y-DEC') pd.Period(now, "Q") # Period('2025Q2', 'Q-DEC') pd.Period(now, "M") # Period('2025-05', 'M') pd.Period(now, "D") # Period('2025-05-20', 'D') pd.Period(now, "min").to_timestamp() # Timestamp('2025-05-20 08:42:00') # adding and subtracting integers from periods shifts the period by its own frequency pd.Period("2012", freq="Y-DEC") + 1 # Period('2013', 'Y-DEC') pd.PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") # PeriodIndex(['2011-01', '2011-02', '2011-03'], dtype='period[M]') pd.period_range("1/1/2011", "1/1/2012", freq="M") # PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05', '2011-06', # '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12', # '2012-01'], # dtype='period[M]') pd.period_range(start="2014-01", freq="3M", periods=4) # PeriodIndex(['2014-01', '2014-04', '2014-07', '2014-10'], dtype='period[3M]') pd.period_range(start="2014-01", freq="Q", periods=4) # PeriodIndex(['2014Q1', '2014Q2', '2014Q3', '2014Q4'], dtype='period[Q-DEC]') periods = [pd.Period("2012-01"), pd.Period("2012-02"), pd.Period("2012-03")] ts = pd.Series(np.random.randn(3), periods) # 2012-01 -0.544439 # 2012-02 -0.478298 # 2012-03 -0.781469 # Freq: M, dtype: float64
7.2. Indexing, Selection, Subsetting
dates = [
datetime(2011, 1, 2),
datetime(2011, 1, 5),
datetime(2011, 1, 7),
datetime(2011, 1, 8),
datetime(2011, 1, 10),
datetime(2011, 1, 12),
]
ts = pd.Series(np.random.standard_normal(6), index=dates)
# index using loc with a string date
ts.loc["2011-01-02"]
# np.float64(2.1940286611797526)
# index using iloc with an integer index
ts.iloc[0]
# np.float64(2.1940286611797526)
# index with timestamp
stamp = ts.index[0] # Timestamp('2011-01-02 00:00:00')
ts[stamp]
# np.float64(2.1940286611797526)
# slice with timestamp
t1 = ts.index[0]
t2 = ts.index[3]
ts[t1:t2]
# 2011-01-02 2.194029
# 2011-01-05 1.253080
# 2011-01-07 0.834581
# 2011-01-08 0.929077
# dtype: float64
# index with datetime object
ts[datetime(2011, 1, 2)]
# np.float64(2.1940286611797526)
# slice with datatime objects
ts[datetime(2011, 1, 7) : datetime(2011, 1, 10)]
# 2011-01-07 0.834581
# 2011-01-08 0.929077
# 2011-01-10 1.993171
# dtype: float64
# index with string date
ts["2011-01-02"]
# np.float64(2.1940286611797526)
# slice with string dates
ts["2011-01-07":"2011-01-10"]
# 2011-01-07 0.834581
# 2011-01-08 0.929077
# 2011-01-10 1.993171
# dtype: float64
# slice with string dates not contained in a time series
ts["2011-01-06":"2011-01-11"]
# 2011-01-07 0.834581
# 2011-01-08 0.929077
# 2011-01-10 1.993171
# dtype: float64
# slice with specific year
ts["2011"]
# 2011-01-02 2.194029
# 2011-01-05 1.253080
# 2011-01-07 0.834581
# 2011-01-08 0.929077
# 2011-01-10 1.993171
# 2011-01-12 0.847771
# dtype: float64
# slice with specific month
ts["2011-01"]
# 2011-01-02 2.194029
# 2011-01-05 1.253080
# 2011-01-07 0.834581
# 2011-01-08 0.929077
# 2011-01-10 1.993171
# 2011-01-12 0.847771
# dtype: float64
7.3. Date Ranges, Frequencies, and Shifting
-
By default,
pandas.date_range
generates daily timestamps.pd.date_range("2025-05-01", "2025-05-15") # DatetimeIndex(['2025-05-01', '2025-05-02', '2025-05-03', '2025-05-04', # '2025-05-05', '2025-05-06', '2025-05-07', '2025-05-08', # '2025-05-09', '2025-05-10', '2025-05-11', '2025-05-12', # '2025-05-13', '2025-05-14', '2025-05-15'], # dtype='datetime64[ns]', freq='D') pd.date_range(start="2025-05-01", periods=15) # DatetimeIndex(['2025-05-01', '2025-05-02', '2025-05-03', '2025-05-04', # '2025-05-05', '2025-05-06', '2025-05-07', '2025-05-08', # '2025-05-09', '2025-05-10', '2025-05-11', '2025-05-12', # '2025-05-13', '2025-05-14', '2025-05-15'], # dtype='datetime64[ns]', freq='D') pd.date_range(end="2025-05-15", periods=15) # DatetimeIndex(['2025-05-01', '2025-05-02', '2025-05-03', '2025-05-04', # '2025-05-05', '2025-05-06', '2025-05-07', '2025-05-08', # '2025-05-09', '2025-05-10', '2025-05-11', '2025-05-12', # '2025-05-13', '2025-05-14', '2025-05-15'], # dtype='datetime64[ns]', freq='D')
-
By default,
pandas.date_range
preserves the time (if any) of the start or end timestamp.pd.date_range("2012-05-02 12:56:31", periods=5) # DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31', # '2012-05-04 12:56:31', '2012-05-05 12:56:31', # '2012-05-06 12:56:31'], # dtype='datetime64[ns]', freq='D') # normalize timestamps to midnight pd.date_range("2012-05-02 12:56:31", periods=5, normalize=True) # DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05', # '2012-05-06'], # dtype='datetime64[ns]', freq='D')
-
Frequencies in pandas such as daily, monthly, or every 15 minutes, are composed of a base frequency that typically referred to by a string frequency alias, like "D" for calendar daily, "ME" for monthly or "h" for hourly and a multiplier.
# for each base frequency, there is an object referred to as a date offset. from pandas.tseries.offsets import Hour, Minute Hour() # <Hour> Hour(4) # <4 * Hours> 4 * Hour() # <4 * Hours> 2 * Hour() + 30 * Minute() # <150 * Minutes> Hour(2) + Minute(30) # <150 * Minutes>
pd.date_range("2000-01-01", "2000-01-02 23:59", freq=4 * Hour()) # DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00', # '2000-01-01 08:00:00', '2000-01-01 12:00:00', # '2000-01-01 16:00:00', '2000-01-01 20:00:00', # '2000-01-02 00:00:00', '2000-01-02 04:00:00', # '2000-01-02 08:00:00', '2000-01-02 12:00:00', # '2000-01-02 16:00:00', '2000-01-02 20:00:00'], # dtype='datetime64[ns]', freq='4h') # pd.date_range("2000-01-01", "2000-01-02 23:59", freq="4h") # DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00', # '2000-01-01 08:00:00', '2000-01-01 12:00:00', # '2000-01-01 16:00:00', '2000-01-01 20:00:00', # '2000-01-02 00:00:00', '2000-01-02 04:00:00', # '2000-01-02 08:00:00', '2000-01-02 12:00:00', # '2000-01-02 16:00:00', '2000-01-02 20:00:00'], # dtype='datetime64[ns]', freq='4h') # pd.date_range("2000-01-01", "2000-01-02 23:59", freq="4h30min") # DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:30:00', # '2000-01-01 09:00:00', '2000-01-01 13:30:00', # '2000-01-01 18:00:00', '2000-01-01 22:30:00', # '2000-01-02 03:00:00', '2000-01-02 07:30:00', # '2000-01-02 12:00:00', '2000-01-02 16:30:00', # '2000-01-02 21:00:00'], # dtype='datetime64[ns]', freq='270min') pd.date_range("2000-01-01", "2000-12-31", freq="BME") # DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28', # '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31', # '2000-09-29', '2000-10-31', '2000-11-30', '2000-12-29'], # dtype='datetime64[ns]', freq='BME')
-
Shifting refers to moving data backward and forward through time.
ts = pd.Series( np.random.standard_normal(4), index=pd.date_range("2000-01-01", periods=4, freq="ME"), ) # 2000-01-31 0.162140 # 2000-02-29 1.690547 # 2000-03-31 -2.413846 # 2000-04-30 0.558444 # Freq: ME, dtype: float64
# naive shifts leave the index unmodified, and discard data ts.shift(2) # 2000-01-31 NaN # 2000-02-29 NaN # 2000-03-31 0.162140 # 2000-04-30 1.690547 # Freq: ME, dtype: float64 ts.shift(-2) # 2000-01-31 -2.413846 # 2000-02-29 0.558444 # 2000-03-31 NaN # 2000-04-30 NaN # Freq: ME, dtype: float64 ts / ts.shift(1) - 1 # 2000-01-31 NaN # 2000-02-29 9.426449 # 2000-03-31 -2.427850 # 2000-04-30 -1.231350 # Freq: ME, dtype: float64
# shift with freq to advance the timestamps ts.shift(2, freq="ME") # 2000-03-31 0.162140 # 2000-04-30 1.690547 # 2000-05-31 -2.413846 # 2000-06-30 0.558444 # Freq: ME, dtype: float64 ts.shift(3, freq="D") # 2000-02-03 0.162140 # 2000-03-03 1.690547 # 2000-04-03 -2.413846 # 2000-05-03 0.558444 # dtype: float64
# shifting dates with offsets from pandas.tseries.offsets import Day, MonthEnd now = datetime(2011, 11, 17) now + 3 * Day() # Timestamp('2011-11-20 00:00:00') 3 * Day() # <3 * Days> now + MonthEnd() # Timestamp('2011-11-30 00:00:00') now + MonthEnd(2) # Timestamp('2011-12-31 00:00:00') offset = MonthEnd() # <MonthEnd> offset.rollforward(now) # Timestamp('2011-11-30 00:00:00') offset.rollback(now) # Timestamp('2011-10-31 00:00:00') ts = pd.Series( np.random.standard_normal(20), index=pd.date_range("2000-01-15", periods=20, freq="4D"), ) ts.groupby(MonthEnd().rollforward).mean() # 2000-01-31 -0.729635 # 2000-02-29 0.582600 # 2000-03-31 -0.036413 # dtype: float64 ts.resample("ME").mean() # resample: an easier and faster way # 2000-01-31 -0.729635 # 2000-02-29 0.582600 # 2000-03-31 -0.036413 # Freq: ME, dtype: float64
7.4. Periods and Period Arithmetic
Periods represent time spans, like days, months, quarters, or years, represented by the pandas.Period
class data type, requiring a string or integer and a supported frequency period alias.
p = pd.Period("2011", freq="Y-DEC")
# Period('2011', 'Y-DEC')
p + 3
# Period('2014', 'Y-DEC')
p - 3
# Period('2008', 'Y-DEC')
pd.Period("2011", freq="Y-DEC") - pd.Period("2009", freq="Y-DEC") # pandas._libs.tslibs.offsets.YearEnd
# <2 * YearEnds: month=12>
pd.Period("2006", freq="Y-DEC").asfreq("M", how="start")
# Period('2006-01', 'M')
pd.Period("2006", freq="Y-DEC").asfreq("M", how="end")
# Period('2006-12', 'M')
pd.Period("2011", freq="Y-JUN").asfreq("M", how="start")
# Period('2010-07', 'M')
pd.Period("2011", freq="Y-JUN").asfreq("M", how="end")
# Period('2011-06', 'M')
tpi = pd.period_range("2006", periods=4, freq="Y-DEC")
# PeriodIndex(['2006', '2007', '2008', '2009'], dtype='period[Y-DEC]')
tpi.asfreq("M", how="start")
# PeriodIndex(['2006-01', '2007-01', '2008-01', '2009-01'], dtype='period[M]')
p = pd.Period("2012", "Y-JAN") # a fiscal year ending in January
# Period('2012', 'Y-JAN')
p.asfreq("D", how="start")
# Period('2011-02-01', 'D')
p.asfreq("D", how="end")
# Period('2012-01-31', 'D')
dates = pd.date_range("2000-01-01", periods=3, freq="ME")
ts = pd.Series(np.random.standard_normal(3), index=dates)
# 2000-01-31 1.635092
# 2000-02-29 0.050582
# 2000-03-31 -0.825711
# Freq: ME, dtype: float64
ts.to_period() # inferred as 'M'
# 2000-01 1.635092
# 2000-02 0.050582
# 2000-03 -0.825711
# Freq: M, dtype: float64
pts = ts.to_period("Q-DEC")
# 2000Q1 1.635092
# 2000Q1 0.050582
# 2000Q1 -0.825711
# Freq: Q-DEC, dtype: float64
pts.to_timestamp(freq="M")
# 2000-01-31 1.635092
# 2000-01-31 0.050582
# 2000-01-31 -0.825711
# dtype: float64
pts.to_timestamp(how="end")
# 2000-03-31 23:59:59.999999999 1.635092
# 2000-03-31 23:59:59.999999999 0.050582
# 2000-03-31 23:59:59.999999999 -0.825711
# dtype: float64
data = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/macrodata.csv")
data[["year", "quarter"]].head()
# year quarter
# 0 1959 1
# 1 1959 2
# 2 1959 3
# 3 1959 4
# 4 1960 1
pd.PeriodIndex.from_fields(year=data["year"], quarter=data["quarter"], freq="Q-DEC")
# PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
# '1960Q3', '1960Q4', '1961Q1', '1961Q2',
# ...
# '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
# '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
# dtype='period[Q-DEC]', length=203)
7.5. Resampling and Frequency Conversion
-
The
resample
method in pandas is the primary tool for performing resampling operations during frequency conversion, functioning as a time-basedgroupby
followed by a reduction method on its groups.dates = pd.date_range("2000-01-01", periods=100) ts = pd.Series(np.random.standard_normal(len(dates)), index=dates) # 2000-01-01 -2.360643 # 2000-01-02 -0.453989 # 2000-01-03 -0.030482 # 2000-01-04 0.331564 # 2000-01-05 0.899450 # ... # 2000-04-05 -0.658839 # 2000-04-06 -0.632787 # 2000-04-07 -1.112433 # 2000-04-08 -0.274477 # 2000-04-09 0.262441 # Freq: D, Length: 100, dtype: float64 ts.resample("ME").mean() # 2000-01-31 -0.218753 # 2000-02-29 0.293163 # 2000-03-31 0.148025 # 2000-04-30 -0.040589 # Freq: ME, dtype: float64 ts.resample("ME").mean().to_period("M") # 2000-01 -0.218753 # 2000-02 0.293163 # 2000-03 0.148025 # 2000-04 -0.040589 # Freq: M, dtype: float64 ts.to_period("M").groupby(level=0).mean() # 2000-01 -0.218753 # 2000-02 0.293163 # 2000-03 0.148025 # 2000-04 -0.040589 # Freq: M, dtype: float64
-
Downsampling aggregates data into lower frequency intervals (like daily to monthly), where each data point belongs to exactly one interval, and the process requires careful consideration of which interval boundary to include (
closed
) and whether tolabel
the result with the interval’s start or end time.dates = pd.date_range("2000-01-01", periods=12, freq="min") ts = pd.Series(np.arange(len(dates)), index=dates) # 2000-01-01 00:00:00 0 # 2000-01-01 00:01:00 1 # 2000-01-01 00:02:00 2 # 2000-01-01 00:03:00 3 # 2000-01-01 00:04:00 4 # 2000-01-01 00:05:00 5 # 2000-01-01 00:06:00 6 # 2000-01-01 00:07:00 7 # 2000-01-01 00:08:00 8 # 2000-01-01 00:09:00 9 # 2000-01-01 00:10:00 10 # 2000-01-01 00:11:00 11 # Freq: min, dtype: int64
# by default, each bin includes its left edge and is labeled with its start time rs.closed, rs.label ('left', 'left') rs.binner # DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:05:00', # '2000-01-01 00:10:00', '2000-01-01 00:15:00'], # dtype='datetime64[ns]', freq='5min') rs.indices # defaultdict(list, # {Timestamp('2000-01-01 00:00:00'): [0, 1, 2, 3, 4], # Timestamp('2000-01-01 00:05:00'): [5, 6, 7, 8, 9], # Timestamp('2000-01-01 00:10:00'): [10, 11]}) rs.groups # {Timestamp('2000-01-01 00:00:00'): np.int64(5), # Timestamp('2000-01-01 00:05:00'): np.int64(10), # Timestamp('2000-01-01 00:10:00'): np.int64(12)} rs.sum() # 2000-01-01 00:00:00 10 # 2000-01-01 00:05:00 35 # 2000-01-01 00:10:00 21 # Freq: 5min, dtype: int64
rs = ts.resample("5min", closed="right", label="right") rs.closed, rs.label # ('right', 'right') rs.binner # DatetimeIndex(['1999-12-31 23:55:00', '2000-01-01 00:00:00', # '2000-01-01 00:05:00', '2000-01-01 00:10:00', # '2000-01-01 00:15:00'], # dtype='datetime64[ns]', freq='5min') rs.indices # defaultdict(list, # {Timestamp('2000-01-01 00:00:00'): [0], # Timestamp('2000-01-01 00:05:00'): [1, 2, 3, 4, 5], # Timestamp('2000-01-01 00:10:00'): [6, 7, 8, 9, 10], # Timestamp('2000-01-01 00:15:00'): [11]}) rs.groups # {Timestamp('2000-01-01 00:00:00'): np.int64(1), # Timestamp('2000-01-01 00:05:00'): np.int64(6), # Timestamp('2000-01-01 00:10:00'): np.int64(11), # Timestamp('2000-01-01 00:15:00'): np.int64(12)} rs.sum() # 2000-01-01 00:00:00 0 # 2000-01-01 00:05:00 15 # 2000-01-01 00:10:00 40 # 2000-01-01 00:15:00 11 # Freq: 5min, dtype: int64
# open-high-low-close (OHLC) resampling ts.resample("5min").ohlc() # open high low close # 2000-01-01 00:00:00 0 4 0 4 # 2000-01-01 00:05:00 5 9 5 9 # 2000-01-01 00:10:00 10 11 10 11
-
Upsampling is converting from a lower frequency to a higher frequency (e.g., monthly to daily), where no aggregation is needed.
data = pd.DataFrame( np.random.standard_normal((2, 4)), index=pd.date_range("2000-01-01", periods=2, freq="W-WED"), columns=["Colorado", "Texas", "New York", "Ohio"], ) # Colorado Texas New York Ohio # 2000-01-05 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-12 -0.672536 -0.385764 -0.291828 -0.132960 data.resample("D").asfreq() # Colorado Texas New York Ohio # 2000-01-05 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-06 NaN NaN NaN NaN # 2000-01-07 NaN NaN NaN NaN # 2000-01-08 NaN NaN NaN NaN # 2000-01-09 NaN NaN NaN NaN # 2000-01-10 NaN NaN NaN NaN # 2000-01-11 NaN NaN NaN NaN # 2000-01-12 -0.672536 -0.385764 -0.291828 -0.132960 data.resample("D").ffill() # Colorado Texas New York Ohio # 2000-01-05 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-06 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-07 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-08 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-09 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-10 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-11 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-12 -0.672536 -0.385764 -0.291828 -0.132960 data.resample("D").ffill(limit=2) # Colorado Texas New York Ohio # 2000-01-05 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-06 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-07 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-08 NaN NaN NaN NaN # 2000-01-09 NaN NaN NaN NaN # 2000-01-10 NaN NaN NaN NaN # 2000-01-11 NaN NaN NaN NaN # 2000-01-12 -0.672536 -0.385764 -0.291828 -0.132960 data.resample("W-THU").asfreq() # Colorado Texas New York Ohio # 2000-01-06 NaN NaN NaN NaN # 2000-01-13 NaN NaN NaN NaN data.resample("W-THU").ffill() # Colorado Texas New York Ohio # 2000-01-06 -0.538076 -1.057339 0.181577 -0.716948 # 2000-01-13 -0.672536 -0.385764 -0.291828 -0.132960 data.resample("W-THU").bfill() # Colorado Texas New York Ohio # 2000-01-06 -0.672536 -0.385764 -0.291828 -0.13296 # 2000-01-13 NaN NaN NaN NaN
-
Resampling data indexed by periods.
data = pd.DataFrame( np.random.standard_normal((24, 4)), index=pd.period_range("1-2000", "12-2001", freq="M"), columns=["Colorado", "Texas", "New York", "Ohio"], ) data.head() # Colorado Texas New York Ohio # 2000-01 -1.318496 -1.731395 0.782337 0.310808 # 2000-02 1.384605 1.210093 0.265278 -0.936725 # 2000-03 -0.743702 1.604631 0.011729 0.039290 # 2000-04 1.461097 0.047567 -1.117273 0.268163 # 2000-05 1.572309 -0.068589 -1.911431 0.160430 # 1. converts the PeriodIndex to DatetimeIndex # 2. performs the resampling on the DatetimeIndex # 3. calculates the mean for each year # 4. converts back to PeriodIndex if needed adf = data.to_timestamp().resample("YE-DEC").mean().to_period("Y-DEC") # Colorado Texas New York Ohio # 2000 0.360182 -0.188067 -0.316574 0.151905 # 2001 0.059221 -0.199204 0.230402 -0.310169 # use `asfreq()` instead of `resample()` data.asfreq("Y-DEC").groupby(level=0).mean() # Colorado Texas New York Ohio # 2000 0.360182 -0.188067 -0.316574 0.151905 # 2001 0.059221 -0.199204 0.230402 -0.310169 # https://github.com/pandas-dev/pandas/issues/57033 # FutureWarning: Resampling with a PeriodIndex is deprecated. Cast index to DatetimeIndex before resampling instead. adf.resample("Q-DEC").ffill() # Colorado Texas New York Ohio # 2000Q1 0.360182 -0.188067 -0.316574 0.151905 # 2000Q2 0.360182 -0.188067 -0.316574 0.151905 # 2000Q3 0.360182 -0.188067 -0.316574 0.151905 # 2000Q4 0.360182 -0.188067 -0.316574 0.151905 # 2001Q1 0.059221 -0.199204 0.230402 -0.310169 # 2001Q2 0.059221 -0.199204 0.230402 -0.310169 # 2001Q3 0.059221 -0.199204 0.230402 -0.310169 # 2001Q4 0.059221 -0.199204 0.230402 -0.310169 adf.to_timestamp().reindex( index=pd.date_range("2000-01", "2001-12", freq="ME"), method="ffill" ).resample("QE-DEC").ffill().to_period("Q") # Colorado Texas New York Ohio # 2000Q1 0.360182 -0.188067 -0.316574 0.151905 # 2000Q2 0.360182 -0.188067 -0.316574 0.151905 # 2000Q3 0.360182 -0.188067 -0.316574 0.151905 # 2000Q4 0.360182 -0.188067 -0.316574 0.151905 # 2001Q1 0.059221 -0.199204 0.230402 -0.310169 # 2001Q2 0.059221 -0.199204 0.230402 -0.310169 # 2001Q3 0.059221 -0.199204 0.230402 -0.310169 # 2001Q4 0.059221 -0.199204 0.230402 -0.310169
-
For time series data, the
resample
method is semantically a group operation based on a time intervalization.N = 15 times = pd.date_range("2017-05-20 00:00", freq="1min", periods=N) data = pd.DataFrame( { "time": times.repeat(3), "key": np.tile(["a", "b", "c"], N), "value": np.arange(N * 3.0), } ) data.set_index("time").resample("5min").count() # key value # time # 2017-05-20 00:00:00 15 15 # 2017-05-20 00:05:00 15 15 # 2017-05-20 00:10:00 15 15 time_key = pd.Grouper(freq="5min") data.set_index("time").groupby(["key", time_key]).sum() # value # key time # a 2017-05-20 00:00:00 30.0 # 2017-05-20 00:05:00 105.0 # 2017-05-20 00:10:00 180.0 # b 2017-05-20 00:00:00 35.0 # 2017-05-20 00:05:00 110.0 # 2017-05-20 00:10:00 185.0 # c 2017-05-20 00:00:00 40.0 # 2017-05-20 00:05:00 115.0 # 2017-05-20 00:10:00 190.0
7.6. Moving Window Functions
close_px_all = pd.read_csv(
"https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/stock_px.csv",
parse_dates=True,
index_col=0,
)
close_px = close_px_all[["AAPL", "MSFT", "XOM"]].resample("B").ffill()
close_px.head()
# AAPL MSFT XOM
# 2003-01-02 7.40 21.11 29.22
# 2003-01-03 7.45 21.14 29.24
# 2003-01-06 7.45 21.52 29.96
# 2003-01-07 7.43 21.93 28.95
# 2003-01-08 7.28 21.31 28.83
close_px["AAPL"].plot()
close_px["AAPL"].rolling(250).mean().plot()
std250 = close_px["AAPL"].pct_change().rolling(250, min_periods=10).std()
std250[5:12]
# 2003-01-09 NaN
# 2003-01-10 NaN
# 2003-01-13 NaN
# 2003-01-14 NaN
# 2003-01-15 NaN
# 2003-01-16 0.009628
# 2003-01-17 0.013818
# Freq: B, Name: AAPL, dtype: float64
std250.plot()
std250.expanding().mean().plot()
plt.style.use("grayscale")
close_px.rolling("60D").mean().plot(logy=True)
aapl_px = close_px["AAPL"]["2006":"2007"]
ma30 = aapl_px.rolling(30, min_periods=20).mean()
ewma30 = aapl_px.ewm(span=30).mean()
aapl_px.plot(style="k-", label="Price")
ma30.plot(style="k--", label="Simple Moving Avg")
ewma30.plot(style="k-", label="EW MA")
plt.legend()
spx_px = close_px_all["SPX"]
spx_rets = spx_px.pct_change()
returns = close_px.pct_change()
corr = returns["AAPL"].rolling(125, min_periods=100).corr(spx_rets)
corr.plot()
corr = returns.rolling(125, min_periods=100).corr(spx_rets)
corr.plot()
corr = returns.rolling(125, min_periods=100).corr(spx_rets)
corr.plot()