Python for Data Analysis
conda install -y pandas jupyter matplotlib # /ˈnʌmpli/ /ˈpeɪ.dəns/ /ˈmæt.plɒt.lɪb/
# import conventions
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
# pd.options.display.max_columns = 20
# pd.options.display.max_colwidth = 80
# pd.options.display.max_rows = 20
# pd.options.display.max_colwidth = 80
# np.set_printoptions(precision=4, suppress=True)
# pd.reset_option('all')
# pd.reset_option('display.max_rows')
- 1. NumPy
- 2. Pandas
- 2.1. Series, DataFrame, and Index
- 2.2. Essential Functionality
- 2.3. Reading and Writing
- 2.4. Data Cleaning and Preparation
- 2.4.1. Handling Missing Data
- 2.4.2. Removing Duplicates
- 2.4.3. Transforming Data Using a Function or Mapping
- 2.4.4. Replacing Values
- 2.4.5. Renaming Axis Indexes
- 2.4.6. Discretization and Binning
- 2.4.7. Detecting and Filtering Outliers
- 2.4.8. Permutation and Random Sampling
- 2.4.9. Computing Indicator/Dummy Variables: One-Hot Encoding
- 2.4.10. String Functions in pandas
- 2.4.11. Categorical Encoding
- 2.5. Data Wrangling: Join, Combine, and Reshape
- 2.6. Plotting and Visualization
- References
1. NumPy
NumPy (Numerical Python, /ˈnʌmpli/) is a library for the Python programming language that’s widely used in science and engineering, which contains multidimensional array data structures, such as the homogeneous, N-dimensional array (ndarray
, /ˈændəriː/), and a large library of functions that operate efficiently on these data structures.
import numpy as np
a = np.array([[1, 2, 3], [4, 5, 6]])
a.shape # (2, 3)
# Integer types (signed and unsigned)
int8, uint8 # i1, u1: 8-bit (1 byte) integers
int16, uint16 # i2, u2: 16-bit integers
int32, uint32 # i4, u4: 32-bit integers
int64, uint64 # i8, u8: 64-bit integers
# Floating-point types
float16 # f2: Half-precision (16-bit) floating point
float32 # f4 or f: Single-precision (32-bit) floating point; compatible with C float
float64 # f8 or d: Double-precision (64-bit) floating point; compatible with C double and Python float
float128 # f16 or g: Extended-precision (128-bit) floating point
# Complex number types
complex64 # c8: Complex number with two 32-bit floats (real and imaginary parts)
complex128 # c16: Complex number with two 64-bit floats
complex256 # c32: Complex number with two 128-bit floats
# Boolean type
bool # ?: Boolean values (True or False)
# Object type
object # O: Python object type; can store any Python object
# String and byte types
bytes_ # S: Fixed-length raw byte arrays (e.g., ASCII strings, 1 byte per character).
# Use 'S10' for a byte string of length 10.
# Each string is padded or truncated to fit the specified length.
str_ # U: Fixed-length Unicode strings using UTF-32 encoding (4 bytes per character).
# Use 'U10' for a Unicode string of length 10.
# Each string is padded or truncated to fit the specified length.
1.1. arrays
NumPy’s main object is the homogeneous multidimensional array (dimensions are called axes), known as ndarray
, which is a table of same-type elements (usually numbers) indexed by a tuple of non-negative integers.
Don’t think of NumPy’s N-d arrays solely as spatial points or coordinates; instead, picture them as structured data containers, like boxes within boxes.
|
# create a matrix with 2 rows and 3 columns
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
# the number of axes (dimensions) of the array
a.ndim # 2
# the dimensions of the array, a tuple of integers indicating the size of the array in each dimension
a.shape # (2, 3)
a.ndim == len(a.shape) # True
# the total number of elements of the array
a.size # 6 = 2 x 3
import math
a.size == math.prod(a.shape) # True
# an object describing the type of the elements in the array.
a.dtype # dtype('int64')
a.dtype.name # 'int64'
# the size in bytes of each element of the array.
a.itemsize # 8
# the buffer containing the actual elements of the array.
a.data # <memory at 0x10424e190>
# 1-dimensional array
# axis 0 runs horizontally (or vertically)
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a.ndim # 1
a.shape # (10,)
# 2-dimensional array
# axis 0 runs vertically downwards across the rows.
# axis 1 runs horizontally across the columns.
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
a.ndim # 2
a.shape # (2, 3)
# 3-dimensional array
# axis 0 runs along the depth (the first index).
# axis 1 runs along the rows (the second index).
# axis 2 runs along the columns (the third index).
a = np.arange(8).reshape(2, 2, 2)
# array([[[0, 1],
# [2, 3]],
#
# [[4, 5],
# [6, 7]]])
a.ndim # 3
a.shape # (2, 2, 2)
# creating arrays
a = np.array([1, 2, 3, 4, 5, 6])
# array([1, 2, 3, 4, 5, 6])
b = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
# array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
c = np.array([[1, 2], [3, 4]], dtype=complex)
# array([[1.+0.j, 2.+0.j],
# [3.+0.j, 4.+0.j]])
np.zeros((3, 4))
# array([[0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.]])
np.zeros_like(np.arange(12).reshape(3, 4))
# array([[0, 0, 0, 0],
# [0, 0, 0, 0],
# [0, 0, 0, 0]])
np.ones((3, 4), dtype=np.int16)
# array([[1, 1, 1, 1],
# [1, 1, 1, 1],
# [1, 1, 1, 1]], dtype=int16)
np.empty((3, 5)) # returns uninitialized memory!
# array([[0.0e+000, 4.9e-324, 9.9e-324, 1.5e-323, 2.0e-323],
# [2.5e-323, 3.0e-323, 3.5e-323, 4.0e-323, 4.4e-323],
# [4.9e-323, 5.4e-323, 5.9e-323, 6.4e-323, 6.9e-323]])
np.arange(10, 30, 5)
# array([10, 15, 20, 25])
np.arange(0, 2, 0.3)
# array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8])
from numpy import pi
np.linspace(0, 2, 9) # 9 numbers from 0 to 2
# array([0. , 0.25, 0.5 , 0.75, 1. , 1.25, 1.5 , 1.75, 2. ])
x = np.linspace(0, 2 * pi, 100)
f = np.sin(x)
# array([ 0.00000000e+00, 6.34239197e-02, 1.26592454e-01, 1.89251244e-01,
# 2.51147987e-01, 3.12033446e-01, 3.71662456e-01, 4.29794912e-01,
# . . .
# structured arrays
x = np.array(
[('Rex', 9, 81.0), ('Fido', 3, 27.0)],
dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')],
)
# array([('Rex', 9, 81.), ('Fido', 3, 27.)],
# dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f4')])
1.2. arithmetic operators, and universal functions
# arithmetic operators, and universal functions
a = np.array([20, 30, 40, 50])
np.sin(a)
# array([ 0.91294525, -0.98803162, 0.74511316, -0.26237485])
10 * np.sin(a)
# array([ 9.12945251, -9.88031624, 7.4511316 , -2.62374854])
b = np.arange(4)
# array([0, 1, 2, 3])
c = a - b
# array([20, 29, 38, 47])
# a + b
array([20, 31, 42, 53])
# b**2
array([0, 1, 4, 9])
a < 35
# array([ True, True, False, False])
A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])
A * B # elementwise product
# array([[2, 0],
# [0, 4]])
A @ B # matrix product: A.dot(B)
# array([[5, 4],
# [3, 4]])
# universal functions (ufunc): operate elementwise on an array, producing an array as output
b = np.arange(12).reshape(3, 4)
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
np.exp(b)
# array([1. , 2.71828183, 7.3890561 ])
np.sqrt(b)
# array([0. , 1. , 1.41421356])
c = np.array([2.0, -1.0, 4.0])
np.add(b, c)
# array([2., 0., 6.])
x = np.random.standard_normal(8)
# array([-0.99796507, 1.82697075, 0.60924821, 0.27004016, -1.62010011,
# -0.43957991, -2.27936726, -1.87890946])
y = np.random.standard_normal(8)
# array([-0.16034636, 1.60381624, -0.44003017, -1.50187946, -0.66940637,
# -1.2649151 , -0.12716502, -0.15756179])
np.maximum(x, y)
# array([-0.16034636, 1.82697075, 0.60924821, 0.27004016, -0.66940637,
# -0.43957991, -0.12716502, -0.15756179])
# assign the results into an existing array rather than create a new one
out = np.zeros_like(x)
np.add(x, 1, out=out)
# array([ 2.03493046e-03, 2.82697075e+00, 1.60924821e+00, 1.27004016e+00,
# -6.20100112e-01, 5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
out
# array([ 2.03493046e-03, 2.82697075e+00, 1.60924821e+00, 1.27004016e+00,
# -6.20100112e-01, 5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
# mathematical and statistical methods
a = np.arange(12).reshape(3, 4)
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
a.mean() # np.mean(a)
# np.float64(5.5)
a.mean(axis=1) # compute mean across the columns
# array([1.5, 5.5, 9.5])
a.sum()
# np.int64(66)
a.sum(axis=0) # compute sum down the rows
# array([12, 15, 18, 21])
a.cumsum() # computes the cumulative sum of the elements along a flattened version of the array
# array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66])
a.cumsum(axis=0) # computes the cumulative sum along the rows
# array([[ 0, 1, 2, 3],
# [ 4, 6, 8, 10],
# [12, 15, 18, 21]])
a.cumsum(axis=1) # computes the sums along the columns
# array([[ 0, 1, 3, 6],
# [ 4, 9, 15, 22],
# [ 8, 17, 27, 38]])
b = np.random.permutation(a.ravel()).reshape(a.shape)
# array([[ 6, 8, 11, 3],
# [ 7, 10, 4, 1],
# [ 9, 2, 0, 5]])
b.argmax() # return indices of the maximum values along the given axis
# np.int64(2)
b.argmax(axis=0)
# array([2, 1, 0, 2])
b.argmax(axis=1)
# array([2, 1, 0])
b.argmin() # return indices of the minimum values along the given axis
# np.int64(10)
b.argmin(axis=0)
# array([0, 2, 2, 1])
b.argmin(axis=1)
# array([3, 3, 2])
1.3. indexing, slicing and iterating
# indexing, slicing and iterating
# one-dimensional arrays can be indexed, sliced and iterated over, much
# like lists and other Python sequences.
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a[2]
# np.int64(2)
a[2:5]
# array([2, 3, 4])
# from start to position 6, exclusive, set every 2nd element to 1000
a[:6:2]
# array([0, 2, 4])
a[::-1] # reversed
# array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
# multidimensional arrays can have one index per axis, given in a tuple separated by commas (,)
b = np.fromfunction(lambda x, y: 10 * x+ y, (5, 4), dtype=int)
# array([[ 0, 1, 2, 3],
# [10, 11, 12, 13],
# [20, 21, 22, 23],
# [30, 31, 32, 33],
# [40, 41, 42, 43]])
# a comma-separated list of indices to select individual elements
b[2, 3] # b[2][3]
# np.int64(23)
# every element from column index 1 (`1:`) to the end of the first two rows (`:2`) of the array
b[:2, 1:]
# array([[ 1, 2, 3],
# [11, 12, 13]])
# each row in the second column of `b`
b[0:5, 1]
# array([ 1, 11, 21, 31, 41])
# a colon (`:`) by itself to take the entire axis
b[:, 1]
# array([ 1, 11, 21, 31, 41])
b[:, :1]
# array([[ 0],
# [10],
# [20],
# [30],
# [40]])
# each column in the second and third row of `b`
b[1:3, :]
# array([[10, 11, 12, 13],
# [20, 21, 22, 23]])
# missing indices are considered complete slices `:`
b[-1] # b[-1, :]
# array([40, 41, 42, 43])
# the dots (`...`) represent as many colons as needed to produce a complete indexing tuple
b[-1, ...] # b[-1, :]
# array([40, 41, 42, 43])
b = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
# fancy indexing: indexing with arrays of indices
a = np.zeros((8, 4))
for i in range(8):
a[i] = i
# array([[0., 0., 0., 0.],
# [1., 1., 1., 1.],
# [2., 2., 2., 2.],
# [3., 3., 3., 3.],
# [4., 4., 4., 4.],
# [5., 5., 5., 5.],
# [6., 6., 6., 6.],
# [7., 7., 7., 7.]])
i = [4, 3, 0, 6]
a[i]
# array([[4., 4., 4., 4.],
# [3., 3., 3., 3.],
# [0., 0., 0., 0.],
# [6., 6., 6., 6.]])
a = np.arange(32).reshape((8, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11],
# [12, 13, 14, 15],
# [16, 17, 18, 19],
# [20, 21, 22, 23],
# [24, 25, 26, 27],
# [28, 29, 30, 31]])
i = [1, 5, 7, 2], [0, 3, 1, 2]
a[i]
# array([ 4, 23, 29, 10])
palette = np.array(
[
[0, 0, 0], # black
[255, 0, 0], # red
[0, 255, 0], # green
[0, 0, 255], # blue
[255, 255, 255], # white
]
)
image = np.array(
[
[0, 1, 2, 0], # each value corresponds to a color in the palette
[0, 3, 4, 0],
]
)
palette[image] # the (2, 4, 3) color image
# array([[[ 0, 0, 0],
# [255, 0, 0],
# [ 0, 255, 0],
# [ 0, 0, 0]],
#
# [[ 0, 0, 0],
# [ 0, 0, 255],
# [255, 255, 255],
# [ 0, 0, 0]]])
# boolean indexing
names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2], [-12, -4], [3, 4]])
names == 'Bob'
# array([ True, False, False, True, False, False, False])
data[names == "Bob"]
# array([[4, 7],
# [0, 0]])
data[~(names == "Bob")] # data[names != "Bob"]
# array([[ 0, 2],
# [ -5, 6],
# [ 1, 2],
# [-12, -4],
# [ 3, 4]])
# The Python keywords `and` and `or` do not work with Boolean arrays.
# Use `&` (and) and `|` (or) instead.
mask = (names == "Bob") | (names == "Will")
# array([ True, False, True, True, True, False, False])
data[mask]
# array([[ 4, 7],
# [-5, 6],
# [ 0, 0],
# [ 1, 2]])
# iterating
for r in b:
print(r)
# [0 1 2]
# [3 4 5]
# iterating on each element
for e in b.flat:
print(f'{e}', end='\t')
# 0 1 2 3 4 5
1.4. copies and views
# copies and views: array slices are views on the original array
a = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]])
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
b = a
b is a # True
def f(x: np.array): # pass objects as references
return id(x)
id(a), f(a) # (4397168176, 4397168176)
c = a.view() # a new view of the array
c is a, c.base is a, c.flags.owndata # (False, True, False)
c = c.reshape((2, 6))
c.shape, a.shape # ((2, 6), (3, 4))
c[0, 4] = 1234 # a's data changes
a[1, 0] # np.int64(1234)
s = a[:, 1:3]
s = a[:, 1:3]
# array([[ 1, 2],
# [ 5, 6],
# [ 9, 10]])
s[:] = 10 # a view, not a copy
a
# array([[ 0, 10, 10, 3],
# [1234, 10, 10, 7],
# [ 8, 10, 10, 11]])
d = a.copy() # a copy of the array
d is a, d.base is a # (False, False)
1.5. probability distributions
# np.random: arrays of sample values from many kinds of probability distributions
samples = np.random.standard_normal(size=(4, 4))
# array([[ 0.56695748, 0.00955644, 0.08934151, 1.69336179],
# [-0.41463561, 0.81013138, 0.41954566, 0.11812052],
# [ 0.23305357, 0.26527434, 0.3752615 , 1.82930088],
# [ 0.36156569, 1.66151819, 0.78871978, -0.56163182]])
# from random import normalvariate
# N = 1_000_000
#
# %timeit samples = [normalvariate(0, 1) for _ in range(N)]
# 320 ms ± 885 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#
# %timeit np.random.standard_normal(N)
# 15.8 ms ± 8.28 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# random numbers are pseudo-random with a configurable RNG initialized by the `seed` value, ensuring reproducibility.
rng = np.random.default_rng(seed=12345)
type(rng)
# numpy.random._generator.Generator
r1 = np.random.default_rng(seed=12345)
r2 = np.random.default_rng(seed=12345)
r1.standard_normal((2, 3))
# array([[-1.42382504, 1.26372846, -0.87066174],
# [-0.25917323, -0.07534331, -0.74088465]])
r2.default_rng(seed=12345).standard_normal((2, 3))
# array([[-1.42382504, 1.26372846, -0.87066174],
# [-0.25917323, -0.07534331, -0.74088465]])
1.6. array-oriented programming
# array-oriented programming with arrays
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
# traditional list comprehension
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
np.array(result)
# array([1.1, 2.2, 1.3, 1.4, 2.5])
# numpy.where
result = np.where(cond, xarr, yarr)
# array([1.1, 2.2, 1.3, 1.4, 2.5])
a = np.random.standard_normal((4, 4))
# array([[-0.8529, -1.6429, 1.4967, -0.6435],
# [ 0.5086, 0.5242, 1.5083, 0.0487],
# [-1.2721, -0.8147, -0.3598, -1.2042],
# [ 1.1531, 1.5783, -0.9556, -0.6506]])
np.where(a > 0, 2, -2)
# array([[-2, -2, 2, -2],
# [ 2, 2, 2, 2],
# [-2, -2, -2, -2],
# [ 2, 2, -2, -2]])
# set only positive values to 2
np.where(a > 0, 2, a)
# array([[-0.8529, -1.6429, 2. , -0.6435],
# [ 2. , 2. , 2. , 2. ],
# [-1.2721, -0.8147, -0.3598, -1.2042],
# [ 2. , 2. , -0.9556, -0.6506]])
1.7. sorting
# sorting
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 2, 10, 11, 3],
# [ 0, 6, 9, 5],
# [ 7, 8, 1, 4]])
a.sort()
# array([[ 2, 3, 10, 11],
# [ 0, 5, 6, 9],
# [ 1, 4, 7, 8]])
a.sort(axis=0) # sort across each column
# array([[ 0, 3, 6, 8],
# [ 1, 4, 7, 9],
# [ 2, 5, 10, 11]])
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 5, 3, 11, 8],
# [ 7, 1, 0, 4],
# [10, 9, 2, 6]])
a.sort(axis=1) # sort across each row
# array([[ 3, 5, 8, 11],
# [ 0, 1, 4, 7],
# [ 2, 6, 9, 10]])
np.sort(a, axis=0) # a sorted copy of an array
# array([[ 0, 1, 4, 7],
# [ 2, 5, 8, 10],
# [ 3, 6, 9, 11]])
1.8. saving and loading
# save and load data: file input and output with arrays
np.save("a", np.arange(3))
np.load("a.npy")
# array([0, 1, 2])
a = np.arange(3)
b = np.arange(4, 7)
np.savez("x", a=a, b=b) # np.savez_compressed("x", a=a, b=b)
x = np.load("x.npz")
x["a"]
# array([0, 1, 2])
x["b"]
# array([4, 5, 6])
2. Pandas
Pandas (derived from panel data, /ˈpeɪ.dəns/) is an open-source, BSD-licensed Python library built on NumPy, offering high-performance, user-friendly tabular or heterogeneous data structures like Series
(1D labeled arrays of any data type) and DataFrames
(2D labeled tables) for powerful data analysis.
import numpy as np
import pandas as pd
BooleanDtype # Nullable Boolean data, use "boolean" when passing as string
CategoricalDtype # Categorical data type, use "category" when passing as string
DatetimeTZDtype # Datetime with time zone
Float32Dtype # 32-bit nullable floating point, use "Float32" when passing as string
Float64Dtype # 64-bit nullable floating point, use "Float64" when passing as string
Int8Dtype # 8-bit nullable signed integer, use "Int8" when passing as string
Int16Dtype # 16-bit nullable signed integer, use "Int16" when passing as string
Int32Dtype # 32-bit nullable signed integer, use "Int32" when passing as string
Int64Dtype # 64-bit nullable signed integer, use "Int64" when passing as string
UInt8Dtype # 8-bit nullable unsigned integer, use "UInt8" when passing as string
UInt16Dtype # 16-bit nullable unsigned integer, use "UInt16" when passing as string
UInt32Dtype # 32-bit nullable unsigned integer, use "UInt32" when passing as string
UInt64Dtype # 64-bit nullable unsigned integer, use "UInt64" when passing as string
s = pd.Series([1, 2, 3, None], dtype=pd.Float64Dtype()) # pd.Series([1, 2, 3, None], dtype="Float64")
# 0 1.0
# 1 2.0
# 2 3.0
# 3 <NA>
# dtype: Float64
s[3] is pd.NA
# True
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype()) # pd.Series(['one', 'two', None, 'three'], dtype="string")
# 0 one
# 1 two
# 2 <NA>
# 3 three
# dtype: string
s[2] is pd.NA
# True
2.1. Series, DataFrame, and Index
-
A Series is a one-dimensional array-like object containing a sequence of any values and an associated array of labels, called its index, also like a fixed-length, ordered dictionary as it is a mapping of index values to data values.
s = pd.Series([4, 7, -5, 3]) # default integer index # 0 4 # 1 7 # 2 -5 # 3 3 # dtype: int64 s.array # <NumpyExtensionArray> # [np.int64(4), np.int64(7), np.int64(-5), np.int64(3)] # Length: 4, dtype: int64 s.index # RangeIndex(start=0, stop=4, step=1) s.dtype # dtype('int64') s.to_numpy() # NumPy array # array([ 4, 7, -5, 3]) s.to_dict() # dictionary # {0: 4, 1: 7, 2: -5, 3: 3} s = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"]) # label index # d 4 # b 7 # a -5 # c 3 # dtype: int64 # select a single value or a set of values s["a"] # np.int64(-5) s[["b", "c", "d"]] # b 7 # c 3 # d 4 # dtype: int64 # purely integer-location based indexing for selection by position s.iloc[0] # np.int64(4) s.iloc[:2] # d 4 # b 7 # dtype: int64 s.iloc[[1, 2, 0]] # b 7 # a -5 # d 4 # dtype: int64 # Series is ndarray-like s > 0 # d True # b True # a False # c True # dtype: bool s[s > 0] # d 4 # b 7 # c 3 # dtype: int64 s[s > s.median()] # d 4 # b 7 # dtype: int64 np.exp(s) # import numpy as np # d 54.598150 # b 1096.633158 # a 0.006738 # c 20.085537 # dtype: float64 # Series is dictionary-like s["d"] # np.int64(4) s[["a", "b", "c"]] # a -5 # b 7 # c 3 # dtype: int64 "b" in s # True "e" in s # False s.get("e", np.nan) # nan data = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000} states = ["California", "Ohio", "Oregon", "Texas"] pd.Series(data, index=states) # override indices # California NaN # Ohio 35000.0 # Oregon 16000.0 # Texas 71000.0 # dtype: float64 # name attribute s.name = "fooooobar" s.index.name = "buzzzzzz" s # buzzzzzz # d 4 # b 7 # a -5 # c 3 # Name: fooooobar, dtype: int64 # alter the index in place while preserving the original index name s.index = pd.Index(["Bob", "Steve", "Jeff", "Ryan"], name=s.index.name) # Retains the current index name # Output: # buzzzzzz # Bob 4 # Steve 7 # Jeff -5 # Ryan 3 # Name: fooooobar, dtype: int64 # alter the index in place without preserving the index name s.index = ["Bob", "Steve", "Jeff", "Ryan"] # Removes the index name # Output: # Bob 4 # Steve 7 # Jeff -5 # Ryan 3 # Name: fooooobar, dtype: int64
-
A DataFrame is a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns, or a spreadsheet, or a dict of Series objects.
data = { "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"], "year": [2000, 2001, 2002, 2001, 2002, 2003], "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2], } frame = pd.DataFrame(data) # from a dictionary of equal-length lists or NumPy arrays # state year pop # 0 Ohio 2000 1.5 # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 # 5 Nevada 2003 3.2 frame.dtypes # state object # year int64 # pop float64 # dtype: object frame.head() # select the first five (by default) rows # state year pop # 0 Ohio 2000 1.5 # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 frame.tail() # select the last five (by default) rows # state year pop # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 # 5 Nevada 2003 3.2 # specify a sequence of columns and order pd.DataFrame(data, columns=["year", "state", "pop"]) # year state pop # 0 2000 Ohio 1.5 # 1 2001 Ohio 1.7 # 2 2002 Ohio 3.6 # 3 2001 Nevada 2.4 # 4 2002 Nevada 2.9 # 5 2003 Nevada 3.2 pd.DataFrame(data, columns=["year", "state", "pop", "debt"]) # year state pop debt # 0 2000 Ohio 1.5 NaN # 1 2001 Ohio 1.7 NaN # 2 2002 Ohio 3.6 NaN # 3 2001 Nevada 2.4 NaN # 4 2002 Nevada 2.9 NaN # 5 2003 Nevada 3.2 NaN # retrieve columns as Series frame["state"] # dictionary-like notation # 0 Ohio # 1 Ohio # 2 Ohio # 3 Nevada # 4 Nevada # 5 Nevada # Name: state, dtype: object frame.year # dot attribute notation # 0 2000 # 1 2001 # 2 2002 # 3 2001 # 4 2002 # 5 2003 # Name: year, dtype: int64 # retrieve rows by position or name with the special `iloc` and `loc` attributes frame.loc[1] # state Ohio # year 2001 # pop 1.7 # Name: 1, dtype: object frame.iloc[2] # state Ohio # year 2002 # pop 3.6 # Name: 2, dtype: object # columns can be modified by assignment frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"]) frame2["debt"] = 16.5 # year state pop debt # 0 2000 Ohio 1.5 16.5 # 1 2001 Ohio 1.7 16.5 # 2 2002 Ohio 3.6 16.5 # 3 2001 Nevada 2.4 16.5 # 4 2002 Nevada 2.9 16.5 # 5 2003 Nevada 3.2 16.5 frame2["debt"] = np.arange(6.) # year state pop debt # 0 2000 Ohio 1.5 0.0 # 1 2001 Ohio 1.7 1.0 # 2 2002 Ohio 3.6 2.0 # 3 2001 Nevada 2.4 3.0 # 4 2002 Nevada 2.9 4.0 # 5 2003 Nevada 3.2 5.0 # assigning a column that doesn’t exist will create a new column. frame2["eastern"] = frame2["state"] == "Ohio" # year state pop debt eastern # 0 2000 Ohio 1.5 0.0 True # 1 2001 Ohio 1.7 1.0 True # 2 2002 Ohio 3.6 2.0 True # 3 2001 Nevada 2.4 3.0 False # 4 2002 Nevada 2.9 4.0 False # 5 2003 Nevada 3.2 5.0 False # remove a column del frame2["eastern"] # year state pop debt # 0 2000 Ohio 1.5 0.0 # 1 2001 Ohio 1.7 1.0 # 2 2002 Ohio 3.6 2.0 # 3 2001 Nevada 2.4 3.0 # 4 2002 Nevada 2.9 4.0 # 5 2003 Nevada 3.2 5.0 # return and drop an item from frame frame2.pop("debt") # 0 0.0 # 1 1.0 # 2 2.0 # 3 3.0 # 4 4.0 # 5 5.0 # Name: debt, dtype: float64 frame2 # year state pop # 0 2000 Ohio 1.5 # 1 2001 Ohio 1.7 # 2 2002 Ohio 3.6 # 3 2001 Nevada 2.4 # 4 2002 Nevada 2.9 # 5 2003 Nevada 3.2 # from a nested dictionary of dictionaries # the outer dictionary keys as the columns # the inner keys as the row indices populations = { "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, "Nevada": {2001: 2.4, 2002: 2.9}, } frame3 = pd.DataFrame(populations) # Ohio Nevada # 2000 1.5 NaN # 2001 1.7 2.4 # 2002 3.6 2.9 frame3.T # transpose the DataFrame (swap rows and columns) # 2000 2001 2002 # Ohio 1.5 1.7 3.6 # Nevada NaN 2.4 2.9 frame2.to_numpy() # a two-dimensional ndarray # array([[2000, 'Ohio', 1.5, 0.0], # [2001, 'Ohio', 1.7, 1.0], # [2002, 'Ohio', 3.6, 2.0], # [2001, 'Nevada', 2.4, 3.0], # [2002, 'Nevada', 2.9, 4.0], # [2003, 'Nevada', 3.2, 5.0]], dtype=object)
-
A pandas’s Index object is responsible for holding the axis labels (including a DataFrame’s column names) and other metadata (like the axis name or names).
frame3 # Ohio Nevada # 2000 1.5 NaN # 2001 1.7 2.4 # 2002 3.6 2.9 frame3.index # Index([2000, 2001, 2002], dtype='int64') frame3.columns # Index(['Ohio', 'Nevada'], dtype='object') "Ohio" in frame3.columns # True 2003 in frame3.index # False # duplicate labels select all occurrences of that label s = frame3["Ohio"] # 2000 1.5 # 2001 1.7 # 2002 3.6 # Name: Ohio, dtype: float64 i = s.index.append(pd.Index([2000, 2002])) # Index([2000, 2001, 2002, 2000, 2002], dtype='int64') s[i] # 2000 1.5 # 2001 1.7 # 2002 3.6 # 2000 1.5 # 2002 3.6 # Name: Ohio, dtype: float64
2.2. Essential Functionality
2.2.1. Reindexing
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
# d 4.5
# b 7.2
# a -5.3
# c 3.6
# dtype: float64
obj.reindex(["a", "b", "c", "d", "e"])
# a -5.3
# b 7.2
# c 3.6
# d 4.5
# e NaN
# dtype: float64
frame = pd.DataFrame(
np.arange(9).reshape((3, 3)),
index=["a", "c", "d"],
columns=["Ohio", "Texas", "California"],
)
# Ohio Texas California
# a 0 1 2
# c 3 4 5
# d 6 7 8
frame.reindex(index=["a", "b", "c", "d"])
# Ohio Texas California
# a 0.0 1.0 2.0
# b NaN NaN NaN
# c 3.0 4.0 5.0
# d 6.0 7.0 8.0
# reindex with the `axis` keyword
frame.reindex(["Texas", "Utah", "California"], axis="columns")
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
frame.reindex(["Texas", "Utah", "California"], axis=1)
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
# reindex with the `columns` keyword
frame.reindex(columns=["Texas", "Utah", "California"])
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
# reindex with the `loc` operator
frame.loc[["a", "d", "c"], ["California", "Texas"]]
# California Texas
# a 2 1
# d 8 7
# c 5 4
2.2.2. Dropping Entries from an Axis
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
# a 0.0
# b 1.0
# c 2.0
# d 3.0
# e 4.0
# dtype: float64
obj.drop("c")
# a 0.0
# b 1.0
# d 3.0
# e 4.0
# dtype: float64
obj.drop(["c", "d"])
# a 0.0
# b 1.0
# e 4.0
# dtype: float64
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop(index=["Colorado", "Ohio"])
# one two three four
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop(columns=["two"])
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
data.drop("two", axis=1) # drop values from the columns by passing axis=1
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
data.drop("two", axis="columns") # drop values from the columns by passing axis="columns"
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
2.2.3. Indexing, Selection, and Filtering
s = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
# a 0.0
# b 1.0
# c 2.0
# d 3.0
# dtype: float64
s["b"]
# np.float64(1.0)
s[1] # deprecated: label-based indexing or position-based ?
# np.float64(1.0)
s.iloc[1] # by position
# np.float64(1.0)
s.iloc[[1, 3]]
# b 1.0
# d 3.0
# dtype: float64
s[2:4] # slicing with integers is always integer oriented
# c 2.0
# d 3.0
# dtype: float64
s[["b", "a", "d"]]
# b 1.0
# a 0.0
# d 3.0
# dtype: float64
s.loc[["b", "a", "d"]] # the preferred way to select index values by label
# b 1.0
# a 0.0
# d 3.0
# dtype: float64
s[s < 2]
# a 0.0
# b 1.0
# dtype: float64
s.loc["b":"c"] = 5
# a 0.0
# b 5.0
# c 5.0
# d 3.0
# dtype: float64
# retrieve one or more columns from a DataFrame either with a single value or sequence
# df[column] select single column or sequence of columns from the DataFrame
# df.loc[rows] select single row or subset of rows from the DataFrame by label
# df.loc[:, cols] select single column or subset of columns by label
# df.loc[rows, cols] select both row(s) and column(s) by label
# df.iloc[rows] select single row or subset of rows from the DataFrame by integer position
# df.iloc[:, cols] select single column or subset of columns by integer position
# df.iloc[rows, cols] select both row(s) and column(s) by integer position
# df.at[row, col] select a single scalar value by row and column label
# df.iat[row, col] select a single scalar value by row and column position (integers)
# df.reindex(...) select either rows or columns by labels
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data["two"]
# Ohio 1
# Colorado 5
# Utah 9
# New York 13
# Name: two, dtype: int64
data[["three", "one"]]
# three one
# Ohio 2 0
# Colorado 6 4
# Utah 10 8
# New York 14 12
data[:2] # slice (slice rows)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
data[data["three"] > 5] # boolean array (filter rows)
# one two three four
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data < 5
# one two three four
# Ohio True True True True
# Colorado True False False False
# Utah False False False False
# New York False False False False
data[data < 5] = 0 # boolean DataFrame (set values based on some criterion)
# one two three four
# Ohio 0 0 0 0
# Colorado 0 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
# select a subset of the rows and columns using either axis labels (loc) or integers (iloc)
data.loc["Colorado"] # select a single row by label
# one 0
# two 5
# three 6
# four 7
# Name: Colorado, dtype: int64
data.loc[["Colorado", "New York"]] # select multiple rows with a sequence of labels
# one two three four
# Colorado 0 5 6 7
# New York 12 13 14 15
data.loc["Colorado", ["two", "three"]] # data.loc["Colorado"][["two", "three"]]
# two 5
# three 6
# Name: Colorado, dtype: int64
data.iloc[2] # select a single row by integer
# one 8
# two 9
# three 10
# four 11
# Name: Utah, dtype: int64
data.iloc[[2, 1]]
# one two three four
# Utah 8 9 10 11
# Colorado 0 5 6 7
data.iloc[2, [3, 0, 1]]
# four 11
# one 8
# two 9
# Name: Utah, dtype: int64
data.iloc[[1, 2], [3, 0, 1]]
# four one two
# Colorado 7 0 5
# Utah 11 8 9
data.iloc[:, :3][data.three > 5]
# one two three
# Colorado 0 5 6
# Utah 8 9 10
# New York 12 13 14
# boolean arrays can be used with `loc` but not `iloc`
data.loc[data.three >= 2] # data[data.three >= 2]
# one two three four
# Colorado 0 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
2.2.4. Arithmetic and Data Alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
# a 7.3
# c -2.5
# d 3.4
# e 1.5
# dtype: float64
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
# a -2.1
# c 3.6
# e -1.5
# f 4.0
# g 3.1
# dtype: float64
s1 + s2 # label-based data alignment yields NaN for non-overlapping entries in arithmetic
# a 5.2
# c 1.1
# d NaN
# e 0.0
# f NaN
# g NaN
# dtype: float64
df1 = pd.DataFrame(
np.arange(9.0).reshape((3, 3)),
columns=list("bcd"),
index=["Ohio", "Texas", "Colorado"],
)
# b c d
# Ohio 0.0 1.0 2.0
# Texas 3.0 4.0 5.0
# Colorado 6.0 7.0 8.0
df2 = pd.DataFrame(
np.arange(12.0).reshape((4, 3)),
columns=list("bde"),
index=["Utah", "Ohio", "Texas", "Oregon"],
)
# b d e
# Utah 0.0 1.0 2.0
# Ohio 3.0 4.0 5.0
# Texas 6.0 7.0 8.0
# Oregon 9.0 10.0 11.0
df1 + df2 # alignment is performed on both rows and columns on DataFrame
# b c d e
# Colorado NaN NaN NaN NaN
# Ohio 3.0 NaN 6.0 NaN
# Oregon NaN NaN NaN NaN
# Texas 9.0 NaN 12.0 NaN
# Utah NaN NaN NaN NaN
# arithmetic methods with fill values: substitute the passed value for any missing values
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
# a b c d
# 0 0.0 1.0 2.0 3.0
# 1 4.0 5.0 6.0 7.0
# 2 8.0 9.0 10.0 11.0
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))
# a b c d e
# 0 0.0 1.0 2.0 3.0 4.0
# 1 5.0 6.0 7.0 8.0 9.0
# 2 10.0 11.0 12.0 13.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
df2.loc[1, "b"] = np.nan
# a b c d e
# 0 0.0 1.0 2.0 3.0 4.0
# 1 5.0 NaN 7.0 8.0 9.0
# 2 10.0 11.0 12.0 13.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
df1 + df2
# a b c d e
# 0 0.0 2.0 4.0 6.0 NaN
# 1 9.0 NaN 13.0 15.0 NaN
# 2 18.0 20.0 22.0 24.0 NaN
# 3 NaN NaN NaN NaN NaN
df1.add(df2, fill_value=0.)
# a b c d e
# 0 0.0 2.0 4.0 6.0 4.0
# 1 9.0 5.0 13.0 15.0 9.0
# 2 18.0 20.0 22.0 24.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
# broadcasting: operations between DataFrame and Series
arr = np.arange(12).reshape((3, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
arr[0]
# array([0, 1, 2, 3])
arr - arr[0] # the subtraction is performed once for each row
# array([[0, 0, 0, 0],
# [4, 4, 4, 4],
# [8, 8, 8, 8]])
2.2.5. Function Application and Mapping
data = pd.DataFrame(
np.random.standard_normal((4, 3)),
columns=list("bde"),
index=["Utah", "Ohio", "Texas", "Oregon"],
)
# b d e
# Utah -0.079026 1.151062 -0.393464
# Ohio -0.731363 0.601451 -0.462457
# Texas -1.400861 0.892020 -0.010010
# Oregon 0.899501 -0.418370 -0.791160
np.abs(data) # NumPy ufuncs (element-wise array methods)
# b d e
# Utah 0.079026 1.151062 0.393464
# Ohio 0.731363 0.601451 0.462457
# Texas 1.400861 0.892020 0.010010
# Oregon 0.899501 0.418370 0.791160
# apply a function along an axis of the DataFrame.
data.apply(lambda x: x.max() - x.min())
# b 2.300362
# d 1.569433
# e 0.781149
# dtype: float64
data.apply(lambda x: x.max() - x.min(), axis="columns")
# Utah 1.544527
# Ohio 1.332813
# Texas 2.292882
# Oregon 1.690661
# dtype: float64
data.apply(lambda x: pd.Series([x.max(), x.min()], index=["max", "min"]))
# b d e
# max 0.899501 1.151062 -0.01001
# min -1.400861 -0.418370 -0.79116
# apply a function to a Dataframe elementwise
data.map(lambda x: f"{x:.2f}")
# b d e
# Utah -0.08 1.15 -0.39
# Ohio -0.73 0.60 -0.46
# Texas -1.40 0.89 -0.01
# Oregon 0.90 -0.42 -0.79
2.2.6. Sorting and Ranking
s = pd.Series([4, 7, -3, 2], index=["d", "a", "b", "c"])
# d 4
# a 7
# b -3
# c 2
# dtype: int64
# sort lexicographically by row or column label
s.sort_index()
# a 7
# b -3
# c 2
# d 4
# dtype: int64
s.sort_index(ascending=False) # descending
# d 4
# c 2
# b -3
# a 7
# dtype: int64
i = s.index.sort_values()
# Index(['a', 'b', 'c', 'd'], dtype='object')
s.reindex(i)
# a 7
# b -3
# c 2
# d 4
# dtype: int64
s.sort_values() # sort by value
# b -3
# c 2
# d 4
# a 7
# dtype: int64
# any missing values are sorted to the end of the Series by default
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values()
# 4 -3.0
# 5 2.0
# 0 4.0
# 2 7.0
# 1 NaN
# 3 NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(ascending=False)
# 2 7.0
# 0 4.0
# 5 2.0
# 4 -3.0
# 1 NaN
# 3 NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(na_position="first")
# 1 NaN
# 3 NaN
# 4 -3.0
# 5 2.0
# 0 4.0
# 2 7.0
# dtype: float64
data = pd.DataFrame(
[[4, 7, -3, 2], [0, 1, 0, 1]], index=["three", "one"], columns=["d", "a", "b", "c"]
)
# d a b c
# three 4 7 -3 2
# one 0 1 0 1
data.sort_index()
# d a b c
# one 0 1 0 1
# three 4 7 -3 2
data.sort_index(axis="columns")
# a b c d
# three 7 -3 2 4
# one 1 0 1 0
# compute numerical data ranks (1 through n) along axis.
s = pd.Series([7, -5, 7, 4, 2, 0, 4])
s.rank() # by default, average rank of the group
# 0 6.5
# 1 1.0
# 2 6.5
# 3 4.5
# 4 3.0
# 5 2.0
# 6 4.5
# dtype: float64
s.rank(method="first") # ranks assigned in order they appear in the array
# 0 6.0
# 1 1.0
# 2 7.0
# 3 4.0
# 4 3.0
# 5 2.0
# 6 5.0
# dtype: float64
s.rank(ascending=False) # rank in descending order
# 0 1.5
# 1 7.0
# 2 1.5
# 3 3.5
# 4 5.0
# 5 6.0
# 6 3.5
# dtype: float64
# DataFrame can compute ranks over the rows or the columns
data = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
# b a c
# 0 4.3 0 -2.0
# 1 7.0 1 5.0
# 2 -3.0 0 8.0
# 3 2.0 1 -2.5
data.rank()
# b a c
# 0 3.0 1.5 2.0
# 1 4.0 3.5 3.0
# 2 1.0 1.5 4.0
# 3 2.0 3.5 1.0
data.rank(method="first", axis="columns")
# b a c
# 0 3.0 2.0 1.0
# 1 3.0 1.0 2.0
# 2 1.0 2.0 3.0
# 3 3.0 2.0 1.0
2.2.7. Axis Indexes with Duplicate Labels
s = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
# a 0
# a 1
# b 2
# b 3
# c 4
# dtype: int64
s.index.is_unique
# False
s["a"] # return a Series
# a 0
# a 1
# dtype: int64
s["c"] # return a scalar value
# np.int64(4)
data = pd.DataFrame(np.random.standard_normal((5, 3)), index=["a", "a", "b", "b", "c"])
# 0 1 2
# a -0.366146 0.329982 0.186980
# a 1.275523 0.615156 -3.403692
# b 0.488783 -0.448384 -2.626718
# b 0.610570 2.211333 1.131847
# c -0.157337 0.212340 0.175906
data.loc["a"] # return a DataFrame
# 0 1 2
# a -0.366146 0.329982 0.186980
# a 1.275523 0.615156 -3.403692
data.loc["c"] # return a Series
# 0 -0.157337
# 1 0.212340
# 2 0.175906
# Name: c, dtype: float64
2.2.8. Unique Values, Value Counts, and Membership
s = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
s.unique() # return unique values of Series object
# array(['c', 'a', 'd', 'b'], dtype=object)
s.value_counts() # return a Series containing counts of unique values
# c 3
# a 3
# b 2
# d 1
# Name: count, dtype: int64
mask = s.isin(["b", "c"]) # performs a vectorized set membership check
# 0 True
# 1 False
# 2 False
# 3 False
# 4 False
# 5 True
# 6 True
# 7 True
# 8 True
# dtype: bool
s[mask]
# 0 c
# 5 b
# 6 b
# 7 c
# 8 c
# dtype: object
pd.Index(pd.Series(s.unique()))
# Index(['c', 'a', 'd', 'b'], dtype='object')
pd.Index(pd.Series(s.unique())).get_indexer(s)
# array([0, 1, 2, 1, 1, 3, 3, 0, 0])
2.3. Reading and Writing
2.3.1. Reading and Writing CSV Text Files
csv_data = """
Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
1,John,1998-05-06,1234567890,25,New York,USA,894240000
2,Jane,1993-07-15,0987654321,30,London,UK,741484800
3,Doe,2001-12-20,1122334455,,Sydney,Australia,1008806400
4,NULL,1990-01-01,NULL,40,NULL,Canada,631152000
"""
# load the CSV data into a DataFrame
data = pd.read_csv(
io.StringIO(csv_data), # use io.StringIO to simulate a file-like object
parse_dates=["Birth Day"], # parse the 'Birth Day' column as datetime
dtype={"Phone Number": str}, # treat 'Phone Number' as a string
na_values=["", "NULL"], # handle missing values
)
# Id Name Birth Day Phone Number Age City Country Timestamp
# 0 1 John 1998-05-06 1234567890 25.0 New York USA 894240000
# 1 2 Jane 1993-07-15 0987654321 30.0 London UK 741484800
# 2 3 Doe 2001-12-20 1122334455 NaN Sydney Australia 1008806400
# 3 4 NaN 1990-01-01 NaN 40.0 NaN Canada 631152000
# convert the 'Timestamp' column from Unix epoch time to datetime
data["Timestamp"] = pd.to_datetime(data["Timestamp"], unit="s")
# Id Name Birth Day Phone Number Age City Country Timestamp timestamp
# 0 1 John 1998-05-06 1234567890 25.0 New York USA 894240000 1998-05-04
# 1 2 Jane 1993-07-15 0987654321 30.0 London UK 741484800 1993-07-01
# 2 3 Doe 2001-12-20 1122334455 NaN Sydney Australia 1008806400 2001-12-20
# 3 4 NaN 1990-01-01 NaN 40.0 NaN Canada 631152000 1990-01-01
data.dtypes
# Id int64
# Name object
# Birth Day datetime64[ns]
# Phone Number object
# Age float64
# City object
# Country object
# Timestamp datetime64[ns]
# dtype: object
# write the data out to a comma-separated file
data.to_csv(sys.stdout) # import sys
# ,Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 0,1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 1,2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 2,3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 3,4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False)
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL')
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL', header=False)
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
2.3.2. Reading and Writing Microsoft Excel Files
# add-on packages xlrd and openpyxl to read old-style XLS and newer XLSX files
conda install openpyxl xlrd
-
pandas.ExcelFile
: read sheets and parse as DataFramexlsx = pd.ExcelFile("book1.xlsx") xlsx.sheet_names # ['Sheet1'] xlsx.parse(sheet_name="Sheet1") # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo df = xlsx.parse(sheet_name="Sheet1", index_col=0) # a b c d message # 0 # 0 1 2 3 4 hello # 1 5 6 7 8 world # 2 9 10 11 12 foo df.columns # Index(['a', 'b', 'c', 'd', 'message'], dtype='object') df.index # Index([0, 1, 2], dtype='int64', name=0) xlsx.parse(sheet_name="Sheet1", index_col="message") # 0 a b c d # message # hello 0 1 2 3 4 # world 1 5 6 7 8 # foo 2 9 10 11 12
-
pandas.read_excel
: read a sheet as pandas data directlypd.read_excel("book1.xlsx") # pd.read_excel("book1.xlsx", sheet_name=0) # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo pd.read_excel("book1.xlsx", sheet_name="Sheet1") # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo
-
pandas.to_excel
: write pandas data to Excel fileswriter = pd.ExcelWriter("sample1.xlsx") df.to_excel(writer, sheet_name="foo") # write to sheet 'foo' df.to_excel(writer, sheet_name="bar") # write to sheet 'bar' df.to_excel(writer, sheet_name="buz") # write to sheet 'buz' pd.ExcelFile("sample1.xlsx").sheet_names # ['foo', 'bar', 'buz']
2.3.3. Reading SQL-based Databases
import sqlite3
query = "CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );"
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
# <sqlite3.Cursor at 0x7f56df665840>
con.commit()
data = [
("Atlanta", "Georgia", 1.25, 6),
("Tallahassee", "Florida", 2.6, 3),
("Sacramento", "California", 1.7, 5),
]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
# <sqlite3.Cursor at 0x7f56e074fac0>
con.commit()
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
# [('Atlanta', 'Georgia', 1.25, 6),
# ('Tallahassee', 'Florida', 2.6, 3),
# ('Sacramento', 'California', 1.7, 5)]
cursor.description
# (('a', None, None, None, None, None, None),
# ('b', None, None, None, None, None, None),
# ('c', None, None, None, None, None, None),
# ('d', None, None, None, None, None, None))
columns = [x[0] for x in cursor.description]
pd.DataFrame(rows, columns=columns)
# a b c d
# 0 Atlanta Georgia 1.25 6
# 1 Tallahassee Florida 2.60 3
# 2 Sacramento California 1.70 5
-
pandas.read_sql
: read data easily from a general SQLAlchemy connectionimport sqlalchemy as sqla # conda install sqlalchemy -y db = sqla.create_engine("sqlite:///mydata.sqlite") pd.read_sql("SELECT * FROM test", db) # a b c d # 0 Atlanta Georgia 1.25 6 # 1 Tallahassee Florida 2.60 3 # 2 Sacramento California 1.70 5
2.4. Data Cleaning and Preparation
2.4.1. Handling Missing Data
s = pd.Series([1, np.nan, 3.5, None, 7])
s.dropna() # remove missing values
# 0 1.0
# 2 3.5
# 4 7.0
# dtype: float64
s.fillna(0) # fill NA/NaN values (NA: not available)
# 0 1.0
# 1 0.0
# 2 3.5
# 3 0.0
# 4 7.0
# dtype: float64
s.isna()
# 0 False
# 1 True
# 2 False
# 3 True
# 4 False
# dtype: bool
s.notna()
# 0 True
# 1 False
# 2 True
# 3 False
# 4 True
# dtype: bool
data = pd.DataFrame(
[
[1.0, 6.5, 3.0],
[1.0, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[np.nan, 6.5, 3.0],
]
)
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
data.dropna()
# 0 1 2
# 0 1.0 6.5 3.0
data.dropna(how="all")
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 3 NaN 6.5 3.0
data[4] = np.nan
# 0 1 2 4
# 0 1.0 6.5 3.0 NaN
# 1 1.0 NaN NaN NaN
# 2 NaN NaN NaN NaN
# 3 NaN 6.5 3.0 NaN
data.dropna(axis="columns", how="all")
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
data = pd.DataFrame(np.random.standard_normal((7, 3)))
data.iloc[:4, 1] = np.nan
data.iloc[:2, 2] = np.nan
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.dropna()
# 0 1 2
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.dropna(thresh=2)
# 0 1 2
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.ffill() # fill NA/NaN values by using the last (row) valid observation to fill the gap
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.ffill(axis="columns")
# 0 1 2
# 0 -0.327178 -0.327178 -0.327178
# 1 1.345916 1.345916 1.345916
# 2 -0.502837 -0.502837 0.894144
# 3 -0.372355 -0.372355 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.bfill() # fill NA/NaN values by using the next (row) valid observation to fill the gap
# 0 1 2
# 0 -0.327178 0.484139 0.894144
# 1 1.345916 0.484139 0.894144
# 2 -0.502837 0.484139 0.894144
# 3 -0.372355 0.484139 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.bfill(axis="columns")
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 0.894144 0.894144
# 3 -0.372355 -1.719540 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
2.4.2. Removing Duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
# k1 k2
# 0 one 1
# 1 two 1
# 2 one 2
# 3 two 3
# 4 one 3
# 5 two 4
# 6 two 4
data.duplicated()
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# 5 False
# 6 True
# dtype: bool
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data["v1"] = range(7)
# k1 k2 v1
# 0 one 1 0
# 1 two 1 1
# 2 one 2 2
# 3 two 3 3
# 4 one 3 4
# 5 two 4 5
# 6 two 4 6
data.drop_duplicates(subset=["k1"])
# k1 k2 v1
# 0 one 1 0
# 1 two 1 1
data.drop_duplicates(subset=["k1"], keep="last")
# k1 k2 v1
# 4 one 3 4
# 6 two 4 6
2.4.3. Transforming Data Using a Function or Mapping
data = pd.DataFrame(
{
"food": [
"bacon",
"pulled pork",
"bacon",
"pastrami",
"corned beef",
"bacon",
"pastrami",
"honey ham",
"nova lox",
],
"ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
}
)
# food ounces
# 0 bacon 4.0
# 1 pulled pork 3.0
# 2 bacon 12.0
# 3 pastrami 6.0
# 4 corned beef 7.5
# 5 bacon 8.0
# 6 pastrami 3.0
# 7 honey ham 5.0
# 8 nova lox 6.0
meat_to_animal = {
"bacon": "pig",
"pulled pork": "pig",
"pastrami": "cow",
"corned beef": "cow",
"honey ham": "pig",
"nova lox": "salmon",
}
data["animal"] = data["food"].map(meat_to_animal) # data["food"].map(lambda x: meat_to_animal.get(x, None))
# food ounces animal
# 0 bacon 4.0 pig
# 1 pulled pork 3.0 pig
# 2 bacon 12.0 pig
# 3 pastrami 6.0 cow
# 4 corned beef 7.5 cow
# 5 bacon 8.0 pig
# 6 pastrami 3.0 cow
# 7 honey ham 5.0 pig
# 8 nova lox 6.0 salmon
2.4.4. Replacing Values
data = pd.Series([1., -999., 2., -999., -1000., 3.])
# 0 1.0
# 1 -999.0
# 2 2.0
# 3 -999.0
# 4 -1000.0
# 5 3.0
# dtype: float64
data.replace(-999, np.nan)
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 -1000.0
# 5 3.0
# dtype: float64
data.replace([-999, -1000], np.nan) # replace multiple values at once
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 NaN
# 5 3.0
# dtype: float64
2.4.5. Renaming Axis Indexes
data = pd.DataFrame(
np.arange(12).reshape((3, 4)),
index=["Ohio", "Colorado", "New York"],
columns=["one", "two", "three", "four"],
)
data.index.map(lambda x: x[:4].upper())
# Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(lambda x: x[:4].upper())
# one two three four
# OHIO 0 1 2 3
# COLO 4 5 6 7
# NEW 8 9 10 11
data.rename(index=str.title, columns=str.upper)
# ONE TWO THREE FOUR
# Ohio 0 1 2 3
# Colo 4 5 6 7
# New 8 9 10 11
2.4.6. Discretization and Binning
# Continuous data is often discretized or otherwise separated into “bins” for analysis.
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins) # pandas.core.arrays.categorical.Categorical
# [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
# Length: 12
# Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
age_categories.codes
# array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
age_categories.categories
# IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
age_categories.categories[0]
# Interval(18, 25, closed='right')
age_categories.value_counts()
# (18, 25] 5
# (25, 35] 3
# (35, 60] 3
# (60, 100] 1
# Name: count, dtype: int64
pd.cut(ages, bins, right=False).categories
# IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, right=False, labels=group_names).categories
# Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')
data = np.random.uniform(size=20)
# array([0.34500722, 0.51390559, 0.45442101, 0.24779283, 0.11860691,
# 0.64541406, 0.95682711, 0.69983468, 0.661519 , 0.09869171,
# 0.12846634, 0.10438803, 0.6110211 , 0.28152758, 0.40054146,
# 0.36215715, 0.29260695, 0.0516599 , 0.7291701 , 0.84767151])
pd.cut(data, 4, precision=2).categories # equal-length bins based on the minimum and maximum values
# IntervalIndex([(0.051, 0.28], (0.28, 0.5], (0.5, 0.73], (0.73, 0.96]], dtype='interval[float64, right]')
pd.qcut(data, 4, precision=2).value_counts()
# (0.041999999999999996, 0.22] 5
# (0.22, 0.38] 5
# (0.38, 0.65] 5
# (0.65, 0.96] 5
# Name: count, dtype: int64
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()
# (0.0507, 0.104] 2
# (0.104, 0.381] 8
# (0.381, 0.741] 8
# (0.741, 0.957] 2
# Name: count, dtype: int64
2.4.7. Detecting and Filtering Outliers
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.002266 -0.036654 -0.017418 -0.029068
# std 1.018625 0.950048 0.994702 1.041082
# min -3.421405 -3.833754 -3.642241 -3.125454
# 25% -0.663947 -0.694460 -0.659706 -0.762197
# 50% -0.024047 -0.005600 -0.025911 -0.024954
# 75% 0.700707 0.603613 0.663005 0.701335
# max 3.134904 2.526298 2.994854 3.704972
data[(data.abs() > 3).any(axis="columns")] # select all rows having a value exceeding 3 or –3
# 0 1 2 3
# 64 0.769583 -3.833754 -0.284691 0.283270
# 176 -3.198594 0.745868 -0.430357 -0.374628
# 207 0.823846 -0.171761 1.137797 3.704972
# 416 -3.024055 0.560612 1.320891 -0.691014
# 790 -3.421405 0.516339 0.049209 1.016584
# 915 -0.543984 0.844945 -3.010801 -0.165290
# 916 3.134904 0.091651 0.908098 1.685942
# 945 3.073425 -0.943425 -1.711061 0.573415
# 955 -1.375006 -0.344609 -3.642241 -0.823830
# 981 0.507220 0.136884 0.880373 -3.125454
data[data.abs() > 3] = np.sign(data) * 3 # cap values outside the interval [–3, 3]
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.001830 -0.035821 -0.016765 -0.029648
# std 1.015991 0.947074 0.992531 1.038420
# min -3.000000 -3.000000 -3.000000 -3.000000
# 25% -0.663947 -0.694460 -0.659706 -0.762197
# 50% -0.024047 -0.005600 -0.025911 -0.024954
# 75% 0.700707 0.603613 0.663005 0.701335
# max 3.000000 2.526298 2.994854 3.000000
np.sign(data).head() # produces 1 and –1 values based on positive or negative
# 0 1 2 3
# 0 -1.0 1.0 -1.0 1.0
# 1 -1.0 -1.0 -1.0 -1.0
# 2 1.0 -1.0 -1.0 1.0
# 3 -1.0 -1.0 1.0 -1.0
# 4 1.0 1.0 1.0 -1.0
2.4.8. Permutation and Random Sampling
data = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
# 0 1 2 3 4 5 6
# 0 0 1 2 3 4 5 6
# 1 7 8 9 10 11 12 13
# 2 14 15 16 17 18 19 20
# 3 21 22 23 24 25 26 27
# 4 28 29 30 31 32 33 34
sampler = np.random.permutation(5)
# array([2, 4, 3, 0, 1])
data.iloc[sampler] # equivalent: data.take(sampler)
0 1 2 3 4 5 6
2 14 15 16 17 18 19 20
4 28 29 30 31 32 33 34
3 21 22 23 24 25 26 27
0 0 1 2 3 4 5 6
1 7 8 9 10 11 12 13
column_sampler = np.random.permutation(7)
# array([3, 5, 2, 4, 6, 0, 1])
data.take(column_sampler, axis="columns")
# 3 5 2 4 6 0 1
# 0 3 5 2 4 6 0 1
# 1 10 12 9 11 13 7 8
# 2 17 19 16 18 20 14 15
# 3 24 26 23 25 27 21 22
# 4 31 33 30 32 34 28 29
data.sample(n=3) # select a random subset without replacement
# 0 1 2 3 4 5 6
# 0 0 1 2 3 4 5 6
# 2 14 15 16 17 18 19 20
# 4 28 29 30 31 32 33 34
data.sample(n=7, replace=True) # select a random subset with replacement (to allow repeat choices)
# 0 1 2 3 4 5 6
# 3 21 22 23 24 25 26 27
# 0 0 1 2 3 4 5 6
# 1 7 8 9 10 11 12 13
# 0 0 1 2 3 4 5 6
# 4 28 29 30 31 32 33 34
# 4 28 29 30 31 32 33 34
# 3 21 22 23 24 25 26 27
2.4.9. Computing Indicator/Dummy Variables: One-Hot Encoding
data = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
# key data1
# 0 b 0
# 1 b 1
# 2 a 2
# 3 c 3
# 4 a 4
# 5 b 5
pd.get_dummies(data["key"], dtype=int)
# a b c
# 0 0 1 0
# 1 0 1 0
# 2 1 0 0
# 3 0 0 1
# 4 1 0 0
# 5 0 1 0
data.join(pd.get_dummies(data["key"], dtype=int))
# key data1 a b c
# 0 b 0 0 1 0
# 1 b 1 0 1 0
# 2 a 2 1 0 0
# 3 c 3 0 0 1
# 4 a 4 1 0 0
# 5 b 5 0 1 0
movies = pd.read_table(
"https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/movielens/movies.dat",
sep="::",
header=None,
names=["movie_id", "title", "genres"],
engine="python",
)
movies.head()
# movie_id title genres
# 0 1 Toy Story (1995) Animation|Children's|Comedy
# 1 2 Jumanji (1995) Adventure|Children's|Fantasy
# 2 3 Grumpier Old Men (1995) Comedy|Romance
# 3 4 Waiting to Exhale (1995) Comedy|Drama
# 4 5 Father of the Bride Part II (1995) Comedy
dummies = movies["genres"].str.get_dummies("|") # string functions in pandas
dummies.iloc[:5,:6]
# Action Adventure Animation Children's Comedy Crime
# 0 0 0 1 1 1 0
# 1 0 1 0 1 0 0
# 2 0 0 0 0 1 0
# 3 0 0 0 0 1 0
# 4 0 0 0 0 1 0
movies_windic = movies.join(dummies.add_prefix("Genre_"))
movies_windic.iloc[0][:10]
# movie_id 1
# title Toy Story (1995)
# genres Animation|Children's|Comedy
# Genre_Action 0
# Genre_Adventure 0
# Genre_Animation 1
# Genre_Children's 1
# Genre_Comedy 1
# Genre_Crime 0
# Genre_Documentary 0
# Name: 0, dtype: object
np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
# array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
# 0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins), dtype=int)
# (0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
# 0 0 0 0 0 1
# 1 0 1 0 0 0
# 2 1 0 0 0 0
# 3 0 1 0 0 0
# 4 0 0 1 0 0
# 5 0 0 1 0 0
# 6 0 0 0 0 1
# 7 0 0 0 1 0
# 8 0 0 0 1 0
# 9 0 0 0 1 0
2.4.10. String Functions in pandas
data = {
"Dave": "dave@google.com",
"Steve": "steve@gmail.com",
"Rob": "rob@gmail.com",
"Wes": np.nan,
}
data = pd.Series(data)
# Dave dave@google.com
# Steve steve@gmail.com
# Rob rob@gmail.com
# Wes NaN
# dtype: object
data.isna()
# Dave False
# Steve False
# Rob False
# Wes True
# dtype: bool
data.astype("string") # extension types
# Dave dave@google.com
# Steve steve@gmail.com
# Rob rob@gmail.com
# Wes <NA>
# dtype: string
data.str.contains("gmail") # .str attribute
# Dave False
# Steve True
# Rob True
# Wes NaN
# dtype: object
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE) # regular expressions
# Dave [(dave, google, com)]
# Steve [(steve, gmail, com)]
# Rob [(rob, gmail, com)]
# Wes NaN
# dtype: object
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0] # vectorized element retrieval
# Dave (dave, google, com)
# Steve (steve, gmail, com)
# Rob (rob, gmail, com)
# Wes NaN
# dtype: object
matches.str.get(1)
# Dave google
# Steve gmail
# Rob gmail
# Wes NaN
# dtype: object
data.str[:5] # slice strings
# Dave dave@
# Steve steve
# Rob rob@g
# Wes NaN
# dtype: object
data.str.extract(pattern, flags=re.IGNORECASE) # extract capture groups
# 0 1 2
# Dave dave google com
# Steve steve gmail com
# Rob rob gmail com
# Wes NaN NaN NaN
2.4.11. Categorical Encoding
categories = pd.Series(["apple", "orange"]) # categories, dictionary, or levels
# 0 apple
# 1 orange
# dtype: object
codes = pd.Series([0, 1, 0, 0] * 2) # category codes or codes
# 0 0
# 1 1
# 2 0
# 3 0
# 4 0
# 5 1
# 6 0
# 7 0
# dtype: int64
values = categories.take(codes)
# 0 apple
# 1 orange
# 0 apple
# 0 apple
# 0 apple
# 1 orange
# 0 apple
# 0 apple
# dtype: object
# pandas has a Categorical extension type for the integer-based categorical or dictionary-encoded encoding
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)
data = pd.DataFrame(
{
"fruit": fruits,
"basket_id": np.arange(N),
"count": rng.integers(3, 15, size=N),
"weight": rng.uniform(0, 4, size=N),
},
columns=["basket_id", "fruit", "count", "weight"],
)
# basket_id fruit count weight
# 0 0 apple 8 0.366659
# 1 1 orange 6 2.394272
# 2 2 apple 9 3.418968
# 3 3 apple 12 2.406485
# 4 4 apple 8 3.727953
# 5 5 orange 5 2.899125
# 6 6 apple 3 3.442205
# 7 7 apple 4 3.717351
fruit_cat = data["fruit"].astype("category")
# 0 apple
# 1 orange
# 2 apple
# 3 apple
# 4 apple
# 5 orange
# 6 apple
# 7 apple
# Name: fruit, dtype: category
# Categories (2, object): ['apple', 'orange']
c = fruit_cat.array
type(c) # pandas.core.arrays.categorical.Categorical
c.categories # Index(['apple', 'orange'], dtype='object')
c.codes # array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
dict(enumerate(c.categories)) # get a mapping between codes and categories
# {0: 'apple', 1: 'orange'}
pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
# ['foo', 'bar', 'baz', 'foo', 'bar']
# Categories (3, object): ['bar', 'baz', 'foo']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"])
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo', 'bar', 'baz']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"], ordered=True)
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo' < 'bar' < 'baz']
# `.cat` accessor attribute
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2).astype("category")
cat_s.cat.categories
# Index(['a', 'b', 'c', 'd'], dtype='object')
cat_s.cat.codes
# 0 0
# 1 1
# 2 2
# 3 3
# 4 0
# 5 1
# 6 2
# 7 3
# dtype: int8
cat_s.value_counts()
# a 2
# b 2
# c 2
# d 2
# Name: count, dtype: int64
cat_s2 = cat_s.cat.set_categories(["a", "b", "c", "d", "e"])
cat_s2.value_counts()
# a 2
# b 2
# c 2
# d 2
# e 0
# Name: count, dtype: int64
cat_s3 = cat_s[cat_s.isin(["a", "b"])]
# 0 a
# 1 b
# 4 a
# 5 b
# dtype: category
# Categories (4, object): ['a', 'b', 'c', 'd']
cat_s_3.cat.remove_unused_categories()
# 0 a
# 1 b
# 4 a
# 5 b
# dtype: category
# Categories (2, object): ['a', 'b']
draws = np.random.default_rng(seed=12345).standard_normal(1_000)
# array([-1.42382504, 1.26372846, -0.87066174, -0.25917323, -0.07534331,
bins = pd.qcut(draws, 4) # compute a quartile binning
# [(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
# Length: 1000
# Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4']) # quartile names
# ['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
# Length: 1000
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
bins = pd.Series(bins, name='quartile')
# 0 Q1
# 1 Q4
# 2 Q1
# ..
# 998 Q3
# 999 Q2
# Name: quartile, Length: 1000, dtype: category
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
results = pd.Series(draws).groupby(bins, observed=True).agg(["count", "min", "max"]).reset_index() # statistics
# quartile count min max
# 0 Q1 250 -3.119609 -0.678494
# 1 Q2 250 -0.673305 0.008009
# 2 Q3 250 0.018753 0.686183
# 3 Q4 250 0.688282 3.211418
# dummy variables or one-hot encoding
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s, dtype=int)
# a b c d
# 0 1 0 0 0
# 1 0 1 0 0
# 2 0 0 1 0
# 3 0 0 0 1
# 4 1 0 0 0
# 5 0 1 0 0
# 6 0 0 1 0
# 7 0 0 0 1
2.5. Data Wrangling: Join, Combine, and Reshape
2.5.1. Hierarchical Indexing
data = pd.Series(
np.random.uniform(size=5),
index=[
["a", "a", "b", "b", "b"],
[ 2, 3, 1, 3, 1]
],
)
data # a prettified view
# a 2 0.911650
# 3 0.316521
# b 1 0.049235
# 3 0.995199
# 1 0.602722
# dtype: float64
data.index
# MultiIndex([('a', 2),
# ('a', 3),
# ('b', 1),
# ('b', 3),
# ('b', 1)],
# )
data["b"] # partial indexing
# 1 0.049235
# 3 0.995199
# 1 0.602722
# dtype: float64
data.loc[:, 3] # level indexing
# a 0.316521
# b 0.995199
# dtype: float64
data = pd.DataFrame(
np.arange(12).reshape((4, 3)),
index=[
["a", "a", "b", "b"],
[ 1, 2, 1, 2]
],
columns=[
[ "Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"]
],
)
# Ohio Colorado
# Green Red Green
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
data.index.names = ["key1", "key2"]
data.columns.names = ["state", "color"]
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
data.index.nlevels
# 2
data["Ohio"]
# color Green Red
# key key2
# a 1 0 1
# 2 3 4
# b 1 6 7
# 2 9 10
pd.MultiIndex.from_tuples(
[
("2023-01-01", "North"),
("2023-01-01", "South"),
("2023-01-02", "North"),
("2023-01-02", "South"),
],
names=["Date", "Region"],
)
# MultiIndex([('2023-01-01', 'North'),
# ('2023-01-01', 'South'),
# ('2023-01-02', 'North'),
# ('2023-01-02', 'South')],
# names=['Date', 'Region'])
pd.MultiIndex.from_arrays(
[
[ "Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"],
],
names=["state", "color"],
)
# MultiIndex([( 'Ohio', 'Green'),
# ( 'Ohio', 'Red'),
# ('Colorado', 'Green')],
# names=['state', 'color'])
-
Pivoting, Stacking, and Unstacking
data = { "Date": ["2023-01-01", "2023-01-01", "2023-01-02", "2023-01-02"], "City": ["New York", "Los Angeles", "New York", "Los Angeles"], "Temperature": [30, 75, 28, 77], "Humidity": [65, 30, 70, 35], } data = pd.DataFrame(data) # Date City Temperature Humidity # 0 2023-01-01 New York 30 65 # 1 2023-01-01 Los Angeles 75 30 # 2 2023-01-02 New York 28 70 # 3 2023-01-02 Los Angeles 77 35
# pandas.pivot(...) # - reshapes data by turning unique values from one column into columns # - creates a table where rows and columns represent specific categories, and the values are aggregated pivoted = data.pivot(index="Date", columns="City", values=["Temperature", "Humidity"]) # Temperature Humidity # City Los Angeles New York Los Angeles New York # Date # 2023-01-01 75 30 30 65 # 2023-01-02 77 28 35 70 pivoted.columns # MultiIndex([('Temperature', 'Los Angeles'), # ('Temperature', 'New York'), # ( 'Humidity', 'Los Angeles'), # ( 'Humidity', 'New York')], # names=[None, 'City']) pivoted_t = data.pivot(index="Date", columns="City", values="Temperature") # City Los Angeles New York # Date # 2023-01-01 75 30 # 2023-01-02 77 28
# pandas.stack(...) # - converts columns into rows, creating a hierarchical index # - compacts a DataFrame by stacking columns into a single column stacked = pivoted_t.stack() # Date City # 2023-01-01 Los Angeles 75 # New York 30 # 2023-01-02 Los Angeles 77 # New York 28 # dtype: int64 stacked.index # MultiIndex([('2023-01-01', 'Los Angeles'), # ('2023-01-01', 'New York'), # ('2023-01-02', 'Los Angeles'), # ('2023-01-02', 'New York')], # names=['Date', 'City'])
# pandas.unstack(...) # - converts rows into columns, the reverse of stack # - expands a DataFrame by unstacking a hierarchical index into columns stacked.unstack() # City Los Angeles New York # Date # 2023-01-01 75 30 # 2023-01-02 77 28
-
Reordering and Sorting Levels
df.swaplevel("key1", "key2") # state Ohio Colorado # color Green Red Green # key2 key1 # 1 a 0 1 2 # 2 a 3 4 5 # 1 b 6 7 8 # 2 b 9 10 11 df.sort_index(level=1) # state Ohio Colorado # color Green Red Green # key1 key2 # a 1 0 1 2 # b 1 6 7 8 # a 2 3 4 5 # b 2 9 10 11 df.swaplevel(0, 1) # state Ohio Colorado # color Green Red Green # key2 key1 # 1 a 0 1 2 # 2 a 3 4 5 # 1 b 6 7 8 # 2 b 9 10 11 df.swaplevel(0, 1).sort_index(level=0) # state Ohio Colorado # color Green Red Green # key2 key1 # 1 a 0 1 2 # b 6 7 8 # 2 a 3 4 5 # b 9 10 11
-
Summary Statistics by Level
df.groupby(level="key2").sum() # state Ohio Colorado # color Green Red Green # key2 # 1 6 8 10 # 2 12 14 16 df.T.groupby(level="color").sum().T # color Green Red # key1 key2 # a 1 2 1 # 2 8 4 # b 1 14 7 # 2 20 10
-
Indexing with a DataFrame’s columns
data = pd.DataFrame( { "a": range(7), "b": range(7, 0, -1), "c": ["one", "one", "one", "two", "two", "two", "two"], "d": [0, 1, 2, 0, 1, 2, 3], } ) # a b c d # 0 0 7 one 0 # 1 1 6 one 1 # 2 2 5 one 2 # 3 3 4 two 0 # 4 4 3 two 1 # 5 5 2 two 2 # 6 6 1 two 3 data2 = data.set_index(["c", "d"]) # set a DataFrame index using existing columns # a b # c d # one 0 0 7 # 1 1 6 # 2 2 5 # two 0 3 4 # 1 4 3 # 2 5 2 # 3 6 1 data2.reset_index() # reset the hierarchical index levels into the columns c d a b 0 one 0 0 7 1 one 1 1 6 2 one 2 2 5 3 two 0 3 4 4 two 1 4 3 5 two 2 5 2 6 two 3 6 1
2.5.2. Combining and Merging Datasets
-
Database-Style DataFrame Joins
left = pd.DataFrame( { "key1": ["foo", "foo", "bar"], "key2": ["one", "two", "one"], "lval": pd.Series([1, 2, 3], dtype="Int64"), } ) # key1 key2 lval # 0 foo one 1 # 1 foo two 2 # 2 bar one 3 right = pd.DataFrame( { "key1": ["foo", "foo", "bar", "bar"], "key2": ["one", "one", "one", "two"], "rval": pd.Series([4, 5, 6, 7], dtype="Int64"), } ) # key1 key2 rval # 0 foo one 4 # 1 foo one 5 # 2 bar one 6 # 3 bar two 7 pd.merge(left, right) # same as pd.merge(left, right, on=["key1", "key2"]) by specifying the overlap columns explicitly # key1 key2 lval rval # 0 foo one 1 4 # 1 foo one 1 5 # 2 bar one 3 6 pd.merge(left, right, left_on=["key1", "key2"], right_on=["key1", "key2"]) # specify the left and right keys separately key1 key2 lval rval 0 foo one 1 4 1 foo one 1 5 2 bar one 3 6 pd.merge(left, right, on=["key1", "key2"], how="inner") # inner join (by default) # key1 key2 lval rval # 0 foo one 1 4 # 1 foo one 1 5 # 2 bar one 3 6 pd.merge(left, right, on=["key1", "key2"], how="left") # left join # key1 key2 lval rval # 0 foo one 1 4 # 1 foo one 1 5 # 2 foo two 2 <NA> # 3 bar one 3 6 pd.merge(left, right, on=["key1", "key2"], how="right") # right join # key1 key2 lval rval # 0 foo one 1 4 # 1 foo one 1 5 # 2 bar one 3 6 # 3 bar two <NA> 7 pd.merge(left, right, on=["key1", "key2"], how="outer") # outer join # key1 key2 lval rval # 0 bar one 3 6 # 1 bar two <NA> 7 # 2 foo one 1 4 # 3 foo one 1 5 # 4 foo two 2 <NA> pd.merge(left, right, on=["key1", "key2"], how="outer", indicator=True) # source indicator # key1 key2 lval rval _merge # 0 bar one 3 6 both # 1 bar two <NA> 7 right_only # 2 foo one 1 4 both # 3 foo one 1 5 both # 4 foo two 2 <NA> left_only pd.merge(left, right, on="key1") # specify a single join key # key1 key2_x lval key2_y rval # 0 foo one 1 one 4 # 1 foo one 1 one 5 # 2 foo two 2 one 4 # 3 foo two 2 one 5 # 4 bar one 3 one 6 # 5 bar one 3 two 7 pd.merge(left, right, on="key1", suffixes=("_left", "_right")) # sepcify the suffixes for the overlap columns # key1 key2_left lval key2_right rval # 0 foo one 1 one 4 # 1 foo one 1 one 5 # 2 foo two 2 one 4 # 3 foo two 2 one 5 # 4 bar one 3 one 6 # 5 bar one 3 two 7
-
Merging on Index
left1 = pd.DataFrame( {"key": ["a", "b", "a", "a", "b", "c"], "value": pd.Series(range(6), dtype="Int64")} ) # key value # 0 a 0 # 1 b 1 # 2 a 2 # 3 a 3 # 4 b 4 # 5 c 5 right1 = pd.DataFrame({"group_val": [3.5, 7]}, index=["a", "b"]) # group_val # a 3.5 # b 7.0 pd.merge(left1, right1, left_on="key", right_index=True) # merge on index # key value group_val # 0 a 0 3.5 # 1 b 1 7.0 # 2 a 2 3.5 # 3 a 3 3.5 # 4 b 4 7.0 pd.merge(right1, left1, left_index=True, right_on="key", how="outer") # group_val key value # 0 3.5 a 0 # 2 3.5 a 2 # 3 3.5 a 3 # 1 7.0 b 1 # 4 7.0 b 4 # 5 NaN c 5 left2 = pd.DataFrame( [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], index=["a", "c", "e"], columns=["Ohio", "Nevada"], ).astype("Int64") right2 = pd.DataFrame( [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13, 14]], index=["b", "c", "d", "e"], columns=["Missouri", "Alabama"], ).astype("Int64") pd.merge(left2, right2, left_index=True, right_index=True, how="outer") # use both sides indexes # Ohio Nevada Missouri Alabama # a 1 2 <NA> <NA> # b <NA> <NA> 7 8 # c 3 4 9 10 # d <NA> <NA> 11 12 # e 5 6 13 14 left2.join(right2, how="outer") # join: merge by index shortly # Ohio Nevada Missouri Alabama # a 1 2 <NA> <NA> # b <NA> <NA> 7 8 # c 3 4 9 10 # d <NA> <NA> 11 12 # e 5 6 13 14 another = pd.DataFrame( [[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [16.0, 17.0]], index=["a", "c", "e", "f"], columns=["New York", "Oregon"], ) # New York Oregon # a 7.0 8.0 # c 9.0 10.0 # e 11.0 12.0 # f 16.0 17.0 left2.join([right2, another]) # Ohio Nevada Missouri Alabama New York Oregon # a 1 2 <NA> <NA> 7.0 8.0 # c 3 4 9 10 9.0 10.0 # e 5 6 13 14 11.0 12.0 lefth = pd.DataFrame( { "key1": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada"], "key2": [2000, 2001, 2002, 2001, 2002], "data": pd.Series(range(5), dtype="Int64"), } ) # key1 key2 data # 0 Ohio 2000 0 # 1 Ohio 2001 1 # 2 Ohio 2002 2 # 3 Nevada 2001 3 # 4 Nevada 2002 4 righth_index = pd.MultiIndex.from_arrays( [ ["Nevada", "Nevada", "Ohio", "Ohio", "Ohio", "Ohio"], [2001, 2000, 2000, 2000, 2001, 2002], ] ) righth = pd.DataFrame( { "event1": pd.Series([0, 2, 4, 6, 8, 10], dtype="Int64", index=righth_index), "event2": pd.Series([1, 3, 5, 7, 9, 11], dtype="Int64", index=righth_index), } ) # event1 event2 # Nevada 2001 0 1 # 2000 2 3 # Ohio 2000 4 5 # 2000 6 7 # 2001 8 9 # 2002 10 11 pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True) # multiple-key merge key1 key2 data event1 event2 0 Ohio 2000 0 4 5 0 Ohio 2000 0 6 7 1 Ohio 2001 1 8 9 2 Ohio 2002 2 10 11 3 Nevada 2001 3 0 1
-
Concatenating Along an Axis
# numpy.concatenate(...) arr = np.arange(12).reshape((3, 4)) # array([[ 0, 1, 2, 3], # [ 4, 5, 6, 7], # [ 8, 9, 10, 11]]) np.concat([arr, arr], axis=1) # array([[ 0, 1, 2, 3, 0, 1, 2, 3], # [ 4, 5, 6, 7, 4, 5, 6, 7], # [ 8, 9, 10, 11, 8, 9, 10, 11]])
# pandas.concat(...) s1 = pd.Series([0, 1], index=["a", "b"], dtype="Int64") # a 0 # b 1 # dtype: Int64 s2 = pd.Series([2, 3, 4], index=["c", "d", "e"], dtype="Int64") # c 2 # d 3 # e 4 # dtype: Int64 s3 = pd.Series([5, 6], index=["f", "g"], dtype="Int64") # f 5 # g 6 # dtype: Int64 pd.concat([s1, s2, s3]) # stacking (union all) on columns # a 0 # b 1 # c 2 # d 3 # e 4 # f 5 # g 6 # dtype: Int64 pd.concat([s1, s2, s3], axis="columns") # union (outer join) of the indexes # 0 1 2 # a 0 <NA> <NA> # b 1 <NA> <NA> # c <NA> 2 <NA> # d <NA> 3 <NA> # e <NA> 4 <NA> # f <NA> <NA> 5 # g <NA> <NA> 6 s4 = pd.concat([s1, s3]) # a 0 # b 1 # f 5 # g 6 # dtype: Int64 pd.concat([s1, s4], axis="columns") # 0 1 # a 0 0 # b 1 1 # f <NA> 5 # g <NA> 6 pd.concat([s1, s4], axis="columns", join="inner") # 0 1 # a 0 0 # b 1 1
# use the keys to indicate the concatenated pieces result = pd.concat([s1, s1, s3], keys=["one", "two", "three"]) # one a 0 # b 1 # two a 0 # b 1 # three f 5 # g 6 # dtype: Int64 result.unstack() # a b f g # one 0 1 <NA> <NA> # two 0 1 <NA> <NA> # three <NA> <NA> 5 6 pd.concat([s1, s2, s3], axis="columns", keys=["one", "two", "three"]) # one two three # a 0 <NA> <NA> # b 1 <NA> <NA> # c <NA> 2 <NA> # d <NA> 3 <NA> # e <NA> 4 <NA> # f <NA> <NA> 5 # g <NA> <NA> 6
df1 = pd.DataFrame( np.arange(6).reshape(3, 2), index=["a", "b", "c"], columns=["one", "two"] ) # one two # a 0 1 # b 2 3 # c 4 5 df2 = pd.DataFrame( 5 + np.arange(4).reshape(2, 2), index=["a", "c"], columns=["three", "four"] ) # three four # a 5 6 # c 7 8 pd.concat([df1, df2]) # one two three four # a 0.0 1.0 NaN NaN # b 2.0 3.0 NaN NaN # c 4.0 5.0 NaN NaN # a NaN NaN 5.0 6.0 # c NaN NaN 7.0 8.0 pd.concat([df1, df2], axis="columns") # one two three four # a 0 1 5.0 6.0 # b 2 3 NaN NaN # c 4 5 7.0 8.0 pd.concat([df1, df2], axis="columns", keys=["level1", "level2"]) # level1 level2 # one two three four # a 0 1 5.0 6.0 # b 2 3 NaN NaN # c 4 5 7.0 8.0 pd.concat({"level1": df1, "level2": df2}, axis="columns") # level1 level2 # one two three four # a 0 1 5.0 6.0 # b 2 3 NaN NaN # c 4 5 7.0 8.0 pd.concat([df1, df2], axis="columns", keys=["level1", "level2"], names=["upper", "lower"]) # upper level1 level2 # lower one two three four # a 0 1 5.0 6.0 # b 2 3 NaN NaN # c 4 5 7.0 8.0
# do not preserve indexes along concatenation axis, instead produce a new range(total_length) index df1 = pd.DataFrame(np.random.standard_normal((3, 4)), columns=["a", "b", "c", "d"]) # a b c d # 0 2.410815 -0.872558 1.215403 1.321002 # 1 -0.046726 0.088367 0.255645 -1.442833 # 2 0.001280 0.561726 -0.992764 -0.830453 df2 = pd.DataFrame(np.random.standard_normal((2, 3)), columns=["b", "d", "a"]) # b d a # 0 -1.281585 -1.052020 0.925319 # 1 -0.711210 -0.424939 -0.619031 pd.concat([df1, df2], ignore_index=True) # a b c d # 0 2.410815 -0.872558 1.215403 1.321002 # 1 -0.046726 0.088367 0.255645 -1.442833 # 2 0.001280 0.561726 -0.992764 -0.830453 # 3 0.925319 -1.281585 NaN -1.052020 # 4 -0.619031 -0.711210 NaN -0.424939
-
Combining Data with Overlap
a = pd.Series( [np.nan, 2.5, 0.0, 3.5, 4.5, np.nan], index=["f", "e", "d", "c", "b", "a"] ) # f NaN # e 2.5 # d 0.0 # c 3.5 # b 4.5 # a NaN # dtype: float64 b = pd.Series( [0.0, np.nan, 2.0, np.nan, np.nan, 5.0], index=["a", "b", "c", "d", "e", "f"] ) # a 0.0 # b NaN # c 2.0 # d NaN # e NaN # f 5.0 # dtype: float64 np.where(pd.isna(a), b, a) # not check the index labels # array([0. , 2.5, 0. , 3.5, 4.5, 5. ]) a.combine_first(b) # line up values by index # a 0.0 # b 4.5 # c 3.5 # d 0.0 # e 2.5 # f 5.0 # dtype: float64
df1 = pd.DataFrame( { "a": [1.0, np.nan, 5.0, np.nan], "b": [np.nan, 2.0, np.nan, 6.0], "c": range(2, 18, 4), } ) # a b c # 0 1.0 NaN 2 # 1 NaN 2.0 6 # 2 5.0 NaN 10 # 3 NaN 6.0 14 df2 = pd.DataFrame( { "a": [5.0, 4.0, np.nan, 3.0, 7.0], "b": [np.nan, 3.0, 4.0, 6.0, 8.0], } ) # a b # 0 5.0 NaN # 1 4.0 3.0 # 2 NaN 4.0 # 3 3.0 6.0 # 4 7.0 8.0 df1.combine_first(df2) # union of all the column names # a b c # 0 1.0 NaN 2.0 # 1 4.0 2.0 6.0 # 2 5.0 4.0 10.0 # 3 3.0 6.0 14.0 # 4 7.0 8.0 NaN
2.5.3. Reshaping and Pivoting
-
Reshaping with Hierarchical Indexing
data = pd.DataFrame( np.arange(6).reshape((2, 3)), index=pd.Index(["Ohio", "Colorado"], name="state"), columns=pd.Index(["one", "two", "three"], name="number"), ) # number one two three # state # Ohio 0 1 2 # Colorado 3 4 5 data.stack() # pivot the columns into the rows (a new Series) # state number # Ohio one 0 # two 1 # three 2 # Colorado one 3 # two 4 # three 5 # dtype: int64 data.unstack() # pivot the rows into the columns (a new Series) # number state # one Ohio 0 # Colorado 3 # two Ohio 1 # Colorado 4 # three Ohio 2 # Colorado 5 # dtype: int64 data.stack().unstack() # rearrange forth and back # number one two three # state # Ohio 0 1 2 # Colorado 3 4 5
-
Pivoting “Long” to “Wide” Format
A common way to store multiple time series in databases and CSV files is what is sometimes called long or stacked format, that each row represents a single observation or value, with columns for identifiers (e.g., time, category) and the corresponding value. raw_data = pd.read_csv("https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/examples/macrodata.csv") raw_data.head() # year quarter realgdp realcons realinv ... tbilrate unemp pop infl realint # 0 1959 1 2710.349 1707.4 286.898 ... 2.82 5.8 177.146 0.00 0.00 # 1 1959 2 2778.801 1733.7 310.859 ... 3.08 5.1 177.830 2.34 0.74 # 2 1959 3 2775.488 1751.8 289.226 ... 3.82 5.3 178.657 2.74 1.09 # 3 1959 4 2785.204 1753.7 299.356 ... 4.33 5.6 179.386 0.27 4.06 # 4 1960 1 2847.699 1770.5 331.722 ... 3.50 5.2 180.007 2.31 1.19 # # [5 rows x 14 columns] data = raw_data.loc[:, ["year", "quarter", "realgdp", "infl", "unemp"]] data.head() # year quarter realgdp infl unemp # 0 1959 1 2710.349 0.00 5.8 # 1 1959 2 2778.801 2.34 5.1 # 2 1959 3 2775.488 2.74 5.3 # 3 1959 4 2785.204 0.27 5.6 # 4 1960 1 2847.699 2.31 5.2 periods = pd.PeriodIndex.from_fields( year=data.pop("year"), quarter=data.pop("quarter") ) periods.name = "date" # PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2', # '1960Q3', '1960Q4', '1961Q1', '1961Q2', # ... # '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3', # '2008Q4', '2009Q1', '2009Q2', '2009Q3'], # dtype='period[Q-DEC]', name='date', length=203) data.index = periods.to_timestamp("D") data.head() # realgdp infl unemp # date # 1959-01-01 2710.349 0.00 5.8 # 1959-04-01 2778.801 2.34 5.1 # 1959-07-01 2775.488 2.74 5.3 # 1959-10-01 2785.204 0.27 5.6 # 1960-01-01 2847.699 2.31 5.2 data.columns.name = "item" data.head() # item realgdp infl unemp # date # 1959-01-01 2710.349 0.00 5.8 # 1959-04-01 2778.801 2.34 5.1 # 1959-07-01 2775.488 2.74 5.3 # 1959-10-01 2785.204 0.27 5.6 # 1960-01-01 2847.699 2.31 5.2 ldata = data.stack().reset_index().rename(columns={0: "value"}) # long format ldata.head() # date item value # 0 1959-01-01 realgdp 2710.349 # 1 1959-01-01 infl 0.000 # 2 1959-01-01 unemp 5.800 # 3 1959-04-01 realgdp 2778.801 # 4 1959-04-01 infl 2.340 pivoted = ldata.pivot(index="date", columns="item", values="value") pivoted.head() # item infl realgdp unemp # date # 1959-01-01 0.00 2710.349 5.8 # 1959-04-01 2.34 2778.801 5.1 # 1959-07-01 2.74 2775.488 5.3 # 1959-10-01 0.27 2785.204 5.6 # 1960-01-01 2.31 2847.699 5.2 ldata["value2"] = np.random.standard_normal(len(ldata)) pivoted = ldata.pivot(index="date", columns="item") # mutiple values, hierarchical columns pivoted.head() # value value2 # item infl realgdp unemp infl realgdp unemp # date # 1959-01-01 0.00 2710.349 5.8 -0.971766 -0.561886 0.007484 # 1959-04-01 2.34 2778.801 5.1 0.649848 0.664631 0.635405 # 1959-07-01 2.74 2775.488 5.3 -0.828290 0.528560 -1.424809 # 1959-10-01 0.27 2785.204 5.6 -1.433866 -0.062887 0.395601 # 1960-01-01 2.31 2847.699 5.2 0.074323 -0.682785 -0.587253 pivoted["value"].head() # item infl realgdp unemp # date # 1959-01-01 0.00 2710.349 5.8 # 1959-04-01 2.34 2778.801 5.1 # 1959-07-01 2.74 2775.488 5.3 # 1959-10-01 0.27 2785.204 5.6 # 1960-01-01 2.31 2847.699 5.2 unstacked = ldata.set_index(["date", "item"]).unstack(level="item") # equivalent: set_index and unstack unstacked.head() # value value2 # item infl realgdp unemp infl realgdp unemp # date # 1959-01-01 0.00 2710.349 5.8 -0.971766 -0.561886 0.007484 # 1959-04-01 2.34 2778.801 5.1 0.649848 0.664631 0.635405 # 1959-07-01 2.74 2775.488 5.3 -0.828290 0.528560 -1.424809 # 1959-10-01 0.27 2785.204 5.6 -1.433866 -0.062887 0.395601 # 1960-01-01 2.31 2847.699 5.2 0.074323 -0.682785 -0.587253
-
Pivoting “Wide” to “Long” Format
df = pd.DataFrame( { "key": ["foo", "bar", "baz"], "A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9], } ) # key A B C # 0 foo 1 4 7 # 1 bar 2 5 8 # 2 baz 3 6 9 melted = pd.melt(df, id_vars="key") # use "key" as the group indicator # key variable value # 0 foo A 1 # 1 bar A 2 # 2 baz A 3 # 3 foo B 4 # 4 bar B 5 # 5 baz B 6 # 6 foo C 7 # 7 bar C 8 # 8 baz C 9 melted.pivot(index="key", columns="variable", values="value").reset_index() # reshape back # variable key A B C # 0 bar 2 5 8 # 1 baz 3 6 9 # 2 foo 1 4 7
2.6. Plotting and Visualization
conda install -c conda-forge -y jupyterlab && jupyter lab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = np.arange(10) # array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
plt.plot(data);
plt.show()
One nuance of using Jupyter notebooks is that plots are reset after each cell is evaluated, so you must put all of the plotting commands in a single notebook cell. |
2.6.1. Figures and Subplots
# plots in matplotlib reside within a figure object
fig = plt.figure() # create a new figure
ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
plt.get_fignums() # [1]
plt.figure(1) == fig # True
# get the current figure or create a new figure
cf = plt.gcf()
cf.number, cf == fig # (1, True)
# save the figure as a SVG
fig.savefig("an-empty-figure-with-three-subplots.svg")
# show a specific figure
fig.show()
fig.show() # fine
# show and erase the current figure
plt.show()
plt.show() # not work
plt.get_fignums() # []
plt.gcf() # <Figure size 640x480 with 0 Axes>
ax3.plot(np.random.standard_normal(50).cumsum(), color="black", linestyle="dashed");
ax1.hist(np.random.standard_normal(100), bins=20, color="black", alpha=0.3);
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.standard_normal(30));
ax3.plot(np.random.standard_normal(50).cumsum(), color="black", linestyle="dashed");
# plt.subplots(...): create a grid of subplots more convenient
fig, axes = plt.subplots(2, 3, sharex=True, sharey=True)
axes
# array([[<Axes: >, <Axes: >, <Axes: >],
# [<Axes: >, <Axes: >, <Axes: >]], dtype=object)
fig.show()
# adjusting the spacing around subplots
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
for j in range(2):
axes[i, j].hist(np.random.standard_normal(500), bins=50, color="black", alpha=0.5)
fig.subplots_adjust(wspace=0, hspace=0)