Python for Data Analysis
conda install -y pandas jupyter matplotlib
# import conventions
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
# pd.options.display.max_columns = 20
# pd.options.display.max_colwidth = 80
# pd.options.display.max_rows = 20
# pd.options.display.max_colwidth = 80
# np.set_printoptions(precision=4, suppress=True)
# pd.reset_option('all')
# pd.reset_option('display.max_rows')
- 1. NumPy
- 2. Pandas
- 2.1. Series, DataFrame, and Index
- 2.2. Essential Functionality
- 2.3. Reading and Writing
- 2.4. Data Cleaning and Preparation
- 2.4.1. Handling Missing Data
- 2.4.2. Removing Duplicates
- 2.4.3. Transforming Data Using a Function or Mapping
- 2.4.4. Replacing Values
- 2.4.5. Renaming Axis Indexes
- 2.4.6. Discretization and Binning
- 2.4.7. Detecting and Filtering Outliers
- 2.4.8. Permutation and Random Sampling
- 2.4.9. Computing Indicator/Dummy Variables: One-Hot Encoding
- 2.4.10. String Functions in pandas
- 2.4.11. Categorical Encoding
- 2.5. Data Wrangling: Join, Combine, and Reshape
- References
1. NumPy
NumPy (Numerical Python, /ˈnʌmpli/) is a library for the Python programming language that’s widely used in science and engineering, which contains multidimensional array data structures, such as the homogeneous, N-dimensional array (ndarray
, /ˈændəriː/), and a large library of functions that operate efficiently on these data structures.
import numpy as np
a = np.array([[1, 2, 3], [4, 5, 6]])
a.shape # (2, 3)
# Integer types (signed and unsigned)
int8, uint8 # i1, u1: 8-bit (1 byte) integers
int16, uint16 # i2, u2: 16-bit integers
int32, uint32 # i4, u4: 32-bit integers
int64, uint64 # i8, u8: 64-bit integers
# Floating-point types
float16 # f2: Half-precision (16-bit) floating point
float32 # f4 or f: Single-precision (32-bit) floating point; compatible with C float
float64 # f8 or d: Double-precision (64-bit) floating point; compatible with C double and Python float
float128 # f16 or g: Extended-precision (128-bit) floating point
# Complex number types
complex64 # c8: Complex number with two 32-bit floats (real and imaginary parts)
complex128 # c16: Complex number with two 64-bit floats
complex256 # c32: Complex number with two 128-bit floats
# Boolean type
bool # ?: Boolean values (True or False)
# Object type
object # O: Python object type; can store any Python object
# String and byte types
bytes_ # S: Fixed-length raw byte arrays (e.g., ASCII strings, 1 byte per character).
# Use 'S10' for a byte string of length 10.
# Each string is padded or truncated to fit the specified length.
str_ # U: Fixed-length Unicode strings using UTF-32 encoding (4 bytes per character).
# Use 'U10' for a Unicode string of length 10.
# Each string is padded or truncated to fit the specified length.
1.1. arrays
NumPy’s main object is the homogeneous multidimensional array (dimensions are called axes), known as ndarray
, which is a table of same-type elements (usually numbers) indexed by a tuple of non-negative integers.
Don’t think of NumPy’s N-d arrays solely as spatial points or coordinates; instead, picture them as structured data containers, like boxes within boxes.
|
# create a matrix with 2 rows and 3 columns
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
# the number of axes (dimensions) of the array
a.ndim # 2
# the dimensions of the array, a tuple of integers indicating the size of the array in each dimension
a.shape # (2, 3)
a.ndim == len(a.shape) # True
# the total number of elements of the array
a.size # 6 = 2 x 3
import math
a.size == math.prod(a.shape) # True
# an object describing the type of the elements in the array.
a.dtype # dtype('int64')
a.dtype.name # 'int64'
# the size in bytes of each element of the array.
a.itemsize # 8
# the buffer containing the actual elements of the array.
a.data # <memory at 0x10424e190>
# 1-dimensional array
# axis 0 runs horizontally (or vertically)
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a.ndim # 1
a.shape # (10,)
# 2-dimensional array
# axis 0 runs vertically downwards across the rows.
# axis 1 runs horizontally across the columns.
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
a.ndim # 2
a.shape # (2, 3)
# 3-dimensional array
# axis 0 runs along the depth (the first index).
# axis 1 runs along the rows (the second index).
# axis 2 runs along the columns (the third index).
a = np.arange(8).reshape(2, 2, 2)
# array([[[0, 1],
# [2, 3]],
#
# [[4, 5],
# [6, 7]]])
a.ndim # 3
a.shape # (2, 2, 2)
# creating arrays
a = np.array([1, 2, 3, 4, 5, 6])
# array([1, 2, 3, 4, 5, 6])
b = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
# array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
c = np.array([[1, 2], [3, 4]], dtype=complex)
# array([[1.+0.j, 2.+0.j],
# [3.+0.j, 4.+0.j]])
np.zeros((3, 4))
# array([[0., 0., 0., 0.],
# [0., 0., 0., 0.],
# [0., 0., 0., 0.]])
np.zeros_like(np.arange(12).reshape(3, 4))
# array([[0, 0, 0, 0],
# [0, 0, 0, 0],
# [0, 0, 0, 0]])
np.ones((3, 4), dtype=np.int16)
# array([[1, 1, 1, 1],
# [1, 1, 1, 1],
# [1, 1, 1, 1]], dtype=int16)
np.empty((3, 5)) # returns uninitialized memory!
# array([[0.0e+000, 4.9e-324, 9.9e-324, 1.5e-323, 2.0e-323],
# [2.5e-323, 3.0e-323, 3.5e-323, 4.0e-323, 4.4e-323],
# [4.9e-323, 5.4e-323, 5.9e-323, 6.4e-323, 6.9e-323]])
np.arange(10, 30, 5)
# array([10, 15, 20, 25])
np.arange(0, 2, 0.3)
# array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8])
from numpy import pi
np.linspace(0, 2, 9) # 9 numbers from 0 to 2
# array([0. , 0.25, 0.5 , 0.75, 1. , 1.25, 1.5 , 1.75, 2. ])
x = np.linspace(0, 2 * pi, 100)
f = np.sin(x)
# array([ 0.00000000e+00, 6.34239197e-02, 1.26592454e-01, 1.89251244e-01,
# 2.51147987e-01, 3.12033446e-01, 3.71662456e-01, 4.29794912e-01,
# . . .
# structured arrays
x = np.array(
[('Rex', 9, 81.0), ('Fido', 3, 27.0)],
dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')],
)
# array([('Rex', 9, 81.), ('Fido', 3, 27.)],
# dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f4')])
1.2. arithmetic operators, and universal functions
# arithmetic operators, and universal functions
a = np.array([20, 30, 40, 50])
np.sin(a)
# array([ 0.91294525, -0.98803162, 0.74511316, -0.26237485])
10 * np.sin(a)
# array([ 9.12945251, -9.88031624, 7.4511316 , -2.62374854])
b = np.arange(4)
# array([0, 1, 2, 3])
c = a - b
# array([20, 29, 38, 47])
# a + b
array([20, 31, 42, 53])
# b**2
array([0, 1, 4, 9])
a < 35
# array([ True, True, False, False])
A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])
A * B # elementwise product
# array([[2, 0],
# [0, 4]])
A @ B # matrix product: A.dot(B)
# array([[5, 4],
# [3, 4]])
# universal functions (ufunc): operate elementwise on an array, producing an array as output
b = np.arange(12).reshape(3, 4)
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
np.exp(b)
# array([1. , 2.71828183, 7.3890561 ])
np.sqrt(b)
# array([0. , 1. , 1.41421356])
c = np.array([2.0, -1.0, 4.0])
np.add(b, c)
# array([2., 0., 6.])
x = np.random.standard_normal(8)
# array([-0.99796507, 1.82697075, 0.60924821, 0.27004016, -1.62010011,
# -0.43957991, -2.27936726, -1.87890946])
y = np.random.standard_normal(8)
# array([-0.16034636, 1.60381624, -0.44003017, -1.50187946, -0.66940637,
# -1.2649151 , -0.12716502, -0.15756179])
np.maximum(x, y)
# array([-0.16034636, 1.82697075, 0.60924821, 0.27004016, -0.66940637,
# -0.43957991, -0.12716502, -0.15756179])
# assign the results into an existing array rather than create a new one
out = np.zeros_like(x)
np.add(x, 1, out=out)
# array([ 2.03493046e-03, 2.82697075e+00, 1.60924821e+00, 1.27004016e+00,
# -6.20100112e-01, 5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
out
# array([ 2.03493046e-03, 2.82697075e+00, 1.60924821e+00, 1.27004016e+00,
# -6.20100112e-01, 5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
# mathematical and statistical methods
a = np.arange(12).reshape(3, 4)
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
a.mean() # np.mean(a)
# np.float64(5.5)
a.mean(axis=1) # compute mean across the columns
# array([1.5, 5.5, 9.5])
a.sum()
# np.int64(66)
a.sum(axis=0) # compute sum down the rows
# array([12, 15, 18, 21])
a.cumsum() # computes the cumulative sum of the elements along a flattened version of the array
# array([ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66])
a.cumsum(axis=0) # computes the cumulative sum along the rows
# array([[ 0, 1, 2, 3],
# [ 4, 6, 8, 10],
# [12, 15, 18, 21]])
a.cumsum(axis=1) # computes the sums along the columns
# array([[ 0, 1, 3, 6],
# [ 4, 9, 15, 22],
# [ 8, 17, 27, 38]])
b = np.random.permutation(a.ravel()).reshape(a.shape)
# array([[ 6, 8, 11, 3],
# [ 7, 10, 4, 1],
# [ 9, 2, 0, 5]])
b.argmax() # return indices of the maximum values along the given axis
# np.int64(2)
b.argmax(axis=0)
# array([2, 1, 0, 2])
b.argmax(axis=1)
# array([2, 1, 0])
b.argmin() # return indices of the minimum values along the given axis
# np.int64(10)
b.argmin(axis=0)
# array([0, 2, 2, 1])
b.argmin(axis=1)
# array([3, 3, 2])
1.3. indexing, slicing and iterating
# indexing, slicing and iterating
# one-dimensional arrays can be indexed, sliced and iterated over, much
# like lists and other Python sequences.
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a[2]
# np.int64(2)
a[2:5]
# array([2, 3, 4])
# from start to position 6, exclusive, set every 2nd element to 1000
a[:6:2]
# array([0, 2, 4])
a[::-1] # reversed
# array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
# multidimensional arrays can have one index per axis, given in a tuple separated by commas (,)
b = np.fromfunction(lambda x, y: 10 * x+ y, (5, 4), dtype=int)
# array([[ 0, 1, 2, 3],
# [10, 11, 12, 13],
# [20, 21, 22, 23],
# [30, 31, 32, 33],
# [40, 41, 42, 43]])
# a comma-separated list of indices to select individual elements
b[2, 3] # b[2][3]
# np.int64(23)
# every element from column index 1 (`1:`) to the end of the first two rows (`:2`) of the array
b[:2, 1:]
# array([[ 1, 2, 3],
# [11, 12, 13]])
# each row in the second column of `b`
b[0:5, 1]
# array([ 1, 11, 21, 31, 41])
# a colon (`:`) by itself to take the entire axis
b[:, 1]
# array([ 1, 11, 21, 31, 41])
b[:, :1]
# array([[ 0],
# [10],
# [20],
# [30],
# [40]])
# each column in the second and third row of `b`
b[1:3, :]
# array([[10, 11, 12, 13],
# [20, 21, 22, 23]])
# missing indices are considered complete slices `:`
b[-1] # b[-1, :]
# array([40, 41, 42, 43])
# the dots (`...`) represent as many colons as needed to produce a complete indexing tuple
b[-1, ...] # b[-1, :]
# array([40, 41, 42, 43])
b = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
# [3, 4, 5]])
# fancy indexing: indexing with arrays of indices
a = np.zeros((8, 4))
for i in range(8):
a[i] = i
# array([[0., 0., 0., 0.],
# [1., 1., 1., 1.],
# [2., 2., 2., 2.],
# [3., 3., 3., 3.],
# [4., 4., 4., 4.],
# [5., 5., 5., 5.],
# [6., 6., 6., 6.],
# [7., 7., 7., 7.]])
i = [4, 3, 0, 6]
a[i]
# array([[4., 4., 4., 4.],
# [3., 3., 3., 3.],
# [0., 0., 0., 0.],
# [6., 6., 6., 6.]])
a = np.arange(32).reshape((8, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11],
# [12, 13, 14, 15],
# [16, 17, 18, 19],
# [20, 21, 22, 23],
# [24, 25, 26, 27],
# [28, 29, 30, 31]])
i = [1, 5, 7, 2], [0, 3, 1, 2]
a[i]
# array([ 4, 23, 29, 10])
palette = np.array(
[
[0, 0, 0], # black
[255, 0, 0], # red
[0, 255, 0], # green
[0, 0, 255], # blue
[255, 255, 255], # white
]
)
image = np.array(
[
[0, 1, 2, 0], # each value corresponds to a color in the palette
[0, 3, 4, 0],
]
)
palette[image] # the (2, 4, 3) color image
# array([[[ 0, 0, 0],
# [255, 0, 0],
# [ 0, 255, 0],
# [ 0, 0, 0]],
#
# [[ 0, 0, 0],
# [ 0, 0, 255],
# [255, 255, 255],
# [ 0, 0, 0]]])
# boolean indexing
names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2], [-12, -4], [3, 4]])
names == 'Bob'
# array([ True, False, False, True, False, False, False])
data[names == "Bob"]
# array([[4, 7],
# [0, 0]])
data[~(names == "Bob")] # data[names != "Bob"]
# array([[ 0, 2],
# [ -5, 6],
# [ 1, 2],
# [-12, -4],
# [ 3, 4]])
# The Python keywords `and` and `or` do not work with Boolean arrays.
# Use `&` (and) and `|` (or) instead.
mask = (names == "Bob") | (names == "Will")
# array([ True, False, True, True, True, False, False])
data[mask]
# array([[ 4, 7],
# [-5, 6],
# [ 0, 0],
# [ 1, 2]])
# iterating
for r in b:
print(r)
# [0 1 2]
# [3 4 5]
# iterating on each element
for e in b.flat:
print(f'{e}', end='\t')
# 0 1 2 3 4 5
1.4. copies and views
# copies and views: array slices are views on the original array
a = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]])
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
b = a
b is a # True
def f(x: np.array): # pass objects as references
return id(x)
id(a), f(a) # (4397168176, 4397168176)
c = a.view() # a new view of the array
c is a, c.base is a, c.flags.owndata # (False, True, False)
c = c.reshape((2, 6))
c.shape, a.shape # ((2, 6), (3, 4))
c[0, 4] = 1234 # a's data changes
a[1, 0] # np.int64(1234)
s = a[:, 1:3]
s = a[:, 1:3]
# array([[ 1, 2],
# [ 5, 6],
# [ 9, 10]])
s[:] = 10 # a view, not a copy
a
# array([[ 0, 10, 10, 3],
# [1234, 10, 10, 7],
# [ 8, 10, 10, 11]])
d = a.copy() # a copy of the array
d is a, d.base is a # (False, False)
1.5. probability distributions
# np.random: arrays of sample values from many kinds of probability distributions
samples = np.random.standard_normal(size=(4, 4))
# array([[ 0.56695748, 0.00955644, 0.08934151, 1.69336179],
# [-0.41463561, 0.81013138, 0.41954566, 0.11812052],
# [ 0.23305357, 0.26527434, 0.3752615 , 1.82930088],
# [ 0.36156569, 1.66151819, 0.78871978, -0.56163182]])
# from random import normalvariate
# N = 1_000_000
#
# %timeit samples = [normalvariate(0, 1) for _ in range(N)]
# 320 ms ± 885 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#
# %timeit np.random.standard_normal(N)
# 15.8 ms ± 8.28 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# random numbers are pseudo-random with a configurable RNG initialized by the `seed` value, ensuring reproducibility.
rng = np.random.default_rng(seed=12345)
type(rng)
# numpy.random._generator.Generator
r1 = np.random.default_rng(seed=12345)
r2 = np.random.default_rng(seed=12345)
r1.standard_normal((2, 3))
# array([[-1.42382504, 1.26372846, -0.87066174],
# [-0.25917323, -0.07534331, -0.74088465]])
r2.default_rng(seed=12345).standard_normal((2, 3))
# array([[-1.42382504, 1.26372846, -0.87066174],
# [-0.25917323, -0.07534331, -0.74088465]])
1.6. array-oriented programming
# array-oriented programming with arrays
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
# traditional list comprehension
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
np.array(result)
# array([1.1, 2.2, 1.3, 1.4, 2.5])
# numpy.where
result = np.where(cond, xarr, yarr)
# array([1.1, 2.2, 1.3, 1.4, 2.5])
a = np.random.standard_normal((4, 4))
# array([[-0.8529, -1.6429, 1.4967, -0.6435],
# [ 0.5086, 0.5242, 1.5083, 0.0487],
# [-1.2721, -0.8147, -0.3598, -1.2042],
# [ 1.1531, 1.5783, -0.9556, -0.6506]])
np.where(a > 0, 2, -2)
# array([[-2, -2, 2, -2],
# [ 2, 2, 2, 2],
# [-2, -2, -2, -2],
# [ 2, 2, -2, -2]])
# set only positive values to 2
np.where(a > 0, 2, a)
# array([[-0.8529, -1.6429, 2. , -0.6435],
# [ 2. , 2. , 2. , 2. ],
# [-1.2721, -0.8147, -0.3598, -1.2042],
# [ 2. , 2. , -0.9556, -0.6506]])
1.7. sorting
# sorting
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 2, 10, 11, 3],
# [ 0, 6, 9, 5],
# [ 7, 8, 1, 4]])
a.sort()
# array([[ 2, 3, 10, 11],
# [ 0, 5, 6, 9],
# [ 1, 4, 7, 8]])
a.sort(axis=0) # sort across each column
# array([[ 0, 3, 6, 8],
# [ 1, 4, 7, 9],
# [ 2, 5, 10, 11]])
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 5, 3, 11, 8],
# [ 7, 1, 0, 4],
# [10, 9, 2, 6]])
a.sort(axis=1) # sort across each row
# array([[ 3, 5, 8, 11],
# [ 0, 1, 4, 7],
# [ 2, 6, 9, 10]])
np.sort(a, axis=0) # a sorted copy of an array
# array([[ 0, 1, 4, 7],
# [ 2, 5, 8, 10],
# [ 3, 6, 9, 11]])
1.8. saving and loading
# save and load data: file input and output with arrays
np.save("a", np.arange(3))
np.load("a.npy")
# array([0, 1, 2])
a = np.arange(3)
b = np.arange(4, 7)
np.savez("x", a=a, b=b) # np.savez_compressed("x", a=a, b=b)
x = np.load("x.npz")
x["a"]
# array([0, 1, 2])
x["b"]
# array([4, 5, 6])
2. Pandas
Pandas (derived from panel data, /ˈpeɪ.dəns/) is an open-source, BSD-licensed Python library built on NumPy, offering high-performance, user-friendly tabular or heterogeneous data structures like Series
(1D labeled arrays of any data type) and DataFrames
(2D labeled tables) for powerful data analysis.
import numpy as np
import pandas as pd
BooleanDtype # Nullable Boolean data, use "boolean" when passing as string
CategoricalDtype # Categorical data type, use "category" when passing as string
DatetimeTZDtype # Datetime with time zone
Float32Dtype # 32-bit nullable floating point, use "Float32" when passing as string
Float64Dtype # 64-bit nullable floating point, use "Float64" when passing as string
Int8Dtype # 8-bit nullable signed integer, use "Int8" when passing as string
Int16Dtype # 16-bit nullable signed integer, use "Int16" when passing as string
Int32Dtype # 32-bit nullable signed integer, use "Int32" when passing as string
Int64Dtype # 64-bit nullable signed integer, use "Int64" when passing as string
UInt8Dtype # 8-bit nullable unsigned integer, use "UInt8" when passing as string
UInt16Dtype # 16-bit nullable unsigned integer, use "UInt16" when passing as string
UInt32Dtype # 32-bit nullable unsigned integer, use "UInt32" when passing as string
UInt64Dtype # 64-bit nullable unsigned integer, use "UInt64" when passing as string
s = pd.Series([1, 2, 3, None], dtype=pd.Float64Dtype()) # pd.Series([1, 2, 3, None], dtype="Float64")
# 0 1.0
# 1 2.0
# 2 3.0
# 3 <NA>
# dtype: Float64
s[3] is pd.NA
# True
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype()) # pd.Series(['one', 'two', None, 'three'], dtype="string")
# 0 one
# 1 two
# 2 <NA>
# 3 three
# dtype: string
s[2] is pd.NA
# True
2.1. Series, DataFrame, and Index
-
A Series is a one-dimensional array-like object containing a sequence of any values and an associated array of labels, called its index, also like a fixed-length, ordered dictionary as it is a mapping of index values to data values.
s = pd.Series([4, 7, -5, 3]) # default integer index # 0 4 # 1 7 # 2 -5 # 3 3 # dtype: int64 s.array # <NumpyExtensionArray> # [np.int64(4), np.int64(7), np.int64(-5), np.int64(3)] # Length: 4, dtype: int64 s.index # RangeIndex(start=0, stop=4, step=1) s.dtype # dtype('int64') s.to_numpy() # NumPy array # array([ 4, 7, -5, 3]) s.to_dict() # dictionary # {0: 4, 1: 7, 2: -5, 3: 3} s = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"]) # label index # d 4 # b 7 # a -5 # c 3 # dtype: int64 # select a single value or a set of values s["a"] # np.int64(-5) s[["b", "c", "d"]] # b 7 # c 3 # d 4 # dtype: int64 # purely integer-location based indexing for selection by position s.iloc[0] # np.int64(4) s.iloc[:2] # d 4 # b 7 # dtype: int64 s.iloc[[1, 2, 0]] # b 7 # a -5 # d 4 # dtype: int64 # Series is ndarray-like s > 0 # d True # b True # a False # c True # dtype: bool s[s > 0] # d 4 # b 7 # c 3 # dtype: int64 s[s > s.median()] # d 4 # b 7 # dtype: int64 np.exp(s) # import numpy as np # d 54.598150 # b 1096.633158 # a 0.006738 # c 20.085537 # dtype: float64 # Series is dictionary-like s["d"] # np.int64(4) s[["a", "b", "c"]] # a -5 # b 7 # c 3 # dtype: int64 "b" in s # True "e" in s # False s.get("e", np.nan) # nan data = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000} states = ["California", "Ohio", "Oregon", "Texas"] pd.Series(data, index=states) # override indices # California NaN # Ohio 35000.0 # Oregon 16000.0 # Texas 71000.0 # dtype: float64 # name attribute s.name = "fooooobar" s.index.name = "buzzzzzz" s # buzzzzzz # d 4 # b 7 # a -5 # c 3 # Name: fooooobar, dtype: int64 # alter the index in place while preserving the original index name s.index = pd.Index(["Bob", "Steve", "Jeff", "Ryan"], name=s.index.name) # Retains the current index name # Output: # buzzzzzz # Bob 4 # Steve 7 # Jeff -5 # Ryan 3 # Name: fooooobar, dtype: int64 # alter the index in place without preserving the index name s.index = ["Bob", "Steve", "Jeff", "Ryan"] # Removes the index name # Output: # Bob 4 # Steve 7 # Jeff -5 # Ryan 3 # Name: fooooobar, dtype: int64
-
A DataFrame is a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns, or a spreadsheet, or a dict of Series objects.
data = { "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"], "year": [2000, 2001, 2002, 2001, 2002, 2003], "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2], } frame = pd.DataFrame(data) # from a dictionary of equal-length lists or NumPy arrays # state year pop # 0 Ohio 2000 1.5 # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 # 5 Nevada 2003 3.2 frame.dtypes # state object # year int64 # pop float64 # dtype: object frame.head() # select the first five (by default) rows # state year pop # 0 Ohio 2000 1.5 # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 frame.tail() # select the last five (by default) rows # state year pop # 1 Ohio 2001 1.7 # 2 Ohio 2002 3.6 # 3 Nevada 2001 2.4 # 4 Nevada 2002 2.9 # 5 Nevada 2003 3.2 # specify a sequence of columns and order pd.DataFrame(data, columns=["year", "state", "pop"]) # year state pop # 0 2000 Ohio 1.5 # 1 2001 Ohio 1.7 # 2 2002 Ohio 3.6 # 3 2001 Nevada 2.4 # 4 2002 Nevada 2.9 # 5 2003 Nevada 3.2 pd.DataFrame(data, columns=["year", "state", "pop", "debt"]) # year state pop debt # 0 2000 Ohio 1.5 NaN # 1 2001 Ohio 1.7 NaN # 2 2002 Ohio 3.6 NaN # 3 2001 Nevada 2.4 NaN # 4 2002 Nevada 2.9 NaN # 5 2003 Nevada 3.2 NaN # retrieve columns as Series frame["state"] # dictionary-like notation # 0 Ohio # 1 Ohio # 2 Ohio # 3 Nevada # 4 Nevada # 5 Nevada # Name: state, dtype: object frame.year # dot attribute notation # 0 2000 # 1 2001 # 2 2002 # 3 2001 # 4 2002 # 5 2003 # Name: year, dtype: int64 # retrieve rows by position or name with the special `iloc` and `loc` attributes frame.loc[1] # state Ohio # year 2001 # pop 1.7 # Name: 1, dtype: object frame.iloc[2] # state Ohio # year 2002 # pop 3.6 # Name: 2, dtype: object # columns can be modified by assignment frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"]) frame2["debt"] = 16.5 # year state pop debt # 0 2000 Ohio 1.5 16.5 # 1 2001 Ohio 1.7 16.5 # 2 2002 Ohio 3.6 16.5 # 3 2001 Nevada 2.4 16.5 # 4 2002 Nevada 2.9 16.5 # 5 2003 Nevada 3.2 16.5 frame2["debt"] = np.arange(6.) # year state pop debt # 0 2000 Ohio 1.5 0.0 # 1 2001 Ohio 1.7 1.0 # 2 2002 Ohio 3.6 2.0 # 3 2001 Nevada 2.4 3.0 # 4 2002 Nevada 2.9 4.0 # 5 2003 Nevada 3.2 5.0 # assigning a column that doesn’t exist will create a new column. frame2["eastern"] = frame2["state"] == "Ohio" # year state pop debt eastern # 0 2000 Ohio 1.5 0.0 True # 1 2001 Ohio 1.7 1.0 True # 2 2002 Ohio 3.6 2.0 True # 3 2001 Nevada 2.4 3.0 False # 4 2002 Nevada 2.9 4.0 False # 5 2003 Nevada 3.2 5.0 False # remove a column del frame2["eastern"] # year state pop debt # 0 2000 Ohio 1.5 0.0 # 1 2001 Ohio 1.7 1.0 # 2 2002 Ohio 3.6 2.0 # 3 2001 Nevada 2.4 3.0 # 4 2002 Nevada 2.9 4.0 # 5 2003 Nevada 3.2 5.0 # return and drop an item from frame frame2.pop("debt") # 0 0.0 # 1 1.0 # 2 2.0 # 3 3.0 # 4 4.0 # 5 5.0 # Name: debt, dtype: float64 frame2 # year state pop # 0 2000 Ohio 1.5 # 1 2001 Ohio 1.7 # 2 2002 Ohio 3.6 # 3 2001 Nevada 2.4 # 4 2002 Nevada 2.9 # 5 2003 Nevada 3.2 # from a nested dictionary of dictionaries # the outer dictionary keys as the columns # the inner keys as the row indices populations = { "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6}, "Nevada": {2001: 2.4, 2002: 2.9}, } frame3 = pd.DataFrame(populations) # Ohio Nevada # 2000 1.5 NaN # 2001 1.7 2.4 # 2002 3.6 2.9 frame3.T # transpose the DataFrame (swap rows and columns) # 2000 2001 2002 # Ohio 1.5 1.7 3.6 # Nevada NaN 2.4 2.9 frame2.to_numpy() # a two-dimensional ndarray # array([[2000, 'Ohio', 1.5, 0.0], # [2001, 'Ohio', 1.7, 1.0], # [2002, 'Ohio', 3.6, 2.0], # [2001, 'Nevada', 2.4, 3.0], # [2002, 'Nevada', 2.9, 4.0], # [2003, 'Nevada', 3.2, 5.0]], dtype=object)
-
A pandas’s Index object is responsible for holding the axis labels (including a DataFrame’s column names) and other metadata (like the axis name or names).
frame3 # Ohio Nevada # 2000 1.5 NaN # 2001 1.7 2.4 # 2002 3.6 2.9 frame3.index # Index([2000, 2001, 2002], dtype='int64') frame3.columns # Index(['Ohio', 'Nevada'], dtype='object') "Ohio" in frame3.columns # True 2003 in frame3.index # False # duplicate labels select all occurrences of that label s = frame3["Ohio"] # 2000 1.5 # 2001 1.7 # 2002 3.6 # Name: Ohio, dtype: float64 i = s.index.append(pd.Index([2000, 2002])) # Index([2000, 2001, 2002, 2000, 2002], dtype='int64') s[i] # 2000 1.5 # 2001 1.7 # 2002 3.6 # 2000 1.5 # 2002 3.6 # Name: Ohio, dtype: float64
2.2. Essential Functionality
2.2.1. Reindexing
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
# d 4.5
# b 7.2
# a -5.3
# c 3.6
# dtype: float64
obj.reindex(["a", "b", "c", "d", "e"])
# a -5.3
# b 7.2
# c 3.6
# d 4.5
# e NaN
# dtype: float64
frame = pd.DataFrame(
np.arange(9).reshape((3, 3)),
index=["a", "c", "d"],
columns=["Ohio", "Texas", "California"],
)
# Ohio Texas California
# a 0 1 2
# c 3 4 5
# d 6 7 8
frame.reindex(index=["a", "b", "c", "d"])
# Ohio Texas California
# a 0.0 1.0 2.0
# b NaN NaN NaN
# c 3.0 4.0 5.0
# d 6.0 7.0 8.0
# reindex with the `axis` keyword
frame.reindex(["Texas", "Utah", "California"], axis="columns")
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
frame.reindex(["Texas", "Utah", "California"], axis=1)
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
# reindex with the `columns` keyword
frame.reindex(columns=["Texas", "Utah", "California"])
# Texas Utah California
# a 1 NaN 2
# c 4 NaN 5
# d 7 NaN 8
# reindex with the `loc` operator
frame.loc[["a", "d", "c"], ["California", "Texas"]]
# California Texas
# a 2 1
# d 8 7
# c 5 4
2.2.2. Dropping Entries from an Axis
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
# a 0.0
# b 1.0
# c 2.0
# d 3.0
# e 4.0
# dtype: float64
obj.drop("c")
# a 0.0
# b 1.0
# d 3.0
# e 4.0
# dtype: float64
obj.drop(["c", "d"])
# a 0.0
# b 1.0
# e 4.0
# dtype: float64
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop(index=["Colorado", "Ohio"])
# one two three four
# Utah 8 9 10 11
# New York 12 13 14 15
data.drop(columns=["two"])
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
data.drop("two", axis=1) # drop values from the columns by passing axis=1
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
data.drop("two", axis="columns") # drop values from the columns by passing axis="columns"
# one three four
# Ohio 0 2 3
# Colorado 4 6 7
# Utah 8 10 11
# New York 12 14 15
2.2.3. Indexing, Selection, and Filtering
s = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
# a 0.0
# b 1.0
# c 2.0
# d 3.0
# dtype: float64
s["b"]
# np.float64(1.0)
s[1] # deprecated: label-based indexing or position-based ?
# np.float64(1.0)
s.iloc[1] # by position
# np.float64(1.0)
s.iloc[[1, 3]]
# b 1.0
# d 3.0
# dtype: float64
s[2:4] # slicing with integers is always integer oriented
# c 2.0
# d 3.0
# dtype: float64
s[["b", "a", "d"]]
# b 1.0
# a 0.0
# d 3.0
# dtype: float64
s.loc[["b", "a", "d"]] # the preferred way to select index values by label
# b 1.0
# a 0.0
# d 3.0
# dtype: float64
s[s < 2]
# a 0.0
# b 1.0
# dtype: float64
s.loc["b":"c"] = 5
# a 0.0
# b 5.0
# c 5.0
# d 3.0
# dtype: float64
# retrieve one or more columns from a DataFrame either with a single value or sequence
# df[column] select single column or sequence of columns from the DataFrame
# df.loc[rows] select single row or subset of rows from the DataFrame by label
# df.loc[:, cols] select single column or subset of columns by label
# df.loc[rows, cols] select both row(s) and column(s) by label
# df.iloc[rows] select single row or subset of rows from the DataFrame by integer position
# df.iloc[:, cols] select single column or subset of columns by integer position
# df.iloc[rows, cols] select both row(s) and column(s) by integer position
# df.at[row, col] select a single scalar value by row and column label
# df.iat[row, col] select a single scalar value by row and column position (integers)
# df.reindex(...) select either rows or columns by labels
data = pd.DataFrame(
np.arange(16).reshape((4, 4)),
index=["Ohio", "Colorado", "Utah", "New York"],
columns=["one", "two", "three", "four"],
)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data["two"]
# Ohio 1
# Colorado 5
# Utah 9
# New York 13
# Name: two, dtype: int64
data[["three", "one"]]
# three one
# Ohio 2 0
# Colorado 6 4
# Utah 10 8
# New York 14 12
data[:2] # slice (slice rows)
# one two three four
# Ohio 0 1 2 3
# Colorado 4 5 6 7
data[data["three"] > 5] # boolean array (filter rows)
# one two three four
# Colorado 4 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
data < 5
# one two three four
# Ohio True True True True
# Colorado True False False False
# Utah False False False False
# New York False False False False
data[data < 5] = 0 # boolean DataFrame (set values based on some criterion)
# one two three four
# Ohio 0 0 0 0
# Colorado 0 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
# select a subset of the rows and columns using either axis labels (loc) or integers (iloc)
data.loc["Colorado"] # select a single row by label
# one 0
# two 5
# three 6
# four 7
# Name: Colorado, dtype: int64
data.loc[["Colorado", "New York"]] # select multiple rows with a sequence of labels
# one two three four
# Colorado 0 5 6 7
# New York 12 13 14 15
data.loc["Colorado", ["two", "three"]] # data.loc["Colorado"][["two", "three"]]
# two 5
# three 6
# Name: Colorado, dtype: int64
data.iloc[2] # select a single row by integer
# one 8
# two 9
# three 10
# four 11
# Name: Utah, dtype: int64
data.iloc[[2, 1]]
# one two three four
# Utah 8 9 10 11
# Colorado 0 5 6 7
data.iloc[2, [3, 0, 1]]
# four 11
# one 8
# two 9
# Name: Utah, dtype: int64
data.iloc[[1, 2], [3, 0, 1]]
# four one two
# Colorado 7 0 5
# Utah 11 8 9
data.iloc[:, :3][data.three > 5]
# one two three
# Colorado 0 5 6
# Utah 8 9 10
# New York 12 13 14
# boolean arrays can be used with `loc` but not `iloc`
data.loc[data.three >= 2] # data[data.three >= 2]
# one two three four
# Colorado 0 5 6 7
# Utah 8 9 10 11
# New York 12 13 14 15
2.2.4. Arithmetic and Data Alignment
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
# a 7.3
# c -2.5
# d 3.4
# e 1.5
# dtype: float64
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
# a -2.1
# c 3.6
# e -1.5
# f 4.0
# g 3.1
# dtype: float64
s1 + s2 # label-based data alignment yields NaN for non-overlapping entries in arithmetic
# a 5.2
# c 1.1
# d NaN
# e 0.0
# f NaN
# g NaN
# dtype: float64
df1 = pd.DataFrame(
np.arange(9.0).reshape((3, 3)),
columns=list("bcd"),
index=["Ohio", "Texas", "Colorado"],
)
# b c d
# Ohio 0.0 1.0 2.0
# Texas 3.0 4.0 5.0
# Colorado 6.0 7.0 8.0
df2 = pd.DataFrame(
np.arange(12.0).reshape((4, 3)),
columns=list("bde"),
index=["Utah", "Ohio", "Texas", "Oregon"],
)
# b d e
# Utah 0.0 1.0 2.0
# Ohio 3.0 4.0 5.0
# Texas 6.0 7.0 8.0
# Oregon 9.0 10.0 11.0
df1 + df2 # alignment is performed on both rows and columns on DataFrame
# b c d e
# Colorado NaN NaN NaN NaN
# Ohio 3.0 NaN 6.0 NaN
# Oregon NaN NaN NaN NaN
# Texas 9.0 NaN 12.0 NaN
# Utah NaN NaN NaN NaN
# arithmetic methods with fill values: substitute the passed value for any missing values
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
# a b c d
# 0 0.0 1.0 2.0 3.0
# 1 4.0 5.0 6.0 7.0
# 2 8.0 9.0 10.0 11.0
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))
# a b c d e
# 0 0.0 1.0 2.0 3.0 4.0
# 1 5.0 6.0 7.0 8.0 9.0
# 2 10.0 11.0 12.0 13.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
df2.loc[1, "b"] = np.nan
# a b c d e
# 0 0.0 1.0 2.0 3.0 4.0
# 1 5.0 NaN 7.0 8.0 9.0
# 2 10.0 11.0 12.0 13.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
df1 + df2
# a b c d e
# 0 0.0 2.0 4.0 6.0 NaN
# 1 9.0 NaN 13.0 15.0 NaN
# 2 18.0 20.0 22.0 24.0 NaN
# 3 NaN NaN NaN NaN NaN
df1.add(df2, fill_value=0.)
# a b c d e
# 0 0.0 2.0 4.0 6.0 4.0
# 1 9.0 5.0 13.0 15.0 9.0
# 2 18.0 20.0 22.0 24.0 14.0
# 3 15.0 16.0 17.0 18.0 19.0
# broadcasting: operations between DataFrame and Series
arr = np.arange(12).reshape((3, 4))
# array([[ 0, 1, 2, 3],
# [ 4, 5, 6, 7],
# [ 8, 9, 10, 11]])
arr[0]
# array([0, 1, 2, 3])
arr - arr[0] # the subtraction is performed once for each row
# array([[0, 0, 0, 0],
# [4, 4, 4, 4],
# [8, 8, 8, 8]])
2.2.5. Function Application and Mapping
data = pd.DataFrame(
np.random.standard_normal((4, 3)),
columns=list("bde"),
index=["Utah", "Ohio", "Texas", "Oregon"],
)
# b d e
# Utah -0.079026 1.151062 -0.393464
# Ohio -0.731363 0.601451 -0.462457
# Texas -1.400861 0.892020 -0.010010
# Oregon 0.899501 -0.418370 -0.791160
np.abs(data) # NumPy ufuncs (element-wise array methods)
# b d e
# Utah 0.079026 1.151062 0.393464
# Ohio 0.731363 0.601451 0.462457
# Texas 1.400861 0.892020 0.010010
# Oregon 0.899501 0.418370 0.791160
# apply a function along an axis of the DataFrame.
data.apply(lambda x: x.max() - x.min())
# b 2.300362
# d 1.569433
# e 0.781149
# dtype: float64
data.apply(lambda x: x.max() - x.min(), axis="columns")
# Utah 1.544527
# Ohio 1.332813
# Texas 2.292882
# Oregon 1.690661
# dtype: float64
data.apply(lambda x: pd.Series([x.max(), x.min()], index=["max", "min"]))
# b d e
# max 0.899501 1.151062 -0.01001
# min -1.400861 -0.418370 -0.79116
# apply a function to a Dataframe elementwise
data.map(lambda x: f"{x:.2f}")
# b d e
# Utah -0.08 1.15 -0.39
# Ohio -0.73 0.60 -0.46
# Texas -1.40 0.89 -0.01
# Oregon 0.90 -0.42 -0.79
2.2.6. Sorting and Ranking
s = pd.Series([4, 7, -3, 2], index=["d", "a", "b", "c"])
# d 4
# a 7
# b -3
# c 2
# dtype: int64
# sort lexicographically by row or column label
s.sort_index()
# a 7
# b -3
# c 2
# d 4
# dtype: int64
s.sort_index(ascending=False) # descending
# d 4
# c 2
# b -3
# a 7
# dtype: int64
i = s.index.sort_values()
# Index(['a', 'b', 'c', 'd'], dtype='object')
s.reindex(i)
# a 7
# b -3
# c 2
# d 4
# dtype: int64
s.sort_values() # sort by value
# b -3
# c 2
# d 4
# a 7
# dtype: int64
# any missing values are sorted to the end of the Series by default
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values()
# 4 -3.0
# 5 2.0
# 0 4.0
# 2 7.0
# 1 NaN
# 3 NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(ascending=False)
# 2 7.0
# 0 4.0
# 5 2.0
# 4 -3.0
# 1 NaN
# 3 NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(na_position="first")
# 1 NaN
# 3 NaN
# 4 -3.0
# 5 2.0
# 0 4.0
# 2 7.0
# dtype: float64
data = pd.DataFrame(
[[4, 7, -3, 2], [0, 1, 0, 1]], index=["three", "one"], columns=["d", "a", "b", "c"]
)
# d a b c
# three 4 7 -3 2
# one 0 1 0 1
data.sort_index()
# d a b c
# one 0 1 0 1
# three 4 7 -3 2
data.sort_index(axis="columns")
# a b c d
# three 7 -3 2 4
# one 1 0 1 0
# compute numerical data ranks (1 through n) along axis.
s = pd.Series([7, -5, 7, 4, 2, 0, 4])
s.rank() # by default, average rank of the group
# 0 6.5
# 1 1.0
# 2 6.5
# 3 4.5
# 4 3.0
# 5 2.0
# 6 4.5
# dtype: float64
s.rank(method="first") # ranks assigned in order they appear in the array
# 0 6.0
# 1 1.0
# 2 7.0
# 3 4.0
# 4 3.0
# 5 2.0
# 6 5.0
# dtype: float64
s.rank(ascending=False) # rank in descending order
# 0 1.5
# 1 7.0
# 2 1.5
# 3 3.5
# 4 5.0
# 5 6.0
# 6 3.5
# dtype: float64
# DataFrame can compute ranks over the rows or the columns
data = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
# b a c
# 0 4.3 0 -2.0
# 1 7.0 1 5.0
# 2 -3.0 0 8.0
# 3 2.0 1 -2.5
data.rank()
# b a c
# 0 3.0 1.5 2.0
# 1 4.0 3.5 3.0
# 2 1.0 1.5 4.0
# 3 2.0 3.5 1.0
data.rank(method="first", axis="columns")
# b a c
# 0 3.0 2.0 1.0
# 1 3.0 1.0 2.0
# 2 1.0 2.0 3.0
# 3 3.0 2.0 1.0
2.2.7. Axis Indexes with Duplicate Labels
s = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
# a 0
# a 1
# b 2
# b 3
# c 4
# dtype: int64
s.index.is_unique
# False
s["a"] # return a Series
# a 0
# a 1
# dtype: int64
s["c"] # return a scalar value
# np.int64(4)
data = pd.DataFrame(np.random.standard_normal((5, 3)), index=["a", "a", "b", "b", "c"])
# 0 1 2
# a -0.366146 0.329982 0.186980
# a 1.275523 0.615156 -3.403692
# b 0.488783 -0.448384 -2.626718
# b 0.610570 2.211333 1.131847
# c -0.157337 0.212340 0.175906
data.loc["a"] # return a DataFrame
# 0 1 2
# a -0.366146 0.329982 0.186980
# a 1.275523 0.615156 -3.403692
data.loc["c"] # return a Series
# 0 -0.157337
# 1 0.212340
# 2 0.175906
# Name: c, dtype: float64
2.2.8. Unique Values, Value Counts, and Membership
s = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
s.unique() # return unique values of Series object
# array(['c', 'a', 'd', 'b'], dtype=object)
s.value_counts() # return a Series containing counts of unique values
# c 3
# a 3
# b 2
# d 1
# Name: count, dtype: int64
mask = s.isin(["b", "c"]) # performs a vectorized set membership check
# 0 True
# 1 False
# 2 False
# 3 False
# 4 False
# 5 True
# 6 True
# 7 True
# 8 True
# dtype: bool
s[mask]
# 0 c
# 5 b
# 6 b
# 7 c
# 8 c
# dtype: object
pd.Index(pd.Series(s.unique()))
# Index(['c', 'a', 'd', 'b'], dtype='object')
pd.Index(pd.Series(s.unique())).get_indexer(s)
# array([0, 1, 2, 1, 1, 3, 3, 0, 0])
2.3. Reading and Writing
2.3.1. Reading and Writing CSV Text Files
csv_data = """
Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
1,John,1998-05-06,1234567890,25,New York,USA,894240000
2,Jane,1993-07-15,0987654321,30,London,UK,741484800
3,Doe,2001-12-20,1122334455,,Sydney,Australia,1008806400
4,NULL,1990-01-01,NULL,40,NULL,Canada,631152000
"""
# load the CSV data into a DataFrame
data = pd.read_csv(
io.StringIO(csv_data), # use io.StringIO to simulate a file-like object
parse_dates=["Birth Day"], # parse the 'Birth Day' column as datetime
dtype={"Phone Number": str}, # treat 'Phone Number' as a string
na_values=["", "NULL"], # handle missing values
)
# Id Name Birth Day Phone Number Age City Country Timestamp
# 0 1 John 1998-05-06 1234567890 25.0 New York USA 894240000
# 1 2 Jane 1993-07-15 0987654321 30.0 London UK 741484800
# 2 3 Doe 2001-12-20 1122334455 NaN Sydney Australia 1008806400
# 3 4 NaN 1990-01-01 NaN 40.0 NaN Canada 631152000
# convert the 'Timestamp' column from Unix epoch time to datetime
data["Timestamp"] = pd.to_datetime(data["Timestamp"], unit="s")
# Id Name Birth Day Phone Number Age City Country Timestamp timestamp
# 0 1 John 1998-05-06 1234567890 25.0 New York USA 894240000 1998-05-04
# 1 2 Jane 1993-07-15 0987654321 30.0 London UK 741484800 1993-07-01
# 2 3 Doe 2001-12-20 1122334455 NaN Sydney Australia 1008806400 2001-12-20
# 3 4 NaN 1990-01-01 NaN 40.0 NaN Canada 631152000 1990-01-01
data.dtypes
# Id int64
# Name object
# Birth Day datetime64[ns]
# Phone Number object
# Age float64
# City object
# Country object
# Timestamp datetime64[ns]
# dtype: object
# write the data out to a comma-separated file
data.to_csv(sys.stdout) # import sys
# ,Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 0,1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 1,2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 2,3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 3,4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False)
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL')
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL', header=False)
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
2.3.2. Reading and Writing Microsoft Excel Files
# add-on packages xlrd and openpyxl to read old-style XLS and newer XLSX files
conda install openpyxl xlrd
-
pandas.ExcelFile
: read sheets and parse as DataFramexlsx = pd.ExcelFile("book1.xlsx") xlsx.sheet_names # ['Sheet1'] xlsx.parse(sheet_name="Sheet1") # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo df = xlsx.parse(sheet_name="Sheet1", index_col=0) # a b c d message # 0 # 0 1 2 3 4 hello # 1 5 6 7 8 world # 2 9 10 11 12 foo df.columns # Index(['a', 'b', 'c', 'd', 'message'], dtype='object') df.index # Index([0, 1, 2], dtype='int64', name=0) xlsx.parse(sheet_name="Sheet1", index_col="message") # 0 a b c d # message # hello 0 1 2 3 4 # world 1 5 6 7 8 # foo 2 9 10 11 12
-
pandas.read_excel
: read a sheet as pandas data directlypd.read_excel("book1.xlsx") # pd.read_excel("book1.xlsx", sheet_name=0) # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo pd.read_excel("book1.xlsx", sheet_name="Sheet1") # 0 a b c d message # 0 0 1 2 3 4 hello # 1 1 5 6 7 8 world # 2 2 9 10 11 12 foo
-
pandas.to_excel
: write pandas data to Excel fileswriter = pd.ExcelWriter("sample1.xlsx") df.to_excel(writer, sheet_name="foo") # write to sheet 'foo' df.to_excel(writer, sheet_name="bar") # write to sheet 'bar' df.to_excel(writer, sheet_name="buz") # write to sheet 'buz' pd.ExcelFile("sample1.xlsx").sheet_names # ['foo', 'bar', 'buz']
2.3.3. Reading SQL-based Databases
import sqlite3
query = "CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );"
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
# <sqlite3.Cursor at 0x7f56df665840>
con.commit()
data = [
("Atlanta", "Georgia", 1.25, 6),
("Tallahassee", "Florida", 2.6, 3),
("Sacramento", "California", 1.7, 5),
]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
# <sqlite3.Cursor at 0x7f56e074fac0>
con.commit()
cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
# [('Atlanta', 'Georgia', 1.25, 6),
# ('Tallahassee', 'Florida', 2.6, 3),
# ('Sacramento', 'California', 1.7, 5)]
cursor.description
# (('a', None, None, None, None, None, None),
# ('b', None, None, None, None, None, None),
# ('c', None, None, None, None, None, None),
# ('d', None, None, None, None, None, None))
columns = [x[0] for x in cursor.description]
pd.DataFrame(rows, columns=columns)
# a b c d
# 0 Atlanta Georgia 1.25 6
# 1 Tallahassee Florida 2.60 3
# 2 Sacramento California 1.70 5
-
pandas.read_sql
: read data easily from a general SQLAlchemy connectionimport sqlalchemy as sqla # conda install sqlalchemy -y db = sqla.create_engine("sqlite:///mydata.sqlite") pd.read_sql("SELECT * FROM test", db) # a b c d # 0 Atlanta Georgia 1.25 6 # 1 Tallahassee Florida 2.60 3 # 2 Sacramento California 1.70 5
2.4. Data Cleaning and Preparation
2.4.1. Handling Missing Data
s = pd.Series([1, np.nan, 3.5, None, 7])
s.dropna() # remove missing values
# 0 1.0
# 2 3.5
# 4 7.0
# dtype: float64
s.fillna(0) # fill NA/NaN values (NA: not available)
# 0 1.0
# 1 0.0
# 2 3.5
# 3 0.0
# 4 7.0
# dtype: float64
s.isna()
# 0 False
# 1 True
# 2 False
# 3 True
# 4 False
# dtype: bool
s.notna()
# 0 True
# 1 False
# 2 True
# 3 False
# 4 True
# dtype: bool
data = pd.DataFrame(
[
[1.0, 6.5, 3.0],
[1.0, np.nan, np.nan],
[np.nan, np.nan, np.nan],
[np.nan, 6.5, 3.0],
]
)
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
data.dropna()
# 0 1 2
# 0 1.0 6.5 3.0
data.dropna(how="all")
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 3 NaN 6.5 3.0
data[4] = np.nan
# 0 1 2 4
# 0 1.0 6.5 3.0 NaN
# 1 1.0 NaN NaN NaN
# 2 NaN NaN NaN NaN
# 3 NaN 6.5 3.0 NaN
data.dropna(axis="columns", how="all")
# 0 1 2
# 0 1.0 6.5 3.0
# 1 1.0 NaN NaN
# 2 NaN NaN NaN
# 3 NaN 6.5 3.0
data = pd.DataFrame(np.random.standard_normal((7, 3)))
data.iloc[:4, 1] = np.nan
data.iloc[:2, 2] = np.nan
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.dropna()
# 0 1 2
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.dropna(thresh=2)
# 0 1 2
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.ffill() # fill NA/NaN values by using the last (row) valid observation to fill the gap
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 NaN 0.894144
# 3 -0.372355 NaN -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.ffill(axis="columns")
# 0 1 2
# 0 -0.327178 -0.327178 -0.327178
# 1 1.345916 1.345916 1.345916
# 2 -0.502837 -0.502837 0.894144
# 3 -0.372355 -0.372355 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.bfill() # fill NA/NaN values by using the next (row) valid observation to fill the gap
# 0 1 2
# 0 -0.327178 0.484139 0.894144
# 1 1.345916 0.484139 0.894144
# 2 -0.502837 0.484139 0.894144
# 3 -0.372355 0.484139 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
data.bfill(axis="columns")
# 0 1 2
# 0 -0.327178 NaN NaN
# 1 1.345916 NaN NaN
# 2 -0.502837 0.894144 0.894144
# 3 -0.372355 -1.719540 -1.719540
# 4 -0.210112 0.484139 0.904050
# 5 0.100575 -1.676686 -0.691126
# 6 0.581531 -0.615403 -1.176631
2.4.2. Removing Duplicates
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
# k1 k2
# 0 one 1
# 1 two 1
# 2 one 2
# 3 two 3
# 4 one 3
# 5 two 4
# 6 two 4
data.duplicated()
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# 5 False
# 6 True
# dtype: bool
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data["v1"] = range(7)
# k1 k2 v1
# 0 one 1 0
# 1 two 1 1
# 2 one 2 2
# 3 two 3 3
# 4 one 3 4
# 5 two 4 5
# 6 two 4 6
data.drop_duplicates(subset=["k1"])
# k1 k2 v1
# 0 one 1 0
# 1 two 1 1
data.drop_duplicates(subset=["k1"], keep="last")
# k1 k2 v1
# 4 one 3 4
# 6 two 4 6
2.4.3. Transforming Data Using a Function or Mapping
data = pd.DataFrame(
{
"food": [
"bacon",
"pulled pork",
"bacon",
"pastrami",
"corned beef",
"bacon",
"pastrami",
"honey ham",
"nova lox",
],
"ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
}
)
# food ounces
# 0 bacon 4.0
# 1 pulled pork 3.0
# 2 bacon 12.0
# 3 pastrami 6.0
# 4 corned beef 7.5
# 5 bacon 8.0
# 6 pastrami 3.0
# 7 honey ham 5.0
# 8 nova lox 6.0
meat_to_animal = {
"bacon": "pig",
"pulled pork": "pig",
"pastrami": "cow",
"corned beef": "cow",
"honey ham": "pig",
"nova lox": "salmon",
}
data["animal"] = data["food"].map(meat_to_animal) # data["food"].map(lambda x: meat_to_animal.get(x, None))
# food ounces animal
# 0 bacon 4.0 pig
# 1 pulled pork 3.0 pig
# 2 bacon 12.0 pig
# 3 pastrami 6.0 cow
# 4 corned beef 7.5 cow
# 5 bacon 8.0 pig
# 6 pastrami 3.0 cow
# 7 honey ham 5.0 pig
# 8 nova lox 6.0 salmon
2.4.4. Replacing Values
data = pd.Series([1., -999., 2., -999., -1000., 3.])
# 0 1.0
# 1 -999.0
# 2 2.0
# 3 -999.0
# 4 -1000.0
# 5 3.0
# dtype: float64
data.replace(-999, np.nan)
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 -1000.0
# 5 3.0
# dtype: float64
data.replace([-999, -1000], np.nan) # replace multiple values at once
# 0 1.0
# 1 NaN
# 2 2.0
# 3 NaN
# 4 NaN
# 5 3.0
# dtype: float64
2.4.5. Renaming Axis Indexes
data = pd.DataFrame(
np.arange(12).reshape((3, 4)),
index=["Ohio", "Colorado", "New York"],
columns=["one", "two", "three", "four"],
)
data.index.map(lambda x: x[:4].upper())
# Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(lambda x: x[:4].upper())
# one two three four
# OHIO 0 1 2 3
# COLO 4 5 6 7
# NEW 8 9 10 11
data.rename(index=str.title, columns=str.upper)
# ONE TWO THREE FOUR
# Ohio 0 1 2 3
# Colo 4 5 6 7
# New 8 9 10 11
2.4.6. Discretization and Binning
# Continuous data is often discretized or otherwise separated into “bins” for analysis.
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
age_categories = pd.cut(ages, bins) # pandas.core.arrays.categorical.Categorical
# [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
# Length: 12
# Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
age_categories.codes
# array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
age_categories.categories
# IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
age_categories.categories[0]
# Interval(18, 25, closed='right')
age_categories.value_counts()
# (18, 25] 5
# (25, 35] 3
# (35, 60] 3
# (60, 100] 1
# Name: count, dtype: int64
pd.cut(ages, bins, right=False).categories
# IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')
group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, right=False, labels=group_names).categories
# Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')
data = np.random.uniform(size=20)
# array([0.34500722, 0.51390559, 0.45442101, 0.24779283, 0.11860691,
# 0.64541406, 0.95682711, 0.69983468, 0.661519 , 0.09869171,
# 0.12846634, 0.10438803, 0.6110211 , 0.28152758, 0.40054146,
# 0.36215715, 0.29260695, 0.0516599 , 0.7291701 , 0.84767151])
pd.cut(data, 4, precision=2).categories # equal-length bins based on the minimum and maximum values
# IntervalIndex([(0.051, 0.28], (0.28, 0.5], (0.5, 0.73], (0.73, 0.96]], dtype='interval[float64, right]')
pd.qcut(data, 4, precision=2).value_counts()
# (0.041999999999999996, 0.22] 5
# (0.22, 0.38] 5
# (0.38, 0.65] 5
# (0.65, 0.96] 5
# Name: count, dtype: int64
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()
# (0.0507, 0.104] 2
# (0.104, 0.381] 8
# (0.381, 0.741] 8
# (0.741, 0.957] 2
# Name: count, dtype: int64
2.4.7. Detecting and Filtering Outliers
data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.002266 -0.036654 -0.017418 -0.029068
# std 1.018625 0.950048 0.994702 1.041082
# min -3.421405 -3.833754 -3.642241 -3.125454
# 25% -0.663947 -0.694460 -0.659706 -0.762197
# 50% -0.024047 -0.005600 -0.025911 -0.024954
# 75% 0.700707 0.603613 0.663005 0.701335
# max 3.134904 2.526298 2.994854 3.704972
data[(data.abs() > 3).any(axis="columns")] # select all rows having a value exceeding 3 or –3
# 0 1 2 3
# 64 0.769583 -3.833754 -0.284691 0.283270
# 176 -3.198594 0.745868 -0.430357 -0.374628
# 207 0.823846 -0.171761 1.137797 3.704972
# 416 -3.024055 0.560612 1.320891 -0.691014
# 790 -3.421405 0.516339 0.049209 1.016584
# 915 -0.543984 0.844945 -3.010801 -0.165290
# 916 3.134904 0.091651 0.908098 1.685942
# 945 3.073425 -0.943425 -1.711061 0.573415
# 955 -1.375006 -0.344609 -3.642241 -0.823830
# 981 0.507220 0.136884 0.880373 -3.125454
data[data.abs() > 3] = np.sign(data) * 3 # cap values outside the interval [–3, 3]
data.describe()
# 0 1 2 3
# count 1000.000000 1000.000000 1000.000000 1000.000000
# mean -0.001830 -0.035821 -0.016765 -0.029648
# std 1.015991 0.947074 0.992531 1.038420
# min -3.000000 -3.000000 -3.000000 -3.000000
# 25% -0.663947 -0.694460 -0.659706 -0.762197
# 50% -0.024047 -0.005600 -0.025911 -0.024954
# 75% 0.700707 0.603613 0.663005 0.701335
# max 3.000000 2.526298 2.994854 3.000000
np.sign(data).head() # produces 1 and –1 values based on positive or negative
# 0 1 2 3
# 0 -1.0 1.0 -1.0 1.0
# 1 -1.0 -1.0 -1.0 -1.0
# 2 1.0 -1.0 -1.0 1.0
# 3 -1.0 -1.0 1.0 -1.0
# 4 1.0 1.0 1.0 -1.0
2.4.8. Permutation and Random Sampling
data = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
# 0 1 2 3 4 5 6
# 0 0 1 2 3 4 5 6
# 1 7 8 9 10 11 12 13
# 2 14 15 16 17 18 19 20
# 3 21 22 23 24 25 26 27
# 4 28 29 30 31 32 33 34
sampler = np.random.permutation(5)
# array([2, 4, 3, 0, 1])
data.iloc[sampler] # equivalent: data.take(sampler)
0 1 2 3 4 5 6
2 14 15 16 17 18 19 20
4 28 29 30 31 32 33 34
3 21 22 23 24 25 26 27
0 0 1 2 3 4 5 6
1 7 8 9 10 11 12 13
column_sampler = np.random.permutation(7)
# array([3, 5, 2, 4, 6, 0, 1])
data.take(column_sampler, axis="columns")
# 3 5 2 4 6 0 1
# 0 3 5 2 4 6 0 1
# 1 10 12 9 11 13 7 8
# 2 17 19 16 18 20 14 15
# 3 24 26 23 25 27 21 22
# 4 31 33 30 32 34 28 29
data.sample(n=3) # select a random subset without replacement
# 0 1 2 3 4 5 6
# 0 0 1 2 3 4 5 6
# 2 14 15 16 17 18 19 20
# 4 28 29 30 31 32 33 34
data.sample(n=7, replace=True) # select a random subset with replacement (to allow repeat choices)
# 0 1 2 3 4 5 6
# 3 21 22 23 24 25 26 27
# 0 0 1 2 3 4 5 6
# 1 7 8 9 10 11 12 13
# 0 0 1 2 3 4 5 6
# 4 28 29 30 31 32 33 34
# 4 28 29 30 31 32 33 34
# 3 21 22 23 24 25 26 27
2.4.9. Computing Indicator/Dummy Variables: One-Hot Encoding
data = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
# key data1
# 0 b 0
# 1 b 1
# 2 a 2
# 3 c 3
# 4 a 4
# 5 b 5
pd.get_dummies(data["key"], dtype=int)
# a b c
# 0 0 1 0
# 1 0 1 0
# 2 1 0 0
# 3 0 0 1
# 4 1 0 0
# 5 0 1 0
data.join(pd.get_dummies(data["key"], dtype=int))
# key data1 a b c
# 0 b 0 0 1 0
# 1 b 1 0 1 0
# 2 a 2 1 0 0
# 3 c 3 0 0 1
# 4 a 4 1 0 0
# 5 b 5 0 1 0
movies = pd.read_table(
"https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/movielens/movies.dat",
sep="::",
header=None,
names=["movie_id", "title", "genres"],
engine="python",
)
movies.head()
# movie_id title genres
# 0 1 Toy Story (1995) Animation|Children's|Comedy
# 1 2 Jumanji (1995) Adventure|Children's|Fantasy
# 2 3 Grumpier Old Men (1995) Comedy|Romance
# 3 4 Waiting to Exhale (1995) Comedy|Drama
# 4 5 Father of the Bride Part II (1995) Comedy
dummies = movies["genres"].str.get_dummies("|") # string functions in pandas
dummies.iloc[:5,:6]
# Action Adventure Animation Children's Comedy Crime
# 0 0 0 1 1 1 0
# 1 0 1 0 1 0 0
# 2 0 0 0 0 1 0
# 3 0 0 0 0 1 0
# 4 0 0 0 0 1 0
movies_windic = movies.join(dummies.add_prefix("Genre_"))
movies_windic.iloc[0][:10]
# movie_id 1
# title Toy Story (1995)
# genres Animation|Children's|Comedy
# Genre_Action 0
# Genre_Adventure 0
# Genre_Animation 1
# Genre_Children's 1
# Genre_Comedy 1
# Genre_Crime 0
# Genre_Documentary 0
# Name: 0, dtype: object
np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
# array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
# 0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins), dtype=int)
# (0.0, 0.2] (0.2, 0.4] (0.4, 0.6] (0.6, 0.8] (0.8, 1.0]
# 0 0 0 0 0 1
# 1 0 1 0 0 0
# 2 1 0 0 0 0
# 3 0 1 0 0 0
# 4 0 0 1 0 0
# 5 0 0 1 0 0
# 6 0 0 0 0 1
# 7 0 0 0 1 0
# 8 0 0 0 1 0
# 9 0 0 0 1 0
2.4.10. String Functions in pandas
data = {
"Dave": "dave@google.com",
"Steve": "steve@gmail.com",
"Rob": "rob@gmail.com",
"Wes": np.nan,
}
data = pd.Series(data)
# Dave dave@google.com
# Steve steve@gmail.com
# Rob rob@gmail.com
# Wes NaN
# dtype: object
data.isna()
# Dave False
# Steve False
# Rob False
# Wes True
# dtype: bool
data.astype("string") # extension types
# Dave dave@google.com
# Steve steve@gmail.com
# Rob rob@gmail.com
# Wes <NA>
# dtype: string
data.str.contains("gmail") # .str attribute
# Dave False
# Steve True
# Rob True
# Wes NaN
# dtype: object
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE) # regular expressions
# Dave [(dave, google, com)]
# Steve [(steve, gmail, com)]
# Rob [(rob, gmail, com)]
# Wes NaN
# dtype: object
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0] # vectorized element retrieval
# Dave (dave, google, com)
# Steve (steve, gmail, com)
# Rob (rob, gmail, com)
# Wes NaN
# dtype: object
matches.str.get(1)
# Dave google
# Steve gmail
# Rob gmail
# Wes NaN
# dtype: object
data.str[:5] # slice strings
# Dave dave@
# Steve steve
# Rob rob@g
# Wes NaN
# dtype: object
data.str.extract(pattern, flags=re.IGNORECASE) # extract capture groups
# 0 1 2
# Dave dave google com
# Steve steve gmail com
# Rob rob gmail com
# Wes NaN NaN NaN
2.4.11. Categorical Encoding
categories = pd.Series(["apple", "orange"]) # categories, dictionary, or levels
# 0 apple
# 1 orange
# dtype: object
codes = pd.Series([0, 1, 0, 0] * 2) # category codes or codes
# 0 0
# 1 1
# 2 0
# 3 0
# 4 0
# 5 1
# 6 0
# 7 0
# dtype: int64
values = categories.take(codes)
# 0 apple
# 1 orange
# 0 apple
# 0 apple
# 0 apple
# 1 orange
# 0 apple
# 0 apple
# dtype: object
# pandas has a Categorical extension type for the integer-based categorical or dictionary-encoded encoding
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)
data = pd.DataFrame(
{
"fruit": fruits,
"basket_id": np.arange(N),
"count": rng.integers(3, 15, size=N),
"weight": rng.uniform(0, 4, size=N),
},
columns=["basket_id", "fruit", "count", "weight"],
)
# basket_id fruit count weight
# 0 0 apple 8 0.366659
# 1 1 orange 6 2.394272
# 2 2 apple 9 3.418968
# 3 3 apple 12 2.406485
# 4 4 apple 8 3.727953
# 5 5 orange 5 2.899125
# 6 6 apple 3 3.442205
# 7 7 apple 4 3.717351
fruit_cat = data["fruit"].astype("category")
# 0 apple
# 1 orange
# 2 apple
# 3 apple
# 4 apple
# 5 orange
# 6 apple
# 7 apple
# Name: fruit, dtype: category
# Categories (2, object): ['apple', 'orange']
c = fruit_cat.array
type(c) # pandas.core.arrays.categorical.Categorical
c.categories # Index(['apple', 'orange'], dtype='object')
c.codes # array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)
dict(enumerate(c.categories)) # get a mapping between codes and categories
# {0: 'apple', 1: 'orange'}
pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
# ['foo', 'bar', 'baz', 'foo', 'bar']
# Categories (3, object): ['bar', 'baz', 'foo']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"])
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo', 'bar', 'baz']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"], ordered=True)
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo' < 'bar' < 'baz']
# `.cat` accessor attribute
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2).astype("category")
cat_s.cat.categories
# Index(['a', 'b', 'c', 'd'], dtype='object')
cat_s.cat.codes
# 0 0
# 1 1
# 2 2
# 3 3
# 4 0
# 5 1
# 6 2
# 7 3
# dtype: int8
cat_s.value_counts()
# a 2
# b 2
# c 2
# d 2
# Name: count, dtype: int64
cat_s2 = cat_s.cat.set_categories(["a", "b", "c", "d", "e"])
cat_s2.value_counts()
# a 2
# b 2
# c 2
# d 2
# e 0
# Name: count, dtype: int64
cat_s3 = cat_s[cat_s.isin(["a", "b"])]
# 0 a
# 1 b
# 4 a
# 5 b
# dtype: category
# Categories (4, object): ['a', 'b', 'c', 'd']
cat_s_3.cat.remove_unused_categories()
# 0 a
# 1 b
# 4 a
# 5 b
# dtype: category
# Categories (2, object): ['a', 'b']
draws = np.random.default_rng(seed=12345).standard_normal(1_000)
# array([-1.42382504, 1.26372846, -0.87066174, -0.25917323, -0.07534331,
bins = pd.qcut(draws, 4) # compute a quartile binning
# [(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
# Length: 1000
# Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]
bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4']) # quartile names
# ['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
# Length: 1000
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
bins = pd.Series(bins, name='quartile')
# 0 Q1
# 1 Q4
# 2 Q1
# ..
# 998 Q3
# 999 Q2
# Name: quartile, Length: 1000, dtype: category
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
results = pd.Series(draws).groupby(bins, observed=True).agg(["count", "min", "max"]).reset_index() # statistics
# quartile count min max
# 0 Q1 250 -3.119609 -0.678494
# 1 Q2 250 -0.673305 0.008009
# 2 Q3 250 0.018753 0.686183
# 3 Q4 250 0.688282 3.211418
# dummy variables or one-hot encoding
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s, dtype=int)
# a b c d
# 0 1 0 0 0
# 1 0 1 0 0
# 2 0 0 1 0
# 3 0 0 0 1
# 4 1 0 0 0
# 5 0 1 0 0
# 6 0 0 1 0
# 7 0 0 0 1
2.5. Data Wrangling: Join, Combine, and Reshape
2.5.1. Hierarchical Indexing
# a Series with a list of lists (or arrays) as the index
data = pd.Series(
np.random.uniform(size=9),
index=[
["a", "a", "a", "b", "b", "c", "c", "d", "d"],
[ 1, 2, 3, 1, 3, 1, 2, 2, 3]
],
)
print(data) # a prettified view of a Series with a MultiIndex as its index
# a 1 0.141713
# 2 0.969081
# 3 0.850562
# b 1 0.882278
# 3 0.539535
# c 1 0.944913
# 2 0.371328
# d 2 0.585134
# 3 0.826680
# dtype: float64
data.index # MultiIndex
# MultiIndex([('a', 1),
# ('a', 2),
# ('a', 3),
# ('b', 1),
# ('b', 3),
# ('c', 1),
# ('c', 2),
# ('d', 2),
# ('d', 3)],
# )
data["b"] # partial indexing
# 1 0.882278
# 3 0.539535
# dtype: float64
data.loc[:, 2] # level indexing
# a 0.969081
# c 0.371328
# d 0.585134
# dtype: float64
data.unstack()
# 1 2 3
# a 0.141713 0.969081 0.850562
# b 0.882278 NaN 0.539535
# c 0.944913 0.371328 NaN
# d NaN 0.585134 0.826680
data.unstack().stack()
# a 1 0.141713
# 2 0.969081
# 3 0.850562
# b 1 0.882278
# 3 0.539535
# c 1 0.944913
# 2 0.371328
# d 2 0.585134
# 3 0.826680
# dtype: float64
df = pd.DataFrame(
np.arange(12).reshape((4, 3)),
index=[
["a", "a", "b", "b"],
[ 1, 2, 1, 2]
],
columns=[
[ "Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"]
],
)
# Ohio Colorado
# Green Red Green
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
df.index.names = ["key1", "key2"]
df.columns.names = ["state", "color"]
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# 2 3 4 5
# b 1 6 7 8
# 2 9 10 11
df.index.nlevels
# 2
df["Ohio"]
# color Green Red
# key key2
# a 1 0 1
# 2 3 4
# b 1 6 7
# 2 9 10
# a MultiIndex can be created by itself and then reused
pd.MultiIndex.from_arrays(
[
[ "Ohio", "Ohio", "Colorado"],
["Green", "Red", "Green"],
],
names=["state", "color"],
)
# reordering and sorting levels
df.swaplevel("key1", "key2")
# state Ohio Colorado
# color Green Red Green
# key2 key1
# 1 a 0 1 2
# 2 a 3 4 5
# 1 b 6 7 8
# 2 b 9 10 11
df.sort_index(level=1)
# state Ohio Colorado
# color Green Red Green
# key1 key2
# a 1 0 1 2
# b 1 6 7 8
# a 2 3 4 5
# b 2 9 10 11
df.swaplevel(0, 1)
# state Ohio Colorado
# color Green Red Green
# key2 key1
# 1 a 0 1 2
# 2 a 3 4 5
# 1 b 6 7 8
# 2 b 9 10 11
df.swaplevel(0, 1).sort_index(level=0)
# state Ohio Colorado
# color Green Red Green
# key2 key1
# 1 a 0 1 2
# b 6 7 8
# 2 a 3 4 5
# b 9 10 11
# summary statistics by level
df.groupby(level="key2").sum()
# state Ohio Colorado
# color Green Red Green
# key2
# 1 6 8 10
# 2 12 14 16
df.T.groupby(level="color").sum().T
# color Green Red
# key1 key2
# a 1 2 1
# 2 8 4
# b 1 14 7
# 2 20 10