conda install -y pandas jupyter matplotlib
# import conventions
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels as sm
# pd.options.display.max_columns = 20
# pd.options.display.max_colwidth = 80
# pd.options.display.max_rows = 20
# pd.options.display.max_colwidth = 80
# np.set_printoptions(precision=4, suppress=True)
# pd.reset_option('all')
# pd.reset_option('display.max_rows')

1. NumPy

NumPy (Numerical Python, /ˈnʌmpli/) is a library for the Python programming language that’s widely used in science and engineering, which contains multidimensional array data structures, such as the homogeneous, N-dimensional array (ndarray, /ˈændəriː/), and a large library of functions that operate efficiently on these data structures.

import numpy as np

a = np.array([[1, 2, 3], [4, 5, 6]])
a.shape  # (2, 3)
# Integer types (signed and unsigned)
int8, uint8     # i1, u1: 8-bit (1 byte) integers
int16, uint16   # i2, u2: 16-bit integers
int32, uint32   # i4, u4: 32-bit integers
int64, uint64   # i8, u8: 64-bit integers

# Floating-point types
float16         # f2: Half-precision (16-bit) floating point
float32         # f4 or f: Single-precision (32-bit) floating point; compatible with C float
float64         # f8 or d: Double-precision (64-bit) floating point; compatible with C double and Python float
float128        # f16 or g: Extended-precision (128-bit) floating point

# Complex number types
complex64       # c8: Complex number with two 32-bit floats (real and imaginary parts)
complex128      # c16: Complex number with two 64-bit floats
complex256      # c32: Complex number with two 128-bit floats

# Boolean type
bool            # ?: Boolean values (True or False)

# Object type
object          # O: Python object type; can store any Python object

# String and byte types
bytes_          # S: Fixed-length raw byte arrays (e.g., ASCII strings, 1 byte per character).
                # Use 'S10' for a byte string of length 10.
                # Each string is padded or truncated to fit the specified length.
str_            # U: Fixed-length Unicode strings using UTF-32 encoding (4 bytes per character).
                # Use 'U10' for a Unicode string of length 10.
                # Each string is padded or truncated to fit the specified length.

1.1. arrays

NumPy’s main object is the homogeneous multidimensional array (dimensions are called axes), known as ndarray, which is a table of same-type elements (usually numbers) indexed by a tuple of non-negative integers.

Don’t think of NumPy’s N-d arrays solely as spatial points or coordinates; instead, picture them as structured data containers, like boxes within boxes.

  • 0D: a single box holding a single value without no axes.

    scalar = np.array(5)
    print(scalar)
    # 5
    print(scalar.shape)
    # ()
  • 1D: A single row of boxes with one axis (axis 0).

    vector = np.array([10, 20 , 30])
    print(vector)
    # [10 20 30]
    print(vector.shape)
    # (3,)
    vector[1]
    # 20
  • 2D: A grid of boxes (rows and columns) with two axes: axis 0 (rows) and axis 1 (columns).

    matrix = np.array([[1, 2, 3],
                       [4, 5, 6]])
    print(matrix)
    # [[1 2 3]
    #  [4 5 6]]
    print(matrix.shape)
    # (2, 3)
    print(matrix[1, 0])
    # 4
  • 3D: A stack of grids (like layers of boxes) with three axes: axis 0 (depth), axis 1 (rows), and axis 2 (columns).

    tensor = np.array([[[1, 2], [3, 4]],
                       [[5, 6], [7, 8]]])
    print(tensor)
    # [[[1 2]
    #   [3 4]]
    #
    #  [[5 6]
    #   [7 8]]]
    print(tensor.shape)
    # (2, 2, 2)
    print(tensor[0, 1, 0])
    # 3
# create a matrix with 2 rows and 3 columns
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
#        [3, 4, 5]])

# the number of axes (dimensions) of the array
a.ndim  # 2

# the dimensions of the array, a tuple of integers indicating the size of the array in each dimension
a.shape  # (2, 3)
a.ndim == len(a.shape)  # True

# the total number of elements of the array
a.size  # 6 = 2 x 3

import math
a.size == math.prod(a.shape)  # True

# an object describing the type of the elements in the array.
a.dtype # dtype('int64')
a.dtype.name  # 'int64'

# the size in bytes of each element of the array.
a.itemsize  # 8

# the buffer containing the actual elements of the array.
a.data  # <memory at 0x10424e190>
# 1-dimensional array
#   axis 0 runs horizontally (or vertically)
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
a.ndim   # 1
a.shape  # (10,)
# 2-dimensional array
#   axis 0 runs vertically downwards across the rows.
#   axis 1 runs horizontally across the columns.
a = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
#        [3, 4, 5]])
a.ndim   # 2
a.shape  # (2, 3)
# 3-dimensional array
#   axis 0 runs along the depth (the first index).
#   axis 1 runs along the rows (the second index).
#   axis 2 runs along the columns (the third index).
a = np.arange(8).reshape(2, 2, 2)
# array([[[0, 1],
#         [2, 3]],
#
#        [[4, 5],
#         [6, 7]]])
a.ndim   # 3
a.shape  # (2, 2, 2)
# creating arrays

a = np.array([1, 2, 3, 4, 5, 6])
# array([1, 2, 3, 4, 5, 6])

b = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
# array([[ 1,  2,  3,  4],
#        [ 5,  6,  7,  8],
#        [ 9, 10, 11, 12]])

c = np.array([[1, 2], [3, 4]], dtype=complex)
# array([[1.+0.j, 2.+0.j],
#        [3.+0.j, 4.+0.j]])

np.zeros((3, 4))
# array([[0., 0., 0., 0.],
#        [0., 0., 0., 0.],
#        [0., 0., 0., 0.]])
np.zeros_like(np.arange(12).reshape(3, 4))
# array([[0, 0, 0, 0],
#        [0, 0, 0, 0],
#        [0, 0, 0, 0]])

np.ones((3, 4), dtype=np.int16)
# array([[1, 1, 1, 1],
#        [1, 1, 1, 1],
#        [1, 1, 1, 1]], dtype=int16)

np.empty((3, 5))  # returns uninitialized memory!
# array([[0.0e+000, 4.9e-324, 9.9e-324, 1.5e-323, 2.0e-323],
#        [2.5e-323, 3.0e-323, 3.5e-323, 4.0e-323, 4.4e-323],
#        [4.9e-323, 5.4e-323, 5.9e-323, 6.4e-323, 6.9e-323]])

np.arange(10, 30, 5)
# array([10, 15, 20, 25])

np.arange(0, 2, 0.3)
# array([0. , 0.3, 0.6, 0.9, 1.2, 1.5, 1.8])

from numpy import pi

np.linspace(0, 2, 9)  # 9 numbers from 0 to 2
# array([0.  , 0.25, 0.5 , 0.75, 1.  , 1.25, 1.5 , 1.75, 2.  ])

x = np.linspace(0, 2 * pi, 100)
f = np.sin(x)
# array([ 0.00000000e+00,  6.34239197e-02,  1.26592454e-01,  1.89251244e-01,
#         2.51147987e-01,  3.12033446e-01,  3.71662456e-01,  4.29794912e-01,
#         .     .       .

# structured arrays
x = np.array(
    [('Rex', 9, 81.0), ('Fido', 3, 27.0)],
    dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')],
)
# array([('Rex', 9, 81.), ('Fido', 3, 27.)],
#       dtype=[('name', '<U10'), ('age', '<i4'), ('weight', '<f4')])

1.2. arithmetic operators, and universal functions

# arithmetic operators, and universal functions

a = np.array([20, 30, 40, 50])

np.sin(a)
# array([ 0.91294525, -0.98803162,  0.74511316, -0.26237485])

10 * np.sin(a)
# array([ 9.12945251, -9.88031624,  7.4511316 , -2.62374854])

b = np.arange(4)
# array([0, 1, 2, 3])

c = a - b
# array([20, 29, 38, 47])

# a + b
array([20, 31, 42, 53])

# b**2
array([0, 1, 4, 9])

a < 35
# array([ True,  True, False, False])

A = np.array([[1, 1], [0, 1]])
B = np.array([[2, 0], [3, 4]])

A * B  # elementwise product
# array([[2, 0],
#        [0, 4]])
A @ B  # matrix product: A.dot(B)
# array([[5, 4],
#        [3, 4]])
# universal functions (ufunc): operate elementwise on an array, producing an array as output
b = np.arange(12).reshape(3, 4)
# array([[ 0,  1,  2,  3],
#        [ 4,  5,  6,  7],
#        [ 8,  9, 10, 11]])

np.exp(b)
# array([1.        , 2.71828183, 7.3890561 ])

np.sqrt(b)
# array([0.        , 1.        , 1.41421356])

c = np.array([2.0, -1.0, 4.0])
np.add(b, c)
# array([2., 0., 6.])

x = np.random.standard_normal(8)
# array([-0.99796507,  1.82697075,  0.60924821,  0.27004016, -1.62010011,
#        -0.43957991, -2.27936726, -1.87890946])

y = np.random.standard_normal(8)
# array([-0.16034636,  1.60381624, -0.44003017, -1.50187946, -0.66940637,
#        -1.2649151 , -0.12716502, -0.15756179])

np.maximum(x, y)
# array([-0.16034636,  1.82697075,  0.60924821,  0.27004016, -0.66940637,
#        -0.43957991, -0.12716502, -0.15756179])

# assign the results into an existing array rather than create a new one
out = np.zeros_like(x)
np.add(x, 1, out=out)
# array([ 2.03493046e-03,  2.82697075e+00,  1.60924821e+00,  1.27004016e+00,
#        -6.20100112e-01,  5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
out
# array([ 2.03493046e-03,  2.82697075e+00,  1.60924821e+00,  1.27004016e+00,
#        -6.20100112e-01,  5.60420090e-01, -1.27936726e+00, -8.78909462e-01])
# mathematical and statistical methods
a = np.arange(12).reshape(3, 4)
# array([[ 0,  1,  2,  3],
#        [ 4,  5,  6,  7],
#        [ 8,  9, 10, 11]])

a.mean()  # np.mean(a)
# np.float64(5.5)

a.mean(axis=1)  # compute mean across the columns
# array([1.5, 5.5, 9.5])

a.sum()
# np.int64(66)

a.sum(axis=0)  # compute sum down the rows
# array([12, 15, 18, 21])

a.cumsum()  # computes the cumulative sum of the elements along a flattened version of the array
# array([ 0,  1,  3,  6, 10, 15, 21, 28, 36, 45, 55, 66])

a.cumsum(axis=0)  # computes the cumulative sum along the rows
# array([[ 0,  1,  2,  3],
#        [ 4,  6,  8, 10],
#        [12, 15, 18, 21]])

a.cumsum(axis=1)  # computes the sums along the columns
# array([[ 0,  1,  3,  6],
#        [ 4,  9, 15, 22],
#        [ 8, 17, 27, 38]])

b = np.random.permutation(a.ravel()).reshape(a.shape)
# array([[ 6,  8, 11,  3],
#        [ 7, 10,  4,  1],
#        [ 9,  2,  0,  5]])

b.argmax()  # return indices of the maximum values along the given axis
# np.int64(2)

b.argmax(axis=0)
# array([2, 1, 0, 2])

b.argmax(axis=1)
# array([2, 1, 0])

b.argmin()  # return indices of the minimum values along the given axis
# np.int64(10)

b.argmin(axis=0)
# array([0, 2, 2, 1])

b.argmin(axis=1)
# array([3, 3, 2])

1.3. indexing, slicing and iterating

# indexing, slicing and iterating

# one-dimensional arrays can be indexed, sliced and iterated over, much
# like lists and other Python sequences.
a = np.arange(10)
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

a[2]
# np.int64(2)

a[2:5]
# array([2, 3, 4])

# from start to position 6, exclusive, set every 2nd element to 1000
a[:6:2]
# array([0, 2, 4])

a[::-1]  # reversed
# array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

# multidimensional arrays can have one index per axis, given in a tuple separated by commas (,)
b = np.fromfunction(lambda x, y: 10 * x+ y, (5, 4), dtype=int)
# array([[ 0,  1,  2,  3],
#        [10, 11, 12, 13],
#        [20, 21, 22, 23],
#        [30, 31, 32, 33],
#        [40, 41, 42, 43]])

# a comma-separated list of indices to select individual elements
b[2, 3]  # b[2][3]
# np.int64(23)

# every element from column index 1 (`1:`) to the end of the first two rows (`:2`) of the array
b[:2, 1:]
# array([[ 1,  2,  3],
#        [11, 12, 13]])

# each row in the second column of `b`
b[0:5, 1]
# array([ 1, 11, 21, 31, 41])

# a colon (`:`) by itself to take the entire axis
b[:, 1]
# array([ 1, 11, 21, 31, 41])
b[:, :1]
# array([[ 0],
#        [10],
#        [20],
#        [30],
#        [40]])

# each column in the second and third row of `b`
b[1:3, :]
# array([[10, 11, 12, 13],
#       [20, 21, 22, 23]])

# missing indices are considered complete slices `:`
b[-1]  # b[-1, :]
# array([40, 41, 42, 43])

# the dots (`...`) represent as many colons as needed to produce a complete indexing tuple
b[-1, ...]  # b[-1, :]
# array([40, 41, 42, 43])

b = np.arange(6).reshape(2, 3)
# array([[0, 1, 2],
#        [3, 4, 5]])

# fancy indexing: indexing with arrays of indices
a = np.zeros((8, 4))
for i in range(8):
    a[i] = i
# array([[0., 0., 0., 0.],
#        [1., 1., 1., 1.],
#        [2., 2., 2., 2.],
#        [3., 3., 3., 3.],
#        [4., 4., 4., 4.],
#        [5., 5., 5., 5.],
#        [6., 6., 6., 6.],
#        [7., 7., 7., 7.]])

i = [4, 3, 0, 6]
a[i]
# array([[4., 4., 4., 4.],
#        [3., 3., 3., 3.],
#        [0., 0., 0., 0.],
#        [6., 6., 6., 6.]])

a = np.arange(32).reshape((8, 4))
# array([[ 0,  1,  2,  3],
#        [ 4,  5,  6,  7],
#        [ 8,  9, 10, 11],
#        [12, 13, 14, 15],
#        [16, 17, 18, 19],
#        [20, 21, 22, 23],
#        [24, 25, 26, 27],
#        [28, 29, 30, 31]])
i = [1, 5, 7, 2], [0, 3, 1, 2]
a[i]
# array([ 4, 23, 29, 10])

palette = np.array(
    [
        [0, 0, 0],       # black
        [255, 0, 0],     # red
        [0, 255, 0],     # green
        [0, 0, 255],     # blue
        [255, 255, 255], # white
    ]
)

image = np.array(
    [
        [0, 1, 2, 0],  # each value corresponds to a color in the palette
        [0, 3, 4, 0],
    ]
)

palette[image]  # the (2, 4, 3) color image

# array([[[  0,   0,   0],
#         [255,   0,   0],
#         [  0, 255,   0],
#         [  0,   0,   0]],
#
#        [[  0,   0,   0],
#         [  0,   0, 255],
#         [255, 255, 255],
#         [  0,   0,   0]]])

# boolean indexing
names = np.array(["Bob", "Joe", "Will", "Bob", "Will", "Joe", "Joe"])
data = np.array([[4, 7], [0, 2], [-5, 6], [0, 0], [1, 2], [-12, -4], [3, 4]])

names == 'Bob'
# array([ True, False, False,  True, False, False, False])

data[names == "Bob"]
# array([[4, 7],
#        [0, 0]])

data[~(names == "Bob")]  # data[names != "Bob"]
# array([[  0,   2],
#        [ -5,   6],
#        [  1,   2],
#        [-12,  -4],
#        [  3,   4]])

# The Python keywords `and` and `or` do not work with Boolean arrays.
# Use `&` (and) and `|` (or) instead.
mask = (names == "Bob") | (names == "Will")
# array([ True, False,  True,  True,  True, False, False])

data[mask]
# array([[ 4,  7],
#        [-5,  6],
#        [ 0,  0],
#        [ 1,  2]])
# iterating
for r in b:
    print(r)
# [0 1 2]
# [3 4 5]

# iterating on each element
for e in b.flat:
    print(f'{e}', end='\t')
# 0	1	2	3	4	5

1.4. copies and views

# copies and views: array slices are views on the original array
a = np.array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]])
# array([[ 0,  1,  2,  3],
#        [ 4,  5,  6,  7],
#        [ 8,  9, 10, 11]])
b = a
b is a  # True

def f(x: np.array):  # pass objects as references
    return id(x)

id(a), f(a)  # (4397168176, 4397168176)

c = a.view()  # a new view of the array
c is a, c.base is a, c.flags.owndata  # (False, True, False)
c = c.reshape((2, 6))
c.shape, a.shape  # ((2, 6), (3, 4))
c[0, 4] = 1234  # a's data changes
a[1, 0]  # np.int64(1234)

s = a[:, 1:3]
s = a[:, 1:3]
# array([[ 1,  2],
#        [ 5,  6],
#        [ 9, 10]])
s[:] = 10  # a view, not a copy
a
# array([[   0,   10,   10,    3],
#        [1234,   10,   10,    7],
#        [   8,   10,   10,   11]])

d = a.copy()  # a copy of the array
d is a, d.base is a  # (False, False)

1.5. probability distributions

# np.random: arrays of sample values from many kinds of probability distributions
samples = np.random.standard_normal(size=(4, 4))
# array([[ 0.56695748,  0.00955644,  0.08934151,  1.69336179],
#        [-0.41463561,  0.81013138,  0.41954566,  0.11812052],
#        [ 0.23305357,  0.26527434,  0.3752615 ,  1.82930088],
#        [ 0.36156569,  1.66151819,  0.78871978, -0.56163182]])

# from random import normalvariate
# N = 1_000_000
#
# %timeit samples = [normalvariate(0, 1) for _ in range(N)]
# 320 ms ± 885 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#
# %timeit np.random.standard_normal(N)
# 15.8 ms ± 8.28 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

# random numbers are pseudo-random with a configurable RNG initialized by the `seed` value, ensuring reproducibility.
rng = np.random.default_rng(seed=12345)
type(rng)
# numpy.random._generator.Generator

r1 = np.random.default_rng(seed=12345)
r2 = np.random.default_rng(seed=12345)

r1.standard_normal((2, 3))
# array([[-1.42382504,  1.26372846, -0.87066174],
#        [-0.25917323, -0.07534331, -0.74088465]])

r2.default_rng(seed=12345).standard_normal((2, 3))
# array([[-1.42382504,  1.26372846, -0.87066174],
#        [-0.25917323, -0.07534331, -0.74088465]])

1.6. array-oriented programming

# array-oriented programming with arrays
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])

# traditional list comprehension
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
np.array(result)
# array([1.1, 2.2, 1.3, 1.4, 2.5])

# numpy.where
result = np.where(cond, xarr, yarr)
# array([1.1, 2.2, 1.3, 1.4, 2.5])

a = np.random.standard_normal((4, 4))
# array([[-0.8529, -1.6429,  1.4967, -0.6435],
#        [ 0.5086,  0.5242,  1.5083,  0.0487],
#        [-1.2721, -0.8147, -0.3598, -1.2042],
#        [ 1.1531,  1.5783, -0.9556, -0.6506]])

np.where(a > 0, 2, -2)
# array([[-2, -2,  2, -2],
#        [ 2,  2,  2,  2],
#        [-2, -2, -2, -2],
#        [ 2,  2, -2, -2]])

# set only positive values to 2
np.where(a > 0, 2, a)
# array([[-0.8529, -1.6429,  2.    , -0.6435],
#        [ 2.    ,  2.    ,  2.    ,  2.    ],
#        [-1.2721, -0.8147, -0.3598, -1.2042],
#        [ 2.    ,  2.    , -0.9556, -0.6506]])

1.7. sorting

# sorting
a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 2, 10, 11,  3],
#        [ 0,  6,  9,  5],
#        [ 7,  8,  1,  4]])

a.sort()
# array([[ 2,  3, 10, 11],
#        [ 0,  5,  6,  9],
#        [ 1,  4,  7,  8]])

a.sort(axis=0)  # sort across each column
# array([[ 0,  3,  6,  8],
#        [ 1,  4,  7,  9],
#        [ 2,  5, 10, 11]])

a = np.random.permutation(np.arange(12)).reshape(3, 4)
# array([[ 5,  3, 11,  8],
#        [ 7,  1,  0,  4],
#        [10,  9,  2,  6]])

a.sort(axis=1)  # sort across each row
# array([[ 3,  5,  8, 11],
#        [ 0,  1,  4,  7],
#        [ 2,  6,  9, 10]])

np.sort(a, axis=0)  # a sorted copy of an array
# array([[ 0,  1,  4,  7],
#        [ 2,  5,  8, 10],
#        [ 3,  6,  9, 11]])

1.8. saving and loading

# save and load data: file input and output with arrays
np.save("a", np.arange(3))

np.load("a.npy")
# array([0, 1, 2])

a = np.arange(3)
b = np.arange(4, 7)
np.savez("x", a=a, b=b)  # np.savez_compressed("x", a=a, b=b)

x = np.load("x.npz")
x["a"]
# array([0, 1, 2])
x["b"]
# array([4, 5, 6])

2. Pandas

Pandas (derived from panel data, /ˈpeɪ.dəns/) is an open-source, BSD-licensed Python library built on NumPy, offering high-performance, user-friendly tabular or heterogeneous data structures like Series (1D labeled arrays of any data type) and DataFrames (2D labeled tables) for powerful data analysis.

import numpy as np
import pandas as pd
BooleanDtype     # Nullable Boolean data, use "boolean" when passing as string
CategoricalDtype # Categorical data type, use "category" when passing as string
DatetimeTZDtype  # Datetime with time zone
Float32Dtype     # 32-bit nullable floating point, use "Float32" when passing as string
Float64Dtype     # 64-bit nullable floating point, use "Float64" when passing as string
Int8Dtype   # 8-bit nullable signed integer, use "Int8" when passing as string
Int16Dtype  # 16-bit nullable signed integer, use "Int16" when passing as string
Int32Dtype  # 32-bit nullable signed integer, use "Int32" when passing as string
Int64Dtype  # 64-bit nullable signed integer, use "Int64" when passing as string
UInt8Dtype  # 8-bit nullable unsigned integer, use "UInt8" when passing as string
UInt16Dtype # 16-bit nullable unsigned integer, use "UInt16" when passing as string
UInt32Dtype # 32-bit nullable unsigned integer, use "UInt32" when passing as string
UInt64Dtype # 64-bit nullable unsigned integer, use "UInt64" when passing as string
s = pd.Series([1, 2, 3, None], dtype=pd.Float64Dtype())  # pd.Series([1, 2, 3, None], dtype="Float64")
# 0     1.0
# 1     2.0
# 2     3.0
# 3    <NA>
# dtype: Float64
s[3] is pd.NA
# True

s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())  # pd.Series(['one', 'two', None, 'three'], dtype="string")
# 0      one
# 1      two
# 2     <NA>
# 3    three
# dtype: string
s[2] is pd.NA
# True

2.1. Series, DataFrame, and Index

  • A Series is a one-dimensional array-like object containing a sequence of any values and an associated array of labels, called its index, also like a fixed-length, ordered dictionary as it is a mapping of index values to data values.

    s = pd.Series([4, 7, -5, 3])  # default integer index
    # 0    4
    # 1    7
    # 2   -5
    # 3    3
    # dtype: int64
    
    s.array
    # <NumpyExtensionArray>
    # [np.int64(4), np.int64(7), np.int64(-5), np.int64(3)]
    # Length: 4, dtype: int64
    
    s.index
    # RangeIndex(start=0, stop=4, step=1)
    
    s.dtype
    # dtype('int64')
    
    s.to_numpy()  # NumPy array
    # array([ 4,  7, -5,  3])
    
    s.to_dict()  # dictionary
    # {0: 4, 1: 7, 2: -5, 3: 3}
    
    s = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])  # label index
    # d    4
    # b    7
    # a   -5
    # c    3
    # dtype: int64
    
    # select a single value or a set of values
    s["a"]
    # np.int64(-5)
    s[["b", "c", "d"]]
    # b    7
    # c    3
    # d    4
    # dtype: int64
    
    # purely integer-location based indexing for selection by position
    s.iloc[0]
    # np.int64(4)
    s.iloc[:2]
    # d    4
    # b    7
    # dtype: int64
    s.iloc[[1, 2, 0]]
    # b    7
    # a   -5
    # d    4
    # dtype: int64
    
    # Series is ndarray-like
    s > 0
    # d     True
    # b     True
    # a    False
    # c     True
    # dtype: bool
    s[s > 0]
    # d    4
    # b    7
    # c    3
    # dtype: int64
    s[s > s.median()]
    # d    4
    # b    7
    # dtype: int64
    np.exp(s)  # import numpy as np
    # d      54.598150
    # b    1096.633158
    # a       0.006738
    # c      20.085537
    # dtype: float64
    
    # Series is dictionary-like
    s["d"]
    # np.int64(4)
    s[["a", "b", "c"]]
    # a   -5
    # b    7
    # c    3
    # dtype: int64
    "b" in s
    # True
    "e" in s
    # False
    s.get("e", np.nan)
    # nan
    data = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
    states = ["California", "Ohio", "Oregon", "Texas"]
    pd.Series(data, index=states)  # override indices
    # California        NaN
    # Ohio          35000.0
    # Oregon        16000.0
    # Texas         71000.0
    # dtype: float64
    
    # name attribute
    s.name = "fooooobar"
    s.index.name = "buzzzzzz"
    s
    # buzzzzzz
    # d    4
    # b    7
    # a   -5
    # c    3
    # Name: fooooobar, dtype: int64
    
    # alter the index in place while preserving the original index name
    s.index = pd.Index(["Bob", "Steve", "Jeff", "Ryan"], name=s.index.name)  # Retains the current index name
    # Output:
    # buzzzzzz
    # Bob      4
    # Steve    7
    # Jeff    -5
    # Ryan     3
    # Name: fooooobar, dtype: int64
    
    # alter the index in place without preserving the index name
    s.index = ["Bob", "Steve", "Jeff", "Ryan"]  # Removes the index name
    # Output:
    # Bob      4
    # Steve    7
    # Jeff    -5
    # Ryan     3
    # Name: fooooobar, dtype: int64
  • A DataFrame is a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns, or a spreadsheet, or a dict of Series objects.

    data = {
        "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2],
    }
    frame = pd.DataFrame(data)  # from a dictionary of equal-length lists or NumPy arrays
    #     state  year  pop
    # 0    Ohio  2000  1.5
    # 1    Ohio  2001  1.7
    # 2    Ohio  2002  3.6
    # 3  Nevada  2001  2.4
    # 4  Nevada  2002  2.9
    # 5  Nevada  2003  3.2
    
    frame.dtypes
    # state     object
    # year       int64
    # pop      float64
    # dtype: object
    
    frame.head()  # select the first five (by default) rows
    #     state  year  pop
    # 0    Ohio  2000  1.5
    # 1    Ohio  2001  1.7
    # 2    Ohio  2002  3.6
    # 3  Nevada  2001  2.4
    # 4  Nevada  2002  2.9
    frame.tail()  # select the last five (by default) rows
    #     state  year  pop
    # 1    Ohio  2001  1.7
    # 2    Ohio  2002  3.6
    # 3  Nevada  2001  2.4
    # 4  Nevada  2002  2.9
    # 5  Nevada  2003  3.2
    
    # specify a sequence of columns and order
    pd.DataFrame(data, columns=["year", "state", "pop"])
    #    year   state  pop
    # 0  2000    Ohio  1.5
    # 1  2001    Ohio  1.7
    # 2  2002    Ohio  3.6
    # 3  2001  Nevada  2.4
    # 4  2002  Nevada  2.9
    # 5  2003  Nevada  3.2
    pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
    #    year   state  pop debt
    # 0  2000    Ohio  1.5  NaN
    # 1  2001    Ohio  1.7  NaN
    # 2  2002    Ohio  3.6  NaN
    # 3  2001  Nevada  2.4  NaN
    # 4  2002  Nevada  2.9  NaN
    # 5  2003  Nevada  3.2  NaN
    
    # retrieve columns as Series
    frame["state"]  # dictionary-like notation
    # 0      Ohio
    # 1      Ohio
    # 2      Ohio
    # 3    Nevada
    # 4    Nevada
    # 5    Nevada
    # Name: state, dtype: object
    frame.year  # dot attribute notation
    # 0    2000
    # 1    2001
    # 2    2002
    # 3    2001
    # 4    2002
    # 5    2003
    # Name: year, dtype: int64
    
    # retrieve rows by position or name with the special `iloc` and `loc` attributes
    frame.loc[1]
    # state    Ohio
    # year     2001
    # pop       1.7
    # Name: 1, dtype: object
    frame.iloc[2]
    # state    Ohio
    # year     2002
    # pop       3.6
    # Name: 2, dtype: object
    
    # columns can be modified by assignment
    frame2 = pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
    frame2["debt"] = 16.5
    #   year   state  pop  debt
    # 0  2000    Ohio  1.5  16.5
    # 1  2001    Ohio  1.7  16.5
    # 2  2002    Ohio  3.6  16.5
    # 3  2001  Nevada  2.4  16.5
    # 4  2002  Nevada  2.9  16.5
    # 5  2003  Nevada  3.2  16.5
    frame2["debt"] = np.arange(6.)
    #    year   state  pop  debt
    # 0  2000    Ohio  1.5   0.0
    # 1  2001    Ohio  1.7   1.0
    # 2  2002    Ohio  3.6   2.0
    # 3  2001  Nevada  2.4   3.0
    # 4  2002  Nevada  2.9   4.0
    # 5  2003  Nevada  3.2   5.0
    
    # assigning a column that doesn’t exist will create a new column.
    frame2["eastern"] = frame2["state"] == "Ohio"
    #    year   state  pop  debt  eastern
    # 0  2000    Ohio  1.5   0.0     True
    # 1  2001    Ohio  1.7   1.0     True
    # 2  2002    Ohio  3.6   2.0     True
    # 3  2001  Nevada  2.4   3.0    False
    # 4  2002  Nevada  2.9   4.0    False
    # 5  2003  Nevada  3.2   5.0    False
    
    # remove a column
    del frame2["eastern"]
    #    year   state  pop  debt
    # 0  2000    Ohio  1.5   0.0
    # 1  2001    Ohio  1.7   1.0
    # 2  2002    Ohio  3.6   2.0
    # 3  2001  Nevada  2.4   3.0
    # 4  2002  Nevada  2.9   4.0
    # 5  2003  Nevada  3.2   5.0
    
    # return and drop an item from frame
    frame2.pop("debt")
    # 0    0.0
    # 1    1.0
    # 2    2.0
    # 3    3.0
    # 4    4.0
    # 5    5.0
    # Name: debt, dtype: float64
    frame2
    #    year   state  pop
    # 0  2000    Ohio  1.5
    # 1  2001    Ohio  1.7
    # 2  2002    Ohio  3.6
    # 3  2001  Nevada  2.4
    # 4  2002  Nevada  2.9
    # 5  2003  Nevada  3.2
    
    # from a nested dictionary of dictionaries
    #   the outer dictionary keys as the columns
    #   the inner keys as the row indices
    populations = {
        "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
        "Nevada": {2001: 2.4, 2002: 2.9},
    }
    frame3 = pd.DataFrame(populations)
    #       Ohio  Nevada
    # 2000   1.5     NaN
    # 2001   1.7     2.4
    # 2002   3.6     2.9
    
    frame3.T  # transpose the DataFrame (swap rows and columns)
    #         2000  2001  2002
    # Ohio     1.5   1.7   3.6
    # Nevada   NaN   2.4   2.9
    
    frame2.to_numpy()  # a two-dimensional ndarray
    # array([[2000, 'Ohio', 1.5, 0.0],
    #        [2001, 'Ohio', 1.7, 1.0],
    #        [2002, 'Ohio', 3.6, 2.0],
    #        [2001, 'Nevada', 2.4, 3.0],
    #        [2002, 'Nevada', 2.9, 4.0],
    #        [2003, 'Nevada', 3.2, 5.0]], dtype=object)
  • A pandas’s Index object is responsible for holding the axis labels (including a DataFrame’s column names) and other metadata (like the axis name or names).

    frame3
    #       Ohio  Nevada
    # 2000   1.5     NaN
    # 2001   1.7     2.4
    # 2002   3.6     2.9
    
    frame3.index
    # Index([2000, 2001, 2002], dtype='int64')
    
    frame3.columns
    # Index(['Ohio', 'Nevada'], dtype='object')
    
    "Ohio" in frame3.columns
    # True
    
    2003 in frame3.index
    # False
    
    # duplicate labels select all occurrences of that label
    s = frame3["Ohio"]
    # 2000    1.5
    # 2001    1.7
    # 2002    3.6
    # Name: Ohio, dtype: float64
    i = s.index.append(pd.Index([2000, 2002]))  # Index([2000, 2001, 2002, 2000, 2002], dtype='int64')
    s[i]
    # 2000    1.5
    # 2001    1.7
    # 2002    3.6
    # 2000    1.5
    # 2002    3.6
    # Name: Ohio, dtype: float64

2.2. Essential Functionality

2.2.1. Reindexing

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
# d    4.5
# b    7.2
# a   -5.3
# c    3.6
# dtype: float64
obj.reindex(["a", "b", "c", "d", "e"])
# a   -5.3
# b    7.2
# c    3.6
# d    4.5
# e    NaN
# dtype: float64

frame = pd.DataFrame(
    np.arange(9).reshape((3, 3)),
    index=["a", "c", "d"],
    columns=["Ohio", "Texas", "California"],
)
#    Ohio  Texas  California
# a     0      1           2
# c     3      4           5
# d     6      7           8

frame.reindex(index=["a", "b", "c", "d"])
#    Ohio  Texas  California
# a   0.0    1.0         2.0
# b   NaN    NaN         NaN
# c   3.0    4.0         5.0
# d   6.0    7.0         8.0

# reindex with the `axis` keyword
frame.reindex(["Texas", "Utah", "California"], axis="columns")
#    Texas  Utah  California
# a      1   NaN           2
# c      4   NaN           5
# d      7   NaN           8
frame.reindex(["Texas", "Utah", "California"], axis=1)
#    Texas  Utah  California
# a      1   NaN           2
# c      4   NaN           5
# d      7   NaN           8

# reindex with the `columns` keyword
frame.reindex(columns=["Texas", "Utah", "California"])
#    Texas  Utah  California
# a      1   NaN           2
# c      4   NaN           5
# d      7   NaN           8

# reindex with the `loc` operator
frame.loc[["a", "d", "c"], ["California", "Texas"]]
#    California  Texas
# a           2      1
# d           8      7
# c           5      4

2.2.2. Dropping Entries from an Axis

obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
# a    0.0
# b    1.0
# c    2.0
# d    3.0
# e    4.0
# dtype: float64

obj.drop("c")
# a    0.0
# b    1.0
# d    3.0
# e    4.0
# dtype: float64

obj.drop(["c", "d"])
# a    0.0
# b    1.0
# e    4.0
# dtype: float64

data = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"],
)
#          one  two  three  four
# Ohio        0    1      2     3
# Colorado    4    5      6     7
# Utah        8    9     10    11
# New York   12   13     14    15
data.drop(index=["Colorado", "Ohio"])
#           one  two  three  four
# Utah        8    9     10    11
# New York   12   13     14    15
data.drop(columns=["two"])
#           one  three  four
# Ohio        0      2     3
# Colorado    4      6     7
# Utah        8     10    11
# New York   12     14    15
data.drop("two", axis=1)          # drop values from the columns by passing axis=1
#           one  three  four
# Ohio        0      2     3
# Colorado    4      6     7
# Utah        8     10    11
# New York   12     14    15
data.drop("two", axis="columns")  # drop values from the columns by passing axis="columns"
#           one  three  four
# Ohio        0      2     3
# Colorado    4      6     7
# Utah        8     10    11
# New York   12     14    15

2.2.3. Indexing, Selection, and Filtering

s = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
# a    0.0
# b    1.0
# c    2.0
# d    3.0
# dtype: float64

s["b"]
# np.float64(1.0)

s[1]  # deprecated: label-based indexing or position-based ?
# np.float64(1.0)

s.iloc[1]  # by position
# np.float64(1.0)
s.iloc[[1, 3]]
# b    1.0
# d    3.0
# dtype: float64

s[2:4]  # slicing with integers is always integer oriented
# c    2.0
# d    3.0
# dtype: float64

s[["b", "a", "d"]]
# b    1.0
# a    0.0
# d    3.0
# dtype: float64
s.loc[["b", "a", "d"]]  # the preferred way to select index values by label
# b    1.0
# a    0.0
# d    3.0
# dtype: float64

s[s < 2]
# a    0.0
# b    1.0
# dtype: float64

s.loc["b":"c"] = 5
# a    0.0
# b    5.0
# c    5.0
# d    3.0
# dtype: float64

# retrieve one or more columns from a DataFrame either with a single value or sequence
# df[column]          select single column or sequence of columns from the DataFrame
# df.loc[rows]        select single row or subset of rows from the DataFrame by label
# df.loc[:, cols]     select single column or subset of columns by label
# df.loc[rows, cols]  select both row(s) and column(s) by label
# df.iloc[rows]       select single row or subset of rows from the DataFrame by integer position
# df.iloc[:, cols]    select single column or subset of columns by integer position
# df.iloc[rows, cols] select both row(s) and column(s) by integer position
# df.at[row, col]     select a single scalar value by row and column label
# df.iat[row, col]    select a single scalar value by row and column position (integers)
# df.reindex(...)     select either rows or columns by labels

data = pd.DataFrame(
    np.arange(16).reshape((4, 4)),
    index=["Ohio", "Colorado", "Utah", "New York"],
    columns=["one", "two", "three", "four"],
)
#           one  two  three  four
# Ohio        0    1      2     3
# Colorado    4    5      6     7
# Utah        8    9     10    11
# New York   12   13     14    15

data["two"]
# Ohio         1
# Colorado     5
# Utah         9
# New York    13
# Name: two, dtype: int64

data[["three", "one"]]
#           three  one
# Ohio          2    0
# Colorado      6    4
# Utah         10    8
# New York     14   12

data[:2]  # slice (slice rows)
#           one  two  three  four
# Ohio        0    1      2     3
# Colorado    4    5      6     7

data[data["three"] > 5]  # boolean array (filter rows)
#           one  two  three  four
# Colorado    4    5      6     7
# Utah        8    9     10    11
# New York   12   13     14    15

data < 5
#             one    two  three   four
# Ohio       True   True   True   True
# Colorado   True  False  False  False
# Utah      False  False  False  False
# New York  False  False  False  False

data[data < 5] = 0  # boolean DataFrame (set values based on some criterion)
#           one  two  three  four
# Ohio        0    0      0     0
# Colorado    0    5      6     7
# Utah        8    9     10    11
# New York   12   13     14    15

# select a subset of the rows and columns using either axis labels (loc) or integers (iloc)
data.loc["Colorado"]  # select a single row by label
# one      0
# two      5
# three    6
# four     7
# Name: Colorado, dtype: int64

data.loc[["Colorado", "New York"]]  # select multiple rows with a sequence of labels
#           one  two  three  four
# Colorado    0    5      6     7
# New York   12   13     14    15

data.loc["Colorado", ["two", "three"]]  # data.loc["Colorado"][["two", "three"]]
# two      5
# three    6
# Name: Colorado, dtype: int64

data.iloc[2]  # select a single row by integer
# one       8
# two       9
# three    10
# four     11
# Name: Utah, dtype: int64

data.iloc[[2, 1]]
#           one  two  three  four
# Utah        8    9     10    11
# Colorado    0    5      6     7

data.iloc[2, [3, 0, 1]]
# four    11
# one      8
# two      9
# Name: Utah, dtype: int64

data.iloc[[1, 2], [3, 0, 1]]
#           four  one  two
# Colorado     7    0    5
# Utah        11    8    9

data.iloc[:, :3][data.three > 5]
#           one  two  three
# Colorado    0    5      6
# Utah        8    9     10
# New York   12   13     14

# boolean arrays can be used with `loc` but not `iloc`
data.loc[data.three >= 2]  # data[data.three >= 2]
#           one  two  three  four
# Colorado    0    5      6     7
# Utah        8    9     10    11
# New York   12   13     14    15

2.2.4. Arithmetic and Data Alignment

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
# a    7.3
# c   -2.5
# d    3.4
# e    1.5
# dtype: float64
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
# a   -2.1
# c    3.6
# e   -1.5
# f    4.0
# g    3.1
# dtype: float64
s1 + s2  # label-based data alignment yields NaN for non-overlapping entries in arithmetic
# a    5.2
# c    1.1
# d    NaN
# e    0.0
# f    NaN
# g    NaN
# dtype: float64

df1 = pd.DataFrame(
    np.arange(9.0).reshape((3, 3)),
    columns=list("bcd"),
    index=["Ohio", "Texas", "Colorado"],
)
#             b    c    d
# Ohio      0.0  1.0  2.0
# Texas     3.0  4.0  5.0
# Colorado  6.0  7.0  8.0
df2 = pd.DataFrame(
    np.arange(12.0).reshape((4, 3)),
    columns=list("bde"),
    index=["Utah", "Ohio", "Texas", "Oregon"],
)
#           b     d     e
# Utah    0.0   1.0   2.0
# Ohio    3.0   4.0   5.0
# Texas   6.0   7.0   8.0
# Oregon  9.0  10.0  11.0
df1 + df2  # alignment is performed on both rows and columns on DataFrame
#             b   c     d   e
# Colorado  NaN NaN   NaN NaN
# Ohio      3.0 NaN   6.0 NaN
# Oregon    NaN NaN   NaN NaN
# Texas     9.0 NaN  12.0 NaN
# Utah      NaN NaN   NaN NaN

# arithmetic methods with fill values: substitute the passed value for any missing values
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list("abcd"))
#      a    b     c     d
# 0  0.0  1.0   2.0   3.0
# 1  4.0  5.0   6.0   7.0
# 2  8.0  9.0  10.0  11.0
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list("abcde"))
#       a     b     c     d     e
# 0   0.0   1.0   2.0   3.0   4.0
# 1   5.0   6.0   7.0   8.0   9.0
# 2  10.0  11.0  12.0  13.0  14.0
# 3  15.0  16.0  17.0  18.0  19.0

df2.loc[1, "b"] = np.nan
#       a     b     c     d     e
# 0   0.0   1.0   2.0   3.0   4.0
# 1   5.0   NaN   7.0   8.0   9.0
# 2  10.0  11.0  12.0  13.0  14.0
# 3  15.0  16.0  17.0  18.0  19.0

df1 + df2
#       a     b     c     d   e
# 0   0.0   2.0   4.0   6.0 NaN
# 1   9.0   NaN  13.0  15.0 NaN
# 2  18.0  20.0  22.0  24.0 NaN
# 3   NaN   NaN   NaN   NaN NaN

df1.add(df2, fill_value=0.)
#       a     b     c     d     e
# 0   0.0   2.0   4.0   6.0   4.0
# 1   9.0   5.0  13.0  15.0   9.0
# 2  18.0  20.0  22.0  24.0  14.0
# 3  15.0  16.0  17.0  18.0  19.0

# broadcasting: operations between DataFrame and Series
arr = np.arange(12).reshape((3, 4))
# array([[ 0,  1,  2,  3],
#        [ 4,  5,  6,  7],
#        [ 8,  9, 10, 11]])

arr[0]
# array([0, 1, 2, 3])

arr - arr[0]  # the subtraction is performed once for each row
# array([[0, 0, 0, 0],
#        [4, 4, 4, 4],
#        [8, 8, 8, 8]])

2.2.5. Function Application and Mapping

data = pd.DataFrame(
    np.random.standard_normal((4, 3)),
    columns=list("bde"),
    index=["Utah", "Ohio", "Texas", "Oregon"],
)
#                b         d         e
# Utah   -0.079026  1.151062 -0.393464
# Ohio   -0.731363  0.601451 -0.462457
# Texas  -1.400861  0.892020 -0.010010
# Oregon  0.899501 -0.418370 -0.791160

np.abs(data)  # NumPy ufuncs (element-wise array methods)
#                b         d         e
# Utah    0.079026  1.151062  0.393464
# Ohio    0.731363  0.601451  0.462457
# Texas   1.400861  0.892020  0.010010
# Oregon  0.899501  0.418370  0.791160

# apply a function along an axis of the DataFrame.
data.apply(lambda x: x.max() - x.min())
# b    2.300362
# d    1.569433
# e    0.781149
# dtype: float64
data.apply(lambda x: x.max() - x.min(), axis="columns")
# Utah      1.544527
# Ohio      1.332813
# Texas     2.292882
# Oregon    1.690661
# dtype: float64
data.apply(lambda x: pd.Series([x.max(), x.min()], index=["max", "min"]))
#             b         d        e
# max  0.899501  1.151062 -0.01001
# min -1.400861 -0.418370 -0.79116

# apply a function to a Dataframe elementwise
data.map(lambda x: f"{x:.2f}")
#             b      d      e
# Utah    -0.08   1.15  -0.39
# Ohio    -0.73   0.60  -0.46
# Texas   -1.40   0.89  -0.01
# Oregon   0.90  -0.42  -0.79

2.2.6. Sorting and Ranking

s = pd.Series([4, 7, -3, 2], index=["d", "a", "b", "c"])
# d    4
# a    7
# b   -3
# c    2
# dtype: int64

# sort lexicographically by row or column label
s.sort_index()
# a    7
# b   -3
# c    2
# d    4
# dtype: int64
s.sort_index(ascending=False)  # descending
# d    4
# c    2
# b   -3
# a    7
# dtype: int64
i = s.index.sort_values()
# Index(['a', 'b', 'c', 'd'], dtype='object')
s.reindex(i)
# a    7
# b   -3
# c    2
# d    4
# dtype: int64

s.sort_values()  # sort by value
# b   -3
# c    2
# d    4
# a    7
# dtype: int64

# any missing values are sorted to the end of the Series by default
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values()
# 4   -3.0
# 5    2.0
# 0    4.0
# 2    7.0
# 1    NaN
# 3    NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(ascending=False)
# 2    7.0
# 0    4.0
# 5    2.0
# 4   -3.0
# 1    NaN
# 3    NaN
# dtype: float64
pd.Series([4, np.nan, 7, np.nan, -3, 2]).sort_values(na_position="first")
# 1    NaN
# 3    NaN
# 4   -3.0
# 5    2.0
# 0    4.0
# 2    7.0
# dtype: float64

data = pd.DataFrame(
    [[4, 7, -3, 2], [0, 1, 0, 1]], index=["three", "one"], columns=["d", "a", "b", "c"]
)
#        d  a  b  c
# three  4  7 -3  2
# one    0  1  0  1
data.sort_index()
#        d  a  b  c
# one    0  1  0  1
# three  4  7 -3  2
data.sort_index(axis="columns")
#        a  b  c  d
# three  7 -3  2  4
# one    1  0  1  0
# compute numerical data ranks (1 through n) along axis.
s = pd.Series([7, -5, 7, 4, 2, 0, 4])

s.rank()  # by default, average rank of the group
# 0    6.5
# 1    1.0
# 2    6.5
# 3    4.5
# 4    3.0
# 5    2.0
# 6    4.5
# dtype: float64
s.rank(method="first")  # ranks assigned in order they appear in the array
# 0    6.0
# 1    1.0
# 2    7.0
# 3    4.0
# 4    3.0
# 5    2.0
# 6    5.0
# dtype: float64
s.rank(ascending=False)  # rank in descending order
# 0    1.5
# 1    7.0
# 2    1.5
# 3    3.5
# 4    5.0
# 5    6.0
# 6    3.5
# dtype: float64

# DataFrame can compute ranks over the rows or the columns
data = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
#      b  a    c
# 0  4.3  0 -2.0
# 1  7.0  1  5.0
# 2 -3.0  0  8.0
# 3  2.0  1 -2.5
data.rank()
#      b    a    c
# 0  3.0  1.5  2.0
# 1  4.0  3.5  3.0
# 2  1.0  1.5  4.0
# 3  2.0  3.5  1.0
data.rank(method="first", axis="columns")
#      b    a    c
# 0  3.0  2.0  1.0
# 1  3.0  1.0  2.0
# 2  1.0  2.0  3.0
# 3  3.0  2.0  1.0

2.2.7. Axis Indexes with Duplicate Labels

s = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])
# a    0
# a    1
# b    2
# b    3
# c    4
# dtype: int64
s.index.is_unique
# False
s["a"]  # return a Series
# a    0
# a    1
# dtype: int64
s["c"]  # return a scalar value
# np.int64(4)
data = pd.DataFrame(np.random.standard_normal((5, 3)), index=["a", "a", "b", "b", "c"])
#           0         1         2
# a -0.366146  0.329982  0.186980
# a  1.275523  0.615156 -3.403692
# b  0.488783 -0.448384 -2.626718
# b  0.610570  2.211333  1.131847
# c -0.157337  0.212340  0.175906
data.loc["a"]  # return a DataFrame
#           0         1         2
# a -0.366146  0.329982  0.186980
# a  1.275523  0.615156 -3.403692
data.loc["c"]  # return a Series
# 0   -0.157337
# 1    0.212340
# 2    0.175906
# Name: c, dtype: float64

2.2.8. Unique Values, Value Counts, and Membership

s = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
s.unique()  # return unique values of Series object
# array(['c', 'a', 'd', 'b'], dtype=object)
s.value_counts()  # return a Series containing counts of unique values
# c    3
# a    3
# b    2
# d    1
# Name: count, dtype: int64
mask = s.isin(["b", "c"])  # performs a vectorized set membership check
# 0     True
# 1    False
# 2    False
# 3    False
# 4    False
# 5     True
# 6     True
# 7     True
# 8     True
# dtype: bool
s[mask]
# 0    c
# 5    b
# 6    b
# 7    c
# 8    c
# dtype: object

pd.Index(pd.Series(s.unique()))
# Index(['c', 'a', 'd', 'b'], dtype='object')
pd.Index(pd.Series(s.unique())).get_indexer(s)
# array([0, 1, 2, 1, 1, 3, 3, 0, 0])

2.3. Reading and Writing

2.3.1. Reading and Writing CSV Text Files

csv_data = """
Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
1,John,1998-05-06,1234567890,25,New York,USA,894240000
2,Jane,1993-07-15,0987654321,30,London,UK,741484800
3,Doe,2001-12-20,1122334455,,Sydney,Australia,1008806400
4,NULL,1990-01-01,NULL,40,NULL,Canada,631152000
"""
# load the CSV data into a DataFrame
data = pd.read_csv(
    io.StringIO(csv_data),        # use io.StringIO to simulate a file-like object
    parse_dates=["Birth Day"],    # parse the 'Birth Day' column as datetime
    dtype={"Phone Number": str},  # treat 'Phone Number' as a string
    na_values=["", "NULL"],       # handle missing values
)
#    Id  Name  Birth Day Phone Number   Age      City    Country   Timestamp
# 0   1  John 1998-05-06   1234567890  25.0  New York        USA   894240000
# 1   2  Jane 1993-07-15   0987654321  30.0    London         UK   741484800
# 2   3   Doe 2001-12-20   1122334455   NaN    Sydney  Australia  1008806400
# 3   4   NaN 1990-01-01          NaN  40.0       NaN     Canada   631152000
# convert the 'Timestamp' column from Unix epoch time to datetime
data["Timestamp"] = pd.to_datetime(data["Timestamp"], unit="s")
#    Id  Name  Birth Day Phone Number   Age      City    Country   Timestamp  timestamp
# 0   1  John 1998-05-06   1234567890  25.0  New York        USA   894240000 1998-05-04
# 1   2  Jane 1993-07-15   0987654321  30.0    London         UK   741484800 1993-07-01
# 2   3   Doe 2001-12-20   1122334455   NaN    Sydney  Australia  1008806400 2001-12-20
# 3   4   NaN 1990-01-01          NaN  40.0       NaN     Canada   631152000 1990-01-01
data.dtypes
# Id                       int64
# Name                    object
# Birth Day       datetime64[ns]
# Phone Number            object
# Age                    float64
# City                    object
# Country                 object
# Timestamp       datetime64[ns]
# dtype: object

# write the data out to a comma-separated file
data.to_csv(sys.stdout)  # import sys
# ,Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 0,1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 1,2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 2,3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 3,4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False)
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,,Sydney,Australia,2001-12-20
# 4,,1990-01-01,,40.0,,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL')
# Id,Name,Birth Day,Phone Number,Age,City,Country,Timestamp
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01
data.to_csv(sys.stdout, index=False, na_rep='NIL', header=False)
# 1,John,1998-05-06,1234567890,25.0,New York,USA,1998-05-04
# 2,Jane,1993-07-15,0987654321,30.0,London,UK,1993-07-01
# 3,Doe,2001-12-20,1122334455,NIL,Sydney,Australia,2001-12-20
# 4,NIL,1990-01-01,NIL,40.0,NIL,Canada,1990-01-01

2.3.2. Reading and Writing Microsoft Excel Files

# add-on packages xlrd and openpyxl to read old-style XLS and newer XLSX files
conda install openpyxl xlrd
  • pandas.ExcelFile: read sheets and parse as DataFrame

    xlsx = pd.ExcelFile("book1.xlsx")
    xlsx.sheet_names
    # ['Sheet1']
    xlsx.parse(sheet_name="Sheet1")
    #    0  a   b   c   d message
    # 0  0  1   2   3   4   hello
    # 1  1  5   6   7   8   world
    # 2  2  9  10  11  12     foo
    df = xlsx.parse(sheet_name="Sheet1", index_col=0)
    #    a   b   c   d message
    # 0
    # 0  1   2   3   4   hello
    # 1  5   6   7   8   world
    # 2  9  10  11  12     foo
    df.columns  # Index(['a', 'b', 'c', 'd', 'message'], dtype='object')
    df.index    # Index([0, 1, 2], dtype='int64', name=0)
    xlsx.parse(sheet_name="Sheet1", index_col="message")
    #          0  a   b   c   d
    # message
    # hello    0  1   2   3   4
    # world    1  5   6   7   8
    # foo      2  9  10  11  12
  • pandas.read_excel: read a sheet as pandas data directly

    pd.read_excel("book1.xlsx")  # pd.read_excel("book1.xlsx", sheet_name=0)
    #    0  a   b   c   d message
    # 0  0  1   2   3   4   hello
    # 1  1  5   6   7   8   world
    # 2  2  9  10  11  12     foo
    pd.read_excel("book1.xlsx", sheet_name="Sheet1")
    #    0  a   b   c   d message
    # 0  0  1   2   3   4   hello
    # 1  1  5   6   7   8   world
    # 2  2  9  10  11  12     foo
  • pandas.to_excel: write pandas data to Excel files

    writer = pd.ExcelWriter("sample1.xlsx")
    df.to_excel(writer, sheet_name="foo")  # write to sheet 'foo'
    df.to_excel(writer, sheet_name="bar")  # write to sheet 'bar'
    df.to_excel(writer, sheet_name="buz")  # write to sheet 'buz'
    pd.ExcelFile("sample1.xlsx").sheet_names
    # ['foo', 'bar', 'buz']

2.3.3. Reading SQL-based Databases

import sqlite3

query = "CREATE TABLE test (a VARCHAR(20), b VARCHAR(20), c REAL, d INTEGER );"
con = sqlite3.connect("mydata.sqlite")
con.execute(query)
# <sqlite3.Cursor at 0x7f56df665840>
con.commit()

data = [
    ("Atlanta", "Georgia", 1.25, 6),
    ("Tallahassee", "Florida", 2.6, 3),
    ("Sacramento", "California", 1.7, 5),
]
stmt = "INSERT INTO test VALUES(?, ?, ?, ?)"
con.executemany(stmt, data)
# <sqlite3.Cursor at 0x7f56e074fac0>
con.commit()

cursor = con.execute("SELECT * FROM test")
rows = cursor.fetchall()
# [('Atlanta', 'Georgia', 1.25, 6),
#  ('Tallahassee', 'Florida', 2.6, 3),
#  ('Sacramento', 'California', 1.7, 5)]
cursor.description
# (('a', None, None, None, None, None, None),
#  ('b', None, None, None, None, None, None),
#  ('c', None, None, None, None, None, None),
#  ('d', None, None, None, None, None, None))
columns = [x[0] for x in cursor.description]
pd.DataFrame(rows, columns=columns)
#              a           b     c  d
# 0      Atlanta     Georgia  1.25  6
# 1  Tallahassee     Florida  2.60  3
# 2   Sacramento  California  1.70  5
  • pandas.read_sql: read data easily from a general SQLAlchemy connection

    import sqlalchemy as sqla  # conda install sqlalchemy -y
    db = sqla.create_engine("sqlite:///mydata.sqlite")
    pd.read_sql("SELECT * FROM test", db)
    #              a           b     c  d
    # 0      Atlanta     Georgia  1.25  6
    # 1  Tallahassee     Florida  2.60  3
    # 2   Sacramento  California  1.70  5

2.4. Data Cleaning and Preparation

2.4.1. Handling Missing Data

s = pd.Series([1, np.nan, 3.5, None, 7])
s.dropna()  # remove missing values
# 0    1.0
# 2    3.5
# 4    7.0
# dtype: float64
s.fillna(0)  # fill NA/NaN values (NA: not available)
# 0    1.0
# 1    0.0
# 2    3.5
# 3    0.0
# 4    7.0
# dtype: float64
s.isna()
# 0    False
# 1     True
# 2    False
# 3     True
# 4    False
# dtype: bool
s.notna()
# 0     True
# 1    False
# 2     True
# 3    False
# 4     True
# dtype: bool

data = pd.DataFrame(
    [
        [1.0, 6.5, 3.0],
        [1.0, np.nan, np.nan],
        [np.nan, np.nan, np.nan],
        [np.nan, 6.5, 3.0],
    ]
)
#      0    1    2
# 0  1.0  6.5  3.0
# 1  1.0  NaN  NaN
# 2  NaN  NaN  NaN
# 3  NaN  6.5  3.0
data.dropna()
#      0    1    2
# 0  1.0  6.5  3.0
data.dropna(how="all")
#      0    1    2
# 0  1.0  6.5  3.0
# 1  1.0  NaN  NaN
# 3  NaN  6.5  3.0
data[4] = np.nan
#      0    1    2   4
# 0  1.0  6.5  3.0 NaN
# 1  1.0  NaN  NaN NaN
# 2  NaN  NaN  NaN NaN
# 3  NaN  6.5  3.0 NaN
data.dropna(axis="columns", how="all")
#      0    1    2
# 0  1.0  6.5  3.0
# 1  1.0  NaN  NaN
# 2  NaN  NaN  NaN
# 3  NaN  6.5  3.0

data = pd.DataFrame(np.random.standard_normal((7, 3)))
data.iloc[:4, 1] = np.nan
data.iloc[:2, 2] = np.nan
#           0         1         2
# 0 -0.327178       NaN       NaN
# 1  1.345916       NaN       NaN
# 2 -0.502837       NaN  0.894144
# 3 -0.372355       NaN -1.719540
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631
data.dropna()
#           0         1         2
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631
data.dropna(thresh=2)
#           0         1         2
# 2 -0.502837       NaN  0.894144
# 3 -0.372355       NaN -1.719540
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631
data.ffill()  # fill NA/NaN values by using the last (row) valid observation to fill the gap
#           0         1         2
# 0 -0.327178       NaN       NaN
# 1  1.345916       NaN       NaN
# 2 -0.502837       NaN  0.894144
# 3 -0.372355       NaN -1.719540
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631
data.ffill(axis="columns")
#           0         1         2
# 0 -0.327178 -0.327178 -0.327178
# 1  1.345916  1.345916  1.345916
# 2 -0.502837 -0.502837  0.894144
# 3 -0.372355 -0.372355 -1.719540
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631
data.bfill()  # fill NA/NaN values by using the next (row) valid observation to fill the gap
#           0         1         2
# 0 -0.327178  0.484139  0.894144
# 1  1.345916  0.484139  0.894144
# 2 -0.502837  0.484139  0.894144
# 3 -0.372355  0.484139 -1.719540
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631
data.bfill(axis="columns")
#           0         1         2
# 0 -0.327178       NaN       NaN
# 1  1.345916       NaN       NaN
# 2 -0.502837  0.894144  0.894144
# 3 -0.372355 -1.719540 -1.719540
# 4 -0.210112  0.484139  0.904050
# 5  0.100575 -1.676686 -0.691126
# 6  0.581531 -0.615403 -1.176631

2.4.2. Removing Duplicates

data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"], "k2": [1, 1, 2, 3, 3, 4, 4]})
#     k1  k2
# 0  one   1
# 1  two   1
# 2  one   2
# 3  two   3
# 4  one   3
# 5  two   4
# 6  two   4
data.duplicated()
# 0    False
# 1    False
# 2    False
# 3    False
# 4    False
# 5    False
# 6     True
# dtype: bool
data.drop_duplicates()
    k1  k2
0  one   1
1  two   1
2  one   2
3  two   3
4  one   3
5  two   4

data["v1"] = range(7)
#     k1  k2  v1
# 0  one   1   0
# 1  two   1   1
# 2  one   2   2
# 3  two   3   3
# 4  one   3   4
# 5  two   4   5
# 6  two   4   6
data.drop_duplicates(subset=["k1"])
#     k1  k2  v1
# 0  one   1   0
# 1  two   1   1
data.drop_duplicates(subset=["k1"], keep="last")
#     k1  k2  v1
# 4  one   3   4
# 6  two   4   6

2.4.3. Transforming Data Using a Function or Mapping

data = pd.DataFrame(
    {
        "food": [
            "bacon",
            "pulled pork",
            "bacon",
            "pastrami",
            "corned beef",
            "bacon",
            "pastrami",
            "honey ham",
            "nova lox",
        ],
        "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6],
    }
)
#           food  ounces
# 0        bacon     4.0
# 1  pulled pork     3.0
# 2        bacon    12.0
# 3     pastrami     6.0
# 4  corned beef     7.5
# 5        bacon     8.0
# 6     pastrami     3.0
# 7    honey ham     5.0
# 8     nova lox     6.0
meat_to_animal = {
    "bacon": "pig",
    "pulled pork": "pig",
    "pastrami": "cow",
    "corned beef": "cow",
    "honey ham": "pig",
    "nova lox": "salmon",
}
data["animal"] = data["food"].map(meat_to_animal)  # data["food"].map(lambda x: meat_to_animal.get(x, None))
#           food  ounces  animal
# 0        bacon     4.0     pig
# 1  pulled pork     3.0     pig
# 2        bacon    12.0     pig
# 3     pastrami     6.0     cow
# 4  corned beef     7.5     cow
# 5        bacon     8.0     pig
# 6     pastrami     3.0     cow
# 7    honey ham     5.0     pig
# 8     nova lox     6.0  salmon

2.4.4. Replacing Values

data = pd.Series([1., -999., 2., -999., -1000., 3.])
# 0       1.0
# 1    -999.0
# 2       2.0
# 3    -999.0
# 4   -1000.0
# 5       3.0
# dtype: float64
data.replace(-999, np.nan)
# 0       1.0
# 1       NaN
# 2       2.0
# 3       NaN
# 4   -1000.0
# 5       3.0
# dtype: float64
data.replace([-999, -1000], np.nan)  # replace multiple values at once
# 0    1.0
# 1    NaN
# 2    2.0
# 3    NaN
# 4    NaN
# 5    3.0
# dtype: float64

2.4.5. Renaming Axis Indexes

data = pd.DataFrame(
    np.arange(12).reshape((3, 4)),
    index=["Ohio", "Colorado", "New York"],
    columns=["one", "two", "three", "four"],
)

data.index.map(lambda x: x[:4].upper())
# Index(['OHIO', 'COLO', 'NEW '], dtype='object')

data.index = data.index.map(lambda x: x[:4].upper())
#       one  two  three  four
# OHIO    0    1      2     3
# COLO    4    5      6     7
# NEW     8    9     10    11

data.rename(index=str.title, columns=str.upper)
#       ONE  TWO  THREE  FOUR
# Ohio    0    1      2     3
# Colo    4    5      6     7
# New     8    9     10    11

2.4.6. Discretization and Binning

# Continuous data is often discretized or otherwise separated into “bins” for analysis.
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

age_categories = pd.cut(ages, bins)  # pandas.core.arrays.categorical.Categorical
# [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
# Length: 12
# Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

age_categories.codes
# array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
age_categories.categories
# IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')
age_categories.categories[0]
# Interval(18, 25, closed='right')

age_categories.value_counts()
# (18, 25]     5
# (25, 35]     3
# (35, 60]     3
# (60, 100]    1
# Name: count, dtype: int64

pd.cut(ages, bins, right=False).categories
# IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')

group_names = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
pd.cut(ages, bins, right=False, labels=group_names).categories
# Index(['Youth', 'YoungAdult', 'MiddleAged', 'Senior'], dtype='object')

data = np.random.uniform(size=20)
# array([0.34500722, 0.51390559, 0.45442101, 0.24779283, 0.11860691,
#        0.64541406, 0.95682711, 0.69983468, 0.661519  , 0.09869171,
#        0.12846634, 0.10438803, 0.6110211 , 0.28152758, 0.40054146,
#        0.36215715, 0.29260695, 0.0516599 , 0.7291701 , 0.84767151])
pd.cut(data, 4, precision=2).categories  # equal-length bins based on the minimum and maximum values
# IntervalIndex([(0.051, 0.28], (0.28, 0.5], (0.5, 0.73], (0.73, 0.96]], dtype='interval[float64, right]')

pd.qcut(data, 4, precision=2).value_counts()
# (0.041999999999999996, 0.22]    5
# (0.22, 0.38]                    5
# (0.38, 0.65]                    5
# (0.65, 0.96]                    5
# Name: count, dtype: int64

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]).value_counts()
# (0.0507, 0.104]    2
# (0.104, 0.381]     8
# (0.381, 0.741]     8
# (0.741, 0.957]     2
# Name: count, dtype: int64

2.4.7. Detecting and Filtering Outliers

data = pd.DataFrame(np.random.standard_normal((1000, 4)))
data.describe()
#                  0            1            2            3
# count  1000.000000  1000.000000  1000.000000  1000.000000
# mean     -0.002266    -0.036654    -0.017418    -0.029068
# std       1.018625     0.950048     0.994702     1.041082
# min      -3.421405    -3.833754    -3.642241    -3.125454
# 25%      -0.663947    -0.694460    -0.659706    -0.762197
# 50%      -0.024047    -0.005600    -0.025911    -0.024954
# 75%       0.700707     0.603613     0.663005     0.701335
# max       3.134904     2.526298     2.994854     3.704972

data[(data.abs() > 3).any(axis="columns")]  # select all rows having a value exceeding 3 or –3
#             0         1         2         3
# 64   0.769583 -3.833754 -0.284691  0.283270
# 176 -3.198594  0.745868 -0.430357 -0.374628
# 207  0.823846 -0.171761  1.137797  3.704972
# 416 -3.024055  0.560612  1.320891 -0.691014
# 790 -3.421405  0.516339  0.049209  1.016584
# 915 -0.543984  0.844945 -3.010801 -0.165290
# 916  3.134904  0.091651  0.908098  1.685942
# 945  3.073425 -0.943425 -1.711061  0.573415
# 955 -1.375006 -0.344609 -3.642241 -0.823830
# 981  0.507220  0.136884  0.880373 -3.125454

data[data.abs() > 3] = np.sign(data) * 3  # cap values outside the interval [–3, 3]
data.describe()
#                  0            1            2            3
# count  1000.000000  1000.000000  1000.000000  1000.000000
# mean     -0.001830    -0.035821    -0.016765    -0.029648
# std       1.015991     0.947074     0.992531     1.038420
# min      -3.000000    -3.000000    -3.000000    -3.000000
# 25%      -0.663947    -0.694460    -0.659706    -0.762197
# 50%      -0.024047    -0.005600    -0.025911    -0.024954
# 75%       0.700707     0.603613     0.663005     0.701335
# max       3.000000     2.526298     2.994854     3.000000

np.sign(data).head()  # produces 1 and –1 values based on positive or negative
#      0    1    2    3
# 0 -1.0  1.0 -1.0  1.0
# 1 -1.0 -1.0 -1.0 -1.0
# 2  1.0 -1.0 -1.0  1.0
# 3 -1.0 -1.0  1.0 -1.0
# 4  1.0  1.0  1.0 -1.0

2.4.8. Permutation and Random Sampling

data = pd.DataFrame(np.arange(5 * 7).reshape((5, 7)))
#     0   1   2   3   4   5   6
# 0   0   1   2   3   4   5   6
# 1   7   8   9  10  11  12  13
# 2  14  15  16  17  18  19  20
# 3  21  22  23  24  25  26  27
# 4  28  29  30  31  32  33  34

sampler = np.random.permutation(5)
# array([2, 4, 3, 0, 1])
data.iloc[sampler]  # equivalent: data.take(sampler)
    0   1   2   3   4   5   6
2  14  15  16  17  18  19  20
4  28  29  30  31  32  33  34
3  21  22  23  24  25  26  27
0   0   1   2   3   4   5   6
1   7   8   9  10  11  12  13
column_sampler = np.random.permutation(7)
# array([3, 5, 2, 4, 6, 0, 1])
data.take(column_sampler, axis="columns")
#     3   5   2   4   6   0   1
# 0   3   5   2   4   6   0   1
# 1  10  12   9  11  13   7   8
# 2  17  19  16  18  20  14  15
# 3  24  26  23  25  27  21  22
# 4  31  33  30  32  34  28  29

data.sample(n=3)  # select a random subset without replacement
#      0   1   2   3   4   5   6
#  0   0   1   2   3   4   5   6
#  2  14  15  16  17  18  19  20
#  4  28  29  30  31  32  33  34

data.sample(n=7, replace=True)  # select a random subset with replacement (to allow repeat choices)
#     0   1   2   3   4   5   6
# 3  21  22  23  24  25  26  27
# 0   0   1   2   3   4   5   6
# 1   7   8   9  10  11  12  13
# 0   0   1   2   3   4   5   6
# 4  28  29  30  31  32  33  34
# 4  28  29  30  31  32  33  34
# 3  21  22  23  24  25  26  27

2.4.9. Computing Indicator/Dummy Variables: One-Hot Encoding

data = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"], "data1": range(6)})
#   key  data1
# 0   b      0
# 1   b      1
# 2   a      2
# 3   c      3
# 4   a      4
# 5   b      5
pd.get_dummies(data["key"], dtype=int)
#    a  b  c
# 0  0  1  0
# 1  0  1  0
# 2  1  0  0
# 3  0  0  1
# 4  1  0  0
# 5  0  1  0
data.join(pd.get_dummies(data["key"], dtype=int))
#   key  data1  a  b  c
# 0   b      0  0  1  0
# 1   b      1  0  1  0
# 2   a      2  1  0  0
# 3   c      3  0  0  1
# 4   a      4  1  0  0
# 5   b      5  0  1  0

movies = pd.read_table(
    "https://raw.githubusercontent.com/wesm/pydata-book/refs/heads/3rd-edition/datasets/movielens/movies.dat",
    sep="::",
    header=None,
    names=["movie_id", "title", "genres"],
    engine="python",
)
movies.head()
#    movie_id                               title                        genres
# 0         1                    Toy Story (1995)   Animation|Children's|Comedy
# 1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
# 2         3             Grumpier Old Men (1995)                Comedy|Romance
# 3         4            Waiting to Exhale (1995)                  Comedy|Drama
# 4         5  Father of the Bride Part II (1995)                        Comedy
dummies = movies["genres"].str.get_dummies("|")  # string functions in pandas
dummies.iloc[:5,:6]
#    Action  Adventure  Animation  Children's  Comedy  Crime
# 0       0          0          1           1       1      0
# 1       0          1          0           1       0      0
# 2       0          0          0           0       1      0
# 3       0          0          0           0       1      0
# 4       0          0          0           0       1      0
movies_windic = movies.join(dummies.add_prefix("Genre_"))
movies_windic.iloc[0][:10]
# movie_id                                       1
# title                           Toy Story (1995)
# genres               Animation|Children's|Comedy
# Genre_Action                                   0
# Genre_Adventure                                0
# Genre_Animation                                1
# Genre_Children's                               1
# Genre_Comedy                                   1
# Genre_Crime                                    0
# Genre_Documentary                              0
# Name: 0, dtype: object

np.random.seed(12345) # to make the example repeatable
values = np.random.uniform(size=10)
# array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
#        0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins), dtype=int)
#    (0.0, 0.2]  (0.2, 0.4]  (0.4, 0.6]  (0.6, 0.8]  (0.8, 1.0]
# 0           0           0           0           0           1
# 1           0           1           0           0           0
# 2           1           0           0           0           0
# 3           0           1           0           0           0
# 4           0           0           1           0           0
# 5           0           0           1           0           0
# 6           0           0           0           0           1
# 7           0           0           0           1           0
# 8           0           0           0           1           0
# 9           0           0           0           1           0

2.4.10. String Functions in pandas

data = {
    "Dave": "dave@google.com",
    "Steve": "steve@gmail.com",
    "Rob": "rob@gmail.com",
    "Wes": np.nan,
}
data = pd.Series(data)
# Dave     dave@google.com
# Steve    steve@gmail.com
# Rob        rob@gmail.com
# Wes                  NaN
# dtype: object
data.isna()
# Dave     False
# Steve    False
# Rob      False
# Wes       True
# dtype: bool

data.astype("string")  # extension types
# Dave     dave@google.com
# Steve    steve@gmail.com
# Rob        rob@gmail.com
# Wes                 <NA>
# dtype: string

data.str.contains("gmail")  # .str attribute
# Dave     False
# Steve     True
# Rob       True
# Wes        NaN
# dtype: object

pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE)  # regular expressions
# Dave     [(dave, google, com)]
# Steve    [(steve, gmail, com)]
# Rob        [(rob, gmail, com)]
# Wes                        NaN
# dtype: object

matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]  # vectorized element retrieval
# Dave     (dave, google, com)
# Steve    (steve, gmail, com)
# Rob        (rob, gmail, com)
# Wes                      NaN
# dtype: object
matches.str.get(1)
# Dave     google
# Steve     gmail
# Rob       gmail
# Wes         NaN
# dtype: object

data.str[:5]  # slice strings
# Dave     dave@
# Steve    steve
# Rob      rob@g
# Wes        NaN
# dtype: object

data.str.extract(pattern, flags=re.IGNORECASE)  # extract capture groups
#            0       1    2
# Dave    dave  google  com
# Steve  steve   gmail  com
# Rob      rob   gmail  com
# Wes      NaN     NaN  NaN

2.4.11. Categorical Encoding

categories = pd.Series(["apple", "orange"])  # categories, dictionary, or levels
# 0     apple
# 1    orange
# dtype: object
codes = pd.Series([0, 1, 0, 0] * 2)  # category codes or codes
# 0    0
# 1    1
# 2    0
# 3    0
# 4    0
# 5    1
# 6    0
# 7    0
# dtype: int64
values = categories.take(codes)
# 0     apple
# 1    orange
# 0     apple
# 0     apple
# 0     apple
# 1    orange
# 0     apple
# 0     apple
# dtype: object
# pandas has a Categorical extension type for the integer-based categorical or dictionary-encoded encoding
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)
data = pd.DataFrame(
    {
        "fruit": fruits,
        "basket_id": np.arange(N),
        "count": rng.integers(3, 15, size=N),
        "weight": rng.uniform(0, 4, size=N),
    },
    columns=["basket_id", "fruit", "count", "weight"],
)
#    basket_id   fruit  count    weight
# 0          0   apple      8  0.366659
# 1          1  orange      6  2.394272
# 2          2   apple      9  3.418968
# 3          3   apple     12  2.406485
# 4          4   apple      8  3.727953
# 5          5  orange      5  2.899125
# 6          6   apple      3  3.442205
# 7          7   apple      4  3.717351

fruit_cat = data["fruit"].astype("category")
# 0     apple
# 1    orange
# 2     apple
# 3     apple
# 4     apple
# 5    orange
# 6     apple
# 7     apple
# Name: fruit, dtype: category
# Categories (2, object): ['apple', 'orange']
c = fruit_cat.array
type(c)       # pandas.core.arrays.categorical.Categorical
c.categories  # Index(['apple', 'orange'], dtype='object')
c.codes       # array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

dict(enumerate(c.categories))  # get a mapping between codes and categories
# {0: 'apple', 1: 'orange'}
pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
# ['foo', 'bar', 'baz', 'foo', 'bar']
# Categories (3, object): ['bar', 'baz', 'foo']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"])
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo', 'bar', 'baz']
pd.Categorical.from_codes(codes=[0, 1, 2, 0, 0, 1], categories=["foo", "bar", "baz"], ordered=True)
# ['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
# Categories (3, object): ['foo' < 'bar' < 'baz']
# `.cat` accessor attribute
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2).astype("category")
cat_s.cat.categories
# Index(['a', 'b', 'c', 'd'], dtype='object')
cat_s.cat.codes
# 0    0
# 1    1
# 2    2
# 3    3
# 4    0
# 5    1
# 6    2
# 7    3
# dtype: int8
cat_s.value_counts()
# a    2
# b    2
# c    2
# d    2
# Name: count, dtype: int64
cat_s2 = cat_s.cat.set_categories(["a", "b", "c", "d", "e"])
cat_s2.value_counts()
# a    2
# b    2
# c    2
# d    2
# e    0
# Name: count, dtype: int64
cat_s3 = cat_s[cat_s.isin(["a", "b"])]
# 0    a
# 1    b
# 4    a
# 5    b
# dtype: category
# Categories (4, object): ['a', 'b', 'c', 'd']
cat_s_3.cat.remove_unused_categories()
# 0    a
# 1    b
# 4    a
# 5    b
# dtype: category
# Categories (2, object): ['a', 'b']
draws = np.random.default_rng(seed=12345).standard_normal(1_000)
# array([-1.42382504,  1.26372846, -0.87066174, -0.25917323, -0.07534331,

bins = pd.qcut(draws, 4)  # compute a quartile binning
# [(-3.121, -0.675], (0.687, 3.211], (-3.121, -0.675], (-0.675, 0.0134], (-0.675, 0.0134], ..., (0.0134, 0.687], (0.0134, 0.687], (-0.675, 0.0134], (0.0134, 0.687], (-0.675, 0.0134]]
# Length: 1000
# Categories (4, interval[float64, right]): [(-3.121, -0.675] < (-0.675, 0.0134] < (0.0134, 0.687] < (0.687, 3.211]]

bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])  # quartile names
# ['Q1', 'Q4', 'Q1', 'Q2', 'Q2', ..., 'Q3', 'Q3', 'Q2', 'Q3', 'Q2']
# Length: 1000
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

bins = pd.Series(bins, name='quartile')
# 0      Q1
# 1      Q4
# 2      Q1
#        ..
# 998    Q3
# 999    Q2
# Name: quartile, Length: 1000, dtype: category
# Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']
results = pd.Series(draws).groupby(bins, observed=True).agg(["count", "min", "max"]).reset_index()  # statistics
#   quartile  count       min       max
# 0       Q1    250 -3.119609 -0.678494
# 1       Q2    250 -0.673305  0.008009
# 2       Q3    250  0.018753  0.686183
# 3       Q4    250  0.688282  3.211418
# dummy variables or one-hot encoding
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
pd.get_dummies(cat_s, dtype=int)
#    a  b  c  d
# 0  1  0  0  0
# 1  0  1  0  0
# 2  0  0  1  0
# 3  0  0  0  1
# 4  1  0  0  0
# 5  0  1  0  0
# 6  0  0  1  0
# 7  0  0  0  1

2.5. Data Wrangling: Join, Combine, and Reshape

2.5.1. Hierarchical Indexing

# a Series with a list of lists (or arrays) as the index
data = pd.Series(
    np.random.uniform(size=9),
    index=[
        ["a", "a", "a", "b", "b", "c", "c", "d", "d"],
        [  1,   2,   3,   1,   3,   1,   2,   2,   3]
    ],
)
print(data)  # a prettified view of a Series with a MultiIndex as its index
# a  1    0.141713
#    2    0.969081
#    3    0.850562
# b  1    0.882278
#    3    0.539535
# c  1    0.944913
#    2    0.371328
# d  2    0.585134
#    3    0.826680
# dtype: float64
data.index  # MultiIndex
# MultiIndex([('a', 1),
#             ('a', 2),
#             ('a', 3),
#             ('b', 1),
#             ('b', 3),
#             ('c', 1),
#             ('c', 2),
#             ('d', 2),
#             ('d', 3)],
#            )

data["b"]       # partial indexing
# 1    0.882278
# 3    0.539535
# dtype: float64
data.loc[:, 2]  # level indexing
# a    0.969081
# c    0.371328
# d    0.585134
# dtype: float64

data.unstack()
#           1         2         3
# a  0.141713  0.969081  0.850562
# b  0.882278       NaN  0.539535
# c  0.944913  0.371328       NaN
# d       NaN  0.585134  0.826680
data.unstack().stack()
# a  1    0.141713
#    2    0.969081
#    3    0.850562
# b  1    0.882278
#    3    0.539535
# c  1    0.944913
#    2    0.371328
# d  2    0.585134
#    3    0.826680
# dtype: float64
df = pd.DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[
        ["a", "a", "b", "b"],
        [  1,   2,   1,   2]
    ],
    columns=[
        [ "Ohio", "Ohio", "Colorado"],
        ["Green",  "Red", "Green"]
    ],
)
#      Ohio     Colorado
#     Green Red    Green
# a 1     0   1        2
#   2     3   4        5
# b 1     6   7        8
#   2     9  10       11
df.index.names = ["key1", "key2"]
df.columns.names = ["state", "color"]
# state      Ohio     Colorado
# color     Green Red    Green
# key1 key2
# a    1        0   1        2
#      2        3   4        5
# b    1        6   7        8
#      2        9  10       11
df.index.nlevels
# 2
df["Ohio"]
# color     Green  Red
# key key2
# a   1         0    1
#     2         3    4
# b   1         6    7
#     2         9   10
# a MultiIndex can be created by itself and then reused
pd.MultiIndex.from_arrays(
    [
        [ "Ohio", "Ohio", "Colorado"],
        ["Green",  "Red", "Green"],
    ],
    names=["state", "color"],
)
# reordering and sorting levels
df.swaplevel("key1", "key2")
# state      Ohio     Colorado
# color     Green Red    Green
# key2 key1
# 1    a        0   1        2
# 2    a        3   4        5
# 1    b        6   7        8
# 2    b        9  10       11
df.sort_index(level=1)
# state      Ohio     Colorado
# color     Green Red    Green
# key1 key2
# a    1        0   1        2
# b    1        6   7        8
# a    2        3   4        5
# b    2        9  10       11

df.swaplevel(0, 1)
# state      Ohio     Colorado
# color     Green Red    Green
# key2 key1
# 1    a        0   1        2
# 2    a        3   4        5
# 1    b        6   7        8
# 2    b        9  10       11
df.swaplevel(0, 1).sort_index(level=0)
# state      Ohio     Colorado
# color     Green Red    Green
# key2 key1
# 1    a        0   1        2
#      b        6   7        8
# 2    a        3   4        5
#      b        9  10       11
# summary statistics by level
df.groupby(level="key2").sum()
# state  Ohio     Colorado
# color Green Red    Green
# key2
# 1         6   8       10
# 2        12  14       16
df.T.groupby(level="color").sum().T
# color      Green  Red
# key1 key2
# a    1         2    1
#      2         8    4
# b    1        14    7
#      2        20   10

References

  • [1] Wes McKinney Python for Data Analysis. thrid edition, O’Reilly, August 2022.