1. USER instruction at Dockerfile

The USER instruction sets the user name (or UID) and optionally the user group (or GID) to use when running the image and for any RUN, CMD and ENTRYPOINT instructions that follow it in the Dockerfile.

$ docker run --rm -it debian:11 id
uid=0(root) gid=0(root) groups=0(root)
$ cat Dockerfile
FROM debian:11
USER 2000:2000

$ docker build . -t debian:2022
[+] Building 0.0s (5/5) FINISHED
 => [internal] load build definition from Dockerfile                                                                                                               0.0s
 => => transferring dockerfile: 73B                                                                                                                                0.0s
 => [internal] load .dockerignore                                                                                                                                  0.0s
 => => transferring context: 2B                                                                                                                                    0.0s
 => [internal] load metadata for docker.io/library/debian:11                                                                                                       0.0s
 => [1/1] FROM docker.io/library/debian:11                                                                                                                         0.0s
 => exporting to image                                                                                                                                             0.0s
 => => exporting layers                                                                                                                                            0.0s
 => => writing image sha256:564a3419a320ccf2cb11a43bfe7025d5fc67cc72590af8bbff1b12f95c6f0229                                                                       0.0s
 => => naming to docker.io/library/debian:2022                                                                                                                     0.0s

$ docker run --rm -it debian:2022 id
uid=2022 gid=2022 groups=2022

$ echo philosophia > philosophy

$ chmod 400 philosophy

$ ls -ln philosophy
-r-------- 1 1000 1000 12 Dec 31 03:53 philosophy

$ docker run --rm -it -v $PWD:/tmp debian:2022 ls -ln /tmp/philosophy
-r-------- 1 1000 1000 12 Dec 30 19:53 /tmp/philosophy

$ docker run --rm -it -v $PWD:/tmp debian:2022 cat /tmp/philosophy
cat: /tmp/philosophy: Permission denied

$ docker run --user 1000:1000 --rm -it -v $PWD:/tmp debian:2022 id
uid=1000 gid=1000 groups=1000

$ docker run --user 1000:1000 --rm -it -v $PWD:/tmp debian:2022 cat /tmp/philosophy
philosophia

2. Kubernetes Securitycontext

$ kubectl explain po.spec.securityContext.runAsUser
KIND:     Pod
VERSION:  v1

FIELD:    runAsUser <integer>

DESCRIPTION:
     The UID to run the entrypoint of the container process. Defaults to user
     specified in image metadata if unspecified. May also be set in
     SecurityContext. If set in both SecurityContext and PodSecurityContext, the
     value specified in SecurityContext takes precedence for that container.
     Note that this field cannot be set when spec.os.name is windows.

$ kubectl explain po.spec.securityContext.runAsGroup
KIND:     Pod
VERSION:  v1

FIELD:    runAsGroup <integer>

DESCRIPTION:
     The GID to run the entrypoint of the container process. Uses runtime
     default if unset. May also be set in SecurityContext. If set in both
     SecurityContext and PodSecurityContext, the value specified in
     SecurityContext takes precedence for that container. Note that this field
     cannot be set when spec.os.name is windows.

$ kubectl explain po.spec.securityContext.fsGroup
KIND:     Pod
VERSION:  v1

FIELD:    fsGroup <integer>

DESCRIPTION:
     A special supplemental group that applies to all containers in a pod. Some
     volume types allow the Kubelet to change the ownership of that volume to be
     owned by the pod:

     1. The owning GID will be the FSGroup 2. The setgid bit is set (new files
     created in the volume will be owned by FSGroup) 3. The permission bits are
     OR'd with rw-rw----

     If unset, the Kubelet will not modify the ownership and permissions of any
     volume. Note that this field cannot be set when spec.os.name is windows.

$ kubectl explain po.spec.securityContext.runAsNonRoot
KIND:     Pod
VERSION:  v1

FIELD:    runAsNonRoot <boolean>

DESCRIPTION:
     Indicates that the container must run as a non-root user. If true, the
     Kubelet will validate the image at runtime to ensure that it does not run
     as UID 0 (root) and fail to start the container if it does. If unset or
     false, no such validation will be performed. May also be set in
     SecurityContext. If set in both SecurityContext and PodSecurityContext, the
     value specified in SecurityContext takes precedence.

$ kubectl explain po.spec.containers.securityContext.runAsUser
$ kubectl explain po.spec.containers.securityContext.runAsGroup
$ kubectl explain po.spec.containers.securityContext.runAsNonRoot

$ kubectl explain po.spec.containers.securityContext.privileged
KIND:     Pod
VERSION:  v1

FIELD:    privileged <boolean>

DESCRIPTION:
     Run container in privileged mode. Processes in privileged containers are
     essentially equivalent to root on the host. Defaults to false. Note that
     this field cannot be set when spec.os.name is windows.

$ kubectl explain po.spec.containers.securityContext.allowPrivilegeEscalation
KIND:     Pod
VERSION:  v1

FIELD:    allowPrivilegeEscalation <boolean>

DESCRIPTION:
     AllowPrivilegeEscalation controls whether a process can gain more
     privileges than its parent process. This bool directly controls if the
     no_new_privs flag will be set on the container process.
     AllowPrivilegeEscalation is true always when the container is: 1) run as
     Privileged 2) has CAP_SYS_ADMIN Note that this field cannot be set when
     spec.os.name is windows.

$ kubectl explain po.spec.containers.securityContext.capabilities
KIND:     Pod
VERSION:  v1

RESOURCE: capabilities <Object>

DESCRIPTION:
     The capabilities to add/drop when running containers. Defaults to the
     default set of capabilities granted by the container runtime. Note that
     this field cannot be set when spec.os.name is windows.

     Adds and removes POSIX capabilities from running containers.

FIELDS:
   add	<[]string>
     Added capabilities

   drop	<[]string>
     Removed capabilities
$ cat sec-01.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-01
spec:
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true

$ kubectl create -f sec-01.yaml
pod/sec-01 created

$ kubectl exec -it sec-01 -- id
uid=0(root) gid=0(root) groups=0(root)
$ cat sec-02.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-02
spec:
  containers:
    - image: debian:2022
      name: debian
      stdin: true
      tty: true

$ kubectl create -f sec-02.yaml
pod/sec-02 created

$ kubectl exec -it sec-02 -- id
uid=2022 gid=0(root) groups=0(root)
$ cat sec-03.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-02
spec:
  securityContext:
    runAsUser: 2022
    runAsGroup: 2022
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true

$ kubectl create -f sec-03.yaml
pod/sec-03 created

$ kubectl exec -it sec-03 -- id
uid=2022 gid=2022 groups=2022
$ cat sec-04.yaml
apiVersion: v1
kind: PersistentVolume
metadata:
  name: sec-04
spec:
  accessModes:
    - ReadWriteOnce
  capacity:
    storage: 5Gi
  local:
    path: /testdata
  nodeAffinity:
    required:
      nodeSelectorTerms:
      - matchExpressions:
        - key: kubernetes.io/os
          operator: In
          values:
          - linux
  persistentVolumeReclaimPolicy: Retain
  storageClassName: local-storage
  volumeMode: Filesystem

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: sec-04
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    limits:
      storage: 5Gi
    requests:
      storage: 5Gi
  storageClassName: local-storage
  volumeMode: Filesystem

---
apiVersion: v1
kind: Pod
metadata:
  name: sec-04
spec:
  securityContext:
    runAsUser: 2022
    runAsGroup: 2022
    fsGroup: 3300
  volumes:
    - name: testdata
      persistentVolumeClaim:
        claimName: sec-04
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true
      workingDir: /testdata
      volumeMounts:
        - mountPath: /testdata
          name: testdata

$ kubectl create -f sec-04.yaml
persistentvolume/sec-04 created
persistentvolumeclaim/sec-04 created
pod/sec-04 created

$ kubectl exec -it sec-04 -- id
uid=2022 gid=2022 groups=2022,3300

$ kubectl exec -it sec-04 -- ls -ld
drwxrwsr-x 2 root 3300 4096 Dec 30 21:15 .

$ kubectl exec -it sec-04 -- touch testfile

$ kubectl exec -it sec-04 -- ls -l
total 0
-rw-r--r-- 1 2022 3300 0 Dec 30 21:15 testfile
$ cat sec-05.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-05
spec:
  securityContext:
    runAsNonRoot: true
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true

$ kubectl create -f sec-05.yaml
pod/sec-05 created

$ kubectl get po sec-05
NAME     READY   STATUS                       RESTARTS   AGE
sec-05   0/1     CreateContainerConfigError   0          6s

$ kubectl describe po sec-05
...
Events:
  Type     Reason          Age               From               Message
  ----     ------          ----              ----               -------
  Normal   Scheduled       16s               default-scheduler  Successfully assigned default/sec-05 to far-seer-01
  Normal   SandboxChanged  15s               kubelet            Pod sandbox changed, it will be killed and re-created.
  Normal   Pulled          2s (x5 over 16s)  kubelet            Container image "debian:11" already present on machine
  Warning  Failed          2s (x5 over 16s)  kubelet            Error: container has runAsNonRoot and image will run as root (pod: "sec-05_default(7cc40d5a-d4da-4a95-9ad4-bc787b803eb4)", container: debian)
$ cat sec-06.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-06
spec:
  securityContext:
    runAsNonRoot: true
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true
      securityContext:
        runAsUser: 3000

$ kubectl create -f sec-06.yaml
pod/sec-06 created

$ kubectl get po sec-06
NAME     READY   STATUS    RESTARTS   AGE
sec-06   1/1     Running   0          16s

3. Pod Security Policy

$ kubectl explain psp.spec
KIND:     PodSecurityPolicy
VERSION:  policy/v1beta1

RESOURCE: spec <Object>

DESCRIPTION:
     spec defines the policy enforced.

     PodSecurityPolicySpec defines the policy enforced.

FIELDS:
   allowPrivilegeEscalation	<boolean>
     allowPrivilegeEscalation determines if a pod can request to allow privilege
     escalation. If unspecified, defaults to true.

   allowedCapabilities	<[]string>
     allowedCapabilities is a list of capabilities that can be requested to add
     to the container. Capabilities in this field may be added at the pod
     author's discretion. You must not list a capability in both
     allowedCapabilities and requiredDropCapabilities.

   privileged	<boolean>
     privileged determines if a pod can request to be run as privileged.

   requiredDropCapabilities	<[]string>
     requiredDropCapabilities are the capabilities that will be dropped from the
     container. These are required to be dropped and cannot be added.

   seLinux	<Object> -required-
     seLinux is the strategy that will dictate the allowable labels that may be
     set.

Pod security policy control is implemented as an optional admission controller, and policies are enforced by enabling the admission controller, but doing so without authorizing any policies will prevent any pods from being created in the cluster.

$ kubectl describe po -n kube-system kube-apiserver-node-01 | grep -- '--enable-admission-plugins'
      --enable-admission-plugins=NodeRestriction

The Kubernetes API server flag enable-admission-plugins takes a comma-delimited list of admission control plugins to invoke prior to modifying objects in the cluster.

Open kube-apiserver manifest at /etc/kubernetes/manifests/kube-apiserver.yaml and edit the server flag enable-admission-plugin to append PodSecurityPolicy.

$ sudo cat /etc/kubernetes/manifests/kube-apiserver.yaml | grep -- '--enable-admission-plugins'
    - --enable-admission-plugins=NodeRestriction,PodSecurityPolicy
      #- --enable-admission-plugins=NodeRestriction
kubelet[316726]: E0104 12:50:20.738992  316726 kubelet.go:1711] "Failed creating a mirror pod for" err="pods \"kube-apiserver-node-01\" is forbidden: PodSecurityPolicy: unable to admit pod: []" pod="kube-system/kube-apiserver-node-01"

To let kubelet to create and sync kube-apiserver pod when enabled the pod security policy control, we need to create a system PodSecurityPolicy for group system:authenticated in namespace kube-system.

$ cat psp-privileged.yaml
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
  name: psp-privileged.local.io
  annotations:
    seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*'
spec:
  privileged: true
  allowPrivilegeEscalation: true
  allowedCapabilities:
    - '*'
  volumes:
    - '*'
  hostNetwork: true
  hostPorts:
    - min: 0
      max: 65535
  hostIPC: true
  hostPID: true
  runAsUser:
    rule: 'RunAsAny'
  seLinux:
    rule: 'RunAsAny'
  supplementalGroups:
    rule: 'RunAsAny'
  fsGroup:
    rule: 'RunAsAny'

$ kubectl apply -f psp-privileged.yaml
Warning: policy/v1beta1 PodSecurityPolicy is deprecated in v1.21+, unavailable in v1.25+
podsecuritypolicy.policy/psp-privileged.local.io created

$ kubectl create clusterrole psp-privileged.local.io \
    --resource podsecuritypolicy \
    --resource-name psp-privileged.local.io \
    --verb use
clusterrole.rbac.authorization.k8s.io/psp-privileged.local.io created

$ kubectl create rolebinding psp-privileged.local.io \
    --clusterrole psp-privileged.local.io \
    --group system:authenticated \
    -n kube-system
rolebinding.rbac.authorization.k8s.io/psp-privileged.local.io created

$ kubectl describe po \
    -n kube-system kube-apiserver-node-01 | grep -- '--enable-admission-plugins'
      --enable-admission-plugins=NodeRestriction,PodSecurityPolicy
$ cat psp-unprivileged.yaml
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
  name: psp-unprivileged.local.io
spec:
  privileged: false
  allowPrivilegeEscalation: false
    #allowedCapabilities:
    #  - '*'
  requiredDropCapabilities:
    - SYS_ADMIN
  volumes:
    - configMap
    - projected
    - secret
    - downwardAPI
  hostNetwork: false
  hostIPC: false
  hostPID: false
  runAsUser:
    rule: MustRunAsNonRoot
      #rule: MustRunAs
      #ranges:
      #  - min: 1
      #    max: 65535
  runAsGroup:
    rule: MustRunAs
    ranges:
      - min: 1
        max: 65535
  seLinux:
    rule: RunAsAny
  supplementalGroups:
    rule: MustRunAs
    ranges:
      - min: 1
        max: 65535
  fsGroup:
    rule: MustRunAs
    ranges:
      - min: 1
        max: 65535

$ kubectl apply -f psp-unprivileged.yaml
Warning: policy/v1beta1 PodSecurityPolicy is deprecated in v1.21+, unavailable in v1.25+
podsecuritypolicy.policy/psp-unprivileged.local.io created

$ kubectl create ns test
namespace/test created

$ kubectl create \
    -n test rolebinding edit:sa:default \
    --clusterrole edit \
    --serviceaccount test:default
rolebinding.rbac.authorization.k8s.io/edit:sa:default created

$ cat sec-07.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-07
spec:
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true

$ kubectl --as system:serviceaccount:test:default create -f sec-07.yaml
Error from server (Forbidden): error when creating "sec-07.yaml": pods "sec-07" is forbidden: PodSecurityPolicy: unable to admit pod: []

$ kubectl create clusterrole psp-unprivileged.local.io \
    --resource podsecuritypolicy \
    --resource-name psp-unprivileged.local.io
error: at least one verb must be specified

$ kubectl create clusterrole psp-unprivileged.local.io \
    --resource podsecuritypolicy \
    --resource-name psp-unprivileged.local.io \
    --verb use
clusterrole.rbac.authorization.k8s.io/psp-unprivileged.local.io created

$ kubectl create rolebinding \
    -n test psp-unprivileged.local.io \
    --clusterrole psp-unprivileged.local.io \
    --group system:authenticated
rolebinding.rbac.authorization.k8s.io/psp-unprivileged.local.io created

$ kubectl --as system:serviceaccount:test:default create \
    -n test \
    -f sec-07.yaml
pod/sec-07 created

$ kubectl get pod
NAME     READY   STATUS                       RESTARTS   AGE
sec-07   0/1     CreateContainerConfigError   0          5s

$ kubectl describe po sec-07
...
Events:
  ----     ------     ----              ----               -------
  ...
  Warning  Failed     1s (x3 over 13s)  kubelet            Error: container has runAsNonRoot and image will run as root (pod: "sec-07_test(6aac2153-3869-407f-bad0-9ba5c0f084da)", container: debian)

$ cat sec-08.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-08
spec:
  securityContext:
    runAsUser: 1000
  containers:
    - image: debian:11
      name: debian
      stdin: true
      tty: true

$ kubectl --as system:serviceaccount:test:default create \
    -n test \
    -f sec-08.yaml
pod/sec-08 created

$ kubectl get po
NAME     READY   STATUS                       RESTARTS   AGE
sec-07   0/1     CreateContainerConfigError   0          3m15s
sec-08   1/1     Running                      0          3s

$ kubectl exec -it sec-08 -- id
uid=1000 gid=1(daemon) groups=1(daemon)
$ ls
Dockerfile  go.mod  main.go

$ cat go.mod
module helloworld

go 1.17

$ cat main.go
package main

import (
	"flag"
	"fmt"
	"net/http"
)

var (
	port int
)

func main() {
	flag.IntVar(&port, "port", 8080, "listening port.")
	flag.Parse()

	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
		w.Write([]byte("Hello World.\n"))
	})

	addr := fmt.Sprintf(":%d", port)
	fmt.Printf("Starting at %s", addr)
	http.ListenAndServe(addr, nil)
}

$ cat Dockerfile
FROM golang:1.17-bullseye
WORKDIR /build
COPY go.* ./
RUN go mod download

COPY main.go main.go
RUN go build -o helloworld

FROM debian:bullseye-slim
COPY --from=0 /build/helloworld /helloworld
ENTRYPOINT ["/helloworld"]

$ docker build . -t helloworld:v0.1
[+] Building 2.5s (14/14) FINISHED
...

$ cat sec-09.yaml
apiVersion: v1
kind: Pod
metadata:
  name: sec-09
spec:
  securityContext:
    runAsUser: 1000
  containers:
    - image: helloworld:v0.1
      name: helloworld
      args:
        - --port=80

$ kubectl create -n test -f sec-09.yaml
pod/sec-09 created

$ kubectl get po sec-09
NAME     READY   STATUS    RESTARTS   AGE
sec-09   1/1     Running   0          6s

$ kubectl port-forward pod/sec-09 5000:80 2>&1 > /dev/null &
[1] 426388

$ curl localhost:5000
Hello World.