可以创建一个没有任何进程的pod/container/namespace吗
intro
从内核的代码可以看到,namespace是在进程创建的时候创建,没有进程引用namespace的时候销毁。反过来说:namespace的创建要依赖进程的创建,不创建进程就没办法创建namespace。
那么这个触发创建namespace的进程从哪里来?特别是POD这种k8s引入的容器的容器这种结构,触发创建/维持它一直存在的进程是什么?
Kubernetes provides us with multiple options on how to use these commands:
When you override the default Entrypoint and Cmd in Kubernetes .yaml file, these rules apply:
- If you do not supply command or args for a Container, the defaults defined in the Docker image are used.
- If you supply only args for a Container, the default Entrypoint defined in the Docker image is run with the args that you supplied.
- If you supply a command for a Container, only the supplied command is used. The default EntryPoint and the default Cmd defined in the Docker image are ignored. Your command is run with the args supplied (or no args if none supplied).
containerd
前面提到的规则由containerd来落实:优先从协议中获得配置的command和args(config.GetCommand(), config.GetArgs()),再从镜像中获得镜像中配置的入口信息(Entrypoint),最后汇总到oci()中。
///@file: containerd\internal\cri\opts\spec_opts.go
// WithProcessArgs sets the process args on the spec based on the image and runtime config
func WithProcessArgs(config *runtime.ContainerConfig, image *imagespec.ImageConfig) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
command, args := config.GetCommand(), config.GetArgs()
// The following logic is migrated from https://github.com/moby/moby/blob/master/daemon/commit.go
// TODO(random-liu): Clearly define the commands overwrite behavior.
if len(command) == 0 {
// Copy array to avoid data race.
if len(args) == 0 {
args = append([]string{}, image.Cmd...)
}
if command == nil {
if !(len(image.Entrypoint) == 1 && image.Entrypoint[0] == "") {
command = append([]string{}, image.Entrypoint...)
}
}
}
if len(command) == 0 && len(args) == 0 {
return errors.New("no command specified")
}
return oci.WithProcessArgs(append(command, args...)...)(ctx, client, c, s)
}
}
// WithProcessArgs replaces the args on the generated spec
func WithProcessArgs(args ...string) SpecOpts {
return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error {
setProcess(s)
s.Process.Args = args
s.Process.CommandLine = ""
return nil
}
}
两者都不配置
按照containerd的文档创建一个没有args参数的config.json文件,通过runc启动会提示错误(runc run failed: args must not be empty):
tsecer@harry: mkdir redis
tsecer@harry: mkdir redis/rootfs
tsecer@harry: ctr images pull docker.io/library/redis:alpine
tsecer@harry: ctr image export redis_test docker.io/library/redis:alpine
tsecer@harry: ctr image import --platform linux/amd64 redis_test
unpacking docker.io/library/redis:alpine (sha256:48501c5ad00d5563bc30c075c7bcef41d7d98de3e9a1e6c752068c66f0a8463b)...done
tsecer@harry: cd redis/
tsecer@harry: runc spec
tsecer@harry: vim config.json # 删除文件中的args配置项
tsecer@harry: cat config.json
{
"ociVersion": "1.0.2-dev",
"process": {
"terminal": true,
"user": {
"uid": 0,
"gid": 0
},
"env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"TERM=xterm"
],
"cwd": "/",
"capabilities": {
"bounding": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"effective": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"permitted": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
],
"ambient": [
"CAP_AUDIT_WRITE",
"CAP_KILL",
"CAP_NET_BIND_SERVICE"
]
},
"rlimits": [
{
"type": "RLIMIT_NOFILE",
"hard": 1024,
"soft": 1024
}
],
"noNewPrivileges": true
},
"root": {
"path": "rootfs",
"readonly": true
},
"hostname": "runc",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc"
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/dev/shm",
"type": "tmpfs",
"source": "shm",
"options": [
"nosuid",
"noexec",
"nodev",
"mode=1777",
"size=65536k"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/sys/fs/cgroup",
"type": "cgroup",
"source": "cgroup",
"options": [
"nosuid",
"noexec",
"nodev",
"relatime",
"ro"
]
}
],
"linux": {
"resources": {
"devices": [
{
"allow": false,
"access": "rwm"
}
]
},
"namespaces": [
{
"type": "pid"
},
{
"type": "network"
},
{
"type": "ipc"
},
{
"type": "uts"
},
{
"type": "mount"
}
],
"maskedPaths": [
"/proc/acpi",
"/proc/asound",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/sys/firmware",
"/proc/scsi"
],
"readonlyPaths": [
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
}
}
tsecer@harry: runc run no_args_test_launch
ERRO[0000] runc run failed: args must not be empty
tsecer@harry:
根据错误提示可以看到对应的源代码
///@file: runc\utils_linux.go
func validateProcessSpec(spec *specs.Process) error {
if spec == nil {
return errors.New("process property must not be empty")
}
if spec.Cwd == "" {
return errors.New("Cwd property must not be empty")
}
if !filepath.IsAbs(spec.Cwd) {
return errors.New("Cwd must be an absolute path")
}
if len(spec.Args) == 0 {
return errors.New("args must not be empty")
}
if spec.SelinuxLabel != "" && !selinux.GetEnabled() {
return errors.New("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil
}
pod的进程
因为pod创建的时候会先创建一个namespace,根据前面的讨论,这个namespace也应该有一个进程,那么这个进程是什么呢?毕竟大家都没有指定。
在这篇文章(What the heck is a pod sandbox?)中,作者遇到一个很诡异的问题,创建pod提示pause:3.5失败,这正好解释pod中进程的问题:创建pod的时候(如果没有指定),k8s会默认创建一个pause进程。
///@file: containerd\internal\cri\config\config.go
const (
// ModePodSandbox means use Controller implementation from sbserver podsandbox package.
// We take this one as a default mode.
ModePodSandbox SandboxControllerMode = "podsandbox"
// ModeShim means use whatever Controller implementation provided by shim.
ModeShim SandboxControllerMode = "shim"
// DefaultSandboxImage is the default image to use for sandboxes when empty or
// for default configurations.
DefaultSandboxImage = "registry.k8s.io/pause:3.9"
// IOTypeFifo is container io implemented by creating named pipe
IOTypeFifo = "fifo"
// IOTypeStreaming is container io implemented by connecting the streaming api to sandbox endpoint
IOTypeStreaming = "streaming"
)
bonus:可以启动一个没有用户进程的kernel吗
内核最终毕竟要启动一个用户态进程,这个进程如何指定呢?从代码里看,如果没有通过内核启动命令行指定,会从约定的文件系统特定位置查找init进程。如果这些位置找不到可执行文件,内核同样会panic(No working init found. Try passing init= option to kernel. )。
///@file:init/main.c
static int __ref kernel_init(void *unused)
{
///...
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (CONFIG_DEFAULT_INIT[0] != '\0') {
ret = run_init_process(CONFIG_DEFAULT_INIT);
if (ret)
pr_err("Default init %s failed (error %d)\n",
CONFIG_DEFAULT_INIT, ret);
else
return 0;
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/admin-guide/init.rst for guidance.");
}
outro
容器化可能最开始是docker主导,最后由k8s主导,可能属于“入门简单、精通很难“,加之本身又非常复杂,所以涉及的机构、组件、协议、规范比较多,从而看起来各种规范和组件也很多(尽管功能本身可能并不复杂)。
其中涉及到Docker Image Specification、OCI Runtime Config Spec、Container Runtime Interface (CRI)。
浙公网安备 33010602011771号