diff --git a/roles/container-engine/kata-containers/defaults/main.yml b/roles/container-engine/kata-containers/defaults/main.yml index fc909caf8..40bbc33d5 100644 --- a/roles/container-engine/kata-containers/defaults/main.yml +++ b/roles/container-engine/kata-containers/defaults/main.yml @@ -7,3 +7,4 @@ kata_containers_qemu_default_memory: "{{ ansible_memtotal_mb }}" kata_containers_qemu_debug: 'false' kata_containers_qemu_sandbox_cgroup_only: 'true' kata_containers_qemu_enable_mem_prealloc: 'false' +kata_containers_virtio_fs_cache: 'always' diff --git a/roles/container-engine/kata-containers/templates/configuration-qemu.toml.j2 b/roles/container-engine/kata-containers/templates/configuration-qemu.toml.j2 index 9e89deeda..15511442c 100644 --- a/roles/container-engine/kata-containers/templates/configuration-qemu.toml.j2 +++ b/roles/container-engine/kata-containers/templates/configuration-qemu.toml.j2 @@ -1,11 +1,12 @@ # Copyright (c) 2017-2019 Intel Corporation +# Copyright (c) 2021 Adobe Inc. # # SPDX-License-Identifier: Apache-2.0 # # XXX: WARNING: this file is auto-generated. # XXX: -# XXX: Source file: "cli/config/configuration-qemu.toml.in" +# XXX: Source file: "config/configuration-qemu.toml.in" # XXX: Project: # XXX: Name: Kata Containers # XXX: Type: kata @@ -18,20 +19,46 @@ kernel = "/opt/kata/share/kata-containers/vmlinux.container" kernel = "/opt/kata/share/kata-containers/vmlinuz.container" {% endif %} image = "/opt/kata/share/kata-containers/kata-containers.img" +# initrd = "/opt/kata/share/kata-containers/kata-containers-initrd.img" machine_type = "q35" +# rootfs filesystem type: +# - ext4 (default) +# - xfs +# - erofs +rootfs_type="ext4" + # Enable confidential guest support. # Toggling that setting may trigger different hardware features, ranging # from memory encryption to both memory and CPU-state encryption and integrity. # The Kata Containers runtime dynamically detects the available feature set and -# aims at enabling the largest possible one. +# aims at enabling the largest possible one, returning an error if none is +# available, or none is supported by the hypervisor. +# +# Known limitations: +# * Does not work by design: +# - CPU Hotplug +# - Memory Hotplug +# - NVDIMM devices +# # Default false # confidential_guest = true +# Choose AMD SEV-SNP confidential guests +# In case of using confidential guests on AMD hardware that supports both SEV +# and SEV-SNP, the following enables SEV-SNP guests. SEV guests are default. +# Default false +# sev_snp_guest = true + +# Enable running QEMU VMM as a non-root user. +# By default QEMU VMM run as root. When this is set to true, QEMU VMM process runs as +# a non-root random user. See documentation for the limitations of this mode. +# rootless = true + # List of valid annotation names for the hypervisor # Each member of the list is a regular expression, which is the base name # of the annotation, e.g. "path" for io.katacontainers.config.hypervisor.path" -enable_annotations = [] +enable_annotations = ["enable_iommu"] # List of valid annotations values for the hypervisor # Each member of the list is a path pattern as described by glob(3). @@ -55,11 +82,25 @@ kernel_params = "" # If you want that qemu uses the default firmware leave this option empty firmware = "" +# Path to the firmware volume. +# firmware TDVF or OVMF can be split into FIRMWARE_VARS.fd (UEFI variables +# as configuration) and FIRMWARE_CODE.fd (UEFI program image). UEFI variables +# can be customized per each user while UEFI code is kept same. +firmware_volume = "" + # Machine accelerators # comma-separated list of machine accelerators to pass to the hypervisor. # For example, `machine_accelerators = "nosmm,nosmbus,nosata,nopit,static-prt,nofw"` machine_accelerators="" +# Qemu seccomp sandbox feature +# comma-separated list of seccomp sandbox features to control the syscall access. +# For example, `seccompsandbox= "on,obsolete=deny,spawn=deny,resourcecontrol=deny"` +# Note: "elevateprivileges=deny" doesn't work with daemonize option, so it's removed from the seccomp sandbox +# Another note: enabling this feature may reduce performance, you may enable +# /proc/sys/net/core/bpf_jit_enable to reduce the impact. see https://man7.org/linux/man-pages/man8/bpfc.8.html +#seccompsandbox="on,obsolete=deny,spawn=deny,resourcecontrol=deny" + # CPU features # comma-separated list of cpu features to pass to the cpu # For example, `cpu_features = "pmu=off,vmx=off" @@ -110,6 +151,12 @@ default_memory = {{ kata_containers_qemu_default_memory }} # This is will determine the times that memory will be hotadded to sandbox/VM. #memory_slots = 10 +# Default maximum memory in MiB per SB / VM +# unspecified or == 0 --> will be set to the actual amount of physical RAM +# > 0 <= amount of physical RAM --> will be set to the specified number +# > amount of physical RAM --> will be set to the actual amount of physical RAM +default_maxmemory = 0 + # The size in MiB will be plused to max memory of hypervisor. # It is the memory address space for the NVDIMM devie. # If set block storage driver (block_device_driver) to "nvdimm", @@ -128,12 +175,13 @@ default_memory = {{ kata_containers_qemu_default_memory }} # root file system is backed by a block device, the block device is passed # directly to the hypervisor for performance reasons. # This flag prevents the block device from being passed to the hypervisor, -# 9pfs is used instead to pass the rootfs. +# virtio-fs is used instead to pass the rootfs. disable_block_device_use = false # Shared file system type: # - virtio-fs (default) # - virtio-9p +# - virtio-fs-nydus {% if kata_containers_version is version('2.2.0', '>=') %} shared_fs = "virtio-fs" {% else %} @@ -141,27 +189,39 @@ shared_fs = "virtio-9p" {% endif %} # Path to vhost-user-fs daemon. +{% if kata_containers_version is version('2.5.0', '>=') %} +virtio_fs_daemon = "/opt/kata/libexec/virtiofsd" +{% else %} virtio_fs_daemon = "/opt/kata/libexec/kata-qemu/virtiofsd" +{% endif %} # List of valid annotations values for the virtiofs daemon # The default if not set is empty (all annotations rejected.) -# Your distribution recommends: ["/opt/kata/libexec/kata-qemu/virtiofsd"] -valid_virtio_fs_daemon_paths = ["/opt/kata/libexec/kata-qemu/virtiofsd"] +# Your distribution recommends: ["/opt/kata/libexec/virtiofsd"] +valid_virtio_fs_daemon_paths = [ + "/opt/kata/libexec/virtiofsd", + "/opt/kata/libexec/kata-qemu/virtiofsd", +] # Default size of DAX cache in MiB virtio_fs_cache_size = 0 +# Default size of virtqueues +virtio_fs_queue_size = 1024 + # Extra args for virtiofsd daemon # # Format example: -# ["-o", "arg1=xxx,arg2", "-o", "hello world", "--arg3=yyy"] +# ["--arg1=xxx", "--arg2=yyy"] +# Examples: +# Set virtiofsd log level to debug : ["--log-level=debug"] # # see `virtiofsd -h` for possible options. -virtio_fs_extra_args = ["--thread-pool-size=1"] +virtio_fs_extra_args = ["--thread-pool-size=1", "--announce-submounts"] # Cache mode: # -# - none +# - never # Metadata, data, and pathname lookup are not cached in guest. They are # always fetched from host and any changes are immediately pushed to host. # @@ -172,13 +232,27 @@ virtio_fs_extra_args = ["--thread-pool-size=1"] # # - always # Metadata, data, and pathname lookup are cached in guest and never expire. -virtio_fs_cache = "always" +virtio_fs_cache = "{{ kata_containers_virtio_fs_cache }}" # Block storage driver to be used for the hypervisor in case the container # rootfs is backed by a block device. This is virtio-scsi, virtio-blk # or nvdimm. block_device_driver = "virtio-scsi" +# aio is the I/O mechanism used by qemu +# Options: +# +# - threads +# Pthread based disk I/O. +# +# - native +# Native Linux I/O. +# +# - io_uring +# Linux io_uring API. This provides the fastest I/O operations on Linux, requires kernel>5.1 and +# qemu >=5.0. +block_device_aio = "io_uring" + # Specifies cache-related options will be set to block devices or not. # Default false #block_device_cache_set = true @@ -242,6 +316,11 @@ vhost_user_store_path = "/var/run/kata-containers/vhost-user" # Your distribution recommends: ["/var/run/kata-containers/vhost-user"] valid_vhost_user_store_paths = ["/var/run/kata-containers/vhost-user"] +# The timeout for reconnecting on non-server spdk sockets when the remote end goes away. +# qemu will delay this many seconds and then attempt to reconnect. +# Zero disables reconnecting, and the default is zero. +vhost_user_reconnect_timeout_sec = 0 + # Enable file based guest memory support. The default is an empty string which # will disable this feature. In the case of virtio-fs, this is enabled # automatically and '/dev/shm' is used as the backing folder. @@ -253,17 +332,12 @@ valid_vhost_user_store_paths = ["/var/run/kata-containers/vhost-user"] # Your distribution recommends: [""] valid_file_mem_backends = [""] -# Enable swap of vm memory. Default false. -# The behaviour is undefined if mem_prealloc is also set to true -#enable_swap = true - # -pflash can add image file to VM. The arguments of it should be in format # of ["/path/to/flash0.img", "/path/to/flash1.img"] pflashes = [] # This option changes the default hypervisor and kernel parameters -# to enable debug output where available. This extra output is added -# to the proxy logs, but only when proxy debug is also enabled. +# to enable debug output where available. And Debug also enables the hmp socket. # # Default false enable_debug = {{ kata_containers_qemu_debug }} @@ -278,21 +352,18 @@ enable_debug = {{ kata_containers_qemu_debug }} # used for 9p packet payload. #msize_9p = 8192 -# If true and vsocks are supported, use vsocks to communicate directly -# with the agent and no proxy is started, otherwise use unix -# sockets and start a proxy to communicate with the agent. -# Default false -#use_vsock = true - # If false and nvdimm is supported, use nvdimm device to plug guest image. # Otherwise virtio-block device is used. +# +# nvdimm is not supported when `confidential_guest = true`. +# # Default is false #disable_image_nvdimm = true # VFIO devices are hotplugged on a bridge by default. # Enable hotplugging on root bus. This may be required for devices with # a large PCI bar, as this is a current limitation with hotplugging on -# a bridge. This value is valid for "pc" machine type. +# a bridge. # Default false #hotplug_vfio_on_root_bus = true @@ -329,15 +400,15 @@ valid_entropy_sources = ["/dev/urandom","/dev/random",""] # the OCI spec passed to the runtime. # # You can create a rootfs with hooks by customizing the osbuilder scripts: -# https://github.com/kata-containers/osbuilder +# https://github.com/kata-containers/kata-containers/tree/main/tools/osbuilder # # Hooks must be stored in a subdirectory of guest_hook_path according to their -# hook type, i.e. "guest_hook_path/{prestart,postart,poststop}". +# hook type, i.e. "guest_hook_path/{prestart,poststart,poststop}". # The agent will scan these directories for executable files and add them, in # lexicographical order, to the lifecycle of the guest container. # Hooks are executed in the runtime namespace of the guest. See the official documentation: # https://github.com/opencontainers/runtime-spec/blob/v1.0.1/config.md#posix-platform-hooks -# Warnings will be logged if any error is encountered will scanning for hooks, +# Warnings will be logged if any error is encountered while scanning for hooks, # but it will not abort container execution. #guest_hook_path = "/usr/share/oci/hooks" # @@ -382,6 +453,19 @@ valid_entropy_sources = ["/dev/urandom","/dev/random",""] # be default_memory. #enable_guest_swap = true +# use legacy serial for guest console if available and implemented for architecture. Default false +#use_legacy_serial = true + +# disable applying SELinux on the VMM process (default false) +disable_selinux=false + +# disable applying SELinux on the container process +# If set to false, the type `container_t` is applied to the container process by default. +# Note: To enable guest SELinux, the guest rootfs must be CentOS that is created and built +# with `SELINUX=yes`. +# (default: true) +disable_guest_selinux=true + [factory] # VM templating support. Once enabled, new VMs are created from template # using vm cloning. They will share the same initial kernel, initramfs and @@ -425,31 +509,6 @@ valid_entropy_sources = ["/dev/urandom","/dev/random",""] # Default /var/run/kata-containers/cache.sock #vm_cache_endpoint = "/var/run/kata-containers/cache.sock" -[proxy.kata] -path = "/opt/kata/libexec/kata-containers/kata-proxy" - -# If enabled, proxy messages will be sent to the system log -# (default: disabled) -enable_debug = {{ kata_containers_qemu_debug }} - -[shim.kata] -path = "/opt/kata/libexec/kata-containers/kata-shim" - -# If enabled, shim messages will be sent to the system log -# (default: disabled) -enable_debug = {{ kata_containers_qemu_debug }} - -# If enabled, the shim will create opentracing.io traces and spans. -# (See https://www.jaegertracing.io/docs/getting-started). -# -# Note: By default, the shim runs in a separate network namespace. Therefore, -# to allow it to send trace details to the Jaeger agent running on the host, -# it is necessary to set 'disable_new_netns=true' so that it runs in the host -# network namespace. -# -# (default: disabled) -#enable_tracing = true - [agent.kata] # If enabled, make the agent display debug-level messages. # (default: disabled) @@ -457,24 +516,17 @@ enable_debug = {{ kata_containers_qemu_debug }} # Enable agent tracing. # -# If enabled, the default trace mode is "dynamic" and the -# default trace type is "isolated". The trace mode and type are set -# explicitly with the `trace_type=` and `trace_mode=` options. +# If enabled, the agent will generate OpenTelemetry trace spans. # # Notes: # -# - Tracing is ONLY enabled when `enable_tracing` is set: explicitly -# setting `trace_mode=` and/or `trace_type=` without setting `enable_tracing` -# will NOT activate agent tracing. -# -# - See https://github.com/kata-containers/agent/blob/master/TRACING.md for -# full details. +# - If the runtime also has tracing enabled, the agent spans will be +# associated with the appropriate runtime parent span. +# - If enabled, the runtime will wait for the container to shutdown, +# increasing the container shutdown time slightly. # # (default: disabled) #enable_tracing = true -# -#trace_mode = "dynamic" -#trace_type = "isolated" # Comma separated list of kernel modules and their parameters. # These modules will be loaded in the guest kernel using modprobe(8). @@ -500,21 +552,6 @@ kernel_modules=[] # (default: 30) #dial_timeout = 30 -[netmon] -# If enabled, the network monitoring process gets started when the -# sandbox is created. This allows for the detection of some additional -# network being added to the existing network namespace, after the -# sandbox has been created. -# (default: disabled) -#enable_netmon = true - -# Specify the path to the netmon binary. -path = "/opt/kata/libexec/kata-containers/kata-netmon" - -# If enabled, netmon messages will be sent to the system log -# (default: disabled) -enable_debug = {{ kata_containers_qemu_debug }} - [runtime] # If enabled, the runtime will log additional debug messages to the # system log @@ -546,6 +583,19 @@ internetworking_model="tcfilter" # (default: true) disable_guest_seccomp=true +# vCPUs pinning settings +# if enabled, each vCPU thread will be scheduled to a fixed CPU +# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet) +# enable_vcpus_pinning = false + +# Apply a custom SELinux security policy to the container process inside the VM. +# This is used when you want to apply a type other than the default `container_t`, +# so general users should not uncomment and apply it. +# (format: "user:role:type") +# Note: You cannot specify MCS policy with the label because the sensitivity levels and +# categories are determined automatically by high-level container runtimes such as containerd. +#guest_selinux_label="system_u:system_r:container_t" + # If enabled, the runtime will create opentracing.io traces and spans. # (See https://www.jaegertracing.io/docs/getting-started). # (default: disabled) @@ -563,11 +613,9 @@ disable_guest_seccomp=true # If enabled, the runtime will not create a network namespace for shim and hypervisor processes. # This option may have some potential impacts to your host. It should only be used when you know what you're doing. -# `disable_new_netns` conflicts with `enable_netmon` # `disable_new_netns` conflicts with `internetworking_model=tcfilter` and `internetworking_model=macvtap`. It works only # with `internetworking_model=none`. The tap device will be in the host network namespace and can connect to a bridge # (like OVS) directly. -# If you are using docker, `disable_new_netns` only works with `docker run --net=none` # (default: false) #disable_new_netns = true @@ -576,15 +624,49 @@ disable_guest_seccomp=true # The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. # The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. # The sandbox cgroup is constrained if there is no container type annotation. -# See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType +# See: https://pkg.go.dev/github.com/kata-containers/kata-containers/src/runtime/virtcontainers#ContainerType sandbox_cgroup_only={{ kata_containers_qemu_sandbox_cgroup_only }} +# If enabled, the runtime will attempt to determine appropriate sandbox size (memory, CPU) before booting the virtual machine. In +# this case, the runtime will not dynamically update the amount of memory and CPU in the virtual machine. This is generally helpful +# when a hardware architecture or hypervisor solutions is utilized which does not support CPU and/or memory hotplug. +# Compatibility for determining appropriate sandbox (VM) size: +# - When running with pods, sandbox sizing information will only be available if using Kubernetes >= 1.23 and containerd >= 1.6. CRI-O +# does not yet support sandbox sizing annotations. +# - When running single containers using a tool like ctr, container sizing information will be available. +static_sandbox_resource_mgmt=false + # If specified, sandbox_bind_mounts identifieds host paths to be mounted (ro) into the sandboxes shared path. # This is only valid if filesystem sharing is utilized. The provided path(s) will be bindmounted into the shared fs directory. # If defaults are utilized, these mounts should be available in the guest at `/run/kata-containers/shared/containers/sandbox-mounts` # These will not be exposed to the container workloads, and are only provided for potential guest services. sandbox_bind_mounts=[] +# VFIO Mode +# Determines how VFIO devices should be be presented to the container. +# Options: +# +# - vfio +# Matches behaviour of OCI runtimes (e.g. runc) as much as +# possible. VFIO devices will appear in the container as VFIO +# character devices under /dev/vfio. The exact names may differ +# from the host (they need to match the VM's IOMMU group numbers +# rather than the host's) +# +# - guest-kernel +# This is a Kata-specific behaviour that's useful in certain cases. +# The VFIO device is managed by whatever driver in the VM kernel +# claims it. This means it will appear as one or more device nodes +# or network interfaces depending on the nature of the device. +# Using this mode requires specially built workloads that know how +# to locate the relevant device interfaces within the VM. +# +vfio_mode="guest-kernel" + +# If enabled, the runtime will not create Kubernetes emptyDir mounts on the guest filesystem. Instead, emptyDir mounts will +# be created on the host and shared via virtio-fs. This is potentially slower, but allows sharing of files from host to guest. +disable_guest_empty_dir=false + # Enabled experimental feature list, format: ["a", "b"]. # Experimental features are features not stable enough for production, # they may break compatibility, and are prepared for a big version bump.