diff options
author | Peter Maydell <peter.maydell@linaro.org> | 2020-11-02 20:29:50 +0000 |
---|---|---|
committer | Peter Maydell <peter.maydell@linaro.org> | 2020-11-02 20:29:50 +0000 |
commit | 8545ae485b1e8e43cc0137310c4c68dbece59990 (patch) | |
tree | e5c7a35c156719ca8a7e342e2f94746c00c3f171 | |
parent | 8680d6e36468f1ca00e2fe749bef50585d632401 (diff) | |
parent | af1bb3fe7f146fafdaadb479975ca2b53b49df40 (diff) |
Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20201102a' into staging
Migration and virtiofs fixes 2020-11-02
Fixes for postcopy migration test hang
A seccomp crash for virtiofsd on some !x86
Help message and minor CID fix
And another crack at Max's set.
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
# gpg: Signature made Mon 02 Nov 2020 19:54:59 GMT
# gpg: using RSA key 45F5C71B4A0CB7FB977A9FA90516331EBC5BFDE7
# gpg: Good signature from "Dr. David Alan Gilbert (RH2) <dgilbert@redhat.com>" [full]
# Primary key fingerprint: 45F5 C71B 4A0C B7FB 977A 9FA9 0516 331E BC5B FDE7
* remotes/dgilbert/tags/pull-migration-20201102a:
tests/acceptance: Add virtiofs_submounts.py
tests/acceptance/boot_linux: Accept SSH pubkey
virtiofsd: Announce sub-mount points
virtiofsd: Add mount ID to the lo_inode key
meson.build: Check for statx()
virtiofsd: Add attr_flags to fuse_entry_param
virtiofsd: Check FUSE_SUBMOUNTS
virtiofsd: Fix the help message of posix lock
tools/virtiofsd: Check vu_init() return value (CID 1435958)
virtiofsd: Seccomp: Add 'send' for syslog
migration: Postpone the kick of the fault thread after recover
migration: Unify reset of last_rb on destination node when recover
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r-- | meson.build | 16 | ||||
-rw-r--r-- | migration/postcopy-ram.c | 2 | ||||
-rw-r--r-- | migration/savevm.c | 17 | ||||
-rw-r--r-- | tests/acceptance/boot_linux.py | 13 | ||||
-rw-r--r-- | tests/acceptance/virtiofs_submounts.py | 321 | ||||
-rw-r--r-- | tests/acceptance/virtiofs_submounts.py.data/cleanup.sh | 46 | ||||
-rw-r--r-- | tests/acceptance/virtiofs_submounts.py.data/guest-cleanup.sh | 30 | ||||
-rw-r--r-- | tests/acceptance/virtiofs_submounts.py.data/guest.sh | 138 | ||||
-rw-r--r-- | tests/acceptance/virtiofs_submounts.py.data/host.sh | 127 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_common.h | 7 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.c | 5 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_lowlevel.h | 5 | ||||
-rw-r--r-- | tools/virtiofsd/fuse_virtio.c | 7 | ||||
-rw-r--r-- | tools/virtiofsd/helper.c | 3 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_ll.c | 117 | ||||
-rw-r--r-- | tools/virtiofsd/passthrough_seccomp.c | 2 |
16 files changed, 832 insertions, 24 deletions
diff --git a/meson.build b/meson.build index 47e32e1fcb..39ac5cf6d8 100644 --- a/meson.build +++ b/meson.build @@ -736,6 +736,21 @@ if not has_malloc_trim and get_option('malloc_trim').enabled() endif endif +# Check whether the glibc provides statx() + +statx_test = ''' + #ifndef _GNU_SOURCE + #define _GNU_SOURCE + #endif + #include <sys/stat.h> + int main(void) { + struct statx statxbuf; + statx(0, "", 0, STATX_BASIC_STATS, &statxbuf); + return 0; + }''' + +has_statx = cc.links(statx_test) + ################# # config-host.h # ################# @@ -768,6 +783,7 @@ config_host_data.set('CONFIG_XKBCOMMON', xkbcommon.found()) config_host_data.set('CONFIG_KEYUTILS', keyutils.found()) config_host_data.set('CONFIG_GETTID', has_gettid) config_host_data.set('CONFIG_MALLOC_TRIM', has_malloc_trim) +config_host_data.set('CONFIG_STATX', has_statx) config_host_data.set('QEMU_VERSION', '"@0@"'.format(meson.project_version())) config_host_data.set('QEMU_VERSION_MAJOR', meson.project_version().split('.')[0]) config_host_data.set('QEMU_VERSION_MINOR', meson.project_version().split('.')[1]) diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index d3bb3a744b..d99842eb1b 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -903,7 +903,6 @@ static void *postcopy_ram_fault_thread(void *opaque) * the channel is rebuilt. */ if (postcopy_pause_fault_thread(mis)) { - mis->last_rb = NULL; /* Continue to read the userfaultfd */ } else { error_report("%s: paused but don't allow to continue", @@ -985,7 +984,6 @@ retry: /* May be network failure, try to wait for recovery */ if (ret == -EIO && postcopy_pause_fault_thread(mis)) { /* We got reconnected somehow, try to continue */ - mis->last_rb = NULL; goto retry; } else { /* This is a unavoidable fault */ diff --git a/migration/savevm.c b/migration/savevm.c index 21ccba9fb3..5f937a2762 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -2062,13 +2062,16 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) } /* + * Reset the last_rb before we resend any page req to source again, since + * the source should have it reset already. + */ + mis->last_rb = NULL; + + /* * This means source VM is ready to resume the postcopy migration. - * It's time to switch state and release the fault thread to - * continue service page faults. */ migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER, MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->postcopy_pause_sem_fault); trace_loadvm_postcopy_handle_resume(); @@ -2089,6 +2092,14 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis) */ migrate_send_rp_req_pages_pending(mis); + /* + * It's time to switch state and release the fault thread to continue + * service page faults. Note that this should be explicitly after the + * above call to migrate_send_rp_req_pages_pending(). In short: + * migrate_send_rp_message_req_pages() is not thread safe, yet. + */ + qemu_sem_post(&mis->postcopy_pause_sem_fault); + return 0; } diff --git a/tests/acceptance/boot_linux.py b/tests/acceptance/boot_linux.py index c743e231f4..1da4a53d6a 100644 --- a/tests/acceptance/boot_linux.py +++ b/tests/acceptance/boot_linux.py @@ -57,7 +57,7 @@ class BootLinuxBase(Test): self.cancel('Failed to download/prepare boot image') return boot.path - def download_cloudinit(self): + def download_cloudinit(self, ssh_pubkey=None): self.log.info('Preparing cloudinit image') try: cloudinit_iso = os.path.join(self.workdir, 'cloudinit.iso') @@ -67,7 +67,8 @@ class BootLinuxBase(Test): password='password', # QEMU's hard coded usermode router address phone_home_host='10.0.2.2', - phone_home_port=self.phone_home_port) + phone_home_port=self.phone_home_port, + authorized_key=ssh_pubkey) except Exception: self.cancel('Failed to prepared cloudinit image') return cloudinit_iso @@ -80,19 +81,19 @@ class BootLinux(BootLinuxBase): timeout = 900 chksum = None - def setUp(self): + def setUp(self, ssh_pubkey=None): super(BootLinux, self).setUp() self.vm.add_args('-smp', '2') self.vm.add_args('-m', '1024') self.prepare_boot() - self.prepare_cloudinit() + self.prepare_cloudinit(ssh_pubkey) def prepare_boot(self): path = self.download_boot() self.vm.add_args('-drive', 'file=%s' % path) - def prepare_cloudinit(self): - cloudinit_iso = self.download_cloudinit() + def prepare_cloudinit(self, ssh_pubkey=None): + cloudinit_iso = self.download_cloudinit(ssh_pubkey) self.vm.add_args('-drive', 'file=%s,format=raw' % cloudinit_iso) def launch_and_wait(self): diff --git a/tests/acceptance/virtiofs_submounts.py b/tests/acceptance/virtiofs_submounts.py new file mode 100644 index 0000000000..361e5990b6 --- /dev/null +++ b/tests/acceptance/virtiofs_submounts.py @@ -0,0 +1,321 @@ +import logging +import re +import os +import subprocess +import time + +from avocado import skipUnless +from avocado_qemu import Test, BUILD_DIR +from avocado_qemu import wait_for_console_pattern +from avocado.utils import ssh + +from qemu.accel import kvm_available + +from boot_linux import BootLinux + + +def run_cmd(args): + subp = subprocess.Popen(args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + stdout, stderr = subp.communicate() + ret = subp.returncode + + return (stdout, stderr, ret) + +def has_cmd(name, args=None): + """ + This function is for use in a @avocado.skipUnless decorator, e.g.: + + @skipUnless(*has_cmd('sudo -n', ('sudo', '-n', 'true'))) + def test_something_that_needs_sudo(self): + ... + """ + + if args is None: + args = ('which', name) + + try: + _, stderr, exitcode = run_cmd(args) + except Exception as e: + exitcode = -1 + stderr = str(e) + + if exitcode != 0: + cmd_line = ' '.join(args) + err = f'{name} required, but "{cmd_line}" failed: {stderr.strip()}' + return (False, err) + else: + return (True, '') + +def has_cmds(*cmds): + """ + This function is for use in a @avocado.skipUnless decorator and + allows checking for the availability of multiple commands, e.g.: + + @skipUnless(*has_cmds(('cmd1', ('cmd1', '--some-parameter')), + 'cmd2', 'cmd3')) + def test_something_that_needs_cmd1_and_cmd2(self): + ... + """ + + for cmd in cmds: + if isinstance(cmd, str): + cmd = (cmd,) + + ok, errstr = has_cmd(*cmd) + if not ok: + return (False, errstr) + + return (True, '') + + +class VirtiofsSubmountsTest(BootLinux): + """ + :avocado: tags=arch:x86_64 + """ + + def get_portfwd(self): + port = None + + res = self.vm.command('human-monitor-command', + command_line='info usernet') + for line in res.split('\r\n'): + match = \ + re.search(r'TCP.HOST_FORWARD.*127\.0\.0\.1\s*(\d+)\s+10\.', + line) + if match is not None: + port = match[1] + break + + self.assertIsNotNone(port) + self.log.debug('sshd listening on port: ' + port) + return port + + def ssh_connect(self, username, keyfile): + self.ssh_logger = logging.getLogger('ssh') + port = self.get_portfwd() + self.ssh_session = ssh.Session('127.0.0.1', port=int(port), + user=username, key=keyfile) + for i in range(10): + try: + self.ssh_session.connect() + return + except: + time.sleep(4) + pass + self.fail('sshd timeout') + + def ssh_command(self, command): + self.ssh_logger.info(command) + result = self.ssh_session.cmd(command) + stdout_lines = [line.rstrip() for line + in result.stdout_text.splitlines()] + for line in stdout_lines: + self.ssh_logger.info(line) + stderr_lines = [line.rstrip() for line + in result.stderr_text.splitlines()] + for line in stderr_lines: + self.ssh_logger.warning(line) + + self.assertEqual(result.exit_status, 0, + f'Guest command failed: {command}') + return stdout_lines, stderr_lines + + def run(self, args, ignore_error=False): + stdout, stderr, ret = run_cmd(args) + + if ret != 0: + cmdline = ' '.join(args) + if not ignore_error: + self.fail(f'{cmdline}: Returned {ret}: {stderr}') + else: + self.log.warn(f'{cmdline}: Returned {ret}: {stderr}') + + return (stdout, stderr, ret) + + def set_up_shared_dir(self): + atwd = os.getenv('AVOCADO_TEST_WORKDIR') + self.shared_dir = os.path.join(atwd, 'virtiofs-shared') + + os.mkdir(self.shared_dir) + + self.run(('cp', self.get_data('guest.sh'), + os.path.join(self.shared_dir, 'check.sh'))) + + self.run(('cp', self.get_data('guest-cleanup.sh'), + os.path.join(self.shared_dir, 'cleanup.sh'))) + + def set_up_virtiofs(self): + attmp = os.getenv('AVOCADO_TESTS_COMMON_TMPDIR') + self.vfsdsock = os.path.join(attmp, 'vfsdsock') + + self.run(('sudo', '-n', 'rm', '-f', self.vfsdsock), ignore_error=True) + + self.virtiofsd = \ + subprocess.Popen(('sudo', '-n', + 'tools/virtiofsd/virtiofsd', + f'--socket-path={self.vfsdsock}', + '-o', f'source={self.shared_dir}', + '-o', 'cache=always', + '-o', 'xattr', + '-o', 'announce_submounts', + '-f'), + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + universal_newlines=True) + + while not os.path.exists(self.vfsdsock): + if self.virtiofsd.poll() is not None: + self.fail('virtiofsd exited prematurely: ' + + self.virtiofsd.communicate()[1]) + time.sleep(0.1) + + self.run(('sudo', '-n', 'chmod', 'go+rw', self.vfsdsock)) + + self.vm.add_args('-chardev', + f'socket,id=vfsdsock,path={self.vfsdsock}', + '-device', + 'vhost-user-fs-pci,queue-size=1024,chardev=vfsdsock' \ + ',tag=host', + '-object', + 'memory-backend-file,id=mem,size=1G,' \ + 'mem-path=/dev/shm,share=on', + '-numa', + 'node,memdev=mem') + + def launch_vm(self): + self.launch_and_wait() + self.ssh_connect('root', self.ssh_key) + + def set_up_nested_mounts(self): + scratch_dir = os.path.join(self.shared_dir, 'scratch') + try: + os.mkdir(scratch_dir) + except FileExistsError: + pass + + args = ['bash', self.get_data('host.sh'), scratch_dir] + if self.seed: + args += [self.seed] + + out, _, _ = self.run(args) + seed = re.search(r'^Seed: \d+', out) + self.log.info(seed[0]) + + def mount_in_guest(self): + self.ssh_command('mkdir -p /mnt/host') + self.ssh_command('mount -t virtiofs host /mnt/host') + + def check_in_guest(self): + self.ssh_command('bash /mnt/host/check.sh /mnt/host/scratch/share') + + def live_cleanup(self): + self.ssh_command('bash /mnt/host/cleanup.sh /mnt/host/scratch') + + # It would be nice if the above was sufficient to make virtiofsd clear + # all references to the mounted directories (so they can be unmounted + # on the host), but unfortunately it is not. To do so, we have to + # resort to a remount. + self.ssh_command('mount -o remount /mnt/host') + + scratch_dir = os.path.join(self.shared_dir, 'scratch') + self.run(('bash', self.get_data('cleanup.sh'), scratch_dir)) + + @skipUnless(*has_cmds(('sudo -n', ('sudo', '-n', 'true')), + 'ssh-keygen', 'bash', 'losetup', 'mkfs.xfs', 'mount')) + def setUp(self): + vmlinuz = self.params.get('vmlinuz') + if vmlinuz is None: + self.cancel('vmlinuz parameter not set; you must point it to a ' + 'Linux kernel binary to test (to run this test with ' \ + 'the on-image kernel, set it to an empty string)') + + self.seed = self.params.get('seed') + + atwd = os.getenv('AVOCADO_TEST_WORKDIR') + self.ssh_key = os.path.join(atwd, 'id_ed25519') + + self.run(('ssh-keygen', '-t', 'ed25519', '-f', self.ssh_key)) + + pubkey = open(self.ssh_key + '.pub').read() + + super(VirtiofsSubmountsTest, self).setUp(pubkey) + + if len(vmlinuz) > 0: + self.vm.add_args('-kernel', vmlinuz, + '-append', 'console=ttyS0 root=/dev/sda1') + + # Allow us to connect to SSH + self.vm.add_args('-netdev', 'user,id=vnet,hostfwd=:127.0.0.1:0-:22', + '-device', 'e1000,netdev=vnet') + + if not kvm_available(self.arch, self.qemu_bin): + self.cancel(KVM_NOT_AVAILABLE) + self.vm.add_args('-accel', 'kvm') + + def tearDown(self): + try: + self.vm.shutdown() + except: + pass + + scratch_dir = os.path.join(self.shared_dir, 'scratch') + self.run(('bash', self.get_data('cleanup.sh'), scratch_dir), + ignore_error=True) + + def test_pre_virtiofsd_set_up(self): + self.set_up_shared_dir() + + self.set_up_nested_mounts() + + self.set_up_virtiofs() + self.launch_vm() + self.mount_in_guest() + self.check_in_guest() + + def test_pre_launch_set_up(self): + self.set_up_shared_dir() + self.set_up_virtiofs() + + self.set_up_nested_mounts() + + self.launch_vm() + self.mount_in_guest() + self.check_in_guest() + + def test_post_launch_set_up(self): + self.set_up_shared_dir() + self.set_up_virtiofs() + self.launch_vm() + + self.set_up_nested_mounts() + + self.mount_in_guest() + self.check_in_guest() + + def test_post_mount_set_up(self): + self.set_up_shared_dir() + self.set_up_virtiofs() + self.launch_vm() + self.mount_in_guest() + + self.set_up_nested_mounts() + + self.check_in_guest() + + def test_two_runs(self): + self.set_up_shared_dir() + + self.set_up_nested_mounts() + + self.set_up_virtiofs() + self.launch_vm() + self.mount_in_guest() + self.check_in_guest() + + self.live_cleanup() + self.set_up_nested_mounts() + + self.check_in_guest() diff --git a/tests/acceptance/virtiofs_submounts.py.data/cleanup.sh b/tests/acceptance/virtiofs_submounts.py.data/cleanup.sh new file mode 100644 index 0000000000..2a6579a0fe --- /dev/null +++ b/tests/acceptance/virtiofs_submounts.py.data/cleanup.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +function print_usage() +{ + if [ -n "$2" ]; then + echo "Error: $2" + echo + fi + echo "Usage: $1 <scratch dir>" +} + +scratch_dir=$1 +if [ -z "$scratch_dir" ]; then + print_usage "$0" 'Scratch dir not given' >&2 + exit 1 +fi + +cd "$scratch_dir/share" || exit 1 +mps=(mnt*) +mp_i=0 +for mp in "${mps[@]}"; do + mp_i=$((mp_i + 1)) + printf "Unmounting %i/%i...\r" "$mp_i" "${#mps[@]}" + + sudo umount -R "$mp" + rm -rf "$mp" +done +echo + +rm some-file +cd .. +rmdir share + +imgs=(fs*.img) +img_i=0 +for img in "${imgs[@]}"; do + img_i=$((img_i + 1)) + printf "Detaching and deleting %i/%i...\r" "$img_i" "${#imgs[@]}" + + dev=$(losetup -j "$img" | sed -e 's/:.*//') + sudo losetup -d "$dev" + rm -f "$img" +done +echo + +echo 'Done.' diff --git a/tests/acceptance/virtiofs_submounts.py.data/guest-cleanup.sh b/tests/acceptance/virtiofs_submounts.py.data/guest-cleanup.sh new file mode 100644 index 0000000000..729cb2d1a5 --- /dev/null +++ b/tests/acceptance/virtiofs_submounts.py.data/guest-cleanup.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +function print_usage() +{ + if [ -n "$2" ]; then + echo "Error: $2" + echo + fi + echo "Usage: $1 <scratch dir>" +} + +scratch_dir=$1 +if [ -z "$scratch_dir" ]; then + print_usage "$0" 'Scratch dir not given' >&2 + exit 1 +fi + +cd "$scratch_dir/share" || exit 1 + +mps=(mnt*) +mp_i=0 +for mp in "${mps[@]}"; do + mp_i=$((mp_i + 1)) + printf "Unmounting %i/%i...\r" "$mp_i" "${#mps[@]}" + + sudo umount -R "$mp" +done +echo + +echo 'Done.' diff --git a/tests/acceptance/virtiofs_submounts.py.data/guest.sh b/tests/acceptance/virtiofs_submounts.py.data/guest.sh new file mode 100644 index 0000000000..59ba40fde1 --- /dev/null +++ b/tests/acceptance/virtiofs_submounts.py.data/guest.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +function print_usage() +{ + if [ -n "$2" ]; then + echo "Error: $2" + echo + fi + echo "Usage: $1 <shared dir>" + echo '(The shared directory is the "share" directory in the scratch' \ + 'directory)' +} + +shared_dir=$1 +if [ -z "$shared_dir" ]; then + print_usage "$0" 'Shared dir not given' >&2 + exit 1 +fi + +cd "$shared_dir" + +# FIXME: This should not be necessary, but it is. In order for all +# submounts to be proper mount points, we need to visit them. +# (Before we visit them, they will not be auto-mounted, and so just +# appear as normal directories, with the catch that their st_ino will +# be the st_ino of the filesystem they host, while the st_dev will +# still be the st_dev of the parent.) +# `find` does not work, because it will refuse to touch the mount +# points as long as they are not mounted; their st_dev being shared +# with the parent and st_ino just being the root node's inode ID +# will practically ensure that this node exists elsewhere on the +# filesystem, and `find` is required to recognize loops and not to +# follow them. +# Thus, we have to manually visit all nodes first. + +mnt_i=0 + +function recursively_visit() +{ + pushd "$1" >/dev/null + for entry in *; do + if [[ "$entry" == mnt* ]]; then + mnt_i=$((mnt_i + 1)) + printf "Triggering auto-mount $mnt_i...\r" + fi + + if [ -d "$entry" ]; then + recursively_visit "$entry" + fi + done + popd >/dev/null +} + +recursively_visit . +echo + + +if [ -n "$(find -name not-mounted)" ]; then + echo "Error: not-mounted files visible on mount points:" >&2 + find -name not-mounted >&2 + exit 1 +fi + +if [ ! -f some-file -o "$(cat some-file)" != 'root' ]; then + echo "Error: Bad file in the share root" >&2 + exit 1 +fi + +shopt -s nullglob + +function check_submounts() +{ + local base_path=$1 + + for mp in mnt*; do + printf "Checking submount %i...\r" "$((${#devs[@]} + 1))" + + mp_i=$(echo "$mp" | sed -e 's/mnt//') + dev=$(stat -c '%D' "$mp") + + if [ -n "${devs[mp_i]}" ]; then + echo "Error: $mp encountered twice" >&2 + exit 1 + fi + devs[mp_i]=$dev + + pushd "$mp" >/dev/null + path="$base_path$mp" + while true; do + expected_content="$(printf '%s\n%s\n' "$mp_i" "$path")" + if [ ! -f some-file ]; then + echo "Error: $PWD/some-file does not exist" >&2 + exit 1 + fi + + if [ "$(cat some-file)" != "$expected_content" ]; then + echo "Error: Bad content in $PWD/some-file:" >&2 + echo '--- found ---' + cat some-file + echo '--- expected ---' + echo "$expected_content" + exit 1 + fi + if [ "$(stat -c '%D' some-file)" != "$dev" ]; then + echo "Error: $PWD/some-file has the wrong device ID" >&2 + exit 1 + fi + + if [ -d sub ]; then + if [ "$(stat -c '%D' sub)" != "$dev" ]; then + echo "Error: $PWD/some-file has the wrong device ID" >&2 + exit 1 + fi + cd sub + path="$path/sub" + else + if [ -n "$(echo mnt*)" ]; then + check_submounts "$path/" + fi + break + fi + done + popd >/dev/null + done +} + +root_dev=$(stat -c '%D' some-file) +devs=() +check_submounts '' +echo + +reused_devs=$(echo "$root_dev ${devs[@]}" | tr ' ' '\n' | sort | uniq -d) +if [ -n "$reused_devs" ]; then + echo "Error: Reused device IDs: $reused_devs" >&2 + exit 1 +fi + +echo "Test passed for ${#devs[@]} submounts." diff --git a/tests/acceptance/virtiofs_submounts.py.data/host.sh b/tests/acceptance/virtiofs_submounts.py.data/host.sh new file mode 100644 index 0000000000..d8a9afebdb --- /dev/null +++ b/tests/acceptance/virtiofs_submounts.py.data/host.sh @@ -0,0 +1,127 @@ +#!/bin/bash + +mount_count=128 + +function print_usage() +{ + if [ -n "$2" ]; then + echo "Error: $2" + echo + fi + echo "Usage: $1 <scratch dir> [seed]" + echo "(If no seed is given, it will be randomly generated.)" +} + +scratch_dir=$1 +if [ -z "$scratch_dir" ]; then + print_usage "$0" 'No scratch dir given' >&2 + exit 1 +fi + +if [ ! -d "$scratch_dir" ]; then + print_usage "$0" "$scratch_dir is not a directory" >&2 + exit 1 +fi + +seed=$2 +if [ -z "$seed" ]; then + seed=$RANDOM +fi +RANDOM=$seed + +echo "Seed: $seed" + +set -e +shopt -s nullglob + +cd "$scratch_dir" +if [ -d share ]; then + echo 'Error: This directory seems to be in use already' >&2 + exit 1 +fi + +for ((i = 0; i < $mount_count; i++)); do + printf "Setting up fs %i/%i...\r" "$((i + 1))" "$mount_count" + + rm -f fs$i.img + truncate -s 512M fs$i.img + mkfs.xfs -q fs$i.img + devs[i]=$(sudo losetup -f --show fs$i.img) +done +echo + +top_level_mounts=$((RANDOM % mount_count + 1)) + +mkdir -p share +echo 'root' > share/some-file + +for ((i = 0; i < $top_level_mounts; i++)); do + printf "Mounting fs %i/%i...\r" "$((i + 1))" "$mount_count" + + mkdir -p share/mnt$i + touch share/mnt$i/not-mounted + sudo mount "${devs[i]}" share/mnt$i + sudo chown "$(id -u):$(id -g)" share/mnt$i + + pushd share/mnt$i >/dev/null + path=mnt$i + nesting=$((RANDOM % 4)) + for ((j = 0; j < $nesting; j++)); do + cat > some-file <<EOF +$i +$path +EOF + mkdir sub + cd sub + path="$path/sub" + done +cat > some-file <<EOF +$i +$path +EOF + popd >/dev/null +done + +for ((; i < $mount_count; i++)); do + printf "Mounting fs %i/%i...\r" "$((i + 1))" "$mount_count" + + mp_i=$((i % top_level_mounts)) + + pushd share/mnt$mp_i >/dev/null + path=mnt$mp_i + while true; do + sub_mp="$(echo mnt*)" + if cd sub 2>/dev/null; then + path="$path/sub" + elif [ -n "$sub_mp" ] && cd "$sub_mp" 2>/dev/null; then + path="$path/$sub_mp" + else + break + fi + done + mkdir mnt$i + touch mnt$i/not-mounted + sudo mount "${devs[i]}" mnt$i + sudo chown "$(id -u):$(id -g)" mnt$i + + cd mnt$i + path="$path/mnt$i" + nesting=$((RANDOM % 4)) + for ((j = 0; j < $nesting; j++)); do + cat > some-file <<EOF +$i +$path +EOF + mkdir sub + cd sub + path="$path/sub" + done + cat > some-file <<EOF +$i +$path +EOF + popd >/dev/null +done +echo + +echo 'Done.' diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h index 686c42c0a5..5aee5193eb 100644 --- a/tools/virtiofsd/fuse_common.h +++ b/tools/virtiofsd/fuse_common.h @@ -353,6 +353,13 @@ struct fuse_file_info { #define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) /** + * Indicates that the kernel supports the FUSE_ATTR_SUBMOUNT flag. + * + * Setting (or unsetting) this flag in the `want` field has *no effect*. + */ +#define FUSE_CAP_SUBMOUNTS (1 << 27) + +/** * Ioctl flags * * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c index 4d1ba2925d..c70fb16a9a 100644 --- a/tools/virtiofsd/fuse_lowlevel.c +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -341,6 +341,8 @@ static void fill_entry(struct fuse_entry_out *arg, .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout), }; convert_stat(&e->attr, &arg->attr); + + arg->attr.flags = e->attr_flags; } /* @@ -1988,6 +1990,9 @@ static void do_init(fuse_req_t req, fuse_ino_t nodeid, bufsize = max_bufsize; } } + if (arg->flags & FUSE_SUBMOUNTS) { + se->conn.capable |= FUSE_CAP_SUBMOUNTS; + } #ifdef HAVE_SPLICE #ifdef HAVE_VMSPLICE se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h index 562fd5241e..9c06240f9e 100644 --- a/tools/virtiofsd/fuse_lowlevel.h +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -102,6 +102,11 @@ struct fuse_entry_param { * large value. */ double entry_timeout; + + /** + * Flags for fuse_attr.flags that do not fit into attr. + */ + uint32_t attr_flags; }; /** diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c index 324936948d..83ba07c6cd 100644 --- a/tools/virtiofsd/fuse_virtio.c +++ b/tools/virtiofsd/fuse_virtio.c @@ -1013,8 +1013,11 @@ int virtio_session_mount(struct fuse_session *se) se->vu_socketfd = data_sock; se->virtio_dev->se = se; pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL); - vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL, - fv_set_watch, fv_remove_watch, &fv_iface); + if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL, + fv_set_watch, fv_remove_watch, &fv_iface)) { + fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__); + return -1; + } return 0; } diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c index 2e181a49b5..75ac48dec2 100644 --- a/tools/virtiofsd/helper.c +++ b/tools/virtiofsd/helper.c @@ -161,7 +161,7 @@ void fuse_cmdline_help(void) " allowed (default: 10)\n" " -o posix_lock|no_posix_lock\n" " enable/disable remote posix lock\n" - " default: posix_lock\n" + " default: no_posix_lock\n" " -o readdirplus|no_readdirplus\n" " enable/disable readirplus\n" " default: readdirplus except with " @@ -190,6 +190,7 @@ void fuse_cmdline_help(void) " retain/discard O_DIRECT flags passed down\n" " to virtiofsd from guest applications.\n" " default: no_allow_direct_io\n" + " -o announce_submounts Announce sub-mount points to the guest\n" ); } diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c index a0beb986f3..ec1008bceb 100644 --- a/tools/virtiofsd/passthrough_ll.c +++ b/tools/virtiofsd/passthrough_ll.c @@ -40,6 +40,7 @@ #include "fuse_virtio.h" #include "fuse_log.h" #include "fuse_lowlevel.h" +#include "standard-headers/linux/fuse.h" #include <assert.h> #include <cap-ng.h> #include <dirent.h> @@ -94,6 +95,7 @@ struct lo_map { struct lo_key { ino_t ino; dev_t dev; + uint64_t mnt_id; }; struct lo_inode { @@ -166,6 +168,8 @@ struct lo_data { int readdirplus_set; int readdirplus_clear; int allow_direct_io; + int announce_submounts; + bool use_statx; struct lo_inode root; GHashTable *inodes; /* protected by lo->mutex */ struct lo_map ino_map; /* protected by lo->mutex */ @@ -205,6 +209,7 @@ static const struct fuse_opt lo_opts[] = { { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 }, { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 }, + { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 }, FUSE_OPT_END }; static bool use_syslog = false; @@ -219,7 +224,8 @@ static struct { /* That we loaded cap-ng in the current thread from the saved */ static __thread bool cap_loaded = 0; -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st); +static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, + uint64_t mnt_id); static int is_dot_or_dotdot(const char *name) { @@ -598,6 +604,20 @@ static void lo_init(void *userdata, struct fuse_conn_info *conn) fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); conn->want &= ~FUSE_CAP_READDIRPLUS; } + + if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) { + fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client " + "does not support it\n"); + lo->announce_submounts = false; + } + +#ifndef CONFIG_STATX + if (lo->announce_submounts) { + fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, there " + "is no statx()\n"); + lo->announce_submounts = false; + } +#endif } static void lo_getattr(fuse_req_t req, fuse_ino_t ino, @@ -741,12 +761,14 @@ out_err: fuse_reply_err(req, saverr); } -static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st) +static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, + uint64_t mnt_id) { struct lo_inode *p; struct lo_key key = { .ino = st->st_ino, .dev = st->st_dev, + .mnt_id = mnt_id, }; pthread_mutex_lock(&lo->mutex); @@ -774,6 +796,60 @@ static void posix_locks_value_destroy(gpointer data) free(plock); } +static int do_statx(struct lo_data *lo, int dirfd, const char *pathname, + struct stat *statbuf, int flags, uint64_t *mnt_id) +{ + int res; + +#if defined(CONFIG_STATX) && defined(STATX_MNT_ID) + if (lo->use_statx) { + struct statx statxbuf; + + res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID, + &statxbuf); + if (!res) { + memset(statbuf, 0, sizeof(*statbuf)); + statbuf->st_dev = makedev(statxbuf.stx_dev_major, + statxbuf.stx_dev_minor); + statbuf->st_ino = statxbuf.stx_ino; + statbuf->st_mode = statxbuf.stx_mode; + statbuf->st_nlink = statxbuf.stx_nlink; + statbuf->st_uid = statxbuf.stx_uid; + statbuf->st_gid = statxbuf.stx_gid; + statbuf->st_rdev = makedev(statxbuf.stx_rdev_major, + statxbuf.stx_rdev_minor); + statbuf->st_size = statxbuf.stx_size; + statbuf->st_blksize = statxbuf.stx_blksize; + statbuf->st_blocks = statxbuf.stx_blocks; + statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec; + statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec; + statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec; + statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec; + statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec; + statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec; + + if (statxbuf.stx_mask & STATX_MNT_ID) { + *mnt_id = statxbuf.stx_mnt_id; + } else { + *mnt_id = 0; + } + return 0; + } else if (errno != ENOSYS) { + return -1; + } + lo->use_statx = false; + /* fallback */ + } +#endif + res = fstatat(dirfd, pathname, statbuf, flags); + if (res == -1) { + return -1; + } + *mnt_id = 0; + + return 0; +} + /* * Increments nlookup and caller must release refcount using * lo_inode_put(&parent). @@ -784,6 +860,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, int newfd; int res; int saverr; + uint64_t mnt_id; struct lo_data *lo = lo_data(req); struct lo_inode *inode = NULL; struct lo_inode *dir = lo_inode(req, parent); @@ -811,12 +888,18 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, goto out_err; } - res = fstatat(newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, + &mnt_id); if (res == -1) { goto out_err; } - inode = lo_find(lo, &e->attr); + if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts && + (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) { + e->attr_flags |= FUSE_ATTR_SUBMOUNT; + } + + inode = lo_find(lo, &e->attr, mnt_id); if (inode) { close(newfd); } else { @@ -838,6 +921,7 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, inode->fd = newfd; inode->key.ino = e->attr.st_ino; inode->key.dev = e->attr.st_dev; + inode->key.mnt_id = mnt_id; pthread_mutex_init(&inode->plock_mutex, NULL); inode->posix_locks = g_hash_table_new_full( g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); @@ -1090,15 +1174,23 @@ static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, const char *name) { int res; + uint64_t mnt_id; struct stat attr; + struct lo_data *lo = lo_data(req); + struct lo_inode *dir = lo_inode(req, parent); - res = fstatat(lo_fd(req, parent), name, &attr, - AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (!dir) { + return NULL; + } + + res = do_statx(lo, dir->fd, name, &attr, + AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, &mnt_id); + lo_inode_put(lo, &dir); if (res == -1) { return NULL; } - return lo_find(lo_data(req), &attr); + return lo_find(lo, &attr, mnt_id); } static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) @@ -3266,6 +3358,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) { int fd, res; struct stat stat; + uint64_t mnt_id; fd = open("/", O_PATH); if (fd == -1) { @@ -3273,7 +3366,8 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) exit(1); } - res = fstatat(fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, + &mnt_id); if (res == -1) { fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); exit(1); @@ -3283,6 +3377,7 @@ static void setup_root(struct lo_data *lo, struct lo_inode *root) root->fd = fd; root->key.ino = stat.st_ino; root->key.dev = stat.st_dev; + root->key.mnt_id = mnt_id; root->nlookup = 2; g_atomic_int_set(&root->refcount, 2); } @@ -3291,7 +3386,7 @@ static guint lo_key_hash(gconstpointer key) { const struct lo_key *lkey = key; - return (guint)lkey->ino + (guint)lkey->dev; + return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id; } static gboolean lo_key_equal(gconstpointer a, gconstpointer b) @@ -3299,7 +3394,7 @@ static gboolean lo_key_equal(gconstpointer a, gconstpointer b) const struct lo_key *la = a; const struct lo_key *lb = b; - return la->ino == lb->ino && la->dev == lb->dev; + return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id; } static void fuse_lo_data_cleanup(struct lo_data *lo) @@ -3445,6 +3540,8 @@ int main(int argc, char *argv[]) exit(1); } + lo.use_statx = true; + se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); if (se == NULL) { goto err_out1; diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c index eb9af8265f..11623f56f2 100644 --- a/tools/virtiofsd/passthrough_seccomp.c +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -76,6 +76,7 @@ static const int syscall_whitelist[] = { SCMP_SYS(mremap), SCMP_SYS(munmap), SCMP_SYS(newfstatat), + SCMP_SYS(statx), SCMP_SYS(open), SCMP_SYS(openat), SCMP_SYS(ppoll), @@ -118,6 +119,7 @@ static const int syscall_whitelist[] = { /* Syscalls used when --syslog is enabled */ static const int syscall_whitelist_syslog[] = { + SCMP_SYS(send), SCMP_SYS(sendto), }; |