#!/usr/bin/env python3
#
#  Copyright (c) 2017 Red Hat Inc
#
# Author:
#  Eduardo Habkost <ehabkost@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

"""
Run QEMU with all combinations of -machine and -device types,
check for crashes and unexpected errors.
"""

import os
import sys
import glob
import logging
import traceback
import re
import random
import argparse
from itertools import chain
from pathlib import Path

try:
    from qemu.machine import QEMUMachine
    from qemu.qmp import ConnectError
except ModuleNotFoundError as exc:
    path = Path(__file__).resolve()
    print(f"Module '{exc.name}' not found.")
    print("  Try 'make check-venv' from your build directory,")
    print("  and then one way to run this script is like so:")
    print(f'  > $builddir/tests/venv/bin/python3 "{path}"')
    sys.exit(1)

logger = logging.getLogger('device-crash-test')
dbg = logger.debug


# Purposes of the following rule list:
# * Avoiding verbose log messages when we find known non-fatal
#   (exitcode=1) errors
# * Avoiding fatal errors when we find known crashes
# * Skipping machines/devices that are known not to work out of
#   the box, when running in --quick mode
#
# Keeping the rule list updated is desirable, but not required,
# because unexpected cases where QEMU exits with exitcode=1 will
# just trigger a INFO message.

# Valid error rule keys:
# * accel: regexp, full match only
# * machine: regexp, full match only
# * device: regexp, full match only
# * log: regexp, partial match allowed
# * exitcode: if not present, defaults to 1. If None, matches any exitcode
# * warn: if True, matching failures will be logged as warnings
# * expected: if True, QEMU is expected to always fail every time
#   when testing the corresponding test case
# * loglevel: log level of log output when there's a match.
ERROR_RULE_LIST = [
    # Machines that won't work out of the box:
    #             MACHINE                         | ERROR MESSAGE
    {'machine':'niagara', 'expected':True},       # Unable to load a firmware for -M niagara
    {'machine':'boston', 'expected':True},        # Please provide either a -kernel or -bios argument
    {'machine':'leon3_generic', 'expected':True}, # Can't read bios image (null)

    # devices that don't work out of the box because they require extra options to "-device DEV":
    #            DEVICE                                    | ERROR MESSAGE
    {'device':'.*-(i386|x86_64)-cpu', 'expected':True},    # CPU socket-id is not set
    {'device':'icp', 'expected':True},                     # icp_realize: required link 'xics' not found: Property '.xics' not found
    {'device':'ics', 'expected':True},                     # ics_base_realize: required link 'xics' not found: Property '.xics' not found
    # "-device ide-cd" does work on more recent QEMU versions, so it doesn't have expected=True
    {'device':'ide-cd'},                                 # No drive specified
    {'device':'ide-hd', 'expected':True},                  # No drive specified
    {'device':'ipmi-bmc-extern', 'expected':True},         # IPMI external bmc requires chardev attribute
    {'device':'isa-debugcon', 'expected':True},            # Can't create serial device, empty char device
    {'device':'isa-ipmi-bt', 'expected':True},             # IPMI device requires a bmc attribute to be set
    {'device':'isa-ipmi-kcs', 'expected':True},            # IPMI device requires a bmc attribute to be set
    {'device':'isa-parallel', 'expected':True},            # Can't create serial device, empty char device
    {'device':'ivshmem-doorbell', 'expected':True},        # You must specify a 'chardev'
    {'device':'ivshmem-plain', 'expected':True},           # You must specify a 'memdev'
    {'device':'loader', 'expected':True},                  # please include valid arguments
    {'device':'nand', 'expected':True},                    # Unsupported NAND block size 0x1
    {'device':'nvdimm', 'expected':True},                  # 'memdev' property is not set
    {'device':'nvme', 'expected':True},                    # Device initialization failed
    {'device':'pc-dimm', 'expected':True},                 # 'memdev' property is not set
    {'device':'pci-bridge', 'expected':True},              # Bridge chassis not specified. Each bridge is required to be assigned a unique chassis id > 0.
    {'device':'pci-bridge-seat', 'expected':True},         # Bridge chassis not specified. Each bridge is required to be assigned a unique chassis id > 0.
    {'device':'pxb', 'expected':True},                     # Bridge chassis not specified. Each bridge is required to be assigned a unique chassis id > 0.
    {'device':'pxb-cxl', 'expected':True},                 # pxb-cxl devices cannot reside on a PCI bus.
    {'device':'scsi-block', 'expected':True},              # drive property not set
    {'device':'scsi-generic', 'expected':True},            # drive property not set
    {'device':'scsi-hd', 'expected':True},                 # drive property not set
    {'device':'spapr-pci-host-bridge', 'expected':True},   # BUID not specified for PHB
    {'device':'spapr-rng', 'expected':True},               # spapr-rng needs an RNG backend!
    {'device':'spapr-vty', 'expected':True},               # chardev property not set
    {'device':'tpm-tis', 'expected':True},                 # tpm_tis: backend driver with id (null) could not be found
    {'device':'unimplemented-device', 'expected':True},    # property 'size' not specified or zero
    {'device':'usb-braille', 'expected':True},             # Property chardev is required
    {'device':'usb-mtp', 'expected':True},                 # rootdir property must be configured
    {'device':'usb-redir', 'expected':True},               # Parameter 'chardev' is missing
    {'device':'usb-serial', 'expected':True},              # Property chardev is required
    {'device':'usb-storage', 'expected':True},             # drive property not set
    {'device':'vfio-amd-xgbe', 'expected':True},           # -device vfio-amd-xgbe: vfio error: wrong host device name
    {'device':'vfio-calxeda-xgmac', 'expected':True},      # -device vfio-calxeda-xgmac: vfio error: wrong host device name
    {'device':'vfio-pci', 'expected':True},                # No provided host device
    {'device':'vfio-pci-igd-lpc-bridge', 'expected':True}, # VFIO dummy ISA/LPC bridge must have address 1f.0
    {'device':'vhost-scsi.*', 'expected':True},            # vhost-scsi: missing wwpn
    {'device':'vhost-vsock-device', 'expected':True},      # guest-cid property must be greater than 2
    {'device':'vhost-vsock-pci', 'expected':True},         # guest-cid property must be greater than 2
    {'device':'virtio-9p-ccw', 'expected':True},           # 9pfs device couldn't find fsdev with the id = NULL
    {'device':'virtio-9p-device', 'expected':True},        # 9pfs device couldn't find fsdev with the id = NULL
    {'device':'virtio-9p-pci', 'expected':True},           # 9pfs device couldn't find fsdev with the id = NULL
    {'device':'virtio-blk-ccw', 'expected':True},          # drive property not set
    {'device':'virtio-blk-device', 'expected':True},       # drive property not set
    {'device':'virtio-blk-device', 'expected':True},       # drive property not set
    {'device':'virtio-blk-pci', 'expected':True},          # drive property not set
    {'device':'virtio-crypto-ccw', 'expected':True},       # 'cryptodev' parameter expects a valid object
    {'device':'virtio-crypto-device', 'expected':True},    # 'cryptodev' parameter expects a valid object
    {'device':'virtio-crypto-pci', 'expected':True},       # 'cryptodev' parameter expects a valid object
    {'device':'virtio-input-host-device', 'expected':True}, # evdev property is required
    {'device':'virtio-input-host-pci', 'expected':True},   # evdev property is required
    {'device':'xen-pvdevice', 'expected':True},            # Device ID invalid, it must always be supplied
    {'device':'vhost-vsock-ccw', 'expected':True},         # guest-cid property must be greater than 2
    {'device':'zpci', 'expected':True},                    # target must be defined
    {'device':'pnv-(occ|icp|lpc)', 'expected':True},       # required link 'xics' not found: Property '.xics' not found
    {'device':'powernv-cpu-.*', 'expected':True},          # pnv_core_realize: required link 'xics' not found: Property '.xics' not found

    # ioapic devices are already created by pc and will fail:
    {'machine':'q35|pc.*', 'device':'kvm-ioapic', 'expected':True}, # Only 1 ioapics allowed
    {'machine':'q35|pc.*', 'device':'ioapic', 'expected':True},     # Only 1 ioapics allowed

    # "spapr-cpu-core needs a pseries machine"
    {'machine':'(?!pseries).*', 'device':'.*-spapr-cpu-core', 'expected':True},

    # KVM-specific devices shouldn't be tried without accel=kvm:
    {'accel':'(?!kvm).*', 'device':'kvmclock', 'expected':True},

    # xen-specific machines and devices:
    {'accel':'(?!xen).*', 'machine':'xen.*', 'expected':True},
    {'accel':'(?!xen).*', 'device':'xen-.*', 'expected':True},

    # this fails on some machine-types, but not all, so they don't have expected=True:
    {'device':'vmgenid'}, # vmgenid requires DMA write support in fw_cfg, which this machine type does not provide

    # Silence INFO messages for errors that are common on multiple
    # devices/machines:
    {'log':r"No '[\w-]+' bus found for device '[\w-]+'"},
    {'log':r"images* must be given with the 'pflash' parameter"},
    {'log':r"(Guest|ROM|Flash|Kernel) image must be specified"},
    {'log':r"[cC]ould not load [\w ]+ (BIOS|bios) '[\w-]+\.bin'"},
    {'log':r"Couldn't find rom image '[\w-]+\.bin'"},
    {'log':r"speed mismatch trying to attach usb device"},
    {'log':r"Can't create a second ISA bus"},
    {'log':r"duplicate fw_cfg file name"},
    # sysbus-related error messages: most machines reject most dynamic sysbus devices:
    {'log':r"Option '-device [\w.,-]+' cannot be handled by this machine"},
    {'log':r"Device [\w.,-]+ is not supported by this machine yet"},
    {'log':r"Device [\w.,-]+ can not be dynamically instantiated"},
    {'log':r"Platform Bus: Can not fit MMIO region of size "},
    # other more specific errors we will ignore:
    {'device':'.*-spapr-cpu-core', 'log':r"CPU core type should be"},
    {'log':r"MSI(-X)? is not supported by interrupt controller"},
    {'log':r"pxb-pcie? devices cannot reside on a PCIe? bus"},
    {'log':r"Ignoring smp_cpus value"},
    {'log':r"sd_init failed: Drive 'sd0' is already in use because it has been automatically connected to another device"},
    {'log':r"This CPU requires a smaller page size than the system is using"},
    {'log':r"MSI-X support is mandatory in the S390 architecture"},
    {'log':r"rom check and register reset failed"},
    {'log':r"Unable to initialize GIC, CPUState for CPU#0 not valid"},
    {'log':r"Multiple VT220 operator consoles are not supported"},
    {'log':r"core 0 already populated"},
    {'log':r"could not find stage1 bootloader"},

    # other exitcode=1 failures not listed above will just generate INFO messages:
    {'exitcode':1, 'loglevel':logging.INFO},

    # everything else (including SIGABRT and SIGSEGV) will be a fatal error:
    {'exitcode':None, 'fatal':True, 'loglevel':logging.FATAL},
]


def errorRuleTestCaseMatch(rule, t):
    """Check if a test case specification can match a error rule

    This only checks if a error rule is a candidate match
    for a given test case, it won't check if the test case
    results/output match the rule.  See ruleListResultMatch().
    """
    return (('machine' not in rule or
             'machine' not in t or
             re.match(rule['machine'] + '$', t['machine'])) and
            ('accel' not in rule or
             'accel' not in t or
             re.match(rule['accel'] + '$', t['accel'])) and
            ('device' not in rule or
             'device' not in t or
             re.match(rule['device'] + '$', t['device'])))


def ruleListCandidates(t):
    """Generate the list of candidates that can match a test case"""
    for i, rule in enumerate(ERROR_RULE_LIST):
        if errorRuleTestCaseMatch(rule, t):
            yield (i, rule)


def findExpectedResult(t):
    """Check if there's an expected=True error rule for a test case

    Returns (i, rule) tuple, where i is the index in
    ERROR_RULE_LIST and rule is the error rule itself.
    """
    for i, rule in ruleListCandidates(t):
        if rule.get('expected'):
            return (i, rule)


def ruleListResultMatch(rule, r):
    """Check if test case results/output match a error rule

    It is valid to call this function only if
    errorRuleTestCaseMatch() is True for the rule (e.g. on
    rules returned by ruleListCandidates())
    """
    assert errorRuleTestCaseMatch(rule, r['testcase'])
    return ((rule.get('exitcode', 1) is None or
             r['exitcode'] == rule.get('exitcode', 1)) and
            ('log' not in rule or
             re.search(rule['log'], r['log'], re.MULTILINE)))


def checkResultRuleList(r):
    """Look up error rule for a given test case result

    Returns (i, rule) tuple, where i is the index in
    ERROR_RULE_LIST and rule is the error rule itself.
    """
    for i, rule in ruleListCandidates(r['testcase']):
        if ruleListResultMatch(rule, r):
            return i, rule

    raise Exception("this should never happen")


def qemuOptsEscape(s):
    """Escape option value QemuOpts"""
    return s.replace(",", ",,")


def formatTestCase(t):
    """Format test case info as "key=value key=value" for prettier logging output"""
    return ' '.join('%s=%s' % (k, v) for k, v in t.items())


def qomListTypeNames(vm, **kwargs):
    """Run qom-list-types QMP command, return type names"""
    types = vm.command('qom-list-types', **kwargs)
    return [t['name'] for t in types]


def infoQDM(vm):
    """Parse 'info qdm' output"""
    args = {'command-line': 'info qdm'}
    devhelp = vm.command('human-monitor-command', **args)
    for l in devhelp.split('\n'):
        l = l.strip()
        if l == '' or l.endswith(':'):
            continue
        d = {'name': re.search(r'name "([^"]+)"', l).group(1),
             'no-user': (re.search(', no-user', l) is not None)}
        yield d


class QemuBinaryInfo(object):
    def __init__(self, binary, devtype):
        if devtype is None:
            devtype = 'device'

        self.binary = binary
        self._machine_info = {}

        dbg("devtype: %r", devtype)
        args = ['-S', '-machine', 'none,accel=kvm:tcg']
        dbg("querying info for QEMU binary: %s", binary)
        vm = QEMUMachine(binary=binary, args=args)
        vm.launch()
        try:
            self.alldevs = set(qomListTypeNames(vm, implements=devtype, abstract=False))
            # there's no way to query DeviceClass::user_creatable using QMP,
            # so use 'info qdm':
            self.no_user_devs = set([d['name'] for d in infoQDM(vm, ) if d['no-user']])
            self.machines = list(m['name'] for m in vm.command('query-machines'))
            self.user_devs = self.alldevs.difference(self.no_user_devs)
            self.kvm_available = vm.command('query-kvm')['enabled']
        finally:
            vm.shutdown()

    def machineInfo(self, machine):
        """Query for information on a specific machine-type

        Results are cached internally, in case the same machine-
        type is queried multiple times.
        """
        if machine in self._machine_info:
            return self._machine_info[machine]

        mi = {}
        args = ['-S', '-machine', '%s' % (machine)]
        dbg("querying machine info for binary=%s machine=%s", self.binary, machine)
        vm = QEMUMachine(binary=self.binary, args=args)
        try:
            vm.launch()
            mi['runnable'] = True
        except Exception:
            dbg("exception trying to run binary=%s machine=%s", self.binary, machine, exc_info=sys.exc_info())
            dbg("log: %r", vm.get_log())
            mi['runnable'] = False

        vm.shutdown()
        self._machine_info[machine] = mi
        return mi


BINARY_INFO = {}


def getBinaryInfo(args, binary):
    if binary not in BINARY_INFO:
        BINARY_INFO[binary] = QemuBinaryInfo(binary, args.devtype)
    return BINARY_INFO[binary]


def checkOneCase(args, testcase):
    """Check one specific case

    Returns a dictionary containing failure information on error,
    or None on success
    """
    binary = testcase['binary']
    accel = testcase['accel']
    machine = testcase['machine']
    device = testcase['device']

    dbg("will test: %r", testcase)

    args = ['-S', '-machine', '%s,accel=%s' % (machine, accel),
            '-device', qemuOptsEscape(device)]
    cmdline = ' '.join([binary] + args)
    dbg("will launch QEMU: %s", cmdline)
    vm = QEMUMachine(binary=binary, args=args, qmp_timer=15)

    exc = None
    exc_traceback = None
    try:
        vm.launch()
    except Exception as this_exc:
        exc = this_exc
        exc_traceback = traceback.format_exc()
        dbg("Exception while running test case")
    finally:
        vm.shutdown()
        ec = vm.exitcode()
        log = vm.get_log()

    if exc is not None or ec != 0:
        return {'exc': exc,
                'exc_traceback':exc_traceback,
                'exitcode':ec,
                'log':log,
                'testcase':testcase,
                'cmdline':cmdline}


def binariesToTest(args, testcase):
    if args.qemu:
        r = args.qemu
    else:
        r = [f.path for f in os.scandir('.')
             if f.name.startswith('qemu-system-') and
                f.is_file() and os.access(f, os.X_OK)]
    return r


def accelsToTest(args, testcase):
    if getBinaryInfo(args, testcase['binary']).kvm_available:
        yield 'kvm'
    yield 'tcg'


def machinesToTest(args, testcase):
    return getBinaryInfo(args, testcase['binary']).machines


def devicesToTest(args, testcase):
    return getBinaryInfo(args, testcase['binary']).user_devs


TESTCASE_VARIABLES = [
    ('binary', binariesToTest),
    ('accel', accelsToTest),
    ('machine', machinesToTest),
    ('device', devicesToTest),
]


def genCases1(args, testcases, var, fn):
    """Generate new testcases for one variable

    If an existing item already has a variable set, don't
    generate new items and just return it directly. This
    allows the "-t" command-line option to be used to choose
    a specific test case.
    """
    for testcase in testcases:
        if var in testcase:
            yield testcase.copy()
        else:
            for i in fn(args, testcase):
                t = testcase.copy()
                t[var] = i
                yield t


def genCases(args, testcase):
    """Generate test cases for all variables
    """
    cases = [testcase.copy()]
    for var, fn in TESTCASE_VARIABLES:
        dbg("var: %r, fn: %r", var, fn)
        cases = genCases1(args, cases, var, fn)
    return cases


def casesToTest(args, testcase):
    cases = genCases(args, testcase)
    if args.random:
        cases = list(cases)
        cases = random.sample(cases, min(args.random, len(cases)))
    if args.debug:
        cases = list(cases)
        dbg("%d test cases to test", len(cases))
    if args.shuffle:
        cases = list(cases)
        random.shuffle(cases)
    return cases


def logFailure(f, level):
    t = f['testcase']
    logger.log(level, "failed: %s", formatTestCase(t))
    logger.log(level, "cmdline: %s", f['cmdline'])
    for l in f['log'].strip().split('\n'):
        logger.log(level, "log: %s", l)
    logger.log(level, "exit code: %r", f['exitcode'])

    # If the Exception is merely a QMP connect error,
    # reduce the logging level for its traceback to
    # improve visual clarity.
    if isinstance(f.get('exc'), ConnectError):
        logger.log(level, "%s.%s: %s",
                   type(f['exc']).__module__,
                   type(f['exc']).__qualname__,
                   str(f['exc']))
        level = logging.DEBUG

    if f['exc_traceback']:
        logger.log(level, "exception:")
        for l in f['exc_traceback'].split('\n'):
            logger.log(level, "  %s", l.rstrip('\n'))


def main():
    parser = argparse.ArgumentParser(description="QEMU -device crash test")
    parser.add_argument('-t', metavar='KEY=VALUE', nargs='*',
                        help="Limit test cases to KEY=VALUE",
                        action='append', dest='testcases', default=[])
    parser.add_argument('-d', '--debug', action='store_true',
                        help='debug output')
    parser.add_argument('-v', '--verbose', action='store_true', default=True,
                        help='verbose output')
    parser.add_argument('-q', '--quiet', dest='verbose', action='store_false',
                        help='non-verbose output')
    parser.add_argument('-r', '--random', type=int, metavar='COUNT',
                        help='run a random sample of COUNT test cases',
                        default=0)
    parser.add_argument('--shuffle', action='store_true',
                        help='Run test cases in random order')
    parser.add_argument('--dry-run', action='store_true',
                        help="Don't run any tests, just generate list")
    parser.add_argument('-D', '--devtype', metavar='TYPE',
                        help="Test only device types that implement TYPE")
    parser.add_argument('-Q', '--quick', action='store_true', default=True,
                        help="Quick mode: skip test cases that are expected to fail")
    parser.add_argument('-F', '--full', action='store_false', dest='quick',
                        help="Full mode: test cases that are expected to fail")
    parser.add_argument('--strict', action='store_true', dest='strict',
                        help="Treat all warnings as fatal")
    parser.add_argument('qemu', nargs='*', metavar='QEMU',
                        help='QEMU binary to run')
    args = parser.parse_args()

    if args.debug:
        lvl = logging.DEBUG
    elif args.verbose:
        lvl = logging.INFO
    else:
        lvl = logging.WARN
    logging.basicConfig(stream=sys.stdout, level=lvl, format='%(levelname)s: %(message)s')

    if not args.debug:
        # Async QMP, when in use, is chatty about connection failures.
        # This script knowingly generates a ton of connection errors.
        # Silence this logger.
        logging.getLogger('qemu.qmp.qmp_client').setLevel(logging.CRITICAL)

    fatal_failures = []
    wl_stats = {}
    skipped = 0
    total = 0

    tc = {}
    dbg("testcases: %r", args.testcases)
    if args.testcases:
        for t in chain(*args.testcases):
            for kv in t.split():
                k, v = kv.split('=', 1)
                tc[k] = v

    if len(binariesToTest(args, tc)) == 0:
        print("No QEMU binary found", file=sys.stderr)
        parser.print_usage(sys.stderr)
        return 1

    for t in casesToTest(args, tc):
        logger.info("running test case: %s", formatTestCase(t))
        total += 1

        expected_match = findExpectedResult(t)
        if (args.quick and
                (expected_match or
                 not getBinaryInfo(args, t['binary']).machineInfo(t['machine'])['runnable'])):
            dbg("skipped: %s", formatTestCase(t))
            skipped += 1
            continue

        if args.dry_run:
            continue

        try:
            f = checkOneCase(args, t)
        except KeyboardInterrupt:
            break

        if f:
            i, rule = checkResultRuleList(f)
            dbg("testcase: %r, rule list match: %r", t, rule)
            wl_stats.setdefault(i, []).append(f)
            level = rule.get('loglevel', logging.DEBUG)
            logFailure(f, level)
            if rule.get('fatal') or (args.strict and level >= logging.WARN):
                fatal_failures.append(f)
        else:
            dbg("success: %s", formatTestCase(t))
            if expected_match:
                logger.warn("Didn't fail as expected: %s", formatTestCase(t))

    logger.info("Total: %d test cases", total)
    if skipped:
        logger.info("Skipped %d test cases", skipped)

    if args.debug:
        stats = sorted([(len(wl_stats.get(i, [])), rule) for i, rule in
                         enumerate(ERROR_RULE_LIST)], key=lambda x: x[0])
        for count, rule in stats:
            dbg("error rule stats: %d: %r", count, rule)

    if fatal_failures:
        for f in fatal_failures:
            t = f['testcase']
            logger.error("Fatal failure: %s", formatTestCase(t))
        logger.error("Fatal failures on some machine/device combinations")
        return 1

if __name__ == '__main__':
    sys.exit(main())