/*
 * QEMU System Emulator
 *
 * Copyright (c) 2003-2008 Fabrice Bellard
 * Copyright (c) 2012-2014 Cisco Systems
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#include "qemu/osdep.h"
#include <linux/ip.h>
#include <netdb.h>
#include "net/net.h"
#include "clients.h"
#include "qapi/error.h"
#include "qemu/error-report.h"
#include "qemu/option.h"
#include "qemu/sockets.h"
#include "qemu/iov.h"
#include "qemu/main-loop.h"


/* The buffer size needs to be investigated for optimum numbers and
 * optimum means of paging in on different systems. This size is
 * chosen to be sufficient to accommodate one packet with some headers
 */

#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
#define BUFFER_SIZE 2048
#define IOVSIZE 2
#define MAX_L2TPV3_MSGCNT 64
#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE)

/* Header set to 0x30000 signifies a data packet */

#define L2TPV3_DATA_PACKET 0x30000

/* IANA-assigned IP protocol ID for L2TPv3 */

#ifndef IPPROTO_L2TP
#define IPPROTO_L2TP 0x73
#endif

typedef struct NetL2TPV3State {
    NetClientState nc;
    int fd;

    /*
     * these are used for xmit - that happens packet a time
     * and for first sign of life packet (easier to parse that once)
     */

    uint8_t *header_buf;
    struct iovec *vec;

    /*
     * these are used for receive - try to "eat" up to 32 packets at a time
     */

    struct mmsghdr *msgvec;

    /*
     * peer address
     */

    struct sockaddr_storage *dgram_dst;
    uint32_t dst_size;

    /*
     * L2TPv3 parameters
     */

    uint64_t rx_cookie;
    uint64_t tx_cookie;
    uint32_t rx_session;
    uint32_t tx_session;
    uint32_t header_size;
    uint32_t counter;

    /*
    * DOS avoidance in error handling
    */

    bool header_mismatch;

    /*
     * Ring buffer handling
     */

    int queue_head;
    int queue_tail;
    int queue_depth;

    /*
     * Precomputed offsets
     */

    uint32_t offset;
    uint32_t cookie_offset;
    uint32_t counter_offset;
    uint32_t session_offset;

    /* Poll Control */

    bool read_poll;
    bool write_poll;

    /* Flags */

    bool ipv6;
    bool udp;
    bool has_counter;
    bool pin_counter;
    bool cookie;
    bool cookie_is_64;

} NetL2TPV3State;

static void net_l2tpv3_send(void *opaque);
static void l2tpv3_writable(void *opaque);

static void l2tpv3_update_fd_handler(NetL2TPV3State *s)
{
    qemu_set_fd_handler(s->fd,
                        s->read_poll ? net_l2tpv3_send : NULL,
                        s->write_poll ? l2tpv3_writable : NULL,
                        s);
}

static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable)
{
    if (s->read_poll != enable) {
        s->read_poll = enable;
        l2tpv3_update_fd_handler(s);
    }
}

static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable)
{
    if (s->write_poll != enable) {
        s->write_poll = enable;
        l2tpv3_update_fd_handler(s);
    }
}

static void l2tpv3_writable(void *opaque)
{
    NetL2TPV3State *s = opaque;
    l2tpv3_write_poll(s, false);
    qemu_flush_queued_packets(&s->nc);
}

static void l2tpv3_send_completed(NetClientState *nc, ssize_t len)
{
    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
    l2tpv3_read_poll(s, true);
}

static void l2tpv3_poll(NetClientState *nc, bool enable)
{
    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
    l2tpv3_write_poll(s, enable);
    l2tpv3_read_poll(s, enable);
}

static void l2tpv3_form_header(NetL2TPV3State *s)
{
    uint32_t *counter;

    if (s->udp) {
        stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET);
    }
    stl_be_p(
            (uint32_t *) (s->header_buf + s->session_offset),
            s->tx_session
        );
    if (s->cookie) {
        if (s->cookie_is_64) {
            stq_be_p(
                (uint64_t *)(s->header_buf + s->cookie_offset),
                s->tx_cookie
            );
        } else {
            stl_be_p(
                (uint32_t *) (s->header_buf + s->cookie_offset),
                s->tx_cookie
            );
        }
    }
    if (s->has_counter) {
        counter = (uint32_t *)(s->header_buf + s->counter_offset);
        if (s->pin_counter) {
            *counter = 0;
        } else {
            stl_be_p(counter, ++s->counter);
        }
    }
}

static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc,
                    const struct iovec *iov,
                    int iovcnt)
{
    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);

    struct msghdr message;
    int ret;

    if (iovcnt > MAX_L2TPV3_IOVCNT - 1) {
        error_report(
            "iovec too long %d > %d, change l2tpv3.h",
            iovcnt, MAX_L2TPV3_IOVCNT
        );
        return -1;
    }
    l2tpv3_form_header(s);
    memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
    s->vec->iov_base = s->header_buf;
    s->vec->iov_len = s->offset;
    message.msg_name = s->dgram_dst;
    message.msg_namelen = s->dst_size;
    message.msg_iov = s->vec;
    message.msg_iovlen = iovcnt + 1;
    message.msg_control = NULL;
    message.msg_controllen = 0;
    message.msg_flags = 0;
    do {
        ret = sendmsg(s->fd, &message, 0);
    } while ((ret == -1) && (errno == EINTR));
    if (ret > 0) {
        ret -= s->offset;
    } else if (ret == 0) {
        /* belt and braces - should not occur on DGRAM
        * we should get an error and never a 0 send
        */
        ret = iov_size(iov, iovcnt);
    } else {
        /* signal upper layer that socket buffer is full */
        ret = -errno;
        if (ret == -EAGAIN || ret == -ENOBUFS) {
            l2tpv3_write_poll(s, true);
            ret = 0;
        }
    }
    return ret;
}

static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc,
                    const uint8_t *buf,
                    size_t size)
{
    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);

    struct iovec *vec;
    struct msghdr message;
    ssize_t ret = 0;

    l2tpv3_form_header(s);
    vec = s->vec;
    vec->iov_base = s->header_buf;
    vec->iov_len = s->offset;
    vec++;
    vec->iov_base = (void *) buf;
    vec->iov_len = size;
    message.msg_name = s->dgram_dst;
    message.msg_namelen = s->dst_size;
    message.msg_iov = s->vec;
    message.msg_iovlen = 2;
    message.msg_control = NULL;
    message.msg_controllen = 0;
    message.msg_flags = 0;
    do {
        ret = sendmsg(s->fd, &message, 0);
    } while ((ret == -1) && (errno == EINTR));
    if (ret > 0) {
        ret -= s->offset;
    } else if (ret == 0) {
        /* belt and braces - should not occur on DGRAM
        * we should get an error and never a 0 send
        */
        ret = size;
    } else {
        ret = -errno;
        if (ret == -EAGAIN || ret == -ENOBUFS) {
            /* signal upper layer that socket buffer is full */
            l2tpv3_write_poll(s, true);
            ret = 0;
        }
    }
    return ret;
}

static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
{

    uint32_t *session;
    uint64_t cookie;

    if ((!s->udp) && (!s->ipv6)) {
        buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
    }

    /* we do not do a strict check for "data" packets as per
    * the RFC spec because the pure IP spec does not have
    * that anyway.
    */

    if (s->cookie) {
        if (s->cookie_is_64) {
            cookie = ldq_be_p(buf + s->cookie_offset);
        } else {
            cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL;
        }
        if (cookie != s->rx_cookie) {
            if (!s->header_mismatch) {
                error_report("unknown cookie id");
            }
            return -1;
        }
    }
    session = (uint32_t *) (buf + s->session_offset);
    if (ldl_be_p(session) != s->rx_session) {
        if (!s->header_mismatch) {
            error_report("session mismatch");
        }
        return -1;
    }
    return 0;
}

static void net_l2tpv3_process_queue(NetL2TPV3State *s)
{
    int size = 0;
    struct iovec *vec;
    bool bad_read;
    int data_size;
    struct mmsghdr *msgvec;

    /* go into ring mode only if there is a "pending" tail */
    if (s->queue_depth > 0) {
        do {
            msgvec = s->msgvec + s->queue_tail;
            if (msgvec->msg_len > 0) {
                data_size = msgvec->msg_len - s->header_size;
                vec = msgvec->msg_hdr.msg_iov;
                if ((data_size > 0) &&
                    (l2tpv3_verify_header(s, vec->iov_base) == 0)) {
                    vec++;
                    /* Use the legacy delivery for now, we will
                     * switch to using our own ring as a queueing mechanism
                     * at a later date
                     */
                    size = qemu_send_packet_async(
                            &s->nc,
                            vec->iov_base,
                            data_size,
                            l2tpv3_send_completed
                        );
                    if (size == 0) {
                        l2tpv3_read_poll(s, false);
                    }
                    bad_read = false;
                } else {
                    bad_read = true;
                    if (!s->header_mismatch) {
                        /* report error only once */
                        error_report("l2tpv3 header verification failed");
                        s->header_mismatch = true;
                    }
                }
            } else {
                bad_read = true;
            }
            s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT;
            s->queue_depth--;
        } while (
                (s->queue_depth > 0) &&
                 qemu_can_send_packet(&s->nc) &&
                ((size > 0) || bad_read)
            );
    }
}

static void net_l2tpv3_send(void *opaque)
{
    NetL2TPV3State *s = opaque;
    int target_count, count;
    struct mmsghdr *msgvec;

    /* go into ring mode only if there is a "pending" tail */

    if (s->queue_depth) {

        /* The ring buffer we use has variable intake
         * count of how much we can read varies - adjust accordingly
         */

        target_count = MAX_L2TPV3_MSGCNT - s->queue_depth;

        /* Ensure we do not overrun the ring when we have
         * a lot of enqueued packets
         */

        if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) {
            target_count = MAX_L2TPV3_MSGCNT - s->queue_head;
        }
    } else {

        /* we do not have any pending packets - we can use
        * the whole message vector linearly instead of using
        * it as a ring
        */

        s->queue_head = 0;
        s->queue_tail = 0;
        target_count = MAX_L2TPV3_MSGCNT;
    }

    msgvec = s->msgvec + s->queue_head;
    if (target_count > 0) {
        do {
            count = recvmmsg(
                s->fd,
                msgvec,
                target_count, MSG_DONTWAIT, NULL);
        } while ((count == -1) && (errno == EINTR));
        if (count < 0) {
            /* Recv error - we still need to flush packets here,
             * (re)set queue head to current position
             */
            count = 0;
        }
        s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT;
        s->queue_depth += count;
    }
    net_l2tpv3_process_queue(s);
}

static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount)
{
    int i, j;
    struct iovec *iov;
    struct mmsghdr *cleanup = msgvec;
    if (cleanup) {
        for (i = 0; i < count; i++) {
            if (cleanup->msg_hdr.msg_iov) {
                iov = cleanup->msg_hdr.msg_iov;
                for (j = 0; j < iovcount; j++) {
                    g_free(iov->iov_base);
                    iov++;
                }
                g_free(cleanup->msg_hdr.msg_iov);
            }
            cleanup++;
        }
        g_free(msgvec);
    }
}

static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count)
{
    int i;
    struct iovec *iov;
    struct mmsghdr *msgvec, *result;

    msgvec = g_new(struct mmsghdr, count);
    result = msgvec;
    for (i = 0; i < count ; i++) {
        msgvec->msg_hdr.msg_name = NULL;
        msgvec->msg_hdr.msg_namelen = 0;
        iov =  g_new(struct iovec, IOVSIZE);
        msgvec->msg_hdr.msg_iov = iov;
        iov->iov_base = g_malloc(s->header_size);
        iov->iov_len = s->header_size;
        iov++ ;
        iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
        iov->iov_len = BUFFER_SIZE;
        msgvec->msg_hdr.msg_iovlen = 2;
        msgvec->msg_hdr.msg_control = NULL;
        msgvec->msg_hdr.msg_controllen = 0;
        msgvec->msg_hdr.msg_flags = 0;
        msgvec++;
    }
    return result;
}

static void net_l2tpv3_cleanup(NetClientState *nc)
{
    NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
    qemu_purge_queued_packets(nc);
    l2tpv3_read_poll(s, false);
    l2tpv3_write_poll(s, false);
    if (s->fd >= 0) {
        close(s->fd);
    }
    destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE);
    g_free(s->vec);
    g_free(s->header_buf);
    g_free(s->dgram_dst);
}

static NetClientInfo net_l2tpv3_info = {
    .type = NET_CLIENT_DRIVER_L2TPV3,
    .size = sizeof(NetL2TPV3State),
    .receive = net_l2tpv3_receive_dgram,
    .receive_iov = net_l2tpv3_receive_dgram_iov,
    .poll = l2tpv3_poll,
    .cleanup = net_l2tpv3_cleanup,
};

int net_init_l2tpv3(const Netdev *netdev,
                    const char *name,
                    NetClientState *peer, Error **errp)
{
    const NetdevL2TPv3Options *l2tpv3;
    NetL2TPV3State *s;
    NetClientState *nc;
    int fd = -1, gairet;
    struct addrinfo hints;
    struct addrinfo *result = NULL;
    char *srcport, *dstport;

    nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name);

    s = DO_UPCAST(NetL2TPV3State, nc, nc);

    s->queue_head = 0;
    s->queue_tail = 0;
    s->header_mismatch = false;

    assert(netdev->type == NET_CLIENT_DRIVER_L2TPV3);
    l2tpv3 = &netdev->u.l2tpv3;

    if (l2tpv3->has_ipv6 && l2tpv3->ipv6) {
        s->ipv6 = l2tpv3->ipv6;
    } else {
        s->ipv6 = false;
    }

    if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) {
        error_setg(errp, "offset must be less than 256 bytes");
        goto outerr;
    }

    if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) {
        if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) {
            s->cookie = true;
        } else {
            error_setg(errp,
                       "require both 'rxcookie' and 'txcookie' or neither");
            goto outerr;
        }
    } else {
        s->cookie = false;
    }

    if (l2tpv3->has_cookie64 || l2tpv3->cookie64) {
        s->cookie_is_64  = true;
    } else {
        s->cookie_is_64  = false;
    }

    if (l2tpv3->has_udp && l2tpv3->udp) {
        s->udp = true;
        if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) {
            error_setg(errp, "need both src and dst port for udp");
            goto outerr;
        } else {
            srcport = l2tpv3->srcport;
            dstport = l2tpv3->dstport;
        }
    } else {
        s->udp = false;
        srcport = NULL;
        dstport = NULL;
    }


    s->offset = 4;
    s->session_offset = 0;
    s->cookie_offset = 4;
    s->counter_offset = 4;

    s->tx_session = l2tpv3->txsession;
    if (l2tpv3->has_rxsession) {
        s->rx_session = l2tpv3->rxsession;
    } else {
        s->rx_session = s->tx_session;
    }

    if (s->cookie) {
        s->rx_cookie = l2tpv3->rxcookie;
        s->tx_cookie = l2tpv3->txcookie;
        if (s->cookie_is_64 == true) {
            /* 64 bit cookie */
            s->offset += 8;
            s->counter_offset += 8;
        } else {
            /* 32 bit cookie */
            s->offset += 4;
            s->counter_offset += 4;
        }
    }

    memset(&hints, 0, sizeof(hints));

    if (s->ipv6) {
        hints.ai_family = AF_INET6;
    } else {
        hints.ai_family = AF_INET;
    }
    if (s->udp) {
        hints.ai_socktype = SOCK_DGRAM;
        hints.ai_protocol = 0;
        s->offset += 4;
        s->counter_offset += 4;
        s->session_offset += 4;
        s->cookie_offset += 4;
    } else {
        hints.ai_socktype = SOCK_RAW;
        hints.ai_protocol = IPPROTO_L2TP;
    }

    gairet = getaddrinfo(l2tpv3->src, srcport, &hints, &result);

    if ((gairet != 0) || (result == NULL)) {
        error_setg(errp, "could not resolve src, errno = %s",
                   gai_strerror(gairet));
        goto outerr;
    }
    fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol);
    if (fd == -1) {
        fd = -errno;
        error_setg(errp, "socket creation failed, errno = %d",
                   -fd);
        goto outerr;
    }
    if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) {
        error_setg(errp, "could not bind socket err=%i", errno);
        goto outerr;
    }
    if (result) {
        freeaddrinfo(result);
    }

    memset(&hints, 0, sizeof(hints));

    if (s->ipv6) {
        hints.ai_family = AF_INET6;
    } else {
        hints.ai_family = AF_INET;
    }
    if (s->udp) {
        hints.ai_socktype = SOCK_DGRAM;
        hints.ai_protocol = 0;
    } else {
        hints.ai_socktype = SOCK_RAW;
        hints.ai_protocol = IPPROTO_L2TP;
    }

    result = NULL;
    gairet = getaddrinfo(l2tpv3->dst, dstport, &hints, &result);
    if ((gairet != 0) || (result == NULL)) {
        error_setg(errp, "could not resolve dst, error = %s",
                   gai_strerror(gairet));
        goto outerr;
    }

    s->dgram_dst = g_new0(struct sockaddr_storage, 1);
    memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen);
    s->dst_size = result->ai_addrlen;

    if (result) {
        freeaddrinfo(result);
    }

    if (l2tpv3->has_counter && l2tpv3->counter) {
        s->has_counter = true;
        s->offset += 4;
    } else {
        s->has_counter = false;
    }

    if (l2tpv3->has_pincounter && l2tpv3->pincounter) {
        s->has_counter = true;  /* pin counter implies that there is counter */
        s->pin_counter = true;
    } else {
        s->pin_counter = false;
    }

    if (l2tpv3->has_offset) {
        /* extra offset */
        s->offset += l2tpv3->offset;
    }

    if ((s->ipv6) || (s->udp)) {
        s->header_size = s->offset;
    } else {
        s->header_size = s->offset + sizeof(struct iphdr);
    }

    s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT);
    s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT);
    s->header_buf = g_malloc(s->header_size);

    qemu_set_nonblock(fd);

    s->fd = fd;
    s->counter = 0;

    l2tpv3_read_poll(s, true);

    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
             "l2tpv3: connected");
    return 0;
outerr:
    qemu_del_net_client(nc);
    if (fd >= 0) {
        close(fd);
    }
    if (result) {
        freeaddrinfo(result);
    }
    return -1;
}