src/driver/virtio.c

The virtio driver โ€” same vtable, virtqueue avail/used rings.

Diagram for driver/virtio.c
Diagram: The virtio driver โ€” same vtable, virtqueue avail/used rings.
filesrc/driver/virtio.c
#include <emmintrin.h>
#include <stdlib.h>
#include <string.h>
#include <sys/file.h>
#include <unistd.h>

#include "driver/device.h"
#include "log.h"
#include "memory.h"
#include "pci.h"
#include "virtio.h"
#include "virtio_type.h"

static const char* driver_name = "ixy-virtio";

static inline void virtio_legacy_notify_queue(struct virtio_device* dev, uint16_t idx) {
	write_io16(dev->fd, idx, VIRTIO_PCI_QUEUE_NOTIFY);
}

static uint8_t virtio_legacy_get_status(struct virtio_device* dev) {
	return read_io8(dev->fd, VIRTIO_PCI_STATUS);
}

static void virtio_legacy_check_status(struct virtio_device* dev) {
	if (read_io8(dev->fd, VIRTIO_PCI_STATUS) == VIRTIO_CONFIG_STATUS_FAILED) {
		error("Device signaled unrecoverable error");
	}
}

static inline size_t virtio_legacy_vring_size(unsigned int num, unsigned long align) {
	size_t size;

	size = num * sizeof(struct vring_desc);
	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
	size = RTE_ALIGN_CEIL(size, align);
	size += sizeof(struct vring_used) + (num * sizeof(struct vring_used_elem));
	return size;
}

static inline void virtio_legacy_vring_init(struct vring* vr, unsigned int num, uint8_t* p, unsigned long align) {
	vr->num = num;
	vr->desc = (struct vring_desc*)p;
	vr->avail = (struct vring_avail*)(p + num * sizeof(struct vring_desc));
	vr->used = (void*)RTE_ALIGN_CEIL((uintptr_t)(&vr->avail->ring[num]), align);
}

static void virtio_legacy_setup_tx_queue(struct virtio_device* dev, uint16_t idx) {
	if (idx != 1 && idx != 2) {
		error("Can't setup queue %u as Tx queue", idx);
	}

	// Create virt queue itself - Section 4.1.5.1.3
	write_io16(dev->fd, idx, VIRTIO_PCI_QUEUE_SEL);
	uint32_t max_queue_size = read_io16(dev->fd, VIRTIO_PCI_QUEUE_NUM);
	debug("Max queue size of tx queue #%u: %u", idx, max_queue_size);
	if (max_queue_size == 0) {
		return;
	}
	size_t virt_queue_mem_size = virtio_legacy_vring_size(max_queue_size, 4096);
	struct dma_memory mem = memory_allocate_dma(virt_queue_mem_size, true);
	memset(mem.virt, 0xab, virt_queue_mem_size);
	debug("Allocated %zu bytes for virt queue at %p", virt_queue_mem_size, mem.virt);
	write_io32(dev->fd, mem.phy >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, VIRTIO_PCI_QUEUE_PFN);

	// Section 2.4.2 for layout
	struct virtqueue* vq = calloc(1, sizeof(*vq) + sizeof(void*) * max_queue_size);
	virtio_legacy_vring_init(&vq->vring, max_queue_size, mem.virt, 4096);
	debug("vring desc: %p, vring avail: %p, vring used: %p", vq->vring.desc, vq->vring.avail, vq->vring.used);
	for (size_t i = 0; i < vq->vring.num; ++i) {
		vq->vring.desc[i].len = 0;
		vq->vring.desc[i].addr = 0;
		vq->vring.desc[i].flags = 0;
		vq->vring.desc[i].next = 0;
		vq->vring.avail->ring[i] = 0;
		vq->vring.used->ring[i].id = 0;
		vq->vring.used->ring[i].len = 0;
	}
	vq->vring.used->idx = 0;
	vq->vring.avail->idx = 0;
	vq->vq_used_last_idx = 0;

	// Section 4.1.4.4
	uint32_t notify_offset = read_io16(dev->fd, VIRTIO_PCI_QUEUE_NOTIFY);
	debug("vq notifcation offset %u", notify_offset);
	vq->notification_offset = notify_offset;

	// Ctrl queue packets are not supplied by the user
	if (idx == 2) {
		vq->mempool = memory_allocate_mempool(max_queue_size, 2048);
	}

	// Disable interrupts - Section 2.4.7
	vq->vring.avail->flags = VRING_AVAIL_F_NO_INTERRUPT;
	vq->vring.used->flags = 0;

	if (idx == 1) {
		dev->tx_queue = vq;
	} else {
		dev->ctrl_queue = vq;
	}
}

static void virtio_legacy_send_command(struct virtio_device* dev, void* cmd, size_t cmd_len) {
	struct virtqueue* vq = dev->ctrl_queue;

	if (cmd_len < sizeof(struct virtio_net_ctrl_hdr)) {
		error("Command can not be shorter than control header");
	}
	if (((uint8_t*)cmd)[0] != VIRTIO_NET_CTRL_RX) {
		error("Command class is not supported");
	}

	_mm_mfence();
	// Find free desciptor slot
	uint16_t idx = 0;
	for (idx = 0; idx < vq->vring.num; ++idx) {
		struct vring_desc* desc = &vq->vring.desc[idx];
		if (desc->addr == 0) {
			break;
		}
	}
	if (idx == vq->vring.num) {
		error("command queue full");
	} else {
		debug("Found free desc slot at %u (%u)", idx, vq->vring.num);
	}

	struct pkt_buf* buf = pkt_buf_alloc(vq->mempool);
	if (!buf) {
		error("Control queue ran out of buffers");
	}
	memcpy(buf->data, cmd, cmd_len);
	vq->virtual_addresses[idx] = buf;

	/* The following descriptor setup kills QEMU, but should be allowed with VIRTIO_F_ANY_LAYOUT
	 * Error: kvm: virtio-net ctrl missing headers
	 * Version: QEMU emulator version 2.7.1 pve-qemu-kvm_2.7.1-4
	 */
	// All in one descriptor
	// vq->vring.desc[idx].len = cmd_len;
	// vq->vring.desc[idx].addr = buf->buf_addr_phy + offsetof(struct pkt_buf, data);
	// vq->vring.desc[idx].flags = VRING_DESC_F_WRITE;
	// vq->vring.desc[idx].next = 0;

	// Device-readable head: cmd header
	vq->vring.desc[idx].len = 2;
	vq->vring.desc[idx].addr = buf->buf_addr_phy + offsetof(struct pkt_buf, data);
	vq->vring.desc[idx].flags = VRING_DESC_F_NEXT;
	vq->vring.desc[idx].next = idx + 1;
	// Device-readable payload: data
	vq->vring.desc[idx + 1].len = cmd_len - 2 - 1; // Header and ack byte
	vq->vring.desc[idx + 1].addr = buf->buf_addr_phy + offsetof(struct pkt_buf, data) + 2;
	vq->vring.desc[idx + 1].flags = VRING_DESC_F_NEXT;
	vq->vring.desc[idx + 1].next = idx + 2;
	// Device-writable tail: ack flag
	vq->vring.desc[idx + 2].len = 1;
	vq->vring.desc[idx + 2].addr = buf->buf_addr_phy + offsetof(struct pkt_buf, data) + cmd_len - 1;
	vq->vring.desc[idx + 2].flags = VRING_DESC_F_WRITE;
	vq->vring.desc[idx + 2].next = 0;
	vq->vring.avail->ring[vq->vring.avail->idx % vq->vring.num] = idx;
	_mm_mfence();
	vq->vring.avail->idx++;
	_mm_mfence();

	virtio_legacy_notify_queue(dev, 2);
	_mm_mfence();

	// Wait until the buffer got processed
	while (vq->vq_used_last_idx == vq->vring.used->idx) {
		_mm_mfence();
		debug("Waiting...");
		usleep(100000);
	}
	vq->vq_used_last_idx++;
	// Check status and free buffer
	struct vring_used_elem* e = &vq->vring.used->ring[vq->vring.used->idx];
	debug("e %p: id %u len %u", e, e->id, e->len);
	if (e->id != idx) {
		error("Used buffer has different index as sent one");
	}
	if (vq->virtual_addresses[idx] != buf) {
		error("buffer differ");
	}
	pkt_buf_free(buf);
	vq->vring.desc[idx] = (struct vring_desc){};
	vq->vring.desc[idx + 1] = (struct vring_desc){};
	vq->vring.desc[idx + 2] = (struct vring_desc){};
}

static void virtio_legacy_set_promiscuous(struct virtio_device* dev, bool on) {
	struct {
		struct virtio_net_ctrl_hdr hdr;
		uint8_t on;
		uint8_t ack;
	} __attribute__((__packed__)) cmd = {};
	static_assert(sizeof(cmd) == 4, "Size of command struct wrong");

	cmd.hdr.class = VIRTIO_NET_CTRL_RX;
	cmd.hdr.cmd = VIRTIO_NET_CTRL_RX_PROMISC;
	cmd.on = on ? 1 : 0;

	virtio_legacy_send_command(dev, &cmd, sizeof(cmd));
	info("Set promisc to %u", on);
}

void virtio_set_promisc(struct ixy_device* ixy, bool enabled) {
	struct virtio_device* dev = IXY_TO_VIRTIO(ixy);
	virtio_legacy_set_promiscuous(dev, enabled);
}

uint32_t virtio_get_link_speed(const struct ixy_device* dev) {
	return 1000;
}

static const struct virtio_legacy_net_hdr net_hdr = {
	.flags = 0,
	.gso_type = VIRTIO_NET_HDR_GSO_NONE,
	.hdr_len = 14 + 20 + 8,
};

static void virtio_legacy_setup_rx_queue(struct virtio_device* dev, uint16_t idx) {
	if (idx != 0) {
		error("Can't setup Tx queue as Rx");
	}

	// Create virt queue itself - Section 4.1.5.1.3
	write_io16(dev->fd, idx, VIRTIO_PCI_QUEUE_SEL);
	uint32_t max_queue_size = read_io16(dev->fd, VIRTIO_PCI_QUEUE_NUM);
	debug("Max queue size of rx queue #%u: %u", idx, max_queue_size);
	if (max_queue_size == 0) {
		return;
	}
	uint32_t notify_offset = read_io16(dev->fd, VIRTIO_PCI_QUEUE_NOTIFY);
	debug("Notifcation offset %u", notify_offset);
	size_t virt_queue_mem_size = virtio_legacy_vring_size(max_queue_size, 4096);
	struct dma_memory mem = memory_allocate_dma(virt_queue_mem_size, true);
	memset(mem.virt, 0xab, virt_queue_mem_size);
	debug("Allocated %zu bytes for virt queue at %p", virt_queue_mem_size, mem.virt);
	write_io32(dev->fd, mem.phy >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, VIRTIO_PCI_QUEUE_PFN);

	// Section 2.4.2 for layout
	struct virtqueue* vq = calloc(1, sizeof(*vq) + sizeof(void*) * max_queue_size);
	virtio_legacy_vring_init(&vq->vring, max_queue_size, mem.virt, 4096);
	debug("vring desc: %p, vring avail: %p, vring used: %p", vq->vring.desc, vq->vring.avail, vq->vring.used);
	for (size_t i = 0; i < vq->vring.num; ++i) {
		vq->vring.desc[i].len = 0;
		vq->vring.desc[i].addr = 0;
		vq->vring.desc[i].flags = 0;
		vq->vring.desc[i].next = 0;
		vq->vring.avail->ring[i] = 0;
		vq->vring.used->ring[i].id = 0;
		vq->vring.used->ring[i].len = 0;
	}
	vq->vring.used->idx = 0;
	vq->vring.avail->idx = 0;
	vq->vq_used_last_idx = 0;

	// Section 4.1.4.4
	vq->notification_offset = notify_offset;

	// Disable interrupts - Section 2.4.7
	vq->vring.avail->flags = VRING_AVAIL_F_NO_INTERRUPT;
	vq->vring.used->flags = 0;

	// Allocate buffers and fill descriptor table - Section 3.2.1
	// We allocate more bufs than what would fit in the queue,
	// because we don't want to stall rx if users hold bufs for longer
	vq->mempool = memory_allocate_mempool(max_queue_size * 4, 2048);

	dev->rx_queue = vq;
}

static void virtio_legacy_init(struct virtio_device* dev) {
	// Section 3.1
	debug("Configuring bar0");
	write_io8(dev->fd, VIRTIO_CONFIG_STATUS_RESET, VIRTIO_PCI_STATUS);
	while (read_io8(dev->fd, VIRTIO_PCI_STATUS) != VIRTIO_CONFIG_STATUS_RESET) {
		usleep(100);
	}
	write_io8(dev->fd, VIRTIO_CONFIG_STATUS_ACK, VIRTIO_PCI_STATUS);
	write_io8(dev->fd, VIRTIO_CONFIG_STATUS_DRIVER, VIRTIO_PCI_STATUS);
	// Negotiate features
	uint32_t host_features = read_io32(dev->fd, VIRTIO_PCI_HOST_FEATURES);
	debug("Host features: %x", host_features);
	const uint32_t required_features = (1u << VIRTIO_NET_F_CSUM) | (1u << VIRTIO_NET_F_GUEST_CSUM) |
					   (1u << VIRTIO_NET_F_CTRL_VQ) | (1u << VIRTIO_F_ANY_LAYOUT) |
					   (1u << VIRTIO_NET_F_CTRL_RX) /*| (1u<<VIRTIO_NET_F_MQ)*/;
	if ((host_features & required_features) != required_features) {
		error("Device does not support required features");
	}
	debug("Guest features before negotiation: %x", read_io32(dev->fd, VIRTIO_PCI_GUEST_FEATURES));
	write_io32(dev->fd, required_features, VIRTIO_PCI_GUEST_FEATURES);
	debug("Guest features after negotiation: %x", read_io32(dev->fd, VIRTIO_PCI_GUEST_FEATURES));
	// Queue setup - Section 5.1.2 for queue index calculation
	// Legacy devices only have 3 queues
	virtio_legacy_setup_rx_queue(dev, 0); // Rx
	virtio_legacy_setup_tx_queue(dev, 1); // Tx
	virtio_legacy_setup_tx_queue(dev, 2); // Control
	_mm_mfence();
	// Signal OK
	write_io8(dev->fd, VIRTIO_CONFIG_STATUS_DRIVER_OK, VIRTIO_PCI_STATUS);
	info("Setup complete");
	// Recheck status
	virtio_legacy_check_status(dev);
	virtio_legacy_set_promiscuous(dev, true);
}

// read stat counters and accumulate in stats
// stats may be NULL to just reset the counters
// this is not thread-safe, (but we only support one queue anyways)
// a proper thread-safe implementation would collect per-queue stats
// and perform a read with relaxed memory ordering here without resetting the stats
void virtio_read_stats(struct ixy_device* ixy, struct device_stats* stats) {
	struct virtio_device* dev = IXY_TO_VIRTIO(ixy);
	if (stats) {
		stats->rx_pkts += dev->rx_pkts;
		stats->tx_pkts += dev->tx_pkts;
		stats->rx_bytes += dev->rx_bytes;
		stats->tx_bytes += dev->tx_bytes;
	}
	dev->rx_pkts = dev->tx_pkts = dev->rx_bytes = dev->tx_bytes = 0;
}


struct ixy_device* virtio_init(const char* pci_addr, uint16_t rx_queues, uint16_t tx_queues) {
	if (getuid()) {
		warn("Not running as root, this will probably fail");
	}
	if (rx_queues > 1) {
		error("cannot configure %d rx queues: limit is %d", rx_queues, 1);
	}
	if (tx_queues > 1) {
		error("cannot configure %d tx queues: limit is %d", tx_queues, 1);
	}
	remove_driver(pci_addr);
	struct virtio_device* dev = calloc(1, sizeof(*dev));
	dev->ixy.pci_addr = strdup(pci_addr);
	dev->ixy.driver_name = driver_name;
	dev->ixy.num_rx_queues = rx_queues;
	dev->ixy.num_tx_queues = tx_queues;
	dev->ixy.rx_batch = virtio_rx_batch;
	dev->ixy.tx_batch = virtio_tx_batch;
	dev->ixy.read_stats = virtio_read_stats;
	dev->ixy.set_promisc = virtio_set_promisc;
	dev->ixy.get_link_speed = virtio_get_link_speed;
	enable_dma(pci_addr);
	int config = pci_open_resource(pci_addr, "config", O_RDONLY);
	uint16_t device_id = read_io16(config, 2);
	close(config);
	// Check config if device is legacy network card
	if (device_id == 0x1000) {
		info("Detected virtio legacy network card");
		dev->fd = pci_open_resource(pci_addr, "resource0", O_RDWR);
		virtio_legacy_init(dev);
	} else {
		error("Modern device not supported");
	}
	return &dev->ixy;
}

uint32_t virtio_rx_batch(struct ixy_device* ixy, uint16_t queue_id, struct pkt_buf* bufs[], uint32_t num_bufs) {
	struct virtio_device* dev = IXY_TO_VIRTIO(ixy);
	struct virtqueue* vq = dev->rx_queue;
	uint32_t buf_idx;

	_mm_mfence();
	// Retrieve used bufs from the device
	for (buf_idx = 0; buf_idx < num_bufs; ++buf_idx) {
		// Section 3.2.2
		if (vq->vq_used_last_idx == vq->vring.used->idx) {
			break;
		}
		// info("Rx packet: last used %u, used idx %u", vq->vq_used_last_idx,
		// vq->vring.used->idx);
		struct vring_used_elem* e = vq->vring.used->ring + (vq->vq_used_last_idx % vq->vring.num);
		// info("Used elem %p, id %u len %u", e, e->id, e->len);
		struct vring_desc* desc = &vq->vring.desc[e->id];
		vq->vq_used_last_idx++;
		// We don't support chaining or indirect descriptors
		if (desc->flags != VRING_DESC_F_WRITE) {
			error("unsupported rx flags on descriptor: %x", desc->flags);
		}
		// info("Desc %lu %u %u %u", desc->addr, desc->len, desc->flags,
		// desc->next);
		*desc = (struct vring_desc){};

		// Section 5.1.6.4
		struct pkt_buf* buf = vq->virtual_addresses[e->id];
		buf->size = e->len - sizeof(net_hdr);
		bufs[buf_idx] = buf;
		//struct virtio_net_hdr* hdr = (void*)(buf->head_room + sizeof(buf->head_room) - sizeof(net_hdr));

		// Update rx counter
		dev->rx_bytes += buf->size;
		dev->rx_pkts++;
	}
	// Fill empty slots in descriptor table
	for (uint16_t idx = 0; idx < vq->vring.num; ++idx) {
		struct vring_desc* desc = &vq->vring.desc[idx];
		if (desc->addr != 0) { // descriptor points to something, therefore it is in use
			continue;
		}
		// info("Found free desc slot at %u (%u)", idx, vq->vring.num);
		struct pkt_buf* buf = pkt_buf_alloc(vq->mempool);
		if (!buf) {
			error("failed to allocate new mbuf for rx, you are either leaking memory or your mempool is too small");
		}
		buf->size = vq->mempool->buf_size;
		memcpy(buf->head_room + sizeof(buf->head_room) - sizeof(net_hdr), &net_hdr, sizeof(net_hdr));
		vq->vring.desc[idx].len = buf->size + sizeof(net_hdr);
		vq->vring.desc[idx].addr =
			buf->buf_addr_phy + offsetof(struct pkt_buf, head_room) + sizeof(buf->head_room) - sizeof(net_hdr);
		vq->vring.desc[idx].flags = VRING_DESC_F_WRITE;
		vq->vring.desc[idx].next = 0;
		vq->virtual_addresses[idx] = buf;
		vq->vring.avail->ring[vq->vring.avail->idx % vq->vring.num] = idx;
		_mm_mfence(); // Make sure exposed descriptors reach device before index is updated
		vq->vring.avail->idx++;
		_mm_mfence(); // Make sure the index update reaches device before it is triggered
		virtio_legacy_notify_queue(dev, 0);
	}
	return buf_idx;
}

uint32_t virtio_tx_batch(struct ixy_device* ixy, uint16_t queue_id, struct pkt_buf* bufs[], uint32_t num_bufs) {
	struct virtio_device* dev = IXY_TO_VIRTIO(ixy);
	struct virtqueue* vq = dev->tx_queue;

	_mm_mfence();
	// Free sent buffers
	while (vq->vq_used_last_idx != vq->vring.used->idx) {
		// info("We can free some buffers: %u != %u", vq->vq_used_last_idx,
		// vq->vring.used->idx);
		struct vring_used_elem* e = vq->vring.used->ring + (vq->vq_used_last_idx % vq->vring.num);
		// info("e %p, id %u", e, e->id);
		struct vring_desc* desc = &vq->vring.desc[e->id];
		desc->addr = 0;
		desc->len = 0;
		pkt_buf_free(vq->virtual_addresses[e->id]);
		vq->vq_used_last_idx++;
		_mm_mfence();
	}
	// Send buffers
	uint32_t buf_idx;
	uint16_t idx = 0; // Keep index of last found free descriptor and start searching from there
	for (buf_idx = 0; buf_idx < num_bufs; ++buf_idx) {
		struct pkt_buf* buf = bufs[buf_idx];
		// Find free desc index
		for (; idx < vq->vring.num; ++idx) {
			struct vring_desc* desc = &vq->vring.desc[idx];
			if (desc->addr == 0) {
				break;
			}
		}
		if (idx == vq->vring.num) {
			break;
		}
		// info("Found free desc slot at %u (%u)", idx, vq->vring.num);

		// Update tx counter
		dev->tx_bytes += buf->size;
		dev->tx_pkts++;

		vq->virtual_addresses[idx] = buf;

		// Copy header to headroom in front of data buffer
		memcpy(buf->head_room + sizeof(buf->head_room) - sizeof(net_hdr), &net_hdr, sizeof(net_hdr));

		vq->vring.desc[idx].len = buf->size + sizeof(net_hdr);
		vq->vring.desc[idx].addr =
			buf->buf_addr_phy + offsetof(struct pkt_buf, head_room) + sizeof(buf->head_room) - sizeof(net_hdr);
		vq->vring.desc[idx].flags = 0;
		vq->vring.desc[idx].next = 0;
		vq->vring.avail->ring[(vq->vring.avail->idx + buf_idx) % vq->vring.num] = idx;
	}
	_mm_mfence();
	vq->vring.avail->idx += buf_idx;
	_mm_mfence();
	virtio_legacy_notify_queue(dev, 1);
	return buf_idx;
}