Let's Cram It In! UART Lite Over PCIe Straight Into Linux: A Driver in One Evening (Almost)

A hands-on guide to building a Linux kernel TTY driver for an AXI UARTLite peripheral on a Xilinx FPGA connected via PCIe XDMA, complete with full driver source code, a Python direct-access alternative, and performance profiling.

In this article, I'll show how to connect a UART peripheral on an FPGA to Linux via PCIe and make it appear as a standard /dev/ttyUL0 serial device. We'll write a kernel driver, test it with a GPS module, and compare it to a Python-based alternative.

Project overview

Hardware Setup

  • FPGA: Xilinx Artix-7 (XC7A200T) — part of a custom SDR board
  • GPS Module: SIM68, outputting NMEA 0183 sentences
  • Host: LattePanda Sigma
  • Interface: PCIe Gen1 x4 via M.2 slot
  • UART Speed: 9600 baud
Hardware diagram

The data path looks like this: GPS Module sends NMEA data via UART to the UARTLite IP core on the FPGA, which is connected through an AXI interconnect to the XDMA (DMA over PCIe) block. The XDMA block communicates with the host CPU over PCIe, and finally our driver presents the data to userspace applications.

Board photo

UARTLite Register Map

The AXI UARTLite IP core has a simple register interface at base offset 0x40000:

  • 0x00 — RX FIFO (read received byte)
  • 0x04 — TX FIFO (write byte to transmit)
  • 0x08 — STATUS register
  • 0x0C — CONTROL register

Status flags: RXVALID (bit 0) indicates data is available in the RX FIFO; TXFULL (bit 3) indicates the TX FIFO is full.

Register map

Approach 1: Linux Kernel Driver

The driver registers as a PCIe device (Vendor ID 0x10EE — Xilinx, Device ID 0x7011) and creates a TTY device at /dev/ttyUL0. Instead of using interrupts (which would require modifying the FPGA design), we use a workqueue-based polling approach for receiving data.

Full Driver Source Code

#include <linux/version.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/tty_flip.h>
#include <linux/io.h>
#include <linux/workqueue.h>

#define DRIVER_NAME "uartlite_xdma"
#define VENDOR_ID 0x10EE
#define DEVICE_ID 0x7011
#define UARTLITE_BASE_OFFSET 0x40000

#define UARTLITE_RX_FIFO  0x00
#define UARTLITE_TX_FIFO  0x04
#define UARTLITE_STATUS   0x08
#define UARTLITE_CONTROL  0x0C

#define STATUS_RXVALID    BIT(0)
#define STATUS_TXFULL     BIT(3)

struct uartlite_priv {
    void __iomem *base;
    struct tty_port port;
    struct work_struct rx_work;
    bool running;
};

static struct tty_driver *uartlite_tty_driver;

static int uartlite_tx_ready(struct uartlite_priv *priv)
{
    return !(ioread32(priv->base + UARTLITE_STATUS) & STATUS_TXFULL);
}

static void uartlite_write_byte(struct uartlite_priv *priv, u8 val)
{
    iowrite32(val, priv->base + UARTLITE_TX_FIFO);
}

static int uartlite_rx_ready(struct uartlite_priv *priv)
{
    return ioread32(priv->base + UARTLITE_STATUS) & STATUS_RXVALID;
}

static u8 uartlite_read_byte(struct uartlite_priv *priv)
{
    return ioread32(priv->base + UARTLITE_RX_FIFO);
}

static void uartlite_rx_work(struct work_struct *work)
{
    struct uartlite_priv *priv = container_of(work, struct uartlite_priv, rx_work);
    struct tty_struct *tty = tty_port_tty_get(&priv->port);
    unsigned char buf[16];
    int i, count;

    if (!tty)
        return;

    while (priv->running && uartlite_rx_ready(priv)) {
        count = 0;
        for (i = 0; i < sizeof(buf) && uartlite_rx_ready(priv); i++) {
            buf[i] = uartlite_read_byte(priv);
            count++;
        }
        if (count) {
            tty_insert_flip_string(&priv->port, buf, count);
            tty_flip_buffer_push(&priv->port);
        }
    }

    if (priv->running)
        schedule_work(&priv->rx_work);

    tty_kref_put(tty);
}

static int uartlite_tty_open(struct tty_struct *tty, struct file *filp)
{
    struct uartlite_priv *priv = container_of(tty->port, struct uartlite_priv, port);
    priv->running = true;
    schedule_work(&priv->rx_work);
    return tty_port_open(tty->port, tty, filp);
}

static void uartlite_tty_close(struct tty_struct *tty, struct file *filp)
{
    struct uartlite_priv *priv = container_of(tty->port, struct uartlite_priv, port);
    priv->running = false;
    cancel_work_sync(&priv->rx_work);
    tty_port_close(tty->port, tty, filp);
}

#if LINUX_VERSION_CODE <= KERNEL_VERSION(6, 5, 0)
static int uartlite_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
#else
static ssize_t uartlite_tty_write(struct tty_struct *tty, const u8 *buf, size_t count)
#endif
{
    struct uartlite_priv *priv = tty->driver_data;
    int i;

    for (i = 0; i < count; i++) {
        while (!uartlite_tx_ready(priv))
            cpu_relax();
        uartlite_write_byte(priv, buf[i]);
    }
    return i;
}

static unsigned int uartlite_tty_write_room(struct tty_struct *tty)
{
    struct uartlite_priv *priv = tty->driver_data;
    return uartlite_tx_ready(priv) ? 16 : 0;
}

static unsigned int uartlite_tty_chars_in_buffer(struct tty_struct *tty)
{
    return 0;
}

static const struct tty_operations uartlite_tty_ops = {
    .open = uartlite_tty_open,
    .close = uartlite_tty_close,
    .write = uartlite_tty_write,
    .write_room = uartlite_tty_write_room,
    .chars_in_buffer = uartlite_tty_chars_in_buffer,
};

static int uartlite_port_activate(struct tty_port *port, struct tty_struct *tty)
{
    struct uartlite_priv *priv = container_of(port, struct uartlite_priv, port);
    tty->driver_data = priv;
    return 0;
}

static void uartlite_port_shutdown(struct tty_port *port)
{
    struct uartlite_priv *priv = container_of(port, struct uartlite_priv, port);
    priv->running = false;
    cancel_work_sync(&priv->rx_work);
}

static const struct tty_port_operations uartlite_port_ops = {
    .activate = uartlite_port_activate,
    .shutdown = uartlite_port_shutdown,
};

static int uartlite_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
    struct uartlite_priv *priv;
    int ret;

    priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
    if (!priv)
        return -ENOMEM;

    ret = pcim_enable_device(pdev);
    if (ret)
        return ret;

    priv->base = pcim_iomap(pdev, 0, 0);
    if (!priv->base)
        return -ENOMEM;

    priv->base += UARTLITE_BASE_OFFSET;

    tty_port_init(&priv->port);
    priv->port.ops = &uartlite_port_ops;
    INIT_WORK(&priv->rx_work, uartlite_rx_work);
    priv->running = false;

    tty_port_register_device(&priv->port, uartlite_tty_driver, 0, &pdev->dev);

    pci_set_drvdata(pdev, priv);
    dev_info(&pdev->dev, "UARTlite over XDMA registered as TTY");
    return 0;
}

static void uartlite_remove(struct pci_dev *pdev)
{
    struct uartlite_priv *priv = pci_get_drvdata(pdev);
    tty_unregister_device(uartlite_tty_driver, 0);
    tty_port_destroy(&priv->port);
}

static const struct pci_device_id uartlite_pci_tbl[] = {
    { PCI_DEVICE(VENDOR_ID, DEVICE_ID) },
    { 0, }
};

static struct pci_driver uartlite_pci_driver = {
    .name = DRIVER_NAME,
    .id_table = uartlite_pci_tbl,
    .probe = uartlite_probe,
    .remove = uartlite_remove,
};

static int __init uartlite_init(void)
{
    int ret;

    uartlite_tty_driver = tty_alloc_driver(1, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV);
    if (IS_ERR(uartlite_tty_driver))
        return PTR_ERR(uartlite_tty_driver);

    uartlite_tty_driver->driver_name = DRIVER_NAME;
    uartlite_tty_driver->name = "ttyUL";
    uartlite_tty_driver->major = 0;
    uartlite_tty_driver->minor_start = 0;
    uartlite_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
    uartlite_tty_driver->subtype = SERIAL_TYPE_NORMAL;
    uartlite_tty_driver->init_termios = tty_std_termios;
    uartlite_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;

    tty_set_operations(uartlite_tty_driver, &uartlite_tty_ops);

    ret = tty_register_driver(uartlite_tty_driver);
    if (ret) {
        tty_driver_kref_put(uartlite_tty_driver);
        return ret;
    }

    ret = pci_register_driver(&uartlite_pci_driver);
    if (ret) {
        tty_unregister_driver(uartlite_tty_driver);
        tty_driver_kref_put(uartlite_tty_driver);
    }

    return ret;
}

static void __exit uartlite_exit(void)
{
    pci_unregister_driver(&uartlite_pci_driver);
    tty_unregister_driver(uartlite_tty_driver);
    tty_driver_kref_put(uartlite_tty_driver);
}

module_init(uartlite_init);
module_exit(uartlite_exit);

MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Konstantin");
MODULE_DESCRIPTION("UARTlite TTY driver over XDMA with RX support");

Makefile

obj-m += uartlite_xdma.o

KDIR ?= /lib/modules/$(shell uname -r)/build

all:
	make CC=/usr/bin/gcc-13 -C $(KDIR) M=$(PWD) modules

clean:
	make -C $(KDIR) M=$(PWD) clean

install:
	make -C $(KDIR) M=$(PWD) modules_install

EXTRA_CFLAGS += -g

.PHONY: all clean install

Building and Loading

sudo apt install -y linux-headers-$(uname -r) gcc-13
make
sudo insmod uartlite_xdma.ko
sudo minicom -D /dev/ttyUL0

After loading the module, dmesg should show the registration message, and /dev/ttyUL0 will appear. You can then use minicom or gpsd to read GPS NMEA data.

Approach 2: Direct Python Access

For quick prototyping, you can bypass the kernel entirely by memory-mapping the XDMA user BAR directly from Python:

import os
import time
import numpy as np
import mmap
import asyncio

class XdmaUartLite:
    BASE_ADDR = 0x40000

    RX_FIFO = 0x00
    TX_FIFO = 0x04
    STATUS = 0x08
    CONTROL = 0x0C

    TX_FULL = 0x08
    RX_VALID = 0x01
    TX_RESET = 0x01
    RX_RESET = 0x02

    def __init__(self, device_index=0):
        base = f"/dev/xdma{device_index}"
        self.fd_user = os.open(f"{base}_user", os.O_RDWR)
        self.m_rmap = np.frombuffer(mmap.mmap(self.fd_user, int(1e6)), np.uint32)
        self.reset_fifos()

    def close(self):
        os.close(self.fd_user)

    def read_reg(self, addr):
        return self.m_rmap[addr >> 2] & 0xFFFF

    def write_reg(self, addr, data):
        self.m_rmap[addr >> 2] = np.uint32(data)

    def reset_fifos(self):
        self.write_reg(self.BASE_ADDR + self.CONTROL, self.TX_RESET | self.RX_RESET)

    def send_byte(self, data):
        while self.read_reg(self.BASE_ADDR + self.STATUS) & self.TX_FULL:
            pass
        self.write_reg(self.BASE_ADDR + self.TX_FIFO, data)

    def recv_byte(self):
        while not (self.read_reg(self.BASE_ADDR + self.STATUS) & self.RX_VALID):
            pass
        return self.read_reg(self.BASE_ADDR + self.RX_FIFO)

    def send_data(self, data):
        for byte in data:
            self.send_byte(byte)

    def recv_data(self, size):
        return bytearray([self.recv_byte() for _ in range(size)])

    async def recv_byte_async(self):
        while not (self.read_reg(self.BASE_ADDR + self.STATUS) & self.RX_VALID):
            await asyncio.sleep(0.001)
        return self.read_reg(self.BASE_ADDR + self.RX_FIFO)

    async def recv_data_async(self, size):
        return bytearray([await self.recv_byte_async() for _ in range(size)])

async def main():
    uart = XdmaUartLite()
    while True:
        data = await uart.recv_data_async(128)
        print(data.decode(errors="ignore"))

asyncio.run(main())

Performance Comparison

CPU profiling results

Profiling revealed a stark difference between approaches:

  • Synchronous polling (both kernel and Python): consumes ~100% CPU on one core, spinning in a tight loop waiting for data
  • Async Python with asyncio.sleep(): reduces CPU usage to approximately 1% by yielding the thread between polls
Async performanceGPS data output

For a 9600 baud GPS stream, the async approach is clearly the winner. The kernel driver approach makes more sense when you need standard TTY semantics (e.g., for gpsd) or when integrating with existing tools that expect a serial device.

Conclusion

We built a working Linux TTY driver for a UARTLite peripheral on an FPGA, accessible over PCIe. The driver is minimal but functional — it handles both RX and TX, registers as a proper TTY device, and works with standard tools like minicom and gpsd. For simpler use cases, the Python mmap approach offers a faster development cycle with better CPU efficiency when using async I/O. The whole project took about one evening — well, maybe a bit more than one evening, but who's counting?

FAQ

What is this article about in one sentence?

This article explains the core idea in practical terms and focuses on what you can apply in real work.

Who is this article for?

It is written for engineers, technical leaders, and curious readers who want a clear, implementation-focused explanation.

What should I read next?

Use the related articles below to continue with closely connected topics and concrete examples.