Let's Cram It In! UART Lite Over PCIe Straight Into Linux: A Driver in One Evening (Almost)
A hands-on guide to building a Linux kernel TTY driver for an AXI UARTLite peripheral on a Xilinx FPGA connected via PCIe XDMA, complete with full driver source code, a Python direct-access alternative, and performance profiling.
In this article, I'll show how to connect a UART peripheral on an FPGA to Linux via PCIe and make it appear as a standard /dev/ttyUL0 serial device. We'll write a kernel driver, test it with a GPS module, and compare it to a Python-based alternative.

Hardware Setup
- FPGA: Xilinx Artix-7 (XC7A200T) — part of a custom SDR board
- GPS Module: SIM68, outputting NMEA 0183 sentences
- Host: LattePanda Sigma
- Interface: PCIe Gen1 x4 via M.2 slot
- UART Speed: 9600 baud

The data path looks like this: GPS Module sends NMEA data via UART to the UARTLite IP core on the FPGA, which is connected through an AXI interconnect to the XDMA (DMA over PCIe) block. The XDMA block communicates with the host CPU over PCIe, and finally our driver presents the data to userspace applications.

UARTLite Register Map
The AXI UARTLite IP core has a simple register interface at base offset 0x40000:
0x00— RX FIFO (read received byte)0x04— TX FIFO (write byte to transmit)0x08— STATUS register0x0C— CONTROL register
Status flags: RXVALID (bit 0) indicates data is available in the RX FIFO; TXFULL (bit 3) indicates the TX FIFO is full.

Approach 1: Linux Kernel Driver
The driver registers as a PCIe device (Vendor ID 0x10EE — Xilinx, Device ID 0x7011) and creates a TTY device at /dev/ttyUL0. Instead of using interrupts (which would require modifying the FPGA design), we use a workqueue-based polling approach for receiving data.
Full Driver Source Code
#include <linux/version.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/tty_flip.h>
#include <linux/io.h>
#include <linux/workqueue.h>
#define DRIVER_NAME "uartlite_xdma"
#define VENDOR_ID 0x10EE
#define DEVICE_ID 0x7011
#define UARTLITE_BASE_OFFSET 0x40000
#define UARTLITE_RX_FIFO 0x00
#define UARTLITE_TX_FIFO 0x04
#define UARTLITE_STATUS 0x08
#define UARTLITE_CONTROL 0x0C
#define STATUS_RXVALID BIT(0)
#define STATUS_TXFULL BIT(3)
struct uartlite_priv {
void __iomem *base;
struct tty_port port;
struct work_struct rx_work;
bool running;
};
static struct tty_driver *uartlite_tty_driver;
static int uartlite_tx_ready(struct uartlite_priv *priv)
{
return !(ioread32(priv->base + UARTLITE_STATUS) & STATUS_TXFULL);
}
static void uartlite_write_byte(struct uartlite_priv *priv, u8 val)
{
iowrite32(val, priv->base + UARTLITE_TX_FIFO);
}
static int uartlite_rx_ready(struct uartlite_priv *priv)
{
return ioread32(priv->base + UARTLITE_STATUS) & STATUS_RXVALID;
}
static u8 uartlite_read_byte(struct uartlite_priv *priv)
{
return ioread32(priv->base + UARTLITE_RX_FIFO);
}
static void uartlite_rx_work(struct work_struct *work)
{
struct uartlite_priv *priv = container_of(work, struct uartlite_priv, rx_work);
struct tty_struct *tty = tty_port_tty_get(&priv->port);
unsigned char buf[16];
int i, count;
if (!tty)
return;
while (priv->running && uartlite_rx_ready(priv)) {
count = 0;
for (i = 0; i < sizeof(buf) && uartlite_rx_ready(priv); i++) {
buf[i] = uartlite_read_byte(priv);
count++;
}
if (count) {
tty_insert_flip_string(&priv->port, buf, count);
tty_flip_buffer_push(&priv->port);
}
}
if (priv->running)
schedule_work(&priv->rx_work);
tty_kref_put(tty);
}
static int uartlite_tty_open(struct tty_struct *tty, struct file *filp)
{
struct uartlite_priv *priv = container_of(tty->port, struct uartlite_priv, port);
priv->running = true;
schedule_work(&priv->rx_work);
return tty_port_open(tty->port, tty, filp);
}
static void uartlite_tty_close(struct tty_struct *tty, struct file *filp)
{
struct uartlite_priv *priv = container_of(tty->port, struct uartlite_priv, port);
priv->running = false;
cancel_work_sync(&priv->rx_work);
tty_port_close(tty->port, tty, filp);
}
#if LINUX_VERSION_CODE <= KERNEL_VERSION(6, 5, 0)
static int uartlite_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
#else
static ssize_t uartlite_tty_write(struct tty_struct *tty, const u8 *buf, size_t count)
#endif
{
struct uartlite_priv *priv = tty->driver_data;
int i;
for (i = 0; i < count; i++) {
while (!uartlite_tx_ready(priv))
cpu_relax();
uartlite_write_byte(priv, buf[i]);
}
return i;
}
static unsigned int uartlite_tty_write_room(struct tty_struct *tty)
{
struct uartlite_priv *priv = tty->driver_data;
return uartlite_tx_ready(priv) ? 16 : 0;
}
static unsigned int uartlite_tty_chars_in_buffer(struct tty_struct *tty)
{
return 0;
}
static const struct tty_operations uartlite_tty_ops = {
.open = uartlite_tty_open,
.close = uartlite_tty_close,
.write = uartlite_tty_write,
.write_room = uartlite_tty_write_room,
.chars_in_buffer = uartlite_tty_chars_in_buffer,
};
static int uartlite_port_activate(struct tty_port *port, struct tty_struct *tty)
{
struct uartlite_priv *priv = container_of(port, struct uartlite_priv, port);
tty->driver_data = priv;
return 0;
}
static void uartlite_port_shutdown(struct tty_port *port)
{
struct uartlite_priv *priv = container_of(port, struct uartlite_priv, port);
priv->running = false;
cancel_work_sync(&priv->rx_work);
}
static const struct tty_port_operations uartlite_port_ops = {
.activate = uartlite_port_activate,
.shutdown = uartlite_port_shutdown,
};
static int uartlite_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct uartlite_priv *priv;
int ret;
priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
ret = pcim_enable_device(pdev);
if (ret)
return ret;
priv->base = pcim_iomap(pdev, 0, 0);
if (!priv->base)
return -ENOMEM;
priv->base += UARTLITE_BASE_OFFSET;
tty_port_init(&priv->port);
priv->port.ops = &uartlite_port_ops;
INIT_WORK(&priv->rx_work, uartlite_rx_work);
priv->running = false;
tty_port_register_device(&priv->port, uartlite_tty_driver, 0, &pdev->dev);
pci_set_drvdata(pdev, priv);
dev_info(&pdev->dev, "UARTlite over XDMA registered as TTY");
return 0;
}
static void uartlite_remove(struct pci_dev *pdev)
{
struct uartlite_priv *priv = pci_get_drvdata(pdev);
tty_unregister_device(uartlite_tty_driver, 0);
tty_port_destroy(&priv->port);
}
static const struct pci_device_id uartlite_pci_tbl[] = {
{ PCI_DEVICE(VENDOR_ID, DEVICE_ID) },
{ 0, }
};
static struct pci_driver uartlite_pci_driver = {
.name = DRIVER_NAME,
.id_table = uartlite_pci_tbl,
.probe = uartlite_probe,
.remove = uartlite_remove,
};
static int __init uartlite_init(void)
{
int ret;
uartlite_tty_driver = tty_alloc_driver(1, TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV);
if (IS_ERR(uartlite_tty_driver))
return PTR_ERR(uartlite_tty_driver);
uartlite_tty_driver->driver_name = DRIVER_NAME;
uartlite_tty_driver->name = "ttyUL";
uartlite_tty_driver->major = 0;
uartlite_tty_driver->minor_start = 0;
uartlite_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
uartlite_tty_driver->subtype = SERIAL_TYPE_NORMAL;
uartlite_tty_driver->init_termios = tty_std_termios;
uartlite_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
tty_set_operations(uartlite_tty_driver, &uartlite_tty_ops);
ret = tty_register_driver(uartlite_tty_driver);
if (ret) {
tty_driver_kref_put(uartlite_tty_driver);
return ret;
}
ret = pci_register_driver(&uartlite_pci_driver);
if (ret) {
tty_unregister_driver(uartlite_tty_driver);
tty_driver_kref_put(uartlite_tty_driver);
}
return ret;
}
static void __exit uartlite_exit(void)
{
pci_unregister_driver(&uartlite_pci_driver);
tty_unregister_driver(uartlite_tty_driver);
tty_driver_kref_put(uartlite_tty_driver);
}
module_init(uartlite_init);
module_exit(uartlite_exit);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Konstantin");
MODULE_DESCRIPTION("UARTlite TTY driver over XDMA with RX support");Makefile
obj-m += uartlite_xdma.o
KDIR ?= /lib/modules/$(shell uname -r)/build
all:
make CC=/usr/bin/gcc-13 -C $(KDIR) M=$(PWD) modules
clean:
make -C $(KDIR) M=$(PWD) clean
install:
make -C $(KDIR) M=$(PWD) modules_install
EXTRA_CFLAGS += -g
.PHONY: all clean installBuilding and Loading
sudo apt install -y linux-headers-$(uname -r) gcc-13
make
sudo insmod uartlite_xdma.ko
sudo minicom -D /dev/ttyUL0After loading the module, dmesg should show the registration message, and /dev/ttyUL0 will appear. You can then use minicom or gpsd to read GPS NMEA data.
Approach 2: Direct Python Access
For quick prototyping, you can bypass the kernel entirely by memory-mapping the XDMA user BAR directly from Python:
import os
import time
import numpy as np
import mmap
import asyncio
class XdmaUartLite:
BASE_ADDR = 0x40000
RX_FIFO = 0x00
TX_FIFO = 0x04
STATUS = 0x08
CONTROL = 0x0C
TX_FULL = 0x08
RX_VALID = 0x01
TX_RESET = 0x01
RX_RESET = 0x02
def __init__(self, device_index=0):
base = f"/dev/xdma{device_index}"
self.fd_user = os.open(f"{base}_user", os.O_RDWR)
self.m_rmap = np.frombuffer(mmap.mmap(self.fd_user, int(1e6)), np.uint32)
self.reset_fifos()
def close(self):
os.close(self.fd_user)
def read_reg(self, addr):
return self.m_rmap[addr >> 2] & 0xFFFF
def write_reg(self, addr, data):
self.m_rmap[addr >> 2] = np.uint32(data)
def reset_fifos(self):
self.write_reg(self.BASE_ADDR + self.CONTROL, self.TX_RESET | self.RX_RESET)
def send_byte(self, data):
while self.read_reg(self.BASE_ADDR + self.STATUS) & self.TX_FULL:
pass
self.write_reg(self.BASE_ADDR + self.TX_FIFO, data)
def recv_byte(self):
while not (self.read_reg(self.BASE_ADDR + self.STATUS) & self.RX_VALID):
pass
return self.read_reg(self.BASE_ADDR + self.RX_FIFO)
def send_data(self, data):
for byte in data:
self.send_byte(byte)
def recv_data(self, size):
return bytearray([self.recv_byte() for _ in range(size)])
async def recv_byte_async(self):
while not (self.read_reg(self.BASE_ADDR + self.STATUS) & self.RX_VALID):
await asyncio.sleep(0.001)
return self.read_reg(self.BASE_ADDR + self.RX_FIFO)
async def recv_data_async(self, size):
return bytearray([await self.recv_byte_async() for _ in range(size)])
async def main():
uart = XdmaUartLite()
while True:
data = await uart.recv_data_async(128)
print(data.decode(errors="ignore"))
asyncio.run(main())Performance Comparison

Profiling revealed a stark difference between approaches:
- Synchronous polling (both kernel and Python): consumes ~100% CPU on one core, spinning in a tight loop waiting for data
- Async Python with
asyncio.sleep(): reduces CPU usage to approximately 1% by yielding the thread between polls


For a 9600 baud GPS stream, the async approach is clearly the winner. The kernel driver approach makes more sense when you need standard TTY semantics (e.g., for gpsd) or when integrating with existing tools that expect a serial device.
Conclusion
We built a working Linux TTY driver for a UARTLite peripheral on an FPGA, accessible over PCIe. The driver is minimal but functional — it handles both RX and TX, registers as a proper TTY device, and works with standard tools like minicom and gpsd. For simpler use cases, the Python mmap approach offers a faster development cycle with better CPU efficiency when using async I/O. The whole project took about one evening — well, maybe a bit more than one evening, but who's counting?
FAQ
What is this article about in one sentence?
This article explains the core idea in practical terms and focuses on what you can apply in real work.
Who is this article for?
It is written for engineers, technical leaders, and curious readers who want a clear, implementation-focused explanation.
What should I read next?
Use the related articles below to continue with closely connected topics and concrete examples.