diff --git a/sys/conf/files.riscv b/sys/conf/files.riscv index 90a74367a54f..58a31b5f326e 100644 --- a/sys/conf/files.riscv +++ b/sys/conf/files.riscv @@ -6,6 +6,7 @@ cddl/dev/fbt/riscv/fbt_isa.c optional dtrace_fbt | dtraceall compile-with "${ crypto/des/des_enc.c optional netsmb dev/cpufreq/cpufreq_dt.c optional cpufreq fdt dev/ofw/ofw_cpu.c optional fdt +dev/ofw/ofw_pci.c optional pci fdt dev/ofw/ofw_pcib.c optional pci fdt dev/pci/pci_dw.c optional pci fdt dev/pci/pci_dw_if.m optional pci fdt @@ -42,6 +43,14 @@ libkern/memset.c standard libkern/strcmp.c standard libkern/strlen.c standard libkern/strncmp.c standard +riscv/iommu/iommu_frontend.c standard +riscv/iommu/iommu_if.m standard +riscv/iommu/iommu.c standard +riscv/iommu/iommu_fdt.c optional fdt +riscv/iommu/iommu_pci.c optional pci +riscv/iommu/iommu_pmap.c optional iommu +dev/iommu/busdma_iommu.c optional iommu +dev/iommu/iommu_gas.c optional iommu riscv/riscv/aplic.c standard riscv/riscv/autoconf.c standard riscv/riscv/bus_machdep.c standard diff --git a/sys/riscv/conf/GENERIC b/sys/riscv/conf/GENERIC index 827d5efef50b..cce2787ed5d7 100644 --- a/sys/riscv/conf/GENERIC +++ b/sys/riscv/conf/GENERIC @@ -77,6 +77,7 @@ options RACCT # Resource accounting framework options RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default options RCTL # Resource limits options SMP +options IOMMU # RISC-V SBI console device rcons diff --git a/sys/riscv/include/bus_dma_impl.h b/sys/riscv/include/bus_dma_impl.h index 09fd29b74f8e..8c2040a68f52 100644 --- a/sys/riscv/include/bus_dma_impl.h +++ b/sys/riscv/include/bus_dma_impl.h @@ -41,6 +41,7 @@ struct bus_dma_tag_common { int flags; bus_dma_lock_t *lockfunc; void *lockfuncarg; + int domain; }; struct bus_dma_impl { @@ -52,6 +53,8 @@ struct bus_dma_impl { int (*tag_destroy)(bus_dma_tag_t dmat); int (*map_create)(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp); int (*map_destroy)(bus_dma_tag_t dmat, bus_dmamap_t map); + int (*tag_set_domain)(bus_dma_tag_t); + bool (*id_mapped)(bus_dma_tag_t, vm_paddr_t, bus_size_t); int (*mem_alloc)(bus_dma_tag_t dmat, void** vaddr, int flags, bus_dmamap_t *mapp); void (*mem_free)(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map); diff --git a/sys/riscv/include/iommu.h b/sys/riscv/include/iommu.h new file mode 100644 index 000000000000..38214b7003f4 --- /dev/null +++ b/sys/riscv/include/iommu.h @@ -0,0 +1,10 @@ +/*- + * This file is in the public domain. + */ + +#ifndef _MACHINE_IOMMU_H_ +#define _MACHINE_IOMMU_H_ + +#include + +#endif /* !_MACHINE_IOMMU_H_ */ diff --git a/sys/riscv/iommu/iommu.c b/sys/riscv/iommu/iommu.c new file mode 100644 index 000000000000..59df2e68384f --- /dev/null +++ b/sys/riscv/iommu/iommu.c @@ -0,0 +1,1351 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include + +#include "iommu_if.h" + +#define dprintf(fmt, ...) + +MALLOC_DEFINE(M_IOMMU, "RISCV_IOMMU", "RISC-V IOMMU"); + +#define RD4(sc, reg) bus_read_4(sc->res[0], (reg)) +#define WR4(sc, reg, val) bus_write_4(sc->res[0], (reg), (val)) +#define RD8(sc, reg) bus_read_8(sc->res[0], (reg)) +#define WR8(sc, reg, val) bus_write_8(sc->res[0], (reg), (val)) + +#define CQ_ENTRY_DWORDS 2 /* 16-byte */ +#define CQ_ENTRY_COUNT 8192 /* Amount of 16-byte entries. */ +#define FQ_ENTRY_DWORDS 4 /* 32-byte */ +#define FQ_ENTRY_COUNT 8192 /* Amount of 32-byte entries. */ +#define PQ_ENTRY_DWORDS 2 /* 16-byte */ +#define PQ_ENTRY_COUNT 8192 /* Amount of 16-byte entries. */ + +#define DDT_NON_LEAF_DWORDS 1 +#define DDT_DC_STD_DWORDS 4 /* Standard-format DC. */ +#define DDT_DC_EXT_DWORDS 8 /* Extended-format DC. */ +#define DDT_L1_DID_BITS 9 /* All formats. */ + +#define QUEUE_ALIGN (1024 * 1024) /* TODO */ +#define QUEUE_HEAD(q) ((q)->csr + RISCV_IOMMU_CQH - RISCV_IOMMU_CQB) +#define QUEUE_TAIL(q) ((q)->csr + RISCV_IOMMU_CQT - RISCV_IOMMU_CQB) +#define QUEUE_IPSR(q) (1 << (q)->idx) + +#define PHYS_TO_PPN(p) ((p) >> 12) + +struct riscv_iommu_fq_event { + uint16_t cause_id; + char *descr; +}; + +static struct riscv_iommu_fq_event fq_events[] = { + { FQ_CAUSE_INST_FAULT, "Instruction access fault" }, + { FQ_CAUSE_RD_ADDR_MISALIGNED, "Read address misaligned" }, + { FQ_CAUSE_RD_FAULT, "Read access fault" }, + { FQ_CAUSE_WR_ADDR_MISALIGNED, "Write/AMO address misaligned" }, + { FQ_CAUSE_WR_FAULT, "Write/AMO access fault" }, + { FQ_CAUSE_INST_FAULT_S, "Instruction page fault" }, + { FQ_CAUSE_RD_FAULT_S, "Read page fault" }, + { FQ_CAUSE_WR_FAULT_S, "Write/AMO page fault" }, + { FQ_CAUSE_INST_FAULT_VS, "Instruction guest page fault" }, + { FQ_CAUSE_RD_FAULT_VS, "Read guest-page fault" }, + { FQ_CAUSE_WR_FAULT_VS, "Write/AMO guest-page fault" }, + { FQ_CAUSE_DMA_DISABLED, "All inbound transactions disallowed" }, + { FQ_CAUSE_DDT_LOAD_FAULT, "DDT entry load access fault" }, + { FQ_CAUSE_DDT_INVALID, "DDT entry not valid" }, + { FQ_CAUSE_DDT_MISCONFIGURED, "DDT entry misconfigured" }, + { FQ_CAUSE_TR_TYPE_DISALLOWED, "Transaction type disallowed" }, + { FQ_CAUSE_MSI_LOAD_FAULT, "MSI PTE load access fault" }, + { FQ_CAUSE_MSI_INVALID, "MSI PTE not valid" }, + { FQ_CAUSE_MSI_MISCONFIGURED, "MSI PTE misconfigured" }, + { FQ_CAUSE_MRIF_FAULT, "MRIF access fault" }, + { FQ_CAUSE_PDT_LOAD_FAULT, "PDT entry load access fault" }, + { FQ_CAUSE_PDT_INVALID, "PDT entry not valid" }, + { FQ_CAUSE_PDT_MISCONFIGURED, "PDT entry misconfigured" }, + { FQ_CAUSE_DDT_CORRUPTED, "DDT data corruption" }, + { FQ_CAUSE_PDT_CORRUPTED, "PDT data corruption" }, + { FQ_CAUSE_MSI_PT_CORRUPTED, "MSI PT data corruption" }, + { FQ_CAUSE_MRIF_CORRUPTED, "MSI MRIF data corruption" }, + { FQ_CAUSE_INTERNAL_DP_ERROR, "Internal data path error" }, + { FQ_CAUSE_MSI_WR_FAULT, "IOMMU MSI write access fault" }, + { FQ_CAUSE_PT_CORRUPTED, "1st/2nd-stage PT data corruption" }, + { 0, NULL }, +}; + +static void +riscv_iommu_init_pscids(struct riscv_iommu_softc *sc) +{ + + sc->pscid_set_size = (1 << sc->pscid_bits); + sc->pscid_set = bit_alloc(sc->pscid_set_size, M_IOMMU, M_WAITOK); + mtx_init(&sc->pscid_set_mutex, "pscid set", NULL, MTX_SPIN); +} + +static int +riscv_iommu_pscid_alloc(struct riscv_iommu_softc *sc, int *new_pscid) +{ + + mtx_lock_spin(&sc->pscid_set_mutex); + bit_ffc(sc->pscid_set, sc->pscid_set_size, new_pscid); + if (*new_pscid == -1) { + mtx_unlock_spin(&sc->pscid_set_mutex); + return (ENOMEM); + } + bit_set(sc->pscid_set, *new_pscid); + mtx_unlock_spin(&sc->pscid_set_mutex); + + return (0); +} + +static void +riscv_iommu_pscid_free(struct riscv_iommu_softc *sc, int pscid) +{ + + mtx_lock_spin(&sc->pscid_set_mutex); + bit_clear(sc->pscid_set, pscid); + mtx_unlock_spin(&sc->pscid_set_mutex); +} + +static uint32_t +riscv_iommu_q_inc_tail(struct riscv_iommu_queue *q) +{ + + return ((q->lc.tail + 1) & q->mask); +} + +static uint32_t +riscv_iommu_q_inc_head(struct riscv_iommu_queue *q) +{ + + return ((q->lc.head + 1) & q->mask); +} + +static int +riscv_iommu_q_has_space(struct riscv_iommu_queue *q) +{ + + if (riscv_iommu_q_inc_tail(q) != q->lc.head) + return (1); + + return (0); +} + +static int +riscv_iommu_q_empty(struct riscv_iommu_queue *q) +{ + + if (q->lc.tail == q->lc.head) + return (1); + + return (0); +} + +static int +riscv_iommu_dequeue(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q, + void *data) +{ + void *entry_addr; + + q->lc.val = RD8(sc, q->head_off); + entry_addr = (void *)((uint64_t)q->vaddr + q->lc.head * q->entry_size); + memcpy(data, entry_addr, q->entry_size); + q->lc.head = riscv_iommu_q_inc_head(q); + WR4(sc, q->head_off, q->lc.head); + + return (0); +} + +static int +riscv_iommu_enqueue(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q, + void *data) +{ + void *entry_addr; + + RISCV_IOMMU_LOCK(sc); + + /* Ensure that a space is available. */ + do { + q->lc.head = RD4(sc, q->head_off); + } while (riscv_iommu_q_has_space(q) == 0); + + /* Write the command to the current tail entry. */ + entry_addr = (void *)((uint64_t)q->vaddr + q->lc.tail * q->entry_size); + memcpy(entry_addr, data, q->entry_size); + + /* Increment tail index. */ + q->lc.tail = riscv_iommu_q_inc_tail(q); + WR4(sc, q->tail_off, q->lc.tail); + + RISCV_IOMMU_UNLOCK(sc); + + return (0); +} + +static void +riscv_iommu_sync(struct riscv_iommu_softc *sc, struct riscv_iommu_queue *q) +{ + struct riscv_iommu_command cmd; + uint64_t reg; + + bzero(&cmd, sizeof(struct riscv_iommu_command)); + reg = COMMAND_OPCODE_IOFENCE; + reg |= FUNC_IOFENCE_FUNC_C | FUNC_IOFENCE_PR | FUNC_IOFENCE_PW; + cmd.dword0 = reg; + + riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd); + + /* + * FUNC_IOFENCE_WSI does not seem to be implemented in QEMU, + * so ensure all requests are processed in polling mode; + */ + do { + q->lc.head = RD4(sc, q->head_off); + } while (riscv_iommu_q_empty(q) == 0); +} + +static int +riscv_iommu_inval_ddt(struct riscv_iommu_softc *sc) +{ + struct riscv_iommu_command cmd; + uint64_t reg; + + bzero(&cmd, sizeof(struct riscv_iommu_command)); + reg = COMMAND_OPCODE_IODIR; + reg |= FUNC_IODIR_INVAL_DDT; + cmd.dword0 = reg; + + riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd); + + return (0); +} + +static int +riscv_iommu_inval_ddt_did(struct riscv_iommu_softc *sc, int did) +{ + struct riscv_iommu_command cmd; + uint64_t reg; + + bzero(&cmd, sizeof(struct riscv_iommu_command)); + reg = COMMAND_OPCODE_IODIR; + reg |= FUNC_IODIR_INVAL_DDT; + reg |= FUNC_IODIR_DV; + reg |= (uint64_t)did << FUNC_IODIR_DID_S; + cmd.dword0 = reg; + + riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd); + + return (0); +} + +/* Invalidate entire address space. */ +static int +riscv_iommu_inval_vma(struct riscv_iommu_softc *sc) +{ + struct riscv_iommu_command cmd; + uint64_t reg; + + bzero(&cmd, sizeof(struct riscv_iommu_command)); + reg = COMMAND_OPCODE_IOTINVAL; + reg |= FUNC_IOTINVAL_VMA; + cmd.dword0 = reg; + + riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd); + + return (0); +} + +static int +riscv_iommu_inval_vma_page(struct riscv_iommu_softc *sc, vm_offset_t addr, + int pscid) +{ + struct riscv_iommu_command cmd; + uint64_t reg; + + bzero(&cmd, sizeof(struct riscv_iommu_command)); + reg = COMMAND_OPCODE_IOTINVAL; + reg |= FUNC_IOTINVAL_VMA; + reg |= FUNC_IOTINVAL_AV; + reg |= FUNC_IOTINVAL_PSCV; + reg |= pscid << FUNC_IOTINVAL_PSCID_S; + cmd.dword0 = reg; + cmd.dword1 = PHYS_TO_PPN(addr) << FUNC_IOTINVAL_ADDR_S; + + riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd); + + return (0); +} + +static int +riscv_iommu_inval_vma_pscid(struct riscv_iommu_softc *sc, int pscid) +{ + struct riscv_iommu_command cmd; + uint64_t reg; + + bzero(&cmd, sizeof(struct riscv_iommu_command)); + reg = COMMAND_OPCODE_IOTINVAL; + reg |= FUNC_IOTINVAL_VMA; + reg |= FUNC_IOTINVAL_PSCV; + reg |= pscid << FUNC_IOTINVAL_PSCID_S; + cmd.dword0 = reg; + + riscv_iommu_enqueue(sc, &sc->cq, (void *)&cmd); + + return (0); +} + +static int +riscv_iommu_set_mode(struct riscv_iommu_softc *sc) +{ + struct riscv_iommu_ddt *ddt; + uint64_t reg; + uint64_t base; + + reg = RD8(sc, RISCV_IOMMU_DDTP); + if (reg & DDTP_BUSY) + return (ENXIO); + + ddt = &sc->ddt; + base = ddt->base | (sc->iommu_mode << DDTP_IOMMU_MODE_S); + WR8(sc, RISCV_IOMMU_DDTP, base); + + reg = RD8(sc, RISCV_IOMMU_DDTP); + if (reg != base) { + device_printf(sc->dev, "could not set mode\n"); + return (ENXIO); + } + + riscv_iommu_inval_ddt(sc); + riscv_iommu_inval_vma(sc); + + return (0); +} + +static int +riscv_iommu_enable_queue(struct riscv_iommu_softc *sc, + struct riscv_iommu_queue *q) +{ + uint32_t reg; + int timeout; + + if (q == &sc->cq) + WR4(sc, QUEUE_TAIL(q), 0); + else + WR4(sc, QUEUE_HEAD(q), 0); + + reg = CQCSR_CQEN | CQCSR_CIE | CQCSR_CQMF; + WR4(sc, q->csr, reg); + + timeout = 1000; + do { + reg = RD4(sc, RISCV_IOMMU_CQCSR); + if ((reg & CQCSR_BUSY) == 0) + break; + DELAY(10); + } while (timeout--); + + if (timeout <= 0) { + device_printf(sc->dev, "could not enable command queue\n"); + return (-1); + } + + if ((reg & CQCSR_CQON) == 0) { + device_printf(sc->dev, "could not activate command queue\n"); + return (-1); + } + + /* RW1C interrupt pending bit. */ + WR4(sc, RISCV_IOMMU_IPSR, QUEUE_IPSR(q)); + + return (0); +} + +static int +riscv_iommu_init_queue(struct riscv_iommu_softc *sc, + struct riscv_iommu_queue *q, uint64_t base, uint32_t dwords) +{ + uint64_t reg; + int sz; + + q->entry_size = dwords * 8; + sz = (1 << q->size_log2) * q->entry_size; + + /* Set up the command circular buffer */ + q->vaddr = contigmalloc(sz, M_IOMMU, M_WAITOK | M_ZERO, 0, + (1ul << 48) - 1, QUEUE_ALIGN, 0); + if (q->vaddr == NULL) { + device_printf(sc->dev, "failed to allocate %d bytes\n", sz); + return (-1); + } + + q->mask = (1 << q->size_log2) - 1; + q->head_off = (uint32_t)base - RISCV_IOMMU_CQB + RISCV_IOMMU_CQH; + q->tail_off = (uint32_t)base - RISCV_IOMMU_CQB + RISCV_IOMMU_CQT; + q->paddr = vtophys(q->vaddr); + q->base = (sc->cq.size_log2 - 1) << CQB_LOG2SZ_1_S; + q->base |= PHYS_TO_PPN(q->paddr) << CQB_PPN_S; + WR8(sc, base, q->base); + + /* Verify it sticks. */ + reg = RD8(sc, base); + if (reg != q->base) { + device_printf(sc->dev, "could not init queue\n"); + return (ENXIO); + } + + return (0); +} + +static int +riscv_iommu_init_queues(struct riscv_iommu_softc *sc) +{ + int error; + + sc->cq.size_log2 = ilog2(CQ_ENTRY_COUNT); + sc->fq.size_log2 = ilog2(FQ_ENTRY_COUNT); + sc->pq.size_log2 = ilog2(PQ_ENTRY_COUNT); + + sc->cq.csr = RISCV_IOMMU_CQCSR; + sc->fq.csr = RISCV_IOMMU_FQCSR; + sc->pq.csr = RISCV_IOMMU_PQCSR; + + sc->cq.idx = 0; + sc->fq.idx = 1; + sc->pq.idx = 3; + + /* Command queue (CQ). */ + error = riscv_iommu_init_queue(sc, &sc->cq, RISCV_IOMMU_CQB, + CQ_ENTRY_DWORDS); + if (error) + return (error); + + /* Fault queue (FQ). */ + error = riscv_iommu_init_queue(sc, &sc->fq, RISCV_IOMMU_FQB, + FQ_ENTRY_DWORDS); + if (error) + return (error); + + /* Page request queue (PQ). */ + error = riscv_iommu_init_queue(sc, &sc->pq, RISCV_IOMMU_PQB, + PQ_ENTRY_DWORDS); + if (error) + return (error); + + error = riscv_iommu_enable_queue(sc, &sc->cq); + if (error) + return (error); + + error = riscv_iommu_enable_queue(sc, &sc->fq); + if (error) + return (error); + + error = riscv_iommu_enable_queue(sc, &sc->pq); + if (error) + return (error); + + return (0); +} + +static int +riscv_iommu_init_pagedir(struct riscv_iommu_softc *sc) +{ + + return (0); +} + +static void +riscv_iommu_print_fault(struct riscv_iommu_softc *sc, + struct riscv_iommu_fq_record *rec) +{ + struct riscv_iommu_fq_event *ev; + uint16_t cause_id; + uint16_t ttyp; + uint32_t did; + uint32_t pid; + bool pv, priv; + int i; + + cause_id = (rec->hdr & FQR_HDR_CAUSE_M) >> FQR_HDR_CAUSE_S; + ttyp = (rec->hdr & FQR_HDR_TTYP_M) >> FQR_HDR_TTYP_S; + did = (rec->hdr & FQR_HDR_DID_M) >> FQR_HDR_DID_S; + pid = (rec->hdr & FQR_HDR_PID_M) >> FQR_HDR_PID_S; + pv = (rec->hdr & FQR_HDR_PV) ? 1 : 0; + priv = (rec->hdr & FQR_HDR_PRIV) ? 1 : 0; + + ev = NULL; + for (i = 0; fq_events[i].cause_id != 0; i++) { + if (fq_events[i].cause_id == cause_id) { + ev = &fq_events[i]; + break; + } + } + + if (ev == NULL) { + device_printf(sc->dev, "Fault: unknown fault 0x%x received\n", + cause_id); + return; + } + + device_printf(sc->dev, "Fault: event 0x%x received: %s\n", + ev->cause_id, ev->descr); + device_printf(sc->dev, " hdr 0x%lx\n", rec->hdr); + device_printf(sc->dev, " iotval 0x%lx\n", rec->iotval); + device_printf(sc->dev, " iotval2 0x%lx\n", rec->iotval2); + device_printf(sc->dev, " ttyp 0x%x did 0x%x pid 0x%x pv %d priv %d" + "\n", ttyp, did, pid, pv, priv); +} + +static int +riscv_cq_intr(void *arg) +{ + struct riscv_iommu_softc *sc; + struct riscv_iommu_queue *q; + uint32_t reg; + + sc = arg; + q = &sc->cq; + + reg = RD4(sc, q->csr); + printf("%s: pending %x\n", __func__, reg); + + /* Clear pending bit. */ + WR4(sc, RISCV_IOMMU_IPSR, IPSR_CIP); + + return (FILTER_HANDLED); +} + +static int +riscv_fq_intr(void *arg) +{ + struct riscv_iommu_fq_record rec; + struct riscv_iommu_softc *sc; + struct riscv_iommu_queue *q; + uint32_t reg; + + sc = arg; + q = &sc->fq; + + reg = RD4(sc, q->csr); + printf("%s: pending %x\n", __func__, reg); + + /* Clear pending bit. */ + WR4(sc, RISCV_IOMMU_IPSR, IPSR_FIP); + + do { + riscv_iommu_dequeue(sc, q, &rec); + riscv_iommu_print_fault(sc, &rec); + } while (!riscv_iommu_q_empty(q)); + + return (FILTER_HANDLED); +} + +static int +riscv_pm_intr(void *arg) +{ + struct riscv_iommu_softc *sc; + + sc = arg; + + printf("%s\n", __func__); + + /* Clear pending bit. */ + WR4(sc, RISCV_IOMMU_IPSR, IPSR_PMIP); + + return (FILTER_HANDLED); +} + +static int +riscv_pq_intr(void *arg) +{ + struct riscv_iommu_softc *sc; + struct riscv_iommu_queue *q; + uint32_t reg; + + sc = arg; + q = &sc->pq; + + reg = RD4(sc, q->csr); + printf("%s: pending %x\n", __func__, reg); + + /* Clear pending bit. */ + WR4(sc, RISCV_IOMMU_IPSR, IPSR_PIP); + + return (FILTER_HANDLED); +} + +static int +riscv_iommu_init_ddt_linear(struct riscv_iommu_softc *sc) +{ + struct riscv_iommu_ddt *ddt; + uint64_t size; + uint64_t reg; + + ddt = &sc->ddt; + ddt->num_top_entries = (1 << sc->l0_did_bits); + + size = ddt->num_top_entries * (sc->dc_dwords << 3); + + if (bootverbose) + device_printf(sc->dev, "linear ddt size %ld, num_top_entries " + "%d\n", size, ddt->num_top_entries); + + ddt->vaddr = contigmalloc(size, M_IOMMU, + M_WAITOK | M_ZERO, /* flags */ + 0, /* low */ + (1ul << 48) - 1, /* high */ + size, /* alignment */ + 0); /* boundary */ + if (ddt->vaddr == NULL) { + device_printf(sc->dev, "failed to allocate ddt\n"); + return (ENXIO); + } + + reg = vtophys(ddt->vaddr); + if (bootverbose) + device_printf(sc->dev, "ddt base %p size %lx\n", ddt->vaddr, + size); + ddt->base = PHYS_TO_PPN(reg) << DDTP_PPN_S; + + return (0); +} + +static int +riscv_iommu_init_ddt_2lvl(struct riscv_iommu_softc *sc) +{ + struct riscv_iommu_ddt *ddt; + uint64_t size; + uint64_t reg; + uint64_t sz; + + ddt = &sc->ddt; + ddt->num_top_entries = (1 << DDT_L1_DID_BITS); + + size = ddt->num_top_entries * (DDT_NON_LEAF_DWORDS << 3); + + if (bootverbose) + device_printf(sc->dev, "%s: size %lu, l1 entries %d, size " + "%lu\n", __func__, size, ddt->num_top_entries, size); + + ddt->vaddr = contigmalloc(size, M_IOMMU, + M_WAITOK | M_ZERO, /* flags */ + 0, /* low */ + (1ul << 48) - 1, /* high */ + size, /* alignment */ + 0); /* boundary */ + if (ddt->vaddr == NULL) { + device_printf(sc->dev, "Failed to allocate 2lvl ddt.\n"); + return (ENOMEM); + } + + sz = ddt->num_top_entries * sizeof(struct l1_desc); + ddt->l1 = malloc(sz, M_IOMMU, M_WAITOK | M_ZERO); + + reg = vtophys(ddt->vaddr); + if (bootverbose) + device_printf(sc->dev, "ddt base %p size %lx\n", ddt->vaddr, + size); + ddt->base = PHYS_TO_PPN(reg) << DDTP_PPN_S; + + return (0); +} + +static int +riscv_iommu_init_l0_directory(struct riscv_iommu_softc *sc, int sid) +{ + struct riscv_iommu_ddt *ddt; + struct l1_desc *l1_desc; + uint64_t *l1e; + uint64_t val; + size_t size; + int i; + + ddt = &sc->ddt; + l1_desc = &ddt->l1[sid >> sc->l0_did_bits]; + if (l1_desc->va) { + /* Already allocated. */ + return (0); + } + + size = (1 << sc->l0_did_bits) * (sc->dc_dwords << 3); + + l1_desc->va = contigmalloc(size, M_IOMMU, + M_WAITOK | M_ZERO, /* flags */ + 0, /* low */ + (1ul << 48) - 1, /* high */ + size, /* alignment */ + 0); /* boundary */ + if (l1_desc->va == NULL) { + device_printf(sc->dev, "failed to allocate l0 directory\n"); + return (ENXIO); + } + + l1_desc->pa = vtophys(l1_desc->va); + + i = sid >> sc->l0_did_bits; + l1e = (void *)((uint64_t)ddt->vaddr + DDT_NON_LEAF_DWORDS * 8 * i); + + /* Install the L1 entry. */ + val = PHYS_TO_PPN(l1_desc->pa) << DC_NON_LEAF_ENTRY_PPN_S; + val |= DC_NON_LEAF_ENTRY_VALID; + *l1e = val; + + return (0); +} + +static void * +riscv_iommu_get_dc_addr(struct riscv_iommu_softc *sc, int did) +{ + struct riscv_iommu_ddt *ddt; + struct l1_desc *l1_desc; + uintptr_t l0_base; + void *addr; + int l0_offs; + int l1_idx; + + ddt = &sc->ddt; + + l0_offs = sc->dc_dwords * 8 * (did & ((1 << sc->l0_did_bits) - 1)); + + if (sc->iommu_mode == DDTP_IOMMU_MODE_2LVL) { + l1_idx = (did >> sc->l0_did_bits) & + ((1 << DDT_L1_DID_BITS) - 1); + l1_desc = &ddt->l1[l1_idx]; + l0_base = (uintptr_t)l1_desc->va; + } else + l0_base = (uintptr_t)ddt->vaddr; + + addr = (void *)(l0_base + l0_offs); + + dprintf("ddt vaddr %p addr %p\n", ddt->vaddr, addr); + + return (addr); +} + +static int +riscv_iommu_init_dc(struct riscv_iommu_softc *sc, + struct riscv_iommu_domain *domain, int did, bool bypass) +{ + struct riscv_iommu_dc_base *dc_base; + struct riscv_iommu_dc *dc; + struct riscv_iommu_pmap *p; + + dc = riscv_iommu_get_dc_addr(sc, did); + dc_base = &dc->base; + + device_printf(sc->dev, "address translation for device id" + " 0x%x is %s.\n", did, bypass ? "bypassed" : "enabled"); + + p = &domain->p; + + bzero(dc_base, sizeof(struct riscv_iommu_dc_base)); + if (bypass == false) + dc_base->fsc = p->pm_satp; + dc_base->ta = (domain->pscid << DC_TA_PSCID_S) | DC_TA_V; + + riscv_iommu_inval_ddt_did(sc, did); + riscv_iommu_sync(sc, &sc->cq); + dc_base->tc |= DC_TC_V; + riscv_iommu_inval_ddt_did(sc, did); + riscv_iommu_inval_vma(sc); + riscv_iommu_sync(sc, &sc->cq); + + return (0); +} + +static void +riscv_iommu_deinit_dc(struct riscv_iommu_softc *sc, int did) +{ + struct riscv_iommu_dc_base *dc_base; + struct riscv_iommu_dc *dc; + + dc = riscv_iommu_get_dc_addr(sc, did); + dc_base = &dc->base; + dc_base->tc &= ~DC_TC_V; + + riscv_iommu_inval_ddt_did(sc, did); + riscv_iommu_sync(sc, &sc->cq); +} + +static int +riscv_iommu_setup_interrupts(struct riscv_iommu_softc *sc) +{ + device_t dev; + int error; + + dev = sc->dev; + + if (sc->res[1] == NULL || sc->res[2] == NULL || + sc->res[3] == NULL || sc->res[4] == NULL) { + device_printf(dev, "Warning: no interrupt resources " + "provided.\n"); + return (ENXIO); + } + + error = bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC, + riscv_cq_intr, NULL, sc, &sc->intr_cookie[0]); + if (error) { + device_printf(dev, "Couldn't setup cq interrupt handler\n"); + return (ENXIO); + } + + error = bus_setup_intr(dev, sc->res[2], INTR_TYPE_MISC, + riscv_fq_intr, NULL, sc, &sc->intr_cookie[1]); + if (error) { + device_printf(dev, "Couldn't setup fq interrupt handler\n"); + return (ENXIO); + } + + error = bus_setup_intr(dev, sc->res[3], INTR_TYPE_MISC, + riscv_pm_intr, NULL, sc, &sc->intr_cookie[2]); + if (error) { + device_printf(dev, "Couldn't setup pm interrupt handler\n"); + return (ENXIO); + } + + error = bus_setup_intr(dev, sc->res[4], INTR_TYPE_MISC, + riscv_pq_intr, NULL, sc, &sc->intr_cookie[3]); + if (error) { + device_printf(dev, "Couldn't setup pq interrupt handler\n"); + return (ENXIO); + } + + WR8(sc, RISCV_IOMMU_ICVEC, 0 << 0 | 1 << 4 | 2 << 8 | 3 << 12); + + return (0); +} + +int +riscv_iommu_attach(device_t dev) +{ + struct riscv_iommu_softc *sc; + uint64_t caps; + int error; + + sc = device_get_softc(dev); + + caps = bus_read_8(sc->res[0], RISCV_IOMMU_CAPABILITIES); + if (bootverbose) + device_printf(sc->dev, "IOMMU Capabilities: %lx\n", caps); + + device_printf(sc->dev, "Device-Context structure is %s.\n", + caps & CAPABILITIES_MSI_FLAT ? + "64-bytes (ext format)" : "32-bytes (std format)"); + + if (caps & CAPABILITIES_MSI_FLAT) { + sc->dc_dwords = DDT_DC_EXT_DWORDS; + sc->l0_did_bits = 6; + } else { + sc->dc_dwords = DDT_DC_STD_DWORDS; + sc->l0_did_bits = 7; + } + + if (caps & CAPABILITIES_SV48) + sc->pm_mode = PMAP_MODE_SV48; + else if (caps & CAPABILITIES_SV39) + sc->pm_mode = PMAP_MODE_SV39; + else { + device_printf(sc->dev, "Unsupported virtual memory system\n"); + return (ENXIO); + } + + mtx_init(&sc->mtx, device_get_nameunit(sc->dev), "riscv_iommu", + MTX_DEF); + + WR4(sc, RISCV_IOMMU_FCTL, FCTL_WSI); + + error = riscv_iommu_setup_interrupts(sc); + if (error) { + device_printf(sc->dev, "Could not setup interrupts. " + "Continuing with no interrupts support."); + } + + error = riscv_iommu_init_pagedir(sc); + if (error) + return (error); + + error = riscv_iommu_init_queues(sc); + if (error) + return (error); + + sc->iommu_mode = DDTP_IOMMU_MODE_2LVL; + + switch (sc->iommu_mode) { + case DDTP_IOMMU_MODE_1LVL: + error = riscv_iommu_init_ddt_linear(sc); + break; + case DDTP_IOMMU_MODE_2LVL: + error = riscv_iommu_init_ddt_2lvl(sc); + break; + default: + error = ENXIO; + } + if (error) + return (error); + + sc->pscid_bits = 8; + + riscv_iommu_init_pscids(sc); + if (error) + return (error); + + error = riscv_iommu_set_mode(sc); + if (error) + return (error); + + return (0); +} + +static int +riscv_iommu_set_buswide(device_t dev, struct riscv_iommu_domain *domain, + struct riscv_iommu_ctx *ctx) +{ + struct riscv_iommu_softc *sc; + int i; + + sc = device_get_softc(dev); + + printf("%s\n", __func__); + + for (i = 0; i < PCI_SLOTMAX; i++) + riscv_iommu_init_dc(sc, domain, (ctx->did | i), + ctx->bypass); + + return (0); +} + +static int +riscv_iommu_pci_get_did(device_t child, uintptr_t *xref0, u_int *did0) +{ + struct pci_id_ofw_iommu pi; + int err; + + dprintf("%s\n", __func__); + + err = pci_get_id(child, PCI_ID_OFW_IOMMU, (uintptr_t *)&pi); + if (err == 0) { + if (did0) + *did0 = pi.id; + if (xref0) + *xref0 = pi.xref; + } + + return (err); +} + +static int +riscv_iommu_find(device_t dev, device_t child) +{ + struct riscv_iommu_softc *sc; + uintptr_t xref; + int err; + + dprintf("%s\n", __func__); + + sc = device_get_softc(dev); + + err = riscv_iommu_pci_get_did(child, &xref, NULL); + if (err) + return (ENOENT); + + /* Check if xref is ours. */ + dprintf("xref %lx sc->xref %lx\n", xref, sc->xref); + if (xref != sc->xref) + return (EFAULT); + + return (0); +} + +struct riscv_iommu_ctx * +riscv_iommu_ctx_lookup_by_did(device_t dev, u_int did) +{ + struct riscv_iommu_softc *sc; + struct riscv_iommu_domain *domain; + struct riscv_iommu_unit *unit; + struct riscv_iommu_ctx *ctx; + + dprintf("%s\n", __func__); + sc = device_get_softc(dev); + + unit = &sc->unit; + + LIST_FOREACH(domain, &unit->domain_list, next) { + LIST_FOREACH(ctx, &domain->ctx_list, next) { + if (ctx->did == did) { + refcount_acquire(&ctx->refcnt); + return (ctx); + } + } + } + + return (NULL); +} + +static struct iommu_ctx * +riscv_iommu_ctx_lookup(device_t dev, device_t child) +{ + struct iommu_unit *iommu __diagused; + struct riscv_iommu_softc *sc; + struct riscv_iommu_domain *domain; + struct riscv_iommu_unit *unit; + struct riscv_iommu_ctx *ctx; + + dprintf("%s\n", __func__); + sc = device_get_softc(dev); + + unit = &sc->unit; + iommu = &unit->iommu; + + IOMMU_ASSERT_LOCKED(iommu); + + LIST_FOREACH(domain, &unit->domain_list, next) { + IOMMU_DOMAIN_LOCK(&domain->iodom); + LIST_FOREACH(ctx, &domain->ctx_list, next) { + if (ctx->dev == child) { + refcount_acquire(&ctx->refcnt); + IOMMU_DOMAIN_UNLOCK(&domain->iodom); + return (&ctx->ioctx); + } + } + IOMMU_DOMAIN_UNLOCK(&domain->iodom); + } + + return (NULL); +} + +static int +riscv_iommu_unmap(device_t dev, struct iommu_domain *iodom, + vm_offset_t va, bus_size_t size) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_softc *sc; + int err; + int i; + + sc = device_get_softc(dev); + + domain = (struct riscv_iommu_domain *)iodom; + + err = 0; + + dprintf("%s: %lx, %ld, domain %d\n", __func__, va, size, domain->pscid); + + for (i = 0; i < size; i += PAGE_SIZE) { + if (iommu_pmap_remove(&domain->p, va) == 0) { + /* pmap entry removed, invalidate TLB. */ + riscv_iommu_inval_vma_page(sc, va, domain->pscid); + } else { + err = ENOENT; + break; + } + va += PAGE_SIZE; + } + + riscv_iommu_sync(sc, &sc->cq); + + return (err); +} + +static int +riscv_iommu_map(device_t dev, struct iommu_domain *iodom, + vm_offset_t va, vm_page_t *ma, vm_size_t size, + vm_prot_t prot) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_softc *sc; + vm_paddr_t pa; + int error; + int i; + + sc = device_get_softc(dev); + + domain = (struct riscv_iommu_domain *)iodom; + + for (i = 0; size > 0; size -= PAGE_SIZE) { + pa = VM_PAGE_TO_PHYS(ma[i++]); + dprintf("%s: %lx -> %lx, %ld, domain %d\n", __func__, va, pa, + size, domain->pscid); + error = iommu_pmap_enter(&domain->p, va, pa, prot, 0); + if (error) + return (error); + riscv_iommu_inval_vma_page(sc, va, domain->pscid); + va += PAGE_SIZE; + } + + riscv_iommu_sync(sc, &sc->cq); + + return (0); +} + +static struct iommu_domain * +riscv_iommu_domain_alloc(device_t dev, struct iommu_unit *iommu) +{ + struct iommu_domain *iodom; + struct riscv_iommu_domain *domain; + struct riscv_iommu_unit *unit; + struct riscv_iommu_softc *sc; + int new_pscid; + int va_bits; + int error; + + sc = device_get_softc(dev); + + dprintf("%s\n", __func__); + + unit = (struct riscv_iommu_unit *)iommu; + + error = riscv_iommu_pscid_alloc(sc, &new_pscid); + if (error) { + device_printf(sc->dev, + "Could not allocate PSCID for a new domain.\n"); + return (NULL); + } + + domain = malloc(sizeof(*domain), M_IOMMU, M_WAITOK | M_ZERO); + domain->pscid = (uint16_t)new_pscid; + + iommu_pmap_pinit(&domain->p, sc->pm_mode); + + riscv_iommu_inval_vma_pscid(sc, domain->pscid); + + LIST_INIT(&domain->ctx_list); + + IOMMU_LOCK(iommu); + LIST_INSERT_HEAD(&unit->domain_list, domain, next); + IOMMU_UNLOCK(iommu); + + iodom = &domain->iodom; + + va_bits = sc->pm_mode == PMAP_MODE_SV48 ? 48 : 39; + + /* Avoid sign-extension. */ + va_bits -= 1; + + iodom->end = (1ULL << va_bits) - 1; + + return (iodom); +} + +static void +riscv_iommu_domain_free(device_t dev, struct iommu_domain *iodom) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_softc *sc; + + sc = device_get_softc(dev); + + dprintf("%s\n", __func__); + + domain = (struct riscv_iommu_domain *)iodom; + + LIST_REMOVE(domain, next); + + iommu_pmap_remove_pages(&domain->p); + iommu_pmap_release(&domain->p); + + riscv_iommu_inval_vma_pscid(sc, domain->pscid); + riscv_iommu_pscid_free(sc, domain->pscid); + + free(domain, M_IOMMU); +} + +static struct iommu_ctx * +riscv_iommu_ctx_alloc(device_t dev, struct iommu_domain *iodom, device_t child, + bool disabled) +{ + struct riscv_iommu_domain *domain; + struct riscv_iommu_ctx *ctx; + + dprintf("%s\n", __func__); + + domain = (struct riscv_iommu_domain *)iodom; + + ctx = malloc(sizeof(struct riscv_iommu_ctx), M_IOMMU, + M_WAITOK | M_ZERO); + ctx->dev = child; + ctx->domain = domain; + refcount_init(&ctx->refcnt, 1); + if (disabled) + ctx->bypass = true; + + IOMMU_DOMAIN_LOCK(iodom); + LIST_INSERT_HEAD(&domain->ctx_list, ctx, next); + IOMMU_DOMAIN_UNLOCK(iodom); + + return (&ctx->ioctx); +} + +static int +riscv_iommu_ctx_init(device_t dev, struct iommu_ctx *ioctx) +{ + struct riscv_iommu_domain *domain; + struct iommu_domain *iodom; + struct riscv_iommu_softc *sc; + struct riscv_iommu_ctx *ctx; + devclass_t pci_class; + u_int did; + int error; + + ctx = (struct riscv_iommu_ctx *)ioctx; + + dprintf("%s\n", __func__); + + sc = device_get_softc(dev); + + domain = ctx->domain; + iodom = (struct iommu_domain *)domain; + + pci_class = devclass_find("pci"); + if (device_get_devclass(device_get_parent(ctx->dev)) == pci_class) { + error = riscv_iommu_pci_get_did(ctx->dev, NULL, &did); + if (error) + return (error); + + ioctx->rid = pci_get_rid(dev); + ctx->did = did; + ctx->vendor = pci_get_vendor(ctx->dev); + ctx->device = pci_get_device(ctx->dev); + } + + if (sc->iommu_mode == DDTP_IOMMU_MODE_2LVL) { + error = riscv_iommu_init_l0_directory(sc, ctx->did); + if (error) + return (error); + } + riscv_iommu_init_dc(sc, domain, ctx->did, ctx->bypass); + + if (device_get_devclass(device_get_parent(ctx->dev)) == pci_class) + if (iommu_is_buswide_ctx(iodom->iommu, pci_get_bus(ctx->dev))) + riscv_iommu_set_buswide(dev, domain, ctx); + + return (0); +} + +static bool +riscv_iommu_ctx_free(device_t dev, struct iommu_ctx *ioctx) +{ + struct riscv_iommu_softc *sc; + struct riscv_iommu_ctx *ctx; + + dprintf("%s\n", __func__); + + IOMMU_ASSERT_LOCKED(ioctx->domain->iommu); + + sc = device_get_softc(dev); + + ctx = (struct riscv_iommu_ctx *)ioctx; + if (refcount_release(&ctx->refcnt)) { + riscv_iommu_deinit_dc(sc, ctx->did); + LIST_REMOVE(ctx, next); + free(ctx, M_IOMMU); + return (true); + } + + return (false); +} + +#ifdef FDT +static int +riscv_iommu_ofw_md_data(device_t dev, struct iommu_ctx *ioctx, pcell_t *cells, + int ncells) +{ + struct riscv_iommu_ctx *ctx; + + printf("%s\n", __func__); + ctx = (struct riscv_iommu_ctx *)ioctx; + + if (ncells != 1) + return (-1); + + ctx->did = cells[0]; + + return (0); +} +#endif + +static int +riscv_iommu_read_ivar(device_t dev, device_t child, int which, + uintptr_t *result) +{ + struct riscv_iommu_softc *sc; + + sc = device_get_softc(dev); + + device_printf(sc->dev, "%s\n", __func__); + + return (ENOENT); +} + +static device_method_t riscv_iommu_methods[] = { + /* IOMMU interface */ + DEVMETHOD(iommu_find, riscv_iommu_find), + DEVMETHOD(iommu_map, riscv_iommu_map), + DEVMETHOD(iommu_unmap, riscv_iommu_unmap), + DEVMETHOD(iommu_domain_alloc, riscv_iommu_domain_alloc), + DEVMETHOD(iommu_domain_free, riscv_iommu_domain_free), + DEVMETHOD(iommu_ctx_alloc, riscv_iommu_ctx_alloc), + DEVMETHOD(iommu_ctx_init, riscv_iommu_ctx_init), + DEVMETHOD(iommu_ctx_free, riscv_iommu_ctx_free), + DEVMETHOD(iommu_ctx_lookup, riscv_iommu_ctx_lookup), +#ifdef FDT + DEVMETHOD(iommu_ofw_md_data, riscv_iommu_ofw_md_data), +#endif + + /* Bus interface */ + DEVMETHOD(bus_read_ivar, riscv_iommu_read_ivar), + + /* End */ + DEVMETHOD_END +}; + +DEFINE_CLASS_0(riscv_iommu, riscv_iommu_driver, riscv_iommu_methods, + sizeof(struct riscv_iommu_softc)); diff --git a/sys/riscv/iommu/iommu.h b/sys/riscv/iommu/iommu.h new file mode 100644 index 000000000000..502a0e121bd1 --- /dev/null +++ b/sys/riscv/iommu/iommu.h @@ -0,0 +1,359 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RISCV_IOMMU_IOMMU_H_ +#define _RISCV_IOMMU_IOMMU_H_ + +#define RISCV_IOMMU_CAPABILITIES 0x0000 +#define CAPABILITIES_VERSION_S 0 +#define CAPABILITIES_VERSION_M (0xff << CAPABILITIES_VERSION_S) +#define CAPABILITIES_SV32 (1 << 8) +#define CAPABILITIES_SV39 (1 << 9) +#define CAPABILITIES_SV48 (1 << 10) +#define CAPABILITIES_SV57 (1 << 11) +#define CAPABILITIES_SVPBMT (1 << 15) +#define CAPABILITIES_SV32X4 (1 << 16) +#define CAPABILITIES_SV39X4 (1 << 17) +#define CAPABILITIES_SV48X4 (1 << 18) +#define CAPABILITIES_SV57X4 (1 << 19) +#define CAPABILITIES_AMO_MRIF (1 << 21) +#define CAPABILITIES_MSI_FLAT (1 << 22) +#define CAPABILITIES_MSI_MRIF (1 << 23) +#define CAPABILITIES_AMO_HWAD (1 << 24) +#define CAPABILITIES_ATS (1 << 25) +#define CAPABILITIES_T2GPA (1 << 26) +#define CAPABILITIES_END (1 << 27) +#define CAPABILITIES_IGS_S 28 +#define CAPABILITIES_IGS_M (0x3 << CAPABILITIES_IGS_S) +#define CAPABILITIES_HPM (1 << 30) +#define CAPABILITIES_DBG (1 << 31) +#define CAPABILITIES_PAS_S 32ULL +#define CAPABILITIES_PAS_M (0x3f << CAPABILITIES_PAS_S) +#define CAPABILITIES_PD8 (1ULL << 38) +#define CAPABILITIES_PD17 (1ULL << 39) +#define CAPABILITIES_PD20 (1ULL << 40) +#define RISCV_IOMMU_FCTL 0x0008 +#define FCTL_BE (1 << 0) /* Big-endian */ +#define FCTL_WSI (1 << 1) /* Wire-signalled Ints. */ +#define FCTL_GXL (1 << 2) /* Guest physical addresses */ +#define RISCV_IOMMU_DDTP 0x0010 +#define DDTP_IOMMU_MODE_S 0 +#define DDTP_IOMMU_MODE_OFF (0 << DDTP_IOMMU_MODE_S) +#define DDTP_IOMMU_MODE_BARE (1 << DDTP_IOMMU_MODE_S) +#define DDTP_IOMMU_MODE_1LVL (2 << DDTP_IOMMU_MODE_S) +#define DDTP_IOMMU_MODE_2LVL (3 << DDTP_IOMMU_MODE_S) +#define DDTP_IOMMU_MODE_3LVL (4 << DDTP_IOMMU_MODE_S) +#define DDTP_BUSY (1 << 4) +#define DDTP_PPN_S 10 +#define DDTP_PPN_M (0xfffffffffffULL << DDTP_PPN_S) +#define RISCV_IOMMU_CQB 0x18 /* Command queue base. */ +#define CQB_LOG2SZ_1_S 0 +#define CQB_LOG2SZ_1_M (0x3f << CQB_LOG2SZ_1_S) +#define CQB_PPN_S 10 +#define CQB_PPN_M (0xfffffffffffULL << CQB_PPN_S) +#define RISCV_IOMMU_CQH 0x20 +#define RISCV_IOMMU_CQT 0x24 +#define RISCV_IOMMU_FQB 0x28 /* Fault queue base. */ +#define RISCV_IOMMU_FQH 0x30 +#define RISCV_IOMMU_FQT 0x34 +#define RISCV_IOMMU_PQB 0x38 /* Page queue base. */ +#define RISCV_IOMMU_PQH 0x40 +#define RISCV_IOMMU_PQT 0x44 +#define RISCV_IOMMU_CQCSR 0x48 +#define CQCSR_BUSY (1 << 17) /* Write is observed */ +#define CQCSR_CQON (1 << 16) /* Active */ +#define CQCSR_FENCE_W_IP (1 << 11) /* iofence.c completed */ +#define CQCSR_CMD_ILL (1 << 10) /* Illegal command */ +#define CQCSR_CMD_TO (1 << 9) /* Timeout */ +#define CQCSR_CQMF (1 << 8) /* Memory Fault */ +#define CQCSR_CIE (1 << 1) /* Interrupt Enable */ +#define CQCSR_CQEN (1 << 0) /* Enable */ +#define RISCV_IOMMU_FQCSR 0x4C +#define FQCSR_BUSY (1 << 17) /* Write is observed */ +#define FQCSR_FQON (1 << 16) /* Active */ +#define FQCSR_FQOF (1 << 9) /* Overflow */ +#define FQCSR_FQMF (1 << 8) /* Memory Fault */ +#define FQCSR_FIE (1 << 1) /* Interrupt Enable */ +#define FQCSR_FQEN (1 << 0) /* Enable */ +#define RISCV_IOMMU_PQCSR 0x50 +#define PQCSR_BUSY (1 << 17) /* Write is observed */ +#define PQCSR_PQON (1 << 16) /* Active */ +#define PQCSR_PQOF (1 << 9) /* Overflow */ +#define PQCSR_PQMF (1 << 8) /* Memory Fault */ +#define PQCSR_PIE (1 << 1) /* Interrupt Enable */ +#define PQCSR_PQEN (1 << 0) /* Enable */ +#define RISCV_IOMMU_IPSR 0x54 +#define IPSR_CIP (1 << 0) /* Command queue interrupt pending */ +#define IPSR_FIP (1 << 1) /* Fault queue interrupt pending */ +#define IPSR_PMIP (1 << 2) /* Performance monitoring int pend */ +#define IPSR_PIP (1 << 3) /* Page queue interrupt pending */ + +#define RISCV_IOMMU_IOCOUNTOVF 0x0058 +#define RISCV_IOMMU_IOCOUNTINH 0x005C + +#define RISCV_IOMMU_IOHPMCYCLES 0x0060 +#define RISCV_IOMMU_IOHPMCTR_BASE 0x0068 +#define RISCV_IOMMU_IOHPMCTR(_n) (RISCV_IOMMU_IOHPMCTR_BASE + ((_n) * 0x8)) +#define RISCV_IOMMU_IOHPMEVT_BASE 0x0160 +#define RISCV_IOMMU_IOHPMEVT(_n) (RISCV_IOMMU_IOHPMEVT_BASE + ((_n) * 0x8)) +#define RISCV_IOMMU_TR_REQ_IOVA 0x0258 +#define RISCV_IOMMU_TR_REQ_CTL 0x0260 +#define RISCV_IOMMU_TR_RESPONSE 0x0268 +#define RISCV_IOMMU_ICVEC 0x02F8 + +#define RISCV_IOMMU_LOCK(_sc) mtx_lock(&(_sc)->mtx) +#define RISCV_IOMMU_UNLOCK(_sc) mtx_unlock(&(_sc)->mtx) + +DECLARE_CLASS(riscv_iommu_driver); + +MALLOC_DECLARE(M_IOMMU); + +struct riscv_iommu_unit { + struct iommu_unit iommu; + LIST_HEAD(, riscv_iommu_domain) domain_list; + LIST_ENTRY(riscv_iommu_unit) next; + device_t dev; + intptr_t xref; +}; + +struct riscv_iommu_domain { + struct iommu_domain iodom; + LIST_HEAD(, riscv_iommu_ctx) ctx_list; + LIST_ENTRY(riscv_iommu_domain) next; + u_int entries_cnt; + struct riscv_iommu_cd *cd; + struct riscv_iommu_pmap p; + uint16_t pscid; +}; + +struct riscv_iommu_ctx { + struct iommu_ctx ioctx; + struct riscv_iommu_domain *domain; + LIST_ENTRY(riscv_iommu_ctx) next; + device_t dev; + bool bypass; + int did; + uint16_t vendor; + uint16_t device; + u_int refcnt; +}; + +struct riscv_iommu_queue_local_copy { + union { + uint64_t val; + struct { + uint32_t head; + uint32_t tail; + }; + }; +}; + +struct riscv_iommu_queue { + struct riscv_iommu_queue_local_copy lc; + vm_paddr_t paddr; + void *vaddr; + uint64_t mask; + uint32_t head_off; + uint32_t tail_off; + int size_log2; + uint64_t base; + uint64_t csr; + int idx; + uint8_t entry_size; +}; + +struct l1_desc { + uint8_t span; + void *va; + vm_paddr_t pa; +}; + +/* Base-format device-context. */ +struct riscv_iommu_dc_base { + uint64_t tc; /* Translation control */ +#define DC_TC_V (1 << 0) +#define DC_TC_EN_ATS (1 << 1) +#define DC_TC_EN_PRI (1 << 2) +#define DC_TC_T2GPA (1 << 3) +#define DC_TC_DTF (1 << 4) +#define DC_TC_PDTV (1 << 5) +#define DC_TC_PRPR (1 << 6) +#define DC_TC_GADE (1 << 7) +#define DC_TC_SADE (1 << 8) +#define DC_TC_DPE (1 << 9) +#define DC_TC_SBE (1 << 10) +#define DC_TC_SXL (1 << 11) + uint64_t iohgatp; /* IO Hyp guest address translation */ + uint64_t ta; /* Translation attributes */ +#define DC_TA_V (1 << 0) +#define DC_TA_ENS (1 << 1) +#define DC_TA_SUM (1 << 2) +#define DC_TA_PSCID_S 12 +#define DC_TA_PSCID_M (0xfffff << DC_TA_PSCID_S) + uint64_t fsc; /* First-stage-context */ +}; + +/* Extended-format device-context. */ +struct riscv_iommu_dc { + struct riscv_iommu_dc_base base; + uint64_t msiptp; /* MSI page table pointer */ + uint64_t msi_addr_mask; + uint64_t msi_addr_pattern; + uint64_t _reserved; +}; + +#define DC_NON_LEAF_ENTRY_PPN_S 10 +#define DC_NON_LEAF_ENTRY_VALID (1 << 0) + +struct riscv_iommu_ddt { + void *vaddr; + uint64_t base; + uint32_t base_cfg; + uint32_t num_top_entries; + struct l1_desc *l1; + struct riscv_iommu_dc *dc; +}; + +struct riscv_iommu_softc { + device_t dev; + intptr_t xref; + struct riscv_iommu_unit unit; + struct resource *res[5]; + void *intr_cookie[4]; + struct riscv_iommu_queue cq; + struct riscv_iommu_queue fq; + struct riscv_iommu_queue pq; + struct riscv_iommu_ddt ddt; + struct mtx mtx; + uint32_t l0_did_bits; + uint32_t dc_dwords; + + /* PSCID management. */ + bitstr_t *pscid_set; + int pscid_set_size; + struct mtx pscid_set_mutex; + uint32_t pscid_bits; + + enum pmap_mode pm_mode; + int iommu_mode; +}; + +/* + * Command queue request. + */ +struct riscv_iommu_command { + uint64_t dword0; + uint64_t dword1; +}; + +enum riscv_iommu_fq_causes { + FQ_CAUSE_INST_FAULT = 1, + FQ_CAUSE_RD_ADDR_MISALIGNED = 4, + FQ_CAUSE_RD_FAULT = 5, + FQ_CAUSE_WR_ADDR_MISALIGNED = 6, + FQ_CAUSE_WR_FAULT = 7, + FQ_CAUSE_INST_FAULT_S = 12, + FQ_CAUSE_RD_FAULT_S = 13, + FQ_CAUSE_WR_FAULT_S = 15, + FQ_CAUSE_INST_FAULT_VS = 20, + FQ_CAUSE_RD_FAULT_VS = 21, + FQ_CAUSE_WR_FAULT_VS = 23, + FQ_CAUSE_DMA_DISABLED = 256, + FQ_CAUSE_DDT_LOAD_FAULT = 257, + FQ_CAUSE_DDT_INVALID = 258, + FQ_CAUSE_DDT_MISCONFIGURED = 259, + FQ_CAUSE_TR_TYPE_DISALLOWED = 260, + FQ_CAUSE_MSI_LOAD_FAULT = 261, + FQ_CAUSE_MSI_INVALID = 262, + FQ_CAUSE_MSI_MISCONFIGURED = 263, + FQ_CAUSE_MRIF_FAULT = 264, + FQ_CAUSE_PDT_LOAD_FAULT = 265, + FQ_CAUSE_PDT_INVALID = 266, + FQ_CAUSE_PDT_MISCONFIGURED = 267, + FQ_CAUSE_DDT_CORRUPTED = 268, + FQ_CAUSE_PDT_CORRUPTED = 269, + FQ_CAUSE_MSI_PT_CORRUPTED = 270, + FQ_CAUSE_MRIF_CORRUPTED = 271, + FQ_CAUSE_INTERNAL_DP_ERROR = 272, + FQ_CAUSE_MSI_WR_FAULT = 273, + FQ_CAUSE_PT_CORRUPTED = 274, +}; + +/* + * Fault queue record. + */ +struct riscv_iommu_fq_record { + uint64_t hdr; +#define FQR_HDR_CAUSE_S 0 +#define FQR_HDR_CAUSE_M (0xfff << FQR_HDR_CAUSE_S) +#define FQR_HDR_PID_S 12 +#define FQR_HDR_PID_M (0xfffffULL << FQR_HDR_PID_S) +#define FQR_HDR_PV (1ULL << 32) +#define FQR_HDR_PRIV (1ULL << 33) +#define FQR_HDR_TTYP_S 34ULL +#define FQR_HDR_TTYP_M (0x3fULL << FQR_HDR_TTYP_S) +#define FQR_HDR_DID_S 40ULL +#define FQR_HDR_DID_M (0xffffffULL << FQR_HDR_DID_S) + uint32_t custom; + uint32_t reserved; + uint64_t iotval; + uint64_t iotval2; +}; + +#define COMMAND_OPCODE_S 0 +#define COMMAND_OPCODE_IOTINVAL (1 << COMMAND_OPCODE_S) +#define COMMAND_OPCODE_IOFENCE (2 << COMMAND_OPCODE_S) +#define COMMAND_OPCODE_IODIR (3 << COMMAND_OPCODE_S) +#define COMMAND_OPCODE_ATS (4 << COMMAND_OPCODE_S) +#define COMMAND_OPCODE_FUNC_S 7 +#define COMMAND_OPCODE_FUNC_M (0x3 << COMMAND_OPCODE_FUNC_S) +#define FUNC_IODIR_INVAL_DDT (0 << COMMAND_OPCODE_FUNC_S) +#define FUNC_IODIR_INVAL_PDT (1 << COMMAND_OPCODE_FUNC_S) +#define FUNC_IODIR_PID_S 12 +#define FUNC_IODIR_DV (1ULL << 33) /* DID Valid */ +#define FUNC_IODIR_DID_S 40ULL +/* dword0 */ +#define FUNC_IOTINVAL_VMA (0 << COMMAND_OPCODE_FUNC_S) +#define FUNC_IOTINVAL_GVMA (1 << COMMAND_OPCODE_FUNC_S) +#define FUNC_IOTINVAL_AV (1 << 10) /* Address Valid */ +#define FUNC_IOTINVAL_PSCID_S 12 /* Process-Soft-Context ID */ +#define FUNC_IOTINVAL_PSCV (1ULL << 32) /* PSCID Valid */ +#define FUNC_IOTINVAL_GV (1ULL << 33) /* GSCID Valid */ +#define FUNC_IOTINVAL_GSCID_S 44 /* Guest-Soft-Context ID */ +/* dword1 */ +#define FUNC_IOTINVAL_ADDR_S 10 +#define FUNC_IOFENCE_FUNC_C (0 << 7) +#define FUNC_IOFENCE_AV (1 << 10) +#define FUNC_IOFENCE_WSI (1 << 11) +#define FUNC_IOFENCE_PR (1 << 12) +#define FUNC_IOFENCE_PW (1 << 13) +#define FUNC_IOFENCE_DATA_S 32ULL + +int riscv_iommu_attach(device_t dev); +struct riscv_iommu_ctx *riscv_iommu_ctx_lookup_by_did(device_t dev, u_int did); + +#endif /* _RISCV_IOMMU_IOMMU_H_ */ diff --git a/sys/riscv/iommu/iommu_fdt.c b/sys/riscv/iommu/iommu_fdt.c new file mode 100644 index 000000000000..99fb41d55f8c --- /dev/null +++ b/sys/riscv/iommu/iommu_fdt.c @@ -0,0 +1,145 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include + +static struct ofw_compat_data compat_data[] = { + { "riscv,iommu", 1 }, + { NULL, 0 } +}; + +static struct resource_spec riscv_iommu_spec[] = { + { SYS_RES_MEMORY, 0, RF_ACTIVE }, + { SYS_RES_IRQ, 0, RF_ACTIVE }, /* CQ */ + { SYS_RES_IRQ, 1, RF_ACTIVE }, /* FQ */ + { SYS_RES_IRQ, 2, RF_ACTIVE }, /* PM */ + { SYS_RES_IRQ, 3, RF_ACTIVE }, /* PQ */ + RESOURCE_SPEC_END +}; + +static int +riscv_iommu_fdt_probe(device_t dev) +{ + if (!ofw_bus_status_okay(dev)) + return (ENXIO); + + if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) + return (ENXIO); + + device_set_desc(dev, "RISC-V IOMMU"); + + return (BUS_PROBE_DEFAULT); +} + +static int +riscv_iommu_fdt_attach(device_t dev) +{ + struct riscv_iommu_softc *sc; + struct riscv_iommu_unit *unit; + struct iommu_unit *iommu; + phandle_t node; + int error; + + sc = device_get_softc(dev); + sc->dev = dev; + + node = ofw_bus_get_node(dev); + + error = bus_alloc_resources(dev, riscv_iommu_spec, sc->res); + if (error) { + device_printf(dev, "could not allocate resources\n"); + goto error; + } + + error = riscv_iommu_attach(dev); + if (error != 0) { + device_printf(dev, "Failed to attach. Error %d\n", error); + goto error; + } + + unit = &sc->unit; + unit->dev = dev; + + iommu = &unit->iommu; + iommu->dev = dev; + + LIST_INIT(&unit->domain_list); + + sc->xref = OF_xref_from_node(node); + + error = iommu_register(iommu); + if (error) { + device_printf(dev, "Failed to register RISC-V IOMMU.\n"); + goto error; + } + + return (0); + +error: + bus_release_resources(dev, riscv_iommu_spec, sc->res); + + return (error); +} + +static device_method_t riscv_iommu_fdt_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, riscv_iommu_fdt_probe), + DEVMETHOD(device_attach, riscv_iommu_fdt_attach), + DEVMETHOD_END +}; + +DEFINE_CLASS_1(riscv_iommu, riscv_iommu_fdt_driver, riscv_iommu_fdt_methods, + sizeof(struct riscv_iommu_softc), riscv_iommu_driver); +EARLY_DRIVER_MODULE(riscv_iommu, simplebus, riscv_iommu_fdt_driver, 0, 0, + BUS_PASS_INTERRUPT + BUS_PASS_ORDER_LATE); diff --git a/sys/riscv/iommu/iommu_frontend.c b/sys/riscv/iommu/iommu_frontend.c new file mode 100644 index 000000000000..1de8b3623987 --- /dev/null +++ b/sys/riscv/iommu/iommu_frontend.c @@ -0,0 +1,505 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_platform.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef FDT +#include +#include +#include +#endif + +#include "iommu_frontend.h" +#include "iommu_if.h" + +static MALLOC_DEFINE(M_IOMMU, "IOMMU", "IOMMU framework"); + +#define IOMMU_LIST_LOCK() sx_xlock(&iommu_sx) +#define IOMMU_LIST_UNLOCK() sx_xunlock(&iommu_sx) +#define IOMMU_LIST_ASSERT_LOCKED() sx_assert(&iommu_sx, SA_XLOCKED) + +#define dprintf(fmt, ...) + +static struct sx iommu_sx; + +struct iommu_entry { + struct iommu_unit *iommu; + LIST_ENTRY(iommu_entry) next; +}; +static LIST_HEAD(, iommu_entry) iommu_list = LIST_HEAD_INITIALIZER(iommu_list); + +static int +iommu_domain_unmap_buf(struct iommu_domain *iodom, + struct iommu_map_entry *entry, int flags) +{ + struct iommu_unit *iommu; + int error; + + iommu = iodom->iommu; + error = IOMMU_UNMAP(iommu->dev, iodom, entry->start, entry->end - + entry->start); + return (error); +} + +static int +iommu_domain_map_buf(struct iommu_domain *iodom, struct iommu_map_entry *entry, + vm_page_t *ma, uint64_t eflags, int flags) +{ + struct iommu_unit *iommu; + vm_prot_t prot; + vm_offset_t va; + int error; + + dprintf("%s: base %lx, size %lx\n", __func__, base, size); + + prot = 0; + if (eflags & IOMMU_MAP_ENTRY_READ) + prot |= VM_PROT_READ; + if (eflags & IOMMU_MAP_ENTRY_WRITE) + prot |= VM_PROT_WRITE; + + va = entry->start; + iommu = iodom->iommu; + error = IOMMU_MAP(iommu->dev, iodom, va, ma, entry->end - + entry->start, prot); + return (error); +} + +static const struct iommu_domain_map_ops domain_map_ops = { + .map = iommu_domain_map_buf, + .unmap = iommu_domain_unmap_buf, +}; + +static struct iommu_domain * +iommu_domain_alloc(struct iommu_unit *iommu) +{ + struct iommu_domain *iodom; + + iodom = IOMMU_DOMAIN_ALLOC(iommu->dev, iommu); + if (iodom == NULL) + return (NULL); + + KASSERT(iodom->end != 0, ("domain end is not set")); + + iommu_domain_init(iommu, iodom, &domain_map_ops); + iodom->iommu = iommu; + iommu_gas_init_domain(iodom); + + return (iodom); +} + +static int +iommu_domain_free(struct iommu_domain *iodom) +{ + struct iommu_unit *iommu; + + iommu = iodom->iommu; + + IOMMU_LOCK(iommu); + + if ((iodom->flags & IOMMU_DOMAIN_GAS_INITED) != 0) { + IOMMU_DOMAIN_LOCK(iodom); + iommu_gas_fini_domain(iodom); + IOMMU_DOMAIN_UNLOCK(iodom); + } + + iommu_domain_fini(iodom); + + IOMMU_DOMAIN_FREE(iommu->dev, iodom); + IOMMU_UNLOCK(iommu); + + return (0); +} + +static void +iommu_tag_init(struct iommu_domain *iodom, struct bus_dma_tag_iommu *t) +{ + bus_addr_t maxaddr; + + maxaddr = MIN(iodom->end, BUS_SPACE_MAXADDR); + + t->common.impl = &bus_dma_iommu_impl; + t->common.alignment = 1; + t->common.boundary = 0; + t->common.lowaddr = maxaddr; + t->common.highaddr = maxaddr; + t->common.maxsize = maxaddr; + t->common.nsegments = BUS_SPACE_UNRESTRICTED; + t->common.maxsegsz = maxaddr; +} + +static struct iommu_ctx * +iommu_ctx_alloc(device_t requester, struct iommu_domain *iodom, bool disabled) +{ + struct iommu_unit *iommu; + struct iommu_ctx *ioctx; + + iommu = iodom->iommu; + + ioctx = IOMMU_CTX_ALLOC(iommu->dev, iodom, requester, disabled); + if (ioctx == NULL) + return (NULL); + + ioctx->domain = iodom; + + return (ioctx); +} + +static int +iommu_ctx_init(device_t requester, struct iommu_ctx *ioctx) +{ + struct bus_dma_tag_iommu *tag; + struct iommu_domain *iodom; + struct iommu_unit *iommu; + int error; + + iodom = ioctx->domain; + iommu = iodom->iommu; + + error = IOMMU_CTX_INIT(iommu->dev, ioctx); + if (error) + return (error); + + tag = ioctx->tag = malloc(sizeof(struct bus_dma_tag_iommu), + M_DEVBUF, M_WAITOK | M_ZERO); + tag->owner = requester; + tag->ctx = ioctx; + tag->ctx->domain = iodom; + + iommu_tag_init(iodom, tag); + + return (error); +} + +static struct iommu_unit * +iommu_lookup(device_t dev) +{ + struct iommu_entry *entry; + struct iommu_unit *iommu; + + IOMMU_LIST_LOCK(); + LIST_FOREACH(entry, &iommu_list, next) { + iommu = entry->iommu; + if (iommu->dev == dev) { + IOMMU_LIST_UNLOCK(); + return (iommu); + } + } + IOMMU_LIST_UNLOCK(); + + return (NULL); +} + +#ifdef FDT +struct iommu_ctx * +iommu_get_ctx_ofw(device_t dev, int channel) +{ + struct iommu_domain *iodom; + struct iommu_unit *iommu; + struct iommu_ctx *ioctx; + phandle_t node, parent; + device_t iommu_dev; + pcell_t *cells; + int niommus; + int ncells; + int error; + + node = ofw_bus_get_node(dev); + if (node <= 0) { + device_printf(dev, + "%s called on not ofw based device.\n", __func__); + return (NULL); + } + + error = ofw_bus_parse_xref_list_get_length(node, + "iommus", "#iommu-cells", &niommus); + if (error) { + device_printf(dev, "%s can't get iommu list.\n", __func__); + return (NULL); + } + + if (niommus == 0) { + device_printf(dev, "%s iommu list is empty.\n", __func__); + return (NULL); + } + + error = ofw_bus_parse_xref_list_alloc(node, "iommus", "#iommu-cells", + channel, &parent, &ncells, &cells); + if (error != 0) { + device_printf(dev, "%s can't get iommu device xref.\n", + __func__); + return (NULL); + } + + iommu_dev = OF_device_from_xref(parent); + if (iommu_dev == NULL) { + device_printf(dev, "%s can't get iommu device.\n", __func__); + return (NULL); + } + + iommu = iommu_lookup(iommu_dev); + if (iommu == NULL) { + device_printf(dev, "%s can't lookup iommu.\n", __func__); + return (NULL); + } + + /* + * In our current configuration we have a domain per each ctx, + * so allocate a domain first. + */ + iodom = iommu_domain_alloc(iommu); + if (iodom == NULL) { + device_printf(dev, "%s can't allocate domain.\n", __func__); + return (NULL); + } + + ioctx = iommu_ctx_alloc(dev, iodom, false); + if (ioctx == NULL) { + iommu_domain_free(iodom); + return (NULL); + } + + ioctx->domain = iodom; + + error = IOMMU_OFW_MD_DATA(iommu->dev, ioctx, cells, ncells); + if (error) { + device_printf(dev, "%s can't set MD data\n", __func__); + return (NULL); + } + + error = iommu_ctx_init(dev, ioctx); + if (error) { + IOMMU_CTX_FREE(iommu->dev, ioctx); + iommu_domain_free(iodom); + return (NULL); + } + + return (ioctx); +} +#endif + +struct iommu_ctx * +iommu_get_ctx(struct iommu_unit *iommu, device_t requester, + uint16_t rid, bool disabled, bool rmrr) +{ + struct iommu_domain *iodom; + struct iommu_ctx *ioctx; + int error; + + IOMMU_LOCK(iommu); + ioctx = IOMMU_CTX_LOOKUP(iommu->dev, requester); + if (ioctx) { + IOMMU_UNLOCK(iommu); + return (ioctx); + } + IOMMU_UNLOCK(iommu); + + /* + * In our current configuration we have a domain per each ctx. + * So allocate a domain first. + */ + iodom = iommu_domain_alloc(iommu); + if (iodom == NULL) + return (NULL); + + ioctx = iommu_ctx_alloc(requester, iodom, disabled); + if (ioctx == NULL) { + iommu_domain_free(iodom); + return (NULL); + } + + error = iommu_ctx_init(requester, ioctx); + if (error) { + IOMMU_CTX_FREE(iommu->dev, ioctx); + iommu_domain_free(iodom); + return (NULL); + } + + return (ioctx); +} + +void +iommu_free_ctx_locked(struct iommu_unit *iommu, struct iommu_ctx *ioctx) +{ + struct iommu_domain *domain; + bool released; + int error; + + IOMMU_ASSERT_LOCKED(iommu); + + domain = ioctx->domain; + + released = IOMMU_CTX_FREE(iommu->dev, ioctx); + IOMMU_UNLOCK(iommu); + + if (released) { + /* Since we have a domain per each ctx, remove it too. */ + error = iommu_domain_free(domain); + if (error) + device_printf(iommu->dev, "Could not free a domain\n"); + } +} + +static void +iommu_domain_free_entry(struct iommu_map_entry *entry, bool free) +{ + iommu_gas_free_space(entry); + + if (free) + iommu_gas_free_entry(entry); + else + entry->flags = 0; +} + +void +iommu_domain_unload(struct iommu_domain *iodom, + struct iommu_map_entries_tailq *entries, bool cansleep) +{ + struct iommu_map_entry *entry, *entry1; + int error __diagused; + + TAILQ_FOREACH_SAFE(entry, entries, dmamap_link, entry1) { + KASSERT((entry->flags & IOMMU_MAP_ENTRY_MAP) != 0, + ("not mapped entry %p %p", iodom, entry)); + error = iodom->ops->unmap(iodom, entry, + cansleep ? IOMMU_PGF_WAITOK : 0); + KASSERT(error == 0, ("unmap %p error %d", iodom, error)); + TAILQ_REMOVE(entries, entry, dmamap_link); + iommu_domain_free_entry(entry, true); + } + + if (TAILQ_EMPTY(entries)) + return; + + panic("entries map is not empty"); +} + +int +iommu_register(struct iommu_unit *iommu) +{ + struct iommu_entry *entry; + + mtx_init(&iommu->lock, "IOMMU", NULL, MTX_DEF); + + entry = malloc(sizeof(struct iommu_entry), M_IOMMU, M_WAITOK | M_ZERO); + entry->iommu = iommu; + + IOMMU_LIST_LOCK(); + LIST_INSERT_HEAD(&iommu_list, entry, next); + IOMMU_LIST_UNLOCK(); + + sysctl_ctx_init(&iommu->sysctl_ctx); + iommu_init_busdma(iommu); + + return (0); +} + +int +iommu_unregister(struct iommu_unit *iommu) +{ + struct iommu_entry *entry, *tmp; + + IOMMU_LIST_LOCK(); + LIST_FOREACH_SAFE(entry, &iommu_list, next, tmp) { + if (entry->iommu == iommu) { + LIST_REMOVE(entry, next); + free(entry, M_IOMMU); + } + } + IOMMU_LIST_UNLOCK(); + + iommu_fini_busdma(iommu); + sysctl_ctx_free(&iommu->sysctl_ctx); + + mtx_destroy(&iommu->lock); + + return (0); +} + +struct iommu_unit * +iommu_find(device_t dev, bool verbose) +{ + struct iommu_entry *entry; + struct iommu_unit *iommu; + int error; + + IOMMU_LIST_LOCK(); + LIST_FOREACH(entry, &iommu_list, next) { + iommu = entry->iommu; + error = IOMMU_FIND(iommu->dev, dev); + if (error == 0) { + IOMMU_LIST_UNLOCK(); + return (entry->iommu); + } + } + IOMMU_LIST_UNLOCK(); + + return (NULL); +} + +void +iommu_unit_pre_instantiate_ctx(struct iommu_unit *unit) +{ +} + +void +iommu_domain_unload_entry(struct iommu_map_entry *entry, bool free, + bool cansleep __unused) +{ + + dprintf("%s\n", __func__); + + iommu_domain_free_entry(entry, free); +} + +static void +iommu_init(void) +{ + + sx_init(&iommu_sx, "IOMMU list"); +} + +SYSINIT(iommu, SI_SUB_DRIVERS, SI_ORDER_FIRST, iommu_init, NULL); diff --git a/sys/riscv/iommu/iommu_frontend.h b/sys/riscv/iommu/iommu_frontend.h new file mode 100644 index 000000000000..f42856037383 --- /dev/null +++ b/sys/riscv/iommu/iommu_frontend.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _ARM64_IOMMU_IOMMU_H_ +#define _ARM64_IOMMU_IOMMU_H_ + +#define IOMMU_PAGE_SIZE 4096 +#define IOMMU_PAGE_MASK (IOMMU_PAGE_SIZE - 1) + +int iommu_unregister(struct iommu_unit *unit); +int iommu_register(struct iommu_unit *unit); +struct iommu_ctx * iommu_get_ctx_ofw(device_t dev, int channel); + +#endif /* _ARM64_IOMMU_IOMMU_H_ */ diff --git a/sys/riscv/iommu/iommu_if.m b/sys/riscv/iommu/iommu_if.m new file mode 100644 index 000000000000..56a04b7b8638 --- /dev/null +++ b/sys/riscv/iommu/iommu_if.m @@ -0,0 +1,147 @@ +#- +# SPDX-License-Identifier: BSD-2-Clause +#: +# Copyright (c) 2026 Ruslan Bukin +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# + +#include "opt_platform.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef FDT +#include +#include +#include +#endif + +INTERFACE iommu; + +# +# Check if the iommu controller dev is responsible to serve traffic +# for a given child. +# +METHOD int find { + device_t dev; + device_t child; +}; + +# +# Map a virtual address VA to a physical address PA. +# +METHOD int map { + device_t dev; + struct iommu_domain *iodom; + vm_offset_t va; + vm_page_t *ma; + bus_size_t size; + vm_prot_t prot; +}; + +# +# Unmap a virtual address VA. +# +METHOD int unmap { + device_t dev; + struct iommu_domain *iodom; + vm_offset_t va; + bus_size_t size; +}; + +# +# Allocate an IOMMU domain. +# +METHOD struct iommu_domain * domain_alloc { + device_t dev; + struct iommu_unit *iommu; +}; + +# +# Release all the resources held by IOMMU domain. +# +METHOD void domain_free { + device_t dev; + struct iommu_domain *iodom; +}; + +# +# Find a domain allocated for a dev. +# +METHOD struct iommu_domain * domain_lookup { + device_t dev; +}; + +# +# Find an allocated context for a device. +# +METHOD struct iommu_ctx * ctx_lookup { + device_t dev; + device_t child; +}; + +# +# Allocate a new iommu context. +# +METHOD struct iommu_ctx * ctx_alloc { + device_t dev; + struct iommu_domain *iodom; + device_t child; + bool disabled; +}; + +# +# Initialize the new iommu context. +# +METHOD int ctx_init { + device_t dev; + struct iommu_ctx *ioctx; +}; + +# +# Free the iommu context. +# +METHOD bool ctx_free { + device_t dev; + struct iommu_ctx *ioctx; +}; + +#ifdef FDT +# +# Notify controller we have machine-dependent data. +# +METHOD int ofw_md_data { + device_t dev; + struct iommu_ctx *ioctx; + pcell_t *cells; + int ncells; +}; +#endif diff --git a/sys/riscv/iommu/iommu_pci.c b/sys/riscv/iommu/iommu_pci.c new file mode 100644 index 000000000000..2ec5b2dc95b5 --- /dev/null +++ b/sys/riscv/iommu/iommu_pci.c @@ -0,0 +1,172 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#define PCI_DEVICE_ID_REDHAT_RISCV_IOMMU 0x0014 +#define PCI_VENDOR_ID_REDHAT 0x1b36 +#define PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA 0x0008 +#define PCI_VENDOR_ID_RIVOS 0x1efd + +static int +iommu_pci_probe(device_t dev) +{ + uint16_t vendor_id, device_id; + + vendor_id = pci_get_vendor(dev); + device_id = pci_get_device(dev); + + if (vendor_id == PCI_VENDOR_ID_REDHAT && + device_id == PCI_DEVICE_ID_REDHAT_RISCV_IOMMU) { + device_set_desc(dev, "RedHat IOMMU"); + return (BUS_PROBE_DEFAULT); + } + + if (vendor_id == PCI_VENDOR_ID_RIVOS && + device_id == PCI_DEVICE_ID_RIVOS_RISCV_IOMMU_GA) { + device_set_desc(dev, "Rivos IOMMU"); + return (BUS_PROBE_DEFAULT); + } + + return (ENXIO); +} + +static int +iommu_pci_attach(device_t dev) +{ + struct riscv_iommu_unit *unit; + struct riscv_iommu_softc *sc; + struct iommu_unit *iommu; + phandle_t node; + int count; + int error; + int rid; + int i; + + sc = device_get_softc(dev); + sc->dev = dev; + + node = ofw_bus_get_node(dev); + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(0); + sc->res[0] = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, + RF_ACTIVE); + if (sc->res[0] == NULL) { + device_printf(dev, "couldn't map memory\n"); + goto error; + } + + count = 4; + if (pci_alloc_msix(dev, &count) == 0) { + for (i = 0; i < 4; i++) { + rid = i; + sc->res[1 + i] = bus_alloc_resource_any(dev, + SYS_RES_IRQ, &rid, RF_ACTIVE); + if (sc->res[i + 1] == NULL) { + device_printf(dev, "Can't allocate IRQ " + " resource.\n"); + goto error; + } + } + } else + device_printf(dev, "Can't allocate MSI-X interrupts." + " Ignoring.\n"); + + error = riscv_iommu_attach(dev); + if (error) + goto error; + + unit = &sc->unit; + unit->dev = dev; + + iommu = &unit->iommu; + iommu->dev = dev; + + LIST_INIT(&unit->domain_list); + + sc->xref = OF_xref_from_node(node); + + error = iommu_register(iommu); + if (error) { + device_printf(dev, "Failed to register RISC-V IOMMU.\n"); + goto error; + } + + return (0); + +error: + if (sc->res[0]) + bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(0), + sc->res[0]); + + for (i = 0; i < 4; i++) + if (sc->res[i + 1]) + bus_release_resource(dev, SYS_RES_IRQ, i, + sc->res[i + 1]); + + return (error); +} + +static device_method_t riscv_iommu_pci_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, iommu_pci_probe), + DEVMETHOD(device_attach, iommu_pci_attach), + DEVMETHOD_END +}; + +DEFINE_CLASS_1(riscv_iommu, riscv_iommu_pci_driver, + riscv_iommu_pci_methods, sizeof(struct riscv_iommu_softc), + riscv_iommu_driver); +DRIVER_MODULE(riscv_iommu, pci, riscv_iommu_pci_driver, NULL, NULL); diff --git a/sys/riscv/iommu/iommu_pmap.c b/sys/riscv/iommu/iommu_pmap.c new file mode 100644 index 000000000000..751ba4cf1ac3 --- /dev/null +++ b/sys/riscv/iommu/iommu_pmap.c @@ -0,0 +1,629 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +/* + * Boundary values for the page table page index space: + * + * L3 pages: [0, NUL2E) + * L2 pages: [NUL2E, NUL2E + NUL1E) + * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E) + * + * Note that these ranges are used in both SV39 and SV48 mode. In SV39 mode the + * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages + * in a set of page tables. + */ +#define NUL0E Ln_ENTRIES +#define NUL1E (Ln_ENTRIES * NUL0E) +#define NUL2E (Ln_ENTRIES * NUL1E) + +#define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT)) +#define pmap_l2_pindex(v) ((v) >> L2_SHIFT) + +#define pmap_clear(pte) pmap_store(pte, 0) +#define pmap_clear_bits(pte, bits) atomic_clear_64(pte, bits) +#define pmap_load_store(pte, entry) atomic_swap_64(pte, entry) +#define pmap_load_clear(pte) pmap_load_store(pte, 0) +#define pmap_load(pte) atomic_load_64(pte) +#define pmap_store(pte, entry) atomic_store_64(pte, entry) +#define pmap_store_bits(pte, bits) atomic_set_64(pte, bits) + +#define pmap_l0_index(va) (((va) >> L0_SHIFT) & Ln_ADDR_MASK) +#define pmap_l1_index(va) (((va) >> L1_SHIFT) & Ln_ADDR_MASK) +#define pmap_l2_index(va) (((va) >> L2_SHIFT) & Ln_ADDR_MASK) +#define pmap_l3_index(va) (((va) >> L3_SHIFT) & Ln_ADDR_MASK) + +#define PTE_TO_PHYS(pte) \ + ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE) +#define L2PTE_TO_PHYS(l2) \ + ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT) +#define L1PTE_TO_PHYS(l1) \ + ((((l1) & ~PTE_HI_MASK) >> PTE_PPN2_S) << L1_SHIFT) +#define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte)) + +/********************/ +/* Inline functions */ +/********************/ + +static __inline pd_entry_t * +pmap_l0(struct riscv_iommu_pmap *pmap, vm_offset_t va) +{ + KASSERT(pmap->pm_mode != PMAP_MODE_SV39, + ("%s: in SV39 mode", __func__)); + KASSERT(VIRT_IS_VALID(va), + ("%s: malformed virtual address %#lx", __func__, va)); + return (&pmap->pm_top[pmap_l0_index(va)]); +} + +static __inline pd_entry_t * +pmap_l0_to_l1(struct riscv_iommu_pmap *pmap, pd_entry_t *l0, vm_offset_t va) +{ + vm_paddr_t phys; + pd_entry_t *l1; + + KASSERT(pmap->pm_mode != PMAP_MODE_SV39, + ("%s: in SV39 mode", __func__)); + phys = PTE_TO_PHYS(pmap_load(l0)); + l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); + + return (&l1[pmap_l1_index(va)]); +} + +static __inline pd_entry_t * +pmap_l1(struct riscv_iommu_pmap *pmap, vm_offset_t va) +{ + pd_entry_t *l0; + + KASSERT(VIRT_IS_VALID(va), + ("%s: malformed virtual address %#lx", __func__, va)); + if (pmap->pm_mode == PMAP_MODE_SV39) { + return (&pmap->pm_top[pmap_l1_index(va)]); + } else { + l0 = pmap_l0(pmap, va); + if ((pmap_load(l0) & PTE_V) == 0) + return (NULL); + if ((pmap_load(l0) & PTE_RX) != 0) + return (NULL); + return (pmap_l0_to_l1(pmap, l0, va)); + } +} + +static __inline pd_entry_t * +pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va) +{ + vm_paddr_t phys; + pd_entry_t *l2; + + phys = PTE_TO_PHYS(pmap_load(l1)); + l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); + + return (&l2[pmap_l2_index(va)]); +} + +static __inline pd_entry_t * +pmap_l2(struct riscv_iommu_pmap *pmap, vm_offset_t va) +{ + pd_entry_t *l1; + + l1 = pmap_l1(pmap, va); + if (l1 == NULL) + return (NULL); + if ((pmap_load(l1) & PTE_V) == 0) + return (NULL); + if ((pmap_load(l1) & PTE_RX) != 0) + return (NULL); + + return (pmap_l1_to_l2(l1, va)); +} + +static __inline pt_entry_t * +pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va) +{ + vm_paddr_t phys; + pt_entry_t *l3; + + phys = PTE_TO_PHYS(pmap_load(l2)); + l3 = (pd_entry_t *)PHYS_TO_DMAP(phys); + + return (&l3[pmap_l3_index(va)]); +} + +static __inline pt_entry_t * +pmap_l3(struct riscv_iommu_pmap *pmap, vm_offset_t va) +{ + pd_entry_t *l2; + + l2 = pmap_l2(pmap, va); + if (l2 == NULL) + return (NULL); + if ((pmap_load(l2) & PTE_V) == 0) + return (NULL); + if ((pmap_load(l2) & PTE_RX) != 0) + return (NULL); + + return (pmap_l2_to_l3(l2, va)); +} + +static __inline void +pmap_resident_count_inc(struct riscv_iommu_pmap *pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + pmap->sp_resident_count += count; +} + +static __inline void +pmap_resident_count_dec(struct riscv_iommu_pmap *pmap, int count) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + KASSERT(pmap->sp_resident_count >= count, + ("pmap %p resident count underflow %ld %d", pmap, + pmap->sp_resident_count, count)); + pmap->sp_resident_count -= count; +} + +/*************************************************** + * Page table page management routines..... + ***************************************************/ + +int +iommu_pmap_pinit(struct riscv_iommu_pmap *pmap, enum pmap_mode pm_mode) +{ + vm_paddr_t topphys; + vm_page_t m; + + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO | + VM_ALLOC_WAITOK); + topphys = VM_PAGE_TO_PHYS(m); + pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys); + pmap->pm_mode = pm_mode; + + switch (pm_mode) { + case PMAP_MODE_SV39: + pmap->pm_satp = SATP_MODE_SV39; + break; + case PMAP_MODE_SV48: + pmap->pm_satp = SATP_MODE_SV48; + break; + default: + panic("Unknown virtual memory system"); + }; + + pmap->pm_satp |= (topphys >> PAGE_SHIFT); + +#ifdef INVARIANTS + pmap->sp_resident_count = 0; +#endif + + mtx_init(&pmap->pm_mtx, "iommu pmap", NULL, MTX_DEF); + + return (1); +} + +/* + * Release any resources held by the given physical map. + * Called when a pmap initialized by pmap_pinit is being released. + * Should only be called if the map contains no valid mappings. + */ +void +iommu_pmap_release(struct riscv_iommu_pmap *pmap) +{ + vm_page_t m; + + KASSERT(pmap->sp_resident_count == 0, + ("pmap_release: pmap resident count %ld != 0", + pmap->sp_resident_count)); + + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top)); + vm_page_unwire_noq(m); + vm_page_free_zero(m); + mtx_destroy(&pmap->pm_mtx); +} + +/* + * This routine is called if the desired page table page does not exist. + * + * If page table page allocation fails, this routine may sleep before + * returning NULL. It sleeps only if a lock pointer was given. + * + * Note: If a page allocation fails at page table level two or three, + * one or two pages may be held during the wait, only to be released + * afterwards. This conservative approach is easily argued to avoid + * race conditions. + */ +static vm_page_t +_pmap_alloc_l3(struct riscv_iommu_pmap *pmap, vm_pindex_t ptepindex) +{ + vm_page_t m, pdpg; + pt_entry_t entry; + vm_paddr_t phys; + pn_t pn; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + + /* + * Allocate a page table page. + */ + m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO); + if (m == NULL) { + /* + * Indicate the need to retry. While waiting, the page table + * page may have been allocated. + */ + return (NULL); + } + m->pindex = ptepindex; + + /* + * Map the pagetable page into the process address space, if + * it isn't already there. + */ + pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT; + if (ptepindex >= NUL2E + NUL1E) { + pd_entry_t *l0; + vm_pindex_t l0index; + + KASSERT(pmap->pm_mode != PMAP_MODE_SV39, + ("%s: pindex %#lx in SV39 mode", __func__, ptepindex)); + KASSERT(ptepindex < NUL2E + NUL1E + NUL0E, + ("%s: pindex %#lx out of range", __func__, ptepindex)); + + l0index = ptepindex - (NUL2E + NUL1E); + l0 = &pmap->pm_top[l0index]; + KASSERT((pmap_load(l0) & PTE_V) == 0, + ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0))); + + entry = PTE_V | (pn << PTE_PPN0_S); + pmap_store(l0, entry); + } else if (ptepindex >= NUL2E) { + pd_entry_t *l0, *l1; + vm_pindex_t l0index, l1index; + + l1index = ptepindex - NUL2E; + if (pmap->pm_mode == PMAP_MODE_SV39) { + l1 = &pmap->pm_top[l1index]; + } else { + l0index = l1index >> Ln_ENTRIES_SHIFT; + l0 = &pmap->pm_top[l0index]; + if (pmap_load(l0) == 0) { + /* Recurse to allocate the L1 page. */ + if (_pmap_alloc_l3(pmap, + NUL2E + NUL1E + l0index) == NULL) + goto fail; + phys = PTE_TO_PHYS(pmap_load(l0)); + } else { + phys = PTE_TO_PHYS(pmap_load(l0)); + pdpg = PHYS_TO_VM_PAGE(phys); + pdpg->ref_count++; + } + l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); + l1 = &l1[ptepindex & Ln_ADDR_MASK]; + } + KASSERT((pmap_load(l1) & PTE_V) == 0, + ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1))); + + entry = PTE_V | (pn << PTE_PPN0_S); + pmap_store(l1, entry); + } else { + vm_pindex_t l0index, l1index; + pd_entry_t *l0, *l1, *l2; + + l1index = ptepindex >> (L1_SHIFT - L2_SHIFT); + if (pmap->pm_mode == PMAP_MODE_SV39) { + l1 = &pmap->pm_top[l1index]; + if (pmap_load(l1) == 0) { + /* recurse for allocating page dir */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index) + == NULL) + goto fail; + } else { + pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); + pdpg->ref_count++; + } + } else { + l0index = l1index >> Ln_ENTRIES_SHIFT; + l0 = &pmap->pm_top[l0index]; + if (pmap_load(l0) == 0) { + /* Recurse to allocate the L1 entry. */ + if (_pmap_alloc_l3(pmap, NUL2E + l1index) + == NULL) + goto fail; + phys = PTE_TO_PHYS(pmap_load(l0)); + l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); + l1 = &l1[l1index & Ln_ADDR_MASK]; + } else { + phys = PTE_TO_PHYS(pmap_load(l0)); + l1 = (pd_entry_t *)PHYS_TO_DMAP(phys); + l1 = &l1[l1index & Ln_ADDR_MASK]; + if (pmap_load(l1) == 0) { + /* Recurse to allocate the L2 page. */ + if (_pmap_alloc_l3(pmap, + NUL2E + l1index) == NULL) + goto fail; + } else { + pdpg = PTE_TO_VM_PAGE(pmap_load(l1)); + pdpg->ref_count++; + } + } + } + + phys = PTE_TO_PHYS(pmap_load(l1)); + l2 = (pd_entry_t *)PHYS_TO_DMAP(phys); + l2 = &l2[ptepindex & Ln_ADDR_MASK]; + KASSERT((pmap_load(l2) & PTE_V) == 0, + ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2))); + + entry = PTE_V | (pn << PTE_PPN0_S); + pmap_store(l2, entry); + } + + pmap_resident_count_inc(pmap, 1); + + return (m); + +fail: + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); +} + +/* + * Remove a single IOMMU entry. + */ +int +iommu_pmap_remove(struct riscv_iommu_pmap *pmap, vm_offset_t va) +{ + pt_entry_t *l3; + int rc; + + PMAP_LOCK(pmap); + + l3 = pmap_l3(pmap, va); + if (l3 != NULL) { + pmap_resident_count_dec(pmap, 1); + pmap_clear(l3); + rc = KERN_SUCCESS; + } else + rc = KERN_FAILURE; + + PMAP_UNLOCK(pmap); + + return (rc); +} + +/* Add a single IOMMU entry. This function does not sleep. */ +int +iommu_pmap_enter(struct riscv_iommu_pmap *pmap, vm_offset_t va, vm_paddr_t pa, + vm_prot_t prot, u_int flags) +{ + pd_entry_t *l2, l2e; + pt_entry_t new_l3; + pt_entry_t *l3; + vm_page_t mpte; + pn_t pn; + int rv; + + pn = (pa / PAGE_SIZE); + + new_l3 = PTE_V | PTE_R | PTE_A; + if (prot & VM_PROT_EXECUTE) + new_l3 |= PTE_X; + if (flags & VM_PROT_WRITE) + new_l3 |= PTE_D; + if (prot & VM_PROT_WRITE) + new_l3 |= PTE_W; + if (va < VM_MAX_USER_ADDRESS) + new_l3 |= PTE_U; + + new_l3 |= (pn << PTE_PPN0_S); + new_l3 |= PTE_MA_IO; + + /* + * Set modified bit gratuitously for writeable mappings if + * the page is unmanaged. We do not want to take a fault + * to do the dirty bit accounting for these mappings. + */ + if (prot & VM_PROT_WRITE) + new_l3 |= PTE_D; + + CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa); + + mpte = NULL; + PMAP_LOCK(pmap); + + l2 = pmap_l2(pmap, va); + if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 && + ((l2e & PTE_RWX) == 0)) { + l3 = pmap_l2_to_l3(l2, va); + } else if (va < VM_MAXUSER_ADDRESS) { + mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va)); + if (mpte == NULL) { + CTR0(KTR_PMAP, "pmap_enter: mpte == NULL"); + rv = KERN_RESOURCE_SHORTAGE; + goto out; + } + l3 = pmap_l3(pmap, va); + } else + panic("pmap_enter: missing L3 table for kernel va %#lx", va); + + KASSERT((pmap_load(l3) & PTE_V) == 0, ("l3 is valid")); + + pmap_store(l3, new_l3); + pmap_resident_count_inc(pmap, 1); + + rv = KERN_SUCCESS; +out: + PMAP_UNLOCK(pmap); + + return (rv); +} + +static void +iommu_pmap_remove_pages_sv48(struct riscv_iommu_pmap *pmap) +{ + pd_entry_t l0e, *l1, l1e, *l2, l2e, *l3, l3e; + vm_paddr_t pa0, pa1, pa; + vm_page_t m0, m1, m; + int i, j, k, l; + + PMAP_LOCK(pmap); + + for (i = 0; i < Ln_ENTRIES; i++) { + l0e = pmap->pm_top[i]; + if ((l0e & PTE_V) == 0) + continue; + pa0 = PTE_TO_PHYS(l0e); + m0 = PHYS_TO_VM_PAGE(pa0); + l1 = (pd_entry_t *)PHYS_TO_DMAP(pa0); + + for (j = 0; j < Ln_ENTRIES; j++) { + l1e = l1[j]; + if ((l1e & PTE_V) == 0) + continue; + pa1 = PTE_TO_PHYS(l1e); + m1 = PHYS_TO_VM_PAGE(pa1); + l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1); + + for (k = 0; k < Ln_ENTRIES; k++) { + l2e = l2[k]; + if ((l2e & PTE_V) == 0) + continue; + pa = PTE_TO_PHYS(l2e); + m = PHYS_TO_VM_PAGE(pa); + l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); + + for (l = 0; l < Ln_ENTRIES; l++) { + l3e = l3[l]; + if ((l3e & PTE_V) == 0) + continue; + panic("%s: l3e found (idx %d %d %d %d)", + __func__, i, j, k, l); + } + + vm_page_unwire_noq(m1); + vm_page_unwire_noq(m); + pmap_resident_count_dec(pmap, 1); + vm_page_free(m); + pmap_clear(&l2[k]); + } + + vm_page_unwire_noq(m0); + pmap_resident_count_dec(pmap, 1); + vm_page_free(m1); + pmap_clear(&l1[j]); + } + + pmap_resident_count_dec(pmap, 1); + vm_page_free(m0); + pmap_clear(&pmap->pm_top[i]); + } + + KASSERT(pmap->sp_resident_count == 0, + ("Invalid resident count %jd", pmap->sp_resident_count)); + + PMAP_UNLOCK(pmap); +} + +static void +iommu_pmap_remove_pages_sv39(struct riscv_iommu_pmap *pmap) +{ + pd_entry_t l1e, *l2, l2e, *l3, l3e; + vm_paddr_t pa1, pa; + vm_page_t m1, m; + int j, k, l; + + PMAP_LOCK(pmap); + + for (j = 0; j < Ln_ENTRIES; j++) { + l1e = pmap->pm_top[j]; + if ((l1e & PTE_V) == 0) + continue; + pa1 = PTE_TO_PHYS(l1e); + m1 = PHYS_TO_VM_PAGE(pa1); + l2 = (pd_entry_t *)PHYS_TO_DMAP(pa1); + + for (k = 0; k < Ln_ENTRIES; k++) { + l2e = l2[k]; + if ((l2e & PTE_V) == 0) + continue; + pa = PTE_TO_PHYS(l2e); + m = PHYS_TO_VM_PAGE(pa); + l3 = (pt_entry_t *)PHYS_TO_DMAP(pa); + + for (l = 0; l < Ln_ENTRIES; l++) { + l3e = l3[l]; + if ((l3e & PTE_V) == 0) + continue; + panic("%s: l3e found (idx %d %d %d)", + __func__, j, k, l); + } + + vm_page_unwire_noq(m1); + vm_page_unwire_noq(m); + pmap_resident_count_dec(pmap, 1); + vm_page_free(m); + pmap_clear(&l2[k]); + } + + pmap_resident_count_dec(pmap, 1); + vm_page_free(m1); + pmap_clear(&pmap->pm_top[j]); + } + + KASSERT(pmap->sp_resident_count == 0, + ("Invalid resident count %jd", pmap->sp_resident_count)); + + PMAP_UNLOCK(pmap); +} + +void +iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap) +{ + + switch (pmap->pm_mode) { + case PMAP_MODE_SV39: + iommu_pmap_remove_pages_sv39(pmap); + break; + case PMAP_MODE_SV48: + iommu_pmap_remove_pages_sv48(pmap); + break; + default: + panic("Unknown virtual memory system"); + } +} diff --git a/sys/riscv/iommu/iommu_pmap.h b/sys/riscv/iommu/iommu_pmap.h new file mode 100644 index 000000000000..25347e239119 --- /dev/null +++ b/sys/riscv/iommu/iommu_pmap.h @@ -0,0 +1,49 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2026 Ruslan Bukin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _RISCV_IOMMU_IOMMU_PMAP_H_ +#define _RISCV_IOMMU_IOMMU_PMAP_H_ + +struct riscv_iommu_pmap { + pd_entry_t *pm_top; + enum pmap_mode pm_mode; + uint64_t pm_satp; + struct mtx pm_mtx; +#ifdef INVARIANTS + long sp_resident_count; +#endif +}; + +int iommu_pmap_enter(struct riscv_iommu_pmap *pmap, vm_offset_t va, + vm_paddr_t pa, vm_prot_t prot, u_int flags); +int iommu_pmap_remove(struct riscv_iommu_pmap *pmap, vm_offset_t va); +void iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap); +int iommu_pmap_pinit(struct riscv_iommu_pmap *pmap, enum pmap_mode pm_mode); +void iommu_pmap_release(struct riscv_iommu_pmap *pmap); +void iommu_pmap_remove_pages(struct riscv_iommu_pmap *pmap); + +#endif /* !_RISCV_IOMMU_IOMMU_PMAP_H_ */